Deploying to gh-pages from @ Farama-Foundation/Gymnasium@898b747dcc 🚀
4
0.27.0/.buildinfo
Normal file
@@ -0,0 +1,4 @@
|
||||
# Sphinx build info version 1
|
||||
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
|
||||
config: 309340cd494e6110e8779bfdf20952bc
|
||||
tags: d77d1c0d9ca2f4c8421862c7c5a0d620
|
0
0.27.0/.nojekyll
Normal file
661
0.27.0/404.html
Normal file
@@ -0,0 +1,661 @@
|
||||
<!doctype html>
|
||||
<html class="no-js" lang="en">
|
||||
<head><meta charset="utf-8"/>
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1"/>
|
||||
<meta name="color-scheme" content="light dark">
|
||||
<meta name="description" content="A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)">
|
||||
<meta property="og:title" content="Gymnasium Documentation" />
|
||||
<meta property="og:type" content="website" />
|
||||
<meta property="og:description" content="A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)" />
|
||||
<meta property="og:url" content="https://gymnasium.farama.org/404.html" /><meta property="og:image" content="https://gymnasium.farama.org/_static/img/gymnasium-github.png" /><meta name="twitter:card" content="summary_large_image"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
|
||||
<link rel="index" title="Index" href="/genindex/" /><link rel="search" title="Search" href="/search/" />
|
||||
<link rel="canonical" href="https://gymnasium.farama.org/404.html" />
|
||||
|
||||
<link rel="shortcut icon" href="/_static/favicon.png"/><meta name="generator" content="sphinx-5.3.0, furo 2022.09.15.dev1"/>
|
||||
<title>404 - Page Not Found - Gymnasium Documentation</title>
|
||||
<link rel="stylesheet" type="text/css" href="/_static/pygments.css" />
|
||||
<link rel="stylesheet" type="text/css" href="/_static/styles/furo.css?digest=3cf7b839e8c50b5f3a39bb99d90baa7b845de926" />
|
||||
<link rel="stylesheet" type="text/css" href="/_static/styles/furo-extensions.css?digest=91b9f2a71a58ed2481980f1e5725e16457fde93d" />
|
||||
|
||||
|
||||
|
||||
|
||||
<style>
|
||||
body {
|
||||
--color-code-background: #f8f8f8;
|
||||
--color-code-foreground: black;
|
||||
|
||||
}
|
||||
@media not print {
|
||||
body[data-theme="dark"] {
|
||||
--color-code-background: #202020;
|
||||
--color-code-foreground: #d0d0d0;
|
||||
|
||||
}
|
||||
@media (prefers-color-scheme: dark) {
|
||||
body:not([data-theme="light"]) {
|
||||
--color-code-background: #202020;
|
||||
--color-code-foreground: #d0d0d0;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
</style></head>
|
||||
<body>
|
||||
<header class="farama-header" aria-label="Farama header">
|
||||
<div class="farama-header__container">
|
||||
<div class="farama-header__left--mobile">
|
||||
<label class="nav-overlay-icon" for="__navigation">
|
||||
<div class="visually-hidden">Toggle site navigation sidebar</div>
|
||||
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
|
||||
<defs></defs>
|
||||
<line x1="0.5" y1="4" x2="23.5" y2="4"></line>
|
||||
<line x1="0.232" y1="12" x2="23.5" y2="12"></line>
|
||||
<line x1="0.232" y1="20" x2="23.5" y2="20"></line>
|
||||
</svg>
|
||||
<!-- <svg viewBox="0 0 24 24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
|
||||
<line x1="0.5" y1="4.5" x2="23.5" y2="4.5" style="fill: none; "></line>
|
||||
<line x1="0.5" y1="12" x2="14" y2="12" ></line>
|
||||
<line x1="0.5" y1="19.5" x2="23.5" y2="19.5"></line>
|
||||
<polyline style="stroke-width: 0px;" points="17 7 22 12 17 17"></polyline>
|
||||
</svg> -->
|
||||
<!-- <svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg" style="width:20px">
|
||||
<defs></defs>
|
||||
<rect y="1" width="22" height="22" rx="2" ry="2" style="fill: none;" x="1"></rect>
|
||||
<line x1="8" y1="1" x2="8" y2="23"></line>
|
||||
<polyline style="stroke-linecap: round; fill: none; stroke-linejoin: round;" points="13 7 17 12 13 17"></polyline>
|
||||
</svg> -->
|
||||
</label>
|
||||
</div>
|
||||
<div class="farama-header__left farama-header__center--mobile">
|
||||
<a href="/">
|
||||
<img class="farama-header__logo only-light" src="/_static/img/gymnasium_black.svg" alt="Light Logo"/>
|
||||
<img class="farama-header__logo only-dark" src="/_static/img/gymnasium_white.svg" alt="Dark Logo"/>
|
||||
<span class="farama-header__title">Gymnasium Documentation</span>
|
||||
</a>
|
||||
</div>
|
||||
<div class="farama-header__right">
|
||||
<div class="farama-header-menu">
|
||||
<button class="farama-header-menu__btn" aria-label="Open Farama Menu" aria-expanded="false" aria-haspopup="true" aria-controls="farama-menu">
|
||||
<img class="farama-white-logo-invert" src="/_static/img/farama-logo-header.svg">
|
||||
<svg viewBox="0 0 24 24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
|
||||
<polyline style="stroke-linecap: round; stroke-linejoin: round; fill: none; stroke-width: 2px;" points="1 7 12 18 23 7"></polyline>
|
||||
</svg>
|
||||
</button>
|
||||
<div class="farama-header-menu-container farama-hidden" aria-hidden="true" id="farama-menu">
|
||||
<div class="farama-header-menu__header">
|
||||
<a href="https://farama.org">
|
||||
<img class="farama-header-menu__logo farama-white-logo-invert" src="/_static/img/farama_solid_white.svg" alt="Farama Foundation logo">
|
||||
<span>Farama Foundation</span>
|
||||
</a>
|
||||
<div class="farama-header-menu-header__right">
|
||||
<button id="farama-close-menu">
|
||||
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg" fill="none" stroke="currentColor"
|
||||
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon-close">
|
||||
<line x1="3" y1="21" x2="21" y2="3"></line>
|
||||
<line x1="3" y1="3" x2="21" y2="21"></line>
|
||||
</svg>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="farama-header-menu__body">
|
||||
<!-- Response from farama.org/api/projects.json -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
|
||||
<script>
|
||||
document.body.dataset.theme = localStorage.getItem("theme") || "auto";
|
||||
</script>
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
|
||||
<symbol id="svg-toc" viewBox="0 0 24 24">
|
||||
<title>Contents</title>
|
||||
<svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
|
||||
<path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
|
||||
</svg>
|
||||
</symbol>
|
||||
<symbol id="svg-menu" viewBox="0 0 24 24">
|
||||
<title>Menu</title>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||||
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
|
||||
<line x1="3" y1="12" x2="21" y2="12"></line>
|
||||
<line x1="3" y1="6" x2="21" y2="6"></line>
|
||||
<line x1="3" y1="18" x2="21" y2="18"></line>
|
||||
</svg>
|
||||
</symbol>
|
||||
<symbol id="svg-arrow-right" viewBox="0 0 24 24">
|
||||
<title>Expand</title>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||||
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
|
||||
<polyline points="9 18 15 12 9 6"></polyline>
|
||||
</svg>
|
||||
</symbol>
|
||||
<symbol id="svg-sun" viewBox="0 0 24 24">
|
||||
<title>Light mode</title>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||||
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
|
||||
<circle cx="12" cy="12" r="5"></circle>
|
||||
<line x1="12" y1="1" x2="12" y2="3"></line>
|
||||
<line x1="12" y1="21" x2="12" y2="23"></line>
|
||||
<line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
|
||||
<line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
|
||||
<line x1="1" y1="12" x2="3" y2="12"></line>
|
||||
<line x1="21" y1="12" x2="23" y2="12"></line>
|
||||
<line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
|
||||
<line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
|
||||
</svg>
|
||||
</symbol>
|
||||
<symbol id="svg-moon" viewBox="0 0 24 24">
|
||||
<title>Dark mode</title>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||||
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
|
||||
<path stroke="none" d="M0 0h24v24H0z" fill="none" />
|
||||
<path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
|
||||
</svg>
|
||||
</symbol>
|
||||
<symbol id="svg-sun-half" viewBox="0 0 24 24">
|
||||
<title>Auto light/dark mode</title>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||||
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
|
||||
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
|
||||
<circle cx="12" cy="12" r="9" />
|
||||
<path d="M13 12h5" />
|
||||
<path d="M13 15h4" />
|
||||
<path d="M13 18h1" />
|
||||
<path d="M13 9h4" />
|
||||
<path d="M13 6h1" />
|
||||
</svg>
|
||||
</symbol>
|
||||
</svg>
|
||||
|
||||
<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
|
||||
<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
|
||||
<label class="overlay sidebar-overlay" for="__navigation">
|
||||
<div class="visually-hidden">Hide navigation sidebar</div>
|
||||
</label>
|
||||
<label class="overlay toc-overlay" for="__toc">
|
||||
<div class="visually-hidden">Hide table of contents sidebar</div>
|
||||
</label>
|
||||
|
||||
|
||||
|
||||
<div class="page">
|
||||
<!--<header class="mobile-header">
|
||||
<div class="header-left">
|
||||
<label class="nav-overlay-icon" for="__navigation">
|
||||
<div class="visually-hidden">Toggle site navigation sidebar</div>
|
||||
<i class="icon"><svg><use href="#svg-menu"></use></svg></i>
|
||||
</label>
|
||||
</div>
|
||||
<div class="header-center">
|
||||
<a href="/"><div class="brand">Gymnasium Documentation</div></a>
|
||||
</div>
|
||||
<div class="header-right">
|
||||
<div class="theme-toggle-container theme-toggle-header">
|
||||
<button class="theme-toggle">
|
||||
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
|
||||
<svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
|
||||
<svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
|
||||
<svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
|
||||
</button>
|
||||
</div>
|
||||
<label class="toc-overlay-icon toc-header-icon no-toc" for="__toc">
|
||||
<div class="visually-hidden">Toggle table of contents sidebar</div>
|
||||
<i class="icon"><svg><use href="#svg-toc"></use></svg></i>
|
||||
</label>
|
||||
</div>
|
||||
</header>-->
|
||||
<aside class="sidebar-drawer">
|
||||
<div class="sidebar-container">
|
||||
|
||||
<div class="sidebar-sticky"><a class="farama-sidebar__title" href="/">
|
||||
<img class="farama-header__logo only-light" src="/_static/img/gymnasium_black.svg" alt="Light Logo"/>
|
||||
<img class="farama-header__logo only-dark" src="/_static/img/gymnasium_white.svg" alt="Dark Logo"/>
|
||||
<span class="farama-header__title">Gymnasium Documentation</span>
|
||||
</a><form class="sidebar-search-container" method="get" action="../search/" role="search">
|
||||
<input class="sidebar-search" placeholder=Search name="q" aria-label="Search">
|
||||
<input type="hidden" name="check_keywords" value="yes">
|
||||
<input type="hidden" name="area" value="default">
|
||||
</form>
|
||||
<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
|
||||
<p class="caption" role="heading"><span class="caption-text">Introduction</span></p>
|
||||
<ul>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/content/basic_usage/">Basic Usage</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/content/gym_compatibility/">Compatibility with Gym</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/content/migration-guide/">v21 to v26 Migration Guide</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">API</span></p>
|
||||
<ul>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/api/env/">Env</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/api/registry/">Registry</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="/api/spaces/">Spaces</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/api/spaces/fundamental/">Fundamental Spaces</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/api/spaces/composite/">Composite Spaces</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/api/spaces/utils/">Spaces Utils</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/api/spaces/vector_utils/">Spaces Vector Utils</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="/api/wrappers/">Wrappers</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/api/wrappers/misc_wrappers/">Misc Wrappers</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/api/wrappers/action_wrappers/">Action Wrappers</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/api/wrappers/observation_wrappers/">Observation Wrappers</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/api/wrappers/reward_wrappers/">Reward Wrappers</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/api/vector/">Vector</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/api/utils/">Utils</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="/api/experimental/">Experimental</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/api/experimental/functional/">Functional Environment</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/api/experimental/wrappers/">Wrappers</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/api/experimental/vector/">Vectorizing Environment</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/api/experimental/vector_wrappers/">Vector Environment Wrappers</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Environments</span></p>
|
||||
<ul>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="/environments/classic_control/">Classic Control</a><input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" role="switch" type="checkbox"/><label for="toctree-checkbox-4"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/classic_control/acrobot/">Acrobot</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/classic_control/cart_pole/">Cart Pole</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/classic_control/mountain_car_continuous/">Mountain Car Continuous</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/classic_control/mountain_car/">Mountain Car</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/classic_control/pendulum/">Pendulum</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="/environments/box2d/">Box2D</a><input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" role="switch" type="checkbox"/><label for="toctree-checkbox-5"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/box2d/bipedal_walker/">Bipedal Walker</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/box2d/car_racing/">Car Racing</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/box2d/lunar_lander/">Lunar Lander</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="/environments/toy_text/">Toy Text</a><input class="toctree-checkbox" id="toctree-checkbox-6" name="toctree-checkbox-6" role="switch" type="checkbox"/><label for="toctree-checkbox-6"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/toy_text/blackjack/">Blackjack</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/toy_text/taxi/">Taxi</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/toy_text/cliff_walking/">Cliff Walking</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/toy_text/frozen_lake/">Frozen Lake</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="/environments/mujoco/">MuJoCo</a><input class="toctree-checkbox" id="toctree-checkbox-7" name="toctree-checkbox-7" role="switch" type="checkbox"/><label for="toctree-checkbox-7"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/mujoco/ant/">Ant</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/mujoco/half_cheetah/">Half Cheetah</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/mujoco/hopper/">Hopper</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/mujoco/humanoid_standup/">Humanoid Standup</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/mujoco/humanoid/">Humanoid</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/mujoco/inverted_double_pendulum/">Inverted Double Pendulum</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/mujoco/inverted_pendulum/">Inverted Pendulum</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/mujoco/reacher/">Reacher</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/mujoco/swimmer/">Swimmer</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/mujoco/pusher/">Pusher</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/mujoco/walker2d/">Walker2D</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="/environments/atari/">Atari</a><input class="toctree-checkbox" id="toctree-checkbox-8" name="toctree-checkbox-8" role="switch" type="checkbox"/><label for="toctree-checkbox-8"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/adventure/">Adventure</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/air_raid/">Air Raid</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/alien/">Alien</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/amidar/">Amidar</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/assault/">Assault</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/asterix/">Asterix</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/asteroids/">Asteroids</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/atlantis/">Atlantis</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/bank_heist/">Bank Heist</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/battle_zone/">Battle Zone</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/beam_rider/">Beam Rider</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/berzerk/">Berzerk</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/bowling/">Bowling</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/boxing/">Boxing</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/breakout/">Breakout</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/carnival/">Carnival</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/centipede/">Centipede</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/chopper_command/">Chopper Command</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/crazy_climber/">Crazy Climber</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/defender/">Defender</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/demon_attack/">Demon Attack</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/double_dunk/">Double Dunk</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/elevator_action/">Elevator Action</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/enduro/">Enduro</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/fishing_derby/">FishingDerby</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/freeway/">Freeway</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/frostbite/">Frostbite</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/gopher/">Gopher</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/gravitar/">Gravitar</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/hero/">Hero</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/ice_hockey/">IceHockey</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/jamesbond/">Jamesbond</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/journey_escape/">JourneyEscape</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/kangaroo/">Kangaroo</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/krull/">Krull</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/kung_fu_master/">Kung Fu Master</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/montezuma_revenge/">Montezuma Revenge</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/ms_pacman/">Ms Pacman</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/name_this_game/">Name This Game</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/phoenix/">Phoenix</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/pitfall/">Pitfall</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/pong/">Pong</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/pooyan/">Pooyan</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/private_eye/">PrivateEye</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/qbert/">Qbert</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/riverraid/">Riverraid</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/road_runner/">Road Runner</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/robotank/">Robot Tank</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/seaquest/">Seaquest</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/skiing/">Skiings</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/solaris/">Solaris</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/space_invaders/">SpaceInvaders</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/star_gunner/">StarGunner</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/tennis/">Tennis</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/time_pilot/">TimePilot</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/tutankham/">Tutankham</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/up_n_down/">Up n’ Down</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/venture/">Venture</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/video_pinball/">Video Pinball</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/wizard_of_wor/">Wizard of Wor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="/environments/atari/zaxxon/">Zaxxon</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/environments/third_party_environments/">Third-party Environments</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Tutorials</span></p>
|
||||
<ul>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/tutorials/blackjack_tutorial/">Solving Blackjack with Q-Learning</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/tutorials/environment_creation/">Make your own custom environment</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/tutorials/handling_time_limits/">Handling Time Limits</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/tutorials/implementing_custom_wrappers/">Implementing Custom Wrappers</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="/tutorials/reinforce_invpend_gym_v26/">Training using REINFORCE for Mujoco</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Development</span></p>
|
||||
<ul>
|
||||
<li class="toctree-l1"><a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium">Github</a></li>
|
||||
<li class="toctree-l1"><a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium/blob/main/docs/README.md">Contribute to the Docs</a></li>
|
||||
</ul>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</aside>
|
||||
<div class="main">
|
||||
<div class="content">
|
||||
<div class="article-container">
|
||||
<a href="#" class="back-to-top muted-link">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
|
||||
<path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
|
||||
</svg>
|
||||
<span>Back to top</span>
|
||||
</a>
|
||||
<div class="content-icon-container">
|
||||
|
||||
|
||||
<div class="edit-this-page">
|
||||
<a class="muted-link" href="https://github.com/Farama-Foundation/Gymnasium/edit/main/docs/404.md" title="Edit this page">
|
||||
<svg aria-hidden="true" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
|
||||
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
|
||||
<path d="M4 20h4l10.5 -10.5a1.5 1.5 0 0 0 -4 -4l-10.5 10.5v4" />
|
||||
<line x1="13.5" y1="6.5" x2="17.5" y2="10.5" />
|
||||
</svg>
|
||||
<span class="visually-hidden">Edit this page</span>
|
||||
</a>
|
||||
</div><div class="theme-toggle-container theme-toggle-content">
|
||||
<button class="theme-toggle">
|
||||
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
|
||||
<svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
|
||||
<svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
|
||||
<svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
|
||||
</button>
|
||||
</div>
|
||||
<label class="toc-overlay-icon toc-content-icon no-toc" for="__toc">
|
||||
<div class="visually-hidden">Toggle table of contents sidebar</div>
|
||||
<i class="icon"><svg><use href="#svg-toc"></use></svg></i>
|
||||
</label>
|
||||
</div>
|
||||
<article role="main">
|
||||
|
||||
<section id="page-not-found">
|
||||
<h1>404 - Page Not Found<a class="headerlink" href="#page-not-found" title="Permalink to this heading">#</a></h1>
|
||||
<section id="the-requested-page-could-not-be-found">
|
||||
<h2>The requested page could not be found.<a class="headerlink" href="#the-requested-page-could-not-be-found" title="Permalink to this heading">#</a></h2>
|
||||
</section>
|
||||
</section>
|
||||
|
||||
</article>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="related-pages">
|
||||
|
||||
|
||||
</div>
|
||||
<div class="bottom-of-page">
|
||||
<div class="left-details">
|
||||
<div class="copyright">
|
||||
Copyright © 2022 Farama Foundation
|
||||
</div>
|
||||
<!--
|
||||
Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
|
||||
|
||||
<a href="https://github.com/pradyunsg/furo">Furo</a>
|
||||
-->
|
||||
</div>
|
||||
<div class="right-details">
|
||||
<div class="icons">
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</footer>
|
||||
</div>
|
||||
<aside class="toc-drawer no-toc">
|
||||
|
||||
|
||||
|
||||
</aside>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
const toggleMenu = () => {
|
||||
const menuBtn = document.querySelector(".farama-header-menu__btn");
|
||||
const menuContainer = document.querySelector(".farama-header-menu-container");
|
||||
if (document.querySelector(".farama-header-menu").classList.contains("active")) {
|
||||
menuBtn.ariaExpanded = "false";
|
||||
menuContainer.ariaHidden = "true";
|
||||
} else {
|
||||
menuBtn.ariaExpanded = "true";
|
||||
menuContainer.ariaHidden = "false";
|
||||
}
|
||||
document.querySelector(".farama-header-menu").classList.toggle("active");
|
||||
}
|
||||
|
||||
document.querySelector(".farama-header-menu__btn").addEventListener("click", toggleMenu);
|
||||
document.getElementById("farama-close-menu").addEventListener("click", toggleMenu);
|
||||
</script>
|
||||
|
||||
|
||||
<script>
|
||||
(() => {
|
||||
if (!localStorage.getItem("shownCookieAlert")) {
|
||||
const boxElem = document.createElement("div");
|
||||
boxElem.classList.add("cookie-alert");
|
||||
const containerElem = document.createElement("div");
|
||||
containerElem.classList.add("cookie-alert__container");
|
||||
const textElem = document.createElement("p");
|
||||
textElem.innerHTML = `This page uses <a href="https://analytics.google.com/">
|
||||
Google Analytics</a> to collect statistics. You can disable it by blocking
|
||||
the JavaScript coming from www.google-analytics.com.`;
|
||||
containerElem.appendChild(textElem);
|
||||
const closeBtn = document.createElement("button");
|
||||
closeBtn.innerHTML = `<?xml version="1.0" ?><svg viewBox="0 0 32 32" xmlns="http://www.w3.org/2000/svg"><defs><style>.cls-1{fill:none;stroke:#000;stroke-linecap:round;stroke-linejoin:round;stroke-width:2px;}</style></defs><title/><g id="cross"><line class="cls-1" x1="7" x2="25" y1="7" y2="25"/><line class="cls-1" x1="7" x2="25" y1="25" y2="7"/></g></svg>`
|
||||
closeBtn.onclick = () => {
|
||||
localStorage.setItem("shownCookieAlert", "true");
|
||||
boxElem.style.display = "none";
|
||||
}
|
||||
containerElem.appendChild(closeBtn);
|
||||
boxElem.appendChild(containerElem);
|
||||
document.body.appendChild(boxElem);
|
||||
}
|
||||
})()
|
||||
</script>
|
||||
|
||||
<script async src="https://www.googletagmanager.com/gtag/js?id=G-6H9C8TWXZ8"></script>
|
||||
<script>
|
||||
window.dataLayer = window.dataLayer || [];
|
||||
function gtag(){dataLayer.push(arguments);}
|
||||
gtag('js', new Date());
|
||||
|
||||
gtag('config', 'G-6H9C8TWXZ8');
|
||||
</script>
|
||||
|
||||
<script data-url_root="../" id="documentation_options" src="/_static/documentation_options.js"></script>
|
||||
<script src="/_static/jquery.js"></script>
|
||||
<script src="/_static/underscore.js"></script>
|
||||
<script src="/_static/_sphinx_javascript_frameworks_compat.js"></script>
|
||||
<script src="/_static/doctools.js"></script>
|
||||
<script src="/_static/sphinx_highlight.js"></script>
|
||||
<script src="/_static/scripts/furo.js"></script>
|
||||
|
||||
<script>
|
||||
const createCORSRequest = (method, url) => {
|
||||
let xhr = new XMLHttpRequest();
|
||||
xhr.responseType = 'json';
|
||||
|
||||
if ("withCredentials" in xhr) {
|
||||
xhr.open(method, url, true);
|
||||
} else if (typeof XDomainRequest != "undefined") {
|
||||
// IE8 & IE9
|
||||
xhr = new XDomainRequest();
|
||||
xhr.open(method, url);
|
||||
} else {
|
||||
// CORS not supported.
|
||||
xhr = null;
|
||||
}
|
||||
return xhr;
|
||||
};
|
||||
|
||||
const url = 'https://farama.org/api/projects.json';
|
||||
const imagesBasepath = "https://farama.org/assets/images"
|
||||
const method = 'GET';
|
||||
let xhr = createCORSRequest(method, url);
|
||||
|
||||
xhr.onload = () => {
|
||||
const jsonResponse = xhr.response;
|
||||
const sections = {
|
||||
"Documentation": [],
|
||||
"Mature Projects": [],
|
||||
"Incubating Projects": [],
|
||||
"Foundation": [
|
||||
{
|
||||
name: "About",
|
||||
link: "https://farama.org/about"
|
||||
},
|
||||
{
|
||||
name: "Standards",
|
||||
link: "https://farama.org/project_standards",
|
||||
},
|
||||
{
|
||||
name: "Donate",
|
||||
link: "https://farama.org/donations"
|
||||
}
|
||||
]
|
||||
}
|
||||
Object.keys(jsonResponse).forEach(key => {
|
||||
projectJson = jsonResponse[key];
|
||||
if (projectJson.website !== null) {
|
||||
projectJson.link = projectJson.website;
|
||||
sections["Documentation"].push(projectJson)
|
||||
} else if (projectJson.type == "mature") {
|
||||
projectJson.link = projectJson.github;
|
||||
sections["Mature Projects"].push(projectJson)
|
||||
} else {
|
||||
projectJson.link = projectJson.github;
|
||||
sections["Incubating Projects"].push(projectJson)
|
||||
}
|
||||
})
|
||||
|
||||
const menuContainer = document.querySelector(".farama-header-menu__body");
|
||||
|
||||
Object.keys(sections).forEach((key, i) => {
|
||||
projects = sections[key];
|
||||
const sectionElem = Object.assign(
|
||||
document.createElement('div'), {
|
||||
className:'farama-header-menu__section',
|
||||
style: "padding-left: 24px"
|
||||
}
|
||||
)
|
||||
sectionElem.appendChild(Object.assign(document.createElement('span'),
|
||||
{
|
||||
className:'farama-header-menu__section-title' ,
|
||||
innerText: key
|
||||
}
|
||||
))
|
||||
const ulElem = Object.assign(document.createElement('ul'),
|
||||
{
|
||||
className:'farama-header-menu-list',
|
||||
}
|
||||
)
|
||||
for (let project of projects) {
|
||||
const liElem = document.createElement("li");
|
||||
const aElem = Object.assign(document.createElement("a"),
|
||||
{
|
||||
href: project.link
|
||||
}
|
||||
);
|
||||
liElem.appendChild(aElem);
|
||||
if (key !== "Foundation") {
|
||||
const imgElem = Object.assign(document.createElement("img"),
|
||||
{
|
||||
src: project.image ? imagesBasepath + project.image : imagesBasepath + "/farama_black.svg",
|
||||
alt: `${project.name} logo`,
|
||||
className: "farama-black-logo-invert"
|
||||
}
|
||||
);
|
||||
aElem.appendChild(imgElem);
|
||||
}
|
||||
aElem.appendChild(document.createTextNode(project.name));
|
||||
ulElem.appendChild(liElem);
|
||||
}
|
||||
sectionElem.appendChild(ulElem);
|
||||
menuContainer.appendChild(sectionElem)
|
||||
});
|
||||
}
|
||||
|
||||
xhr.onerror = function() {
|
||||
console.error("Unable to load projects");
|
||||
};
|
||||
|
||||
xhr.send();
|
||||
</script>
|
||||
|
||||
|
||||
<script>
|
||||
const versioningConfig = {
|
||||
githubUser: 'Farama-Foundation',
|
||||
githubRepo: 'Gymnasium',
|
||||
};
|
||||
fetch('/_static/versioning/versioning_menu.html').then(response => {
|
||||
if (response.status === 200) {
|
||||
response.text().then(text => {
|
||||
const container = document.createElement("div");
|
||||
container.innerHTML = text;
|
||||
document.querySelector("body").appendChild(container);
|
||||
// innerHtml doenst evaluate scripts, we need to add them dynamically
|
||||
Array.from(container.querySelectorAll("script")).forEach(oldScript => {
|
||||
const newScript = document.createElement("script");
|
||||
Array.from(oldScript.attributes).forEach(attr => newScript.setAttribute(attr.name, attr.value));
|
||||
newScript.appendChild(document.createTextNode(oldScript.innerHTML));
|
||||
oldScript.parentNode.replaceChild(newScript, oldScript);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
console.warn("Unable to load versioning menu", response);
|
||||
}
|
||||
});
|
||||
</script></body>
|
||||
</html>
|
1
0.27.0/CNAME
Normal file
@@ -0,0 +1 @@
|
||||
gymnasium.farama.org
|
741
0.27.0/README/index.html
Normal file
@@ -0,0 +1,741 @@
|
||||
<!doctype html>
|
||||
<html class="no-js" lang="en">
|
||||
<head><meta charset="utf-8"/>
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1"/>
|
||||
<meta name="color-scheme" content="light dark">
|
||||
<meta name="description" content="A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)">
|
||||
<meta property="og:title" content="Gymnasium Documentation" />
|
||||
<meta property="og:type" content="website" />
|
||||
<meta property="og:description" content="A standard API for reinforcement learning and a diverse set of reference environments (formerly Gym)" />
|
||||
<meta property="og:url" content="https://gymnasium.farama.org/README.html" /><meta property="og:image" content="https://gymnasium.farama.org/_static/img/gymnasium-github.png" /><meta name="twitter:card" content="summary_large_image"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
|
||||
<link rel="index" title="Index" href="../genindex/" /><link rel="search" title="Search" href="../search/" />
|
||||
<link rel="canonical" href="https://gymnasium.farama.org/README.html" />
|
||||
|
||||
<link rel="shortcut icon" href="../_static/favicon.png"/><meta name="generator" content="sphinx-5.3.0, furo 2022.09.15.dev1"/>
|
||||
<title>Gymnasium-docs - Gymnasium Documentation</title>
|
||||
<link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
|
||||
<link rel="stylesheet" type="text/css" href="../_static/styles/furo.css?digest=3cf7b839e8c50b5f3a39bb99d90baa7b845de926" />
|
||||
<link rel="stylesheet" type="text/css" href="../_static/styles/furo-extensions.css?digest=91b9f2a71a58ed2481980f1e5725e16457fde93d" />
|
||||
|
||||
|
||||
|
||||
|
||||
<style>
|
||||
body {
|
||||
--color-code-background: #f8f8f8;
|
||||
--color-code-foreground: black;
|
||||
|
||||
}
|
||||
@media not print {
|
||||
body[data-theme="dark"] {
|
||||
--color-code-background: #202020;
|
||||
--color-code-foreground: #d0d0d0;
|
||||
|
||||
}
|
||||
@media (prefers-color-scheme: dark) {
|
||||
body:not([data-theme="light"]) {
|
||||
--color-code-background: #202020;
|
||||
--color-code-foreground: #d0d0d0;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
</style></head>
|
||||
<body>
|
||||
<header class="farama-header" aria-label="Farama header">
|
||||
<div class="farama-header__container">
|
||||
<div class="farama-header__left--mobile">
|
||||
<label class="nav-overlay-icon" for="__navigation">
|
||||
<div class="visually-hidden">Toggle site navigation sidebar</div>
|
||||
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
|
||||
<defs></defs>
|
||||
<line x1="0.5" y1="4" x2="23.5" y2="4"></line>
|
||||
<line x1="0.232" y1="12" x2="23.5" y2="12"></line>
|
||||
<line x1="0.232" y1="20" x2="23.5" y2="20"></line>
|
||||
</svg>
|
||||
<!-- <svg viewBox="0 0 24 24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
|
||||
<line x1="0.5" y1="4.5" x2="23.5" y2="4.5" style="fill: none; "></line>
|
||||
<line x1="0.5" y1="12" x2="14" y2="12" ></line>
|
||||
<line x1="0.5" y1="19.5" x2="23.5" y2="19.5"></line>
|
||||
<polyline style="stroke-width: 0px;" points="17 7 22 12 17 17"></polyline>
|
||||
</svg> -->
|
||||
<!-- <svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg" style="width:20px">
|
||||
<defs></defs>
|
||||
<rect y="1" width="22" height="22" rx="2" ry="2" style="fill: none;" x="1"></rect>
|
||||
<line x1="8" y1="1" x2="8" y2="23"></line>
|
||||
<polyline style="stroke-linecap: round; fill: none; stroke-linejoin: round;" points="13 7 17 12 13 17"></polyline>
|
||||
</svg> -->
|
||||
</label>
|
||||
</div>
|
||||
<div class="farama-header__left farama-header__center--mobile">
|
||||
<a href="../">
|
||||
<img class="farama-header__logo only-light" src="../_static/img/gymnasium_black.svg" alt="Light Logo"/>
|
||||
<img class="farama-header__logo only-dark" src="../_static/img/gymnasium_white.svg" alt="Dark Logo"/>
|
||||
<span class="farama-header__title">Gymnasium Documentation</span>
|
||||
</a>
|
||||
</div>
|
||||
<div class="farama-header__right">
|
||||
<div class="farama-header-menu">
|
||||
<button class="farama-header-menu__btn" aria-label="Open Farama Menu" aria-expanded="false" aria-haspopup="true" aria-controls="farama-menu">
|
||||
<img class="farama-white-logo-invert" src="../_static/img/farama-logo-header.svg">
|
||||
<svg viewBox="0 0 24 24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
|
||||
<polyline style="stroke-linecap: round; stroke-linejoin: round; fill: none; stroke-width: 2px;" points="1 7 12 18 23 7"></polyline>
|
||||
</svg>
|
||||
</button>
|
||||
<div class="farama-header-menu-container farama-hidden" aria-hidden="true" id="farama-menu">
|
||||
<div class="farama-header-menu__header">
|
||||
<a href="https://farama.org">
|
||||
<img class="farama-header-menu__logo farama-white-logo-invert" src="../_static/img/farama_solid_white.svg" alt="Farama Foundation logo">
|
||||
<span>Farama Foundation</span>
|
||||
</a>
|
||||
<div class="farama-header-menu-header__right">
|
||||
<button id="farama-close-menu">
|
||||
<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg" fill="none" stroke="currentColor"
|
||||
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon-close">
|
||||
<line x1="3" y1="21" x2="21" y2="3"></line>
|
||||
<line x1="3" y1="3" x2="21" y2="21"></line>
|
||||
</svg>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="farama-header-menu__body">
|
||||
<!-- Response from farama.org/api/projects.json -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
|
||||
<script>
|
||||
document.body.dataset.theme = localStorage.getItem("theme") || "auto";
|
||||
</script>
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
|
||||
<symbol id="svg-toc" viewBox="0 0 24 24">
|
||||
<title>Contents</title>
|
||||
<svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 1024 1024">
|
||||
<path d="M408 442h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8zm-8 204c0 4.4 3.6 8 8 8h480c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8H408c-4.4 0-8 3.6-8 8v56zm504-486H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zm0 632H120c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h784c4.4 0 8-3.6 8-8v-56c0-4.4-3.6-8-8-8zM115.4 518.9L271.7 642c5.8 4.6 14.4.5 14.4-6.9V388.9c0-7.4-8.5-11.5-14.4-6.9L115.4 505.1a8.74 8.74 0 0 0 0 13.8z"/>
|
||||
</svg>
|
||||
</symbol>
|
||||
<symbol id="svg-menu" viewBox="0 0 24 24">
|
||||
<title>Menu</title>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||||
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-menu">
|
||||
<line x1="3" y1="12" x2="21" y2="12"></line>
|
||||
<line x1="3" y1="6" x2="21" y2="6"></line>
|
||||
<line x1="3" y1="18" x2="21" y2="18"></line>
|
||||
</svg>
|
||||
</symbol>
|
||||
<symbol id="svg-arrow-right" viewBox="0 0 24 24">
|
||||
<title>Expand</title>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||||
stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather-chevron-right">
|
||||
<polyline points="9 18 15 12 9 6"></polyline>
|
||||
</svg>
|
||||
</symbol>
|
||||
<symbol id="svg-sun" viewBox="0 0 24 24">
|
||||
<title>Light mode</title>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||||
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="feather-sun">
|
||||
<circle cx="12" cy="12" r="5"></circle>
|
||||
<line x1="12" y1="1" x2="12" y2="3"></line>
|
||||
<line x1="12" y1="21" x2="12" y2="23"></line>
|
||||
<line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
|
||||
<line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
|
||||
<line x1="1" y1="12" x2="3" y2="12"></line>
|
||||
<line x1="21" y1="12" x2="23" y2="12"></line>
|
||||
<line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
|
||||
<line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
|
||||
</svg>
|
||||
</symbol>
|
||||
<symbol id="svg-moon" viewBox="0 0 24 24">
|
||||
<title>Dark mode</title>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||||
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-moon">
|
||||
<path stroke="none" d="M0 0h24v24H0z" fill="none" />
|
||||
<path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z" />
|
||||
</svg>
|
||||
</symbol>
|
||||
<symbol id="svg-sun-half" viewBox="0 0 24 24">
|
||||
<title>Auto light/dark mode</title>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor"
|
||||
stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round" class="icon-tabler-shadow">
|
||||
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
|
||||
<circle cx="12" cy="12" r="9" />
|
||||
<path d="M13 12h5" />
|
||||
<path d="M13 15h4" />
|
||||
<path d="M13 18h1" />
|
||||
<path d="M13 9h4" />
|
||||
<path d="M13 6h1" />
|
||||
</svg>
|
||||
</symbol>
|
||||
</svg>
|
||||
|
||||
<input type="checkbox" class="sidebar-toggle" name="__navigation" id="__navigation">
|
||||
<input type="checkbox" class="sidebar-toggle" name="__toc" id="__toc">
|
||||
<label class="overlay sidebar-overlay" for="__navigation">
|
||||
<div class="visually-hidden">Hide navigation sidebar</div>
|
||||
</label>
|
||||
<label class="overlay toc-overlay" for="__toc">
|
||||
<div class="visually-hidden">Hide table of contents sidebar</div>
|
||||
</label>
|
||||
|
||||
|
||||
|
||||
<div class="page">
|
||||
<!--<header class="mobile-header">
|
||||
<div class="header-left">
|
||||
<label class="nav-overlay-icon" for="__navigation">
|
||||
<div class="visually-hidden">Toggle site navigation sidebar</div>
|
||||
<i class="icon"><svg><use href="#svg-menu"></use></svg></i>
|
||||
</label>
|
||||
</div>
|
||||
<div class="header-center">
|
||||
<a href="../"><div class="brand">Gymnasium Documentation</div></a>
|
||||
</div>
|
||||
<div class="header-right">
|
||||
<div class="theme-toggle-container theme-toggle-header">
|
||||
<button class="theme-toggle">
|
||||
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
|
||||
<svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
|
||||
<svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
|
||||
<svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
|
||||
</button>
|
||||
</div>
|
||||
<label class="toc-overlay-icon toc-header-icon" for="__toc">
|
||||
<div class="visually-hidden">Toggle table of contents sidebar</div>
|
||||
<i class="icon"><svg><use href="#svg-toc"></use></svg></i>
|
||||
</label>
|
||||
</div>
|
||||
</header>-->
|
||||
<aside class="sidebar-drawer">
|
||||
<div class="sidebar-container">
|
||||
|
||||
<div class="sidebar-sticky"><a class="farama-sidebar__title" href="../">
|
||||
<img class="farama-header__logo only-light" src="../_static/img/gymnasium_black.svg" alt="Light Logo"/>
|
||||
<img class="farama-header__logo only-dark" src="../_static/img/gymnasium_white.svg" alt="Dark Logo"/>
|
||||
<span class="farama-header__title">Gymnasium Documentation</span>
|
||||
</a><form class="sidebar-search-container" method="get" action="../search/" role="search">
|
||||
<input class="sidebar-search" placeholder=Search name="q" aria-label="Search">
|
||||
<input type="hidden" name="check_keywords" value="yes">
|
||||
<input type="hidden" name="area" value="default">
|
||||
</form>
|
||||
<div id="searchbox"></div><div class="sidebar-scroll"><div class="sidebar-tree">
|
||||
<p class="caption" role="heading"><span class="caption-text">Introduction</span></p>
|
||||
<ul>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../content/basic_usage/">Basic Usage</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../content/gym_compatibility/">Compatibility with Gym</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../content/migration-guide/">v21 to v26 Migration Guide</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">API</span></p>
|
||||
<ul>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../api/env/">Env</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../api/registry/">Registry</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../api/spaces/">Spaces</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" role="switch" type="checkbox"/><label for="toctree-checkbox-1"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../api/spaces/fundamental/">Fundamental Spaces</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../api/spaces/composite/">Composite Spaces</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../api/spaces/utils/">Spaces Utils</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../api/spaces/vector_utils/">Spaces Vector Utils</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../api/wrappers/">Wrappers</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" role="switch" type="checkbox"/><label for="toctree-checkbox-2"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../api/wrappers/misc_wrappers/">Misc Wrappers</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../api/wrappers/action_wrappers/">Action Wrappers</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../api/wrappers/observation_wrappers/">Observation Wrappers</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../api/wrappers/reward_wrappers/">Reward Wrappers</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../api/vector/">Vector</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../api/utils/">Utils</a></li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../api/experimental/">Experimental</a><input class="toctree-checkbox" id="toctree-checkbox-3" name="toctree-checkbox-3" role="switch" type="checkbox"/><label for="toctree-checkbox-3"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../api/experimental/functional/">Functional Environment</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../api/experimental/wrappers/">Wrappers</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../api/experimental/vector/">Vectorizing Environment</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../api/experimental/vector_wrappers/">Vector Environment Wrappers</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Environments</span></p>
|
||||
<ul>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../environments/classic_control/">Classic Control</a><input class="toctree-checkbox" id="toctree-checkbox-4" name="toctree-checkbox-4" role="switch" type="checkbox"/><label for="toctree-checkbox-4"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/classic_control/acrobot/">Acrobot</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/classic_control/cart_pole/">Cart Pole</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/classic_control/mountain_car_continuous/">Mountain Car Continuous</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/classic_control/mountain_car/">Mountain Car</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/classic_control/pendulum/">Pendulum</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../environments/box2d/">Box2D</a><input class="toctree-checkbox" id="toctree-checkbox-5" name="toctree-checkbox-5" role="switch" type="checkbox"/><label for="toctree-checkbox-5"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/box2d/bipedal_walker/">Bipedal Walker</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/box2d/car_racing/">Car Racing</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/box2d/lunar_lander/">Lunar Lander</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../environments/toy_text/">Toy Text</a><input class="toctree-checkbox" id="toctree-checkbox-6" name="toctree-checkbox-6" role="switch" type="checkbox"/><label for="toctree-checkbox-6"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/toy_text/blackjack/">Blackjack</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/toy_text/taxi/">Taxi</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/toy_text/cliff_walking/">Cliff Walking</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/toy_text/frozen_lake/">Frozen Lake</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../environments/mujoco/">MuJoCo</a><input class="toctree-checkbox" id="toctree-checkbox-7" name="toctree-checkbox-7" role="switch" type="checkbox"/><label for="toctree-checkbox-7"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/mujoco/ant/">Ant</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/mujoco/half_cheetah/">Half Cheetah</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/mujoco/hopper/">Hopper</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/mujoco/humanoid_standup/">Humanoid Standup</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/mujoco/humanoid/">Humanoid</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/mujoco/inverted_double_pendulum/">Inverted Double Pendulum</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/mujoco/inverted_pendulum/">Inverted Pendulum</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/mujoco/reacher/">Reacher</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/mujoco/swimmer/">Swimmer</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/mujoco/pusher/">Pusher</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/mujoco/walker2d/">Walker2D</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1 has-children"><a class="reference internal" href="../environments/atari/">Atari</a><input class="toctree-checkbox" id="toctree-checkbox-8" name="toctree-checkbox-8" role="switch" type="checkbox"/><label for="toctree-checkbox-8"><div class="visually-hidden">Toggle child pages in navigation</div><i class="icon"><svg><use href="#svg-arrow-right"></use></svg></i></label><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/adventure/">Adventure</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/air_raid/">Air Raid</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/alien/">Alien</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/amidar/">Amidar</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/assault/">Assault</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/asterix/">Asterix</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/asteroids/">Asteroids</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/atlantis/">Atlantis</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/bank_heist/">Bank Heist</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/battle_zone/">Battle Zone</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/beam_rider/">Beam Rider</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/berzerk/">Berzerk</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/bowling/">Bowling</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/boxing/">Boxing</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/breakout/">Breakout</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/carnival/">Carnival</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/centipede/">Centipede</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/chopper_command/">Chopper Command</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/crazy_climber/">Crazy Climber</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/defender/">Defender</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/demon_attack/">Demon Attack</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/double_dunk/">Double Dunk</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/elevator_action/">Elevator Action</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/enduro/">Enduro</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/fishing_derby/">FishingDerby</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/freeway/">Freeway</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/frostbite/">Frostbite</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/gopher/">Gopher</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/gravitar/">Gravitar</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/hero/">Hero</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/ice_hockey/">IceHockey</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/jamesbond/">Jamesbond</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/journey_escape/">JourneyEscape</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/kangaroo/">Kangaroo</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/krull/">Krull</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/kung_fu_master/">Kung Fu Master</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/montezuma_revenge/">Montezuma Revenge</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/ms_pacman/">Ms Pacman</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/name_this_game/">Name This Game</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/phoenix/">Phoenix</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/pitfall/">Pitfall</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/pong/">Pong</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/pooyan/">Pooyan</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/private_eye/">PrivateEye</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/qbert/">Qbert</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/riverraid/">Riverraid</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/road_runner/">Road Runner</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/robotank/">Robot Tank</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/seaquest/">Seaquest</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/skiing/">Skiings</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/solaris/">Solaris</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/space_invaders/">SpaceInvaders</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/star_gunner/">StarGunner</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/tennis/">Tennis</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/time_pilot/">TimePilot</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/tutankham/">Tutankham</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/up_n_down/">Up n’ Down</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/venture/">Venture</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/video_pinball/">Video Pinball</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/wizard_of_wor/">Wizard of Wor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../environments/atari/zaxxon/">Zaxxon</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../environments/third_party_environments/">Third-party Environments</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Tutorials</span></p>
|
||||
<ul>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../tutorials/blackjack_tutorial/">Solving Blackjack with Q-Learning</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../tutorials/environment_creation/">Make your own custom environment</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../tutorials/handling_time_limits/">Handling Time Limits</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../tutorials/implementing_custom_wrappers/">Implementing Custom Wrappers</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../tutorials/reinforce_invpend_gym_v26/">Training using REINFORCE for Mujoco</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Development</span></p>
|
||||
<ul>
|
||||
<li class="toctree-l1"><a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium">Github</a></li>
|
||||
<li class="toctree-l1"><a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium/blob/main/docs/README.md">Contribute to the Docs</a></li>
|
||||
</ul>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</aside>
|
||||
<div class="main">
|
||||
<div class="content">
|
||||
<div class="article-container">
|
||||
<a href="#" class="back-to-top muted-link">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24">
|
||||
<path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12z"></path>
|
||||
</svg>
|
||||
<span>Back to top</span>
|
||||
</a>
|
||||
<div class="content-icon-container">
|
||||
|
||||
|
||||
<div class="edit-this-page">
|
||||
<a class="muted-link" href="https://github.com/Farama-Foundation/Gymnasium/edit/main/docs/README.md" title="Edit this page">
|
||||
<svg aria-hidden="true" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" fill="none" stroke-linecap="round" stroke-linejoin="round">
|
||||
<path stroke="none" d="M0 0h24v24H0z" fill="none"/>
|
||||
<path d="M4 20h4l10.5 -10.5a1.5 1.5 0 0 0 -4 -4l-10.5 10.5v4" />
|
||||
<line x1="13.5" y1="6.5" x2="17.5" y2="10.5" />
|
||||
</svg>
|
||||
<span class="visually-hidden">Edit this page</span>
|
||||
</a>
|
||||
</div><div class="theme-toggle-container theme-toggle-content">
|
||||
<button class="theme-toggle">
|
||||
<div class="visually-hidden">Toggle Light / Dark / Auto color theme</div>
|
||||
<svg class="theme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
|
||||
<svg class="theme-icon-when-dark"><use href="#svg-moon"></use></svg>
|
||||
<svg class="theme-icon-when-light"><use href="#svg-sun"></use></svg>
|
||||
</button>
|
||||
</div>
|
||||
<label class="toc-overlay-icon toc-content-icon" for="__toc">
|
||||
<div class="visually-hidden">Toggle table of contents sidebar</div>
|
||||
<i class="icon"><svg><use href="#svg-toc"></use></svg></i>
|
||||
</label>
|
||||
</div>
|
||||
<article role="main">
|
||||
|
||||
<section id="gymnasium-docs">
|
||||
<h1>Gymnasium-docs<a class="headerlink" href="#gymnasium-docs" title="Permalink to this heading">#</a></h1>
|
||||
<p>This folder contains the documentation for <a class="reference external" href="https://github.com/Farama-Foundation/Gymnasium">Gymnasium</a>.</p>
|
||||
<p>If you are modifying a non-environment page or an atari environment page, please PR this repo. Otherwise, follow the steps below:</p>
|
||||
<section id="instructions-for-modifying-environment-pages">
|
||||
<h2>Instructions for modifying environment pages<a class="headerlink" href="#instructions-for-modifying-environment-pages" title="Permalink to this heading">#</a></h2>
|
||||
<section id="editing-an-environment-page">
|
||||
<h3>Editing an environment page<a class="headerlink" href="#editing-an-environment-page" title="Permalink to this heading">#</a></h3>
|
||||
<p>If you are editing an Atari environment, directly edit the Markdown file in this repository.</p>
|
||||
<p>Otherwise, fork Gymnasium and edit the docstring in the environment’s Python file. Then, pip install your Gymnasium fork and run <code class="docutils literal notranslate"><span class="pre">docs/scripts/gen_mds.py</span></code> in this repo. This will automatically generate a Markdown documentation file for the environment.</p>
|
||||
</section>
|
||||
<section id="adding-a-new-environment">
|
||||
<h3>Adding a new environment<a class="headerlink" href="#adding-a-new-environment" title="Permalink to this heading">#</a></h3>
|
||||
<section id="atari-env">
|
||||
<h4>Atari env<a class="headerlink" href="#atari-env" title="Permalink to this heading">#</a></h4>
|
||||
<p>For Atari envs, add a Markdown file into <code class="docutils literal notranslate"><span class="pre">pages/environments/atari</span></code> then complete the <strong>other steps</strong>.</p>
|
||||
</section>
|
||||
<section id="non-atari-env">
|
||||
<h4>Non-Atari env<a class="headerlink" href="#non-atari-env" title="Permalink to this heading">#</a></h4>
|
||||
<p>Ensure the environment is in Gymnasium (or your fork). Ensure that the environment’s Python file has a properly formatted markdown docstring. Pip install Gymnasium (or your fork) then run <code class="docutils literal notranslate"><span class="pre">docs/scripts/gen_mds.py</span></code>. This will automatically generate a md page for the environment. Then complete the <span class="xref myst">other steps</span>.</p>
|
||||
</section>
|
||||
<section id="other-steps">
|
||||
<h4>Other steps<a class="headerlink" href="#other-steps" title="Permalink to this heading">#</a></h4>
|
||||
<ul class="simple">
|
||||
<li><p>Add the corresponding gif into the <code class="docutils literal notranslate"><span class="pre">docs/_static/videos/{ENV_TYPE}</span></code> folder, where <code class="docutils literal notranslate"><span class="pre">ENV_TYPE</span></code> is the category of your new environment (e.g. mujoco). Follow snake_case naming convention. Alternatively, run <code class="docutils literal notranslate"><span class="pre">docs/scripts/gen_gifs.py</span></code>.</p></li>
|
||||
<li><p>Edit <code class="docutils literal notranslate"><span class="pre">docs/environments/{ENV_TYPE}/index.md</span></code>, and add the name of the file corresponding to your new environment to the <code class="docutils literal notranslate"><span class="pre">toctree</span></code>.</p></li>
|
||||
</ul>
|
||||
</section>
|
||||
</section>
|
||||
</section>
|
||||
<section id="build-the-documentation">
|
||||
<h2>Build the Documentation<a class="headerlink" href="#build-the-documentation" title="Permalink to this heading">#</a></h2>
|
||||
<p>Install the required packages and Gymnasium (or your fork):</p>
|
||||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">gymnasium</span>
|
||||
<span class="n">cd</span> <span class="n">docs</span>
|
||||
<span class="n">pip</span> <span class="n">install</span> <span class="o">-</span><span class="n">r</span> <span class="n">requirements</span><span class="o">.</span><span class="n">txt</span> <span class="c1"># To install document builder requirements.</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>To build the documentation once:</p>
|
||||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">cd</span> <span class="n">docs</span>
|
||||
<span class="n">make</span> <span class="n">dirhtml</span> <span class="n">_build</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>To rebuild the documentation automatically every time a change is made:</p>
|
||||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">cd</span> <span class="n">docs</span>
|
||||
<span class="n">sphinx</span><span class="o">-</span><span class="n">autobuild</span> <span class="o">-</span><span class="n">b</span> <span class="n">dirhtml</span> <span class="o">.</span> <span class="n">_build</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="writing-tutorials">
|
||||
<h2>Writing Tutorials<a class="headerlink" href="#writing-tutorials" title="Permalink to this heading">#</a></h2>
|
||||
<p>We use Sphinx-Gallery to build the tutorials inside the <code class="docutils literal notranslate"><span class="pre">docs/tutorials</span></code> directory. Check <code class="docutils literal notranslate"><span class="pre">docs/tutorials/demo.py</span></code> to see an example of a tutorial and <a class="reference external" href="https://sphinx-gallery.github.io/stable/syntax.html">Sphinx-Gallery documentation</a> for more information.</p>
|
||||
<p>To convert Jupyter Notebooks to the python tutorials you can use <a class="reference external" href="https://gist.github.com/mgoulao/f07f5f79f6cd9a721db8a34bba0a19a7">this script</a>.</p>
|
||||
<p>If you want Sphinx-Gallery to execute the tutorial (which adds outputs and plots) then the file name should start with <code class="docutils literal notranslate"><span class="pre">run_</span></code>. Note that this adds to the build time so make sure the script doesn’t take more than a few seconds to execute.</p>
|
||||
</section>
|
||||
</section>
|
||||
|
||||
</article>
|
||||
</div>
|
||||
<footer>
|
||||
|
||||
<div class="related-pages">
|
||||
|
||||
|
||||
</div>
|
||||
<div class="bottom-of-page">
|
||||
<div class="left-details">
|
||||
<div class="copyright">
|
||||
Copyright © 2022 Farama Foundation
|
||||
</div>
|
||||
<!--
|
||||
Made with <a href="https://www.sphinx-doc.org/">Sphinx</a> and <a class="muted-link" href="https://pradyunsg.me">@pradyunsg</a>'s
|
||||
|
||||
<a href="https://github.com/pradyunsg/furo">Furo</a>
|
||||
-->
|
||||
</div>
|
||||
<div class="right-details">
|
||||
<div class="icons">
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</footer>
|
||||
</div>
|
||||
<aside class="toc-drawer">
|
||||
|
||||
|
||||
<div class="toc-sticky toc-scroll">
|
||||
<div class="toc-title-container">
|
||||
<span class="toc-title">
|
||||
On this page
|
||||
</span>
|
||||
</div>
|
||||
<div class="toc-tree-container">
|
||||
<div class="toc-tree">
|
||||
<ul>
|
||||
<li><a class="reference internal" href="#">Gymnasium-docs</a><ul>
|
||||
<li><a class="reference internal" href="#instructions-for-modifying-environment-pages">Instructions for modifying environment pages</a><ul>
|
||||
<li><a class="reference internal" href="#editing-an-environment-page">Editing an environment page</a></li>
|
||||
<li><a class="reference internal" href="#adding-a-new-environment">Adding a new environment</a><ul>
|
||||
<li><a class="reference internal" href="#atari-env">Atari env</a></li>
|
||||
<li><a class="reference internal" href="#non-atari-env">Non-Atari env</a></li>
|
||||
<li><a class="reference internal" href="#other-steps">Other steps</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><a class="reference internal" href="#build-the-documentation">Build the Documentation</a></li>
|
||||
<li><a class="reference internal" href="#writing-tutorials">Writing Tutorials</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
</aside>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
const toggleMenu = () => {
|
||||
const menuBtn = document.querySelector(".farama-header-menu__btn");
|
||||
const menuContainer = document.querySelector(".farama-header-menu-container");
|
||||
if (document.querySelector(".farama-header-menu").classList.contains("active")) {
|
||||
menuBtn.ariaExpanded = "false";
|
||||
menuContainer.ariaHidden = "true";
|
||||
} else {
|
||||
menuBtn.ariaExpanded = "true";
|
||||
menuContainer.ariaHidden = "false";
|
||||
}
|
||||
document.querySelector(".farama-header-menu").classList.toggle("active");
|
||||
}
|
||||
|
||||
document.querySelector(".farama-header-menu__btn").addEventListener("click", toggleMenu);
|
||||
document.getElementById("farama-close-menu").addEventListener("click", toggleMenu);
|
||||
</script>
|
||||
|
||||
|
||||
<script>
|
||||
(() => {
|
||||
if (!localStorage.getItem("shownCookieAlert")) {
|
||||
const boxElem = document.createElement("div");
|
||||
boxElem.classList.add("cookie-alert");
|
||||
const containerElem = document.createElement("div");
|
||||
containerElem.classList.add("cookie-alert__container");
|
||||
const textElem = document.createElement("p");
|
||||
textElem.innerHTML = `This page uses <a href="https://analytics.google.com/">
|
||||
Google Analytics</a> to collect statistics. You can disable it by blocking
|
||||
the JavaScript coming from www.google-analytics.com.`;
|
||||
containerElem.appendChild(textElem);
|
||||
const closeBtn = document.createElement("button");
|
||||
closeBtn.innerHTML = `<?xml version="1.0" ?><svg viewBox="0 0 32 32" xmlns="http://www.w3.org/2000/svg"><defs><style>.cls-1{fill:none;stroke:#000;stroke-linecap:round;stroke-linejoin:round;stroke-width:2px;}</style></defs><title/><g id="cross"><line class="cls-1" x1="7" x2="25" y1="7" y2="25"/><line class="cls-1" x1="7" x2="25" y1="25" y2="7"/></g></svg>`
|
||||
closeBtn.onclick = () => {
|
||||
localStorage.setItem("shownCookieAlert", "true");
|
||||
boxElem.style.display = "none";
|
||||
}
|
||||
containerElem.appendChild(closeBtn);
|
||||
boxElem.appendChild(containerElem);
|
||||
document.body.appendChild(boxElem);
|
||||
}
|
||||
})()
|
||||
</script>
|
||||
|
||||
<script async src="https://www.googletagmanager.com/gtag/js?id=G-6H9C8TWXZ8"></script>
|
||||
<script>
|
||||
window.dataLayer = window.dataLayer || [];
|
||||
function gtag(){dataLayer.push(arguments);}
|
||||
gtag('js', new Date());
|
||||
|
||||
gtag('config', 'G-6H9C8TWXZ8');
|
||||
</script>
|
||||
|
||||
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
|
||||
<script src="../_static/jquery.js"></script>
|
||||
<script src="../_static/underscore.js"></script>
|
||||
<script src="../_static/_sphinx_javascript_frameworks_compat.js"></script>
|
||||
<script src="../_static/doctools.js"></script>
|
||||
<script src="../_static/sphinx_highlight.js"></script>
|
||||
<script src="../_static/scripts/furo.js"></script>
|
||||
|
||||
<script>
|
||||
const createCORSRequest = (method, url) => {
|
||||
let xhr = new XMLHttpRequest();
|
||||
xhr.responseType = 'json';
|
||||
|
||||
if ("withCredentials" in xhr) {
|
||||
xhr.open(method, url, true);
|
||||
} else if (typeof XDomainRequest != "undefined") {
|
||||
// IE8 & IE9
|
||||
xhr = new XDomainRequest();
|
||||
xhr.open(method, url);
|
||||
} else {
|
||||
// CORS not supported.
|
||||
xhr = null;
|
||||
}
|
||||
return xhr;
|
||||
};
|
||||
|
||||
const url = 'https://farama.org/api/projects.json';
|
||||
const imagesBasepath = "https://farama.org/assets/images"
|
||||
const method = 'GET';
|
||||
let xhr = createCORSRequest(method, url);
|
||||
|
||||
xhr.onload = () => {
|
||||
const jsonResponse = xhr.response;
|
||||
const sections = {
|
||||
"Documentation": [],
|
||||
"Mature Projects": [],
|
||||
"Incubating Projects": [],
|
||||
"Foundation": [
|
||||
{
|
||||
name: "About",
|
||||
link: "https://farama.org/about"
|
||||
},
|
||||
{
|
||||
name: "Standards",
|
||||
link: "https://farama.org/project_standards",
|
||||
},
|
||||
{
|
||||
name: "Donate",
|
||||
link: "https://farama.org/donations"
|
||||
}
|
||||
]
|
||||
}
|
||||
Object.keys(jsonResponse).forEach(key => {
|
||||
projectJson = jsonResponse[key];
|
||||
if (projectJson.website !== null) {
|
||||
projectJson.link = projectJson.website;
|
||||
sections["Documentation"].push(projectJson)
|
||||
} else if (projectJson.type == "mature") {
|
||||
projectJson.link = projectJson.github;
|
||||
sections["Mature Projects"].push(projectJson)
|
||||
} else {
|
||||
projectJson.link = projectJson.github;
|
||||
sections["Incubating Projects"].push(projectJson)
|
||||
}
|
||||
})
|
||||
|
||||
const menuContainer = document.querySelector(".farama-header-menu__body");
|
||||
|
||||
Object.keys(sections).forEach((key, i) => {
|
||||
projects = sections[key];
|
||||
const sectionElem = Object.assign(
|
||||
document.createElement('div'), {
|
||||
className:'farama-header-menu__section',
|
||||
style: "padding-left: 24px"
|
||||
}
|
||||
)
|
||||
sectionElem.appendChild(Object.assign(document.createElement('span'),
|
||||
{
|
||||
className:'farama-header-menu__section-title' ,
|
||||
innerText: key
|
||||
}
|
||||
))
|
||||
const ulElem = Object.assign(document.createElement('ul'),
|
||||
{
|
||||
className:'farama-header-menu-list',
|
||||
}
|
||||
)
|
||||
for (let project of projects) {
|
||||
const liElem = document.createElement("li");
|
||||
const aElem = Object.assign(document.createElement("a"),
|
||||
{
|
||||
href: project.link
|
||||
}
|
||||
);
|
||||
liElem.appendChild(aElem);
|
||||
if (key !== "Foundation") {
|
||||
const imgElem = Object.assign(document.createElement("img"),
|
||||
{
|
||||
src: project.image ? imagesBasepath + project.image : imagesBasepath + "/farama_black.svg",
|
||||
alt: `${project.name} logo`,
|
||||
className: "farama-black-logo-invert"
|
||||
}
|
||||
);
|
||||
aElem.appendChild(imgElem);
|
||||
}
|
||||
aElem.appendChild(document.createTextNode(project.name));
|
||||
ulElem.appendChild(liElem);
|
||||
}
|
||||
sectionElem.appendChild(ulElem);
|
||||
menuContainer.appendChild(sectionElem)
|
||||
});
|
||||
}
|
||||
|
||||
xhr.onerror = function() {
|
||||
console.error("Unable to load projects");
|
||||
};
|
||||
|
||||
xhr.send();
|
||||
</script>
|
||||
|
||||
|
||||
<script>
|
||||
const versioningConfig = {
|
||||
githubUser: 'Farama-Foundation',
|
||||
githubRepo: 'Gymnasium',
|
||||
};
|
||||
fetch('/_static/versioning/versioning_menu.html').then(response => {
|
||||
if (response.status === 200) {
|
||||
response.text().then(text => {
|
||||
const container = document.createElement("div");
|
||||
container.innerHTML = text;
|
||||
document.querySelector("body").appendChild(container);
|
||||
// innerHtml doenst evaluate scripts, we need to add them dynamically
|
||||
Array.from(container.querySelectorAll("script")).forEach(oldScript => {
|
||||
const newScript = document.createElement("script");
|
||||
Array.from(oldScript.attributes).forEach(attr => newScript.setAttribute(attr.name, attr.value));
|
||||
newScript.appendChild(document.createTextNode(oldScript.innerHTML));
|
||||
oldScript.parentNode.replaceChild(newScript, oldScript);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
console.warn("Unable to load versioning menu", response);
|
||||
}
|
||||
});
|
||||
</script></body>
|
||||
</html>
|
@@ -0,0 +1,510 @@
|
||||
# fmt: off
|
||||
"""
|
||||
Make your own custom environment
|
||||
================================
|
||||
|
||||
This documentation overviews creating new environments and relevant
|
||||
useful wrappers, utilities and tests included in Gymnasium designed for
|
||||
the creation of new environments. You can clone gym-examples to play
|
||||
with the code that is presented here. We recommend that you use a virtual environment:
|
||||
|
||||
.. code:: console
|
||||
|
||||
git clone https://github.com/Farama-Foundation/gym-examples
|
||||
cd gym-examples
|
||||
python -m venv .env
|
||||
source .env/bin/activate
|
||||
pip install -e .
|
||||
|
||||
Subclassing gymnasium.Env
|
||||
-------------------------
|
||||
|
||||
Before learning how to create your own environment you should check out
|
||||
`the documentation of Gymnasium’s API </api/core>`__.
|
||||
|
||||
We will be concerned with a subset of gym-examples that looks like this:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
gym-examples/
|
||||
README.md
|
||||
setup.py
|
||||
gym_examples/
|
||||
__init__.py
|
||||
envs/
|
||||
__init__.py
|
||||
grid_world.py
|
||||
wrappers/
|
||||
__init__.py
|
||||
relative_position.py
|
||||
reacher_weighted_reward.py
|
||||
discrete_action.py
|
||||
clip_reward.py
|
||||
|
||||
To illustrate the process of subclassing ``gymnasium.Env``, we will
|
||||
implement a very simplistic game, called ``GridWorldEnv``. We will write
|
||||
the code for our custom environment in
|
||||
``gym-examples/gym_examples/envs/grid_world.py``. The environment
|
||||
consists of a 2-dimensional square grid of fixed size (specified via the
|
||||
``size`` parameter during construction). The agent can move vertically
|
||||
or horizontally between grid cells in each timestep. The goal of the
|
||||
agent is to navigate to a target on the grid that has been placed
|
||||
randomly at the beginning of the episode.
|
||||
|
||||
- Observations provide the location of the target and agent.
|
||||
- There are 4 actions in our environment, corresponding to the
|
||||
movements “right”, “up”, “left”, and “down”.
|
||||
- A done signal is issued as soon as the agent has navigated to the
|
||||
grid cell where the target is located.
|
||||
- Rewards are binary and sparse, meaning that the immediate reward is
|
||||
always zero, unless the agent has reached the target, then it is 1.
|
||||
|
||||
An episode in this environment (with ``size=5``) might look like this:
|
||||
|
||||
where the blue dot is the agent and the red square represents the
|
||||
target.
|
||||
|
||||
Let us look at the source code of ``GridWorldEnv`` piece by piece:
|
||||
"""
|
||||
|
||||
# %%
|
||||
# Declaration and Initialization
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
#
|
||||
# Our custom environment will inherit from the abstract class
|
||||
# ``gymnasium.Env``. You shouldn’t forget to add the ``metadata``
|
||||
# attribute to your class. There, you should specify the render-modes that
|
||||
# are supported by your environment (e.g. ``"human"``, ``"rgb_array"``,
|
||||
# ``"ansi"``) and the framerate at which your environment should be
|
||||
# rendered. Every environment should support ``None`` as render-mode; you
|
||||
# don’t need to add it in the metadata. In ``GridWorldEnv``, we will
|
||||
# support the modes “rgb_array” and “human” and render at 4 FPS.
|
||||
#
|
||||
# The ``__init__`` method of our environment will accept the integer
|
||||
# ``size``, that determines the size of the square grid. We will set up
|
||||
# some variables for rendering and define ``self.observation_space`` and
|
||||
# ``self.action_space``. In our case, observations should provide
|
||||
# information about the location of the agent and target on the
|
||||
# 2-dimensional grid. We will choose to represent observations in the form
|
||||
# of dictionaries with keys ``"agent"`` and ``"target"``. An observation
|
||||
# may look like ``{"agent": array([1, 0]), "target": array([0, 3])}``.
|
||||
# Since we have 4 actions in our environment (“right”, “up”, “left”,
|
||||
# “down”), we will use ``Discrete(4)`` as an action space. Here is the
|
||||
# declaration of ``GridWorldEnv`` and the implementation of ``__init__``:
|
||||
|
||||
import numpy as np
|
||||
import pygame
|
||||
|
||||
import gymnasium as gym
|
||||
from gymnasium import spaces
|
||||
|
||||
|
||||
class GridWorldEnv(gym.Env):
|
||||
metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4}
|
||||
|
||||
def __init__(self, render_mode=None, size=5):
|
||||
self.size = size # The size of the square grid
|
||||
self.window_size = 512 # The size of the PyGame window
|
||||
|
||||
# Observations are dictionaries with the agent's and the target's location.
|
||||
# Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]).
|
||||
self.observation_space = spaces.Dict(
|
||||
{
|
||||
"agent": spaces.Box(0, size - 1, shape=(2,), dtype=int),
|
||||
"target": spaces.Box(0, size - 1, shape=(2,), dtype=int),
|
||||
}
|
||||
)
|
||||
|
||||
# We have 4 actions, corresponding to "right", "up", "left", "down"
|
||||
self.action_space = spaces.Discrete(4)
|
||||
|
||||
"""
|
||||
The following dictionary maps abstract actions from `self.action_space` to
|
||||
the direction we will walk in if that action is taken.
|
||||
I.e. 0 corresponds to "right", 1 to "up" etc.
|
||||
"""
|
||||
self._action_to_direction = {
|
||||
0: np.array([1, 0]),
|
||||
1: np.array([0, 1]),
|
||||
2: np.array([-1, 0]),
|
||||
3: np.array([0, -1]),
|
||||
}
|
||||
|
||||
assert render_mode is None or render_mode in self.metadata["render_modes"]
|
||||
self.render_mode = render_mode
|
||||
|
||||
"""
|
||||
If human-rendering is used, `self.window` will be a reference
|
||||
to the window that we draw to. `self.clock` will be a clock that is used
|
||||
to ensure that the environment is rendered at the correct framerate in
|
||||
human-mode. They will remain `None` until human-mode is used for the
|
||||
first time.
|
||||
"""
|
||||
self.window = None
|
||||
self.clock = None
|
||||
|
||||
# %%
|
||||
# Constructing Observations From Environment States
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
#
|
||||
# Since we will need to compute observations both in ``reset`` and
|
||||
# ``step``, it is often convenient to have a (private) method ``_get_obs``
|
||||
# that translates the environment’s state into an observation. However,
|
||||
# this is not mandatory and you may as well compute observations in
|
||||
# ``reset`` and ``step`` separately:
|
||||
|
||||
def _get_obs(self):
|
||||
return {"agent": self._agent_location, "target": self._target_location}
|
||||
|
||||
# %%
|
||||
# We can also implement a similar method for the auxiliary information
|
||||
# that is returned by ``step`` and ``reset``. In our case, we would like
|
||||
# to provide the manhattan distance between the agent and the target:
|
||||
|
||||
def _get_info(self):
|
||||
return {
|
||||
"distance": np.linalg.norm(
|
||||
self._agent_location - self._target_location, ord=1
|
||||
)
|
||||
}
|
||||
|
||||
# %%
|
||||
# Oftentimes, info will also contain some data that is only available
|
||||
# inside the ``step`` method (e.g. individual reward terms). In that case,
|
||||
# we would have to update the dictionary that is returned by ``_get_info``
|
||||
# in ``step``.
|
||||
|
||||
# %%
|
||||
# Reset
|
||||
# ~~~~~
|
||||
#
|
||||
# The ``reset`` method will be called to initiate a new episode. You may
|
||||
# assume that the ``step`` method will not be called before ``reset`` has
|
||||
# been called. Moreover, ``reset`` should be called whenever a done signal
|
||||
# has been issued. Users may pass the ``seed`` keyword to ``reset`` to
|
||||
# initialize any random number generator that is used by the environment
|
||||
# to a deterministic state. It is recommended to use the random number
|
||||
# generator ``self.np_random`` that is provided by the environment’s base
|
||||
# class, ``gymnasium.Env``. If you only use this RNG, you do not need to
|
||||
# worry much about seeding, *but you need to remember to call
|
||||
# ``super().reset(seed=seed)``* to make sure that ``gymnasium.Env``
|
||||
# correctly seeds the RNG. Once this is done, we can randomly set the
|
||||
# state of our environment. In our case, we randomly choose the agent’s
|
||||
# location and the random sample target positions, until it does not
|
||||
# coincide with the agent’s position.
|
||||
#
|
||||
# The ``reset`` method should return a tuple of the initial observation
|
||||
# and some auxiliary information. We can use the methods ``_get_obs`` and
|
||||
# ``_get_info`` that we implemented earlier for that:
|
||||
|
||||
def reset(self, seed=None, options=None):
|
||||
# We need the following line to seed self.np_random
|
||||
super().reset(seed=seed)
|
||||
|
||||
# Choose the agent's location uniformly at random
|
||||
self._agent_location = self.np_random.integers(0, self.size, size=2, dtype=int)
|
||||
|
||||
# We will sample the target's location randomly until it does not coincide with the agent's location
|
||||
self._target_location = self._agent_location
|
||||
while np.array_equal(self._target_location, self._agent_location):
|
||||
self._target_location = self.np_random.integers(
|
||||
0, self.size, size=2, dtype=int
|
||||
)
|
||||
|
||||
observation = self._get_obs()
|
||||
info = self._get_info()
|
||||
|
||||
if self.render_mode == "human":
|
||||
self._render_frame()
|
||||
|
||||
return observation, info
|
||||
|
||||
# %%
|
||||
# Step
|
||||
# ~~~~
|
||||
#
|
||||
# The ``step`` method usually contains most of the logic of your
|
||||
# environment. It accepts an ``action``, computes the state of the
|
||||
# environment after applying that action and returns the 4-tuple
|
||||
# ``(observation, reward, done, info)``. Once the new state of the
|
||||
# environment has been computed, we can check whether it is a terminal
|
||||
# state and we set ``done`` accordingly. Since we are using sparse binary
|
||||
# rewards in ``GridWorldEnv``, computing ``reward`` is trivial once we
|
||||
# know ``done``. To gather ``observation`` and ``info``, we can again make
|
||||
# use of ``_get_obs`` and ``_get_info``:
|
||||
|
||||
def step(self, action):
|
||||
# Map the action (element of {0,1,2,3}) to the direction we walk in
|
||||
direction = self._action_to_direction[action]
|
||||
# We use `np.clip` to make sure we don't leave the grid
|
||||
self._agent_location = np.clip(
|
||||
self._agent_location + direction, 0, self.size - 1
|
||||
)
|
||||
# An episode is done iff the agent has reached the target
|
||||
terminated = np.array_equal(self._agent_location, self._target_location)
|
||||
reward = 1 if terminated else 0 # Binary sparse rewards
|
||||
observation = self._get_obs()
|
||||
info = self._get_info()
|
||||
|
||||
if self.render_mode == "human":
|
||||
self._render_frame()
|
||||
|
||||
return observation, reward, terminated, False, info
|
||||
|
||||
# %%
|
||||
# Rendering
|
||||
# ~~~~~~~~~
|
||||
#
|
||||
# Here, we are using PyGame for rendering. A similar approach to rendering
|
||||
# is used in many environments that are included with Gymnasium and you
|
||||
# can use it as a skeleton for your own environments:
|
||||
|
||||
def render(self):
|
||||
if self.render_mode == "rgb_array":
|
||||
return self._render_frame()
|
||||
|
||||
def _render_frame(self):
|
||||
if self.window is None and self.render_mode == "human":
|
||||
pygame.init()
|
||||
pygame.display.init()
|
||||
self.window = pygame.display.set_mode(
|
||||
(self.window_size, self.window_size)
|
||||
)
|
||||
if self.clock is None and self.render_mode == "human":
|
||||
self.clock = pygame.time.Clock()
|
||||
|
||||
canvas = pygame.Surface((self.window_size, self.window_size))
|
||||
canvas.fill((255, 255, 255))
|
||||
pix_square_size = (
|
||||
self.window_size / self.size
|
||||
) # The size of a single grid square in pixels
|
||||
|
||||
# First we draw the target
|
||||
pygame.draw.rect(
|
||||
canvas,
|
||||
(255, 0, 0),
|
||||
pygame.Rect(
|
||||
pix_square_size * self._target_location,
|
||||
(pix_square_size, pix_square_size),
|
||||
),
|
||||
)
|
||||
# Now we draw the agent
|
||||
pygame.draw.circle(
|
||||
canvas,
|
||||
(0, 0, 255),
|
||||
(self._agent_location + 0.5) * pix_square_size,
|
||||
pix_square_size / 3,
|
||||
)
|
||||
|
||||
# Finally, add some gridlines
|
||||
for x in range(self.size + 1):
|
||||
pygame.draw.line(
|
||||
canvas,
|
||||
0,
|
||||
(0, pix_square_size * x),
|
||||
(self.window_size, pix_square_size * x),
|
||||
width=3,
|
||||
)
|
||||
pygame.draw.line(
|
||||
canvas,
|
||||
0,
|
||||
(pix_square_size * x, 0),
|
||||
(pix_square_size * x, self.window_size),
|
||||
width=3,
|
||||
)
|
||||
|
||||
if self.render_mode == "human":
|
||||
# The following line copies our drawings from `canvas` to the visible window
|
||||
self.window.blit(canvas, canvas.get_rect())
|
||||
pygame.event.pump()
|
||||
pygame.display.update()
|
||||
|
||||
# We need to ensure that human-rendering occurs at the predefined framerate.
|
||||
# The following line will automatically add a delay to keep the framerate stable.
|
||||
self.clock.tick(self.metadata["render_fps"])
|
||||
else: # rgb_array
|
||||
return np.transpose(
|
||||
np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
|
||||
)
|
||||
|
||||
# %%
|
||||
# Close
|
||||
# ~~~~~
|
||||
#
|
||||
# The ``close`` method should close any open resources that were used by
|
||||
# the environment. In many cases, you don’t actually have to bother to
|
||||
# implement this method. However, in our example ``render_mode`` may be
|
||||
# ``"human"`` and we might need to close the window that has been opened:
|
||||
|
||||
def close(self):
|
||||
if self.window is not None:
|
||||
pygame.display.quit()
|
||||
pygame.quit()
|
||||
|
||||
|
||||
# %%
|
||||
# In other environments ``close`` might also close files that were opened
|
||||
# or release other resources. You shouldn’t interact with the environment
|
||||
# after having called ``close``.
|
||||
|
||||
# %%
|
||||
# Registering Envs
|
||||
# ----------------
|
||||
#
|
||||
# In order for the custom environments to be detected by Gymnasium, they
|
||||
# must be registered as follows. We will choose to put this code in
|
||||
# ``gym-examples/gym_examples/__init__.py``.
|
||||
#
|
||||
# .. code:: python
|
||||
#
|
||||
# from gymnasium.envs.registration import register
|
||||
#
|
||||
# register(
|
||||
# id="gym_examples/GridWorld-v0",
|
||||
# entry_point="gym_examples.envs:GridWorldEnv",
|
||||
# max_episode_steps=300,
|
||||
# )
|
||||
|
||||
# %%
|
||||
# The environment ID consists of three components, two of which are
|
||||
# optional: an optional namespace (here: ``gym_examples``), a mandatory
|
||||
# name (here: ``GridWorld``) and an optional but recommended version
|
||||
# (here: v0). It might have also been registered as ``GridWorld-v0`` (the
|
||||
# recommended approach), ``GridWorld`` or ``gym_examples/GridWorld``, and
|
||||
# the appropriate ID should then be used during environment creation.
|
||||
#
|
||||
# The keyword argument ``max_episode_steps=300`` will ensure that
|
||||
# GridWorld environments that are instantiated via ``gymnasium.make`` will
|
||||
# be wrapped in a ``TimeLimit`` wrapper (see `the wrapper
|
||||
# documentation </api/wrappers>`__ for more information). A done signal
|
||||
# will then be produced if the agent has reached the target *or* 300 steps
|
||||
# have been executed in the current episode. To distinguish truncation and
|
||||
# termination, you can check ``info["TimeLimit.truncated"]``.
|
||||
#
|
||||
# Apart from ``id`` and ``entrypoint``, you may pass the following
|
||||
# additional keyword arguments to ``register``:
|
||||
#
|
||||
# +----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+
|
||||
# | Name | Type | Default | Description |
|
||||
# +======================+===========+===========+===============================================================================================================+
|
||||
# | ``reward_threshold`` | ``float`` | ``None`` | The reward threshold before the task is considered solved |
|
||||
# +----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+
|
||||
# | ``nondeterministic`` | ``bool`` | ``False`` | Whether this environment is non-deterministic even after seeding |
|
||||
# +----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+
|
||||
# | ``max_episode_steps``| ``int`` | ``None`` | The maximum number of steps that an episode can consist of. If not ``None``, a ``TimeLimit`` wrapper is added |
|
||||
# +----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+
|
||||
# | ``order_enforce`` | ``bool`` | ``True`` | Whether to wrap the environment in an ``OrderEnforcing`` wrapper |
|
||||
# +----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+
|
||||
# | ``autoreset`` | ``bool`` | ``False`` | Whether to wrap the environment in an ``AutoResetWrapper`` |
|
||||
# +----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+
|
||||
# | ``kwargs`` | ``dict`` | ``{}`` | The default kwargs to pass to the environment class |
|
||||
# +----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+
|
||||
#
|
||||
# Most of these keywords (except for ``max_episode_steps``,
|
||||
# ``order_enforce`` and ``kwargs``) do not alter the behavior of
|
||||
# environment instances but merely provide some extra information about
|
||||
# your environment. After registration, our custom ``GridWorldEnv``
|
||||
# environment can be created with
|
||||
# ``env = gymnasium.make('gym_examples/GridWorld-v0')``.
|
||||
#
|
||||
# ``gym-examples/gym_examples/envs/__init__.py`` should have:
|
||||
#
|
||||
# .. code:: python
|
||||
#
|
||||
# from gym_examples.envs.grid_world import GridWorldEnv
|
||||
#
|
||||
# If your environment is not registered, you may optionally pass a module
|
||||
# to import, that would register your environment before creating it like
|
||||
# this - ``env = gymnasium.make('module:Env-v0')``, where ``module``
|
||||
# contains the registration code. For the GridWorld env, the registration
|
||||
# code is run by importing ``gym_examples`` so if it were not possible to
|
||||
# import gym_examples explicitly, you could register while making by
|
||||
# ``env = gymnasium.make('gym_examples:gym_examples/GridWorld-v0)``. This
|
||||
# is especially useful when you’re allowed to pass only the environment ID
|
||||
# into a third-party codebase (eg. learning library). This lets you
|
||||
# register your environment without needing to edit the library’s source
|
||||
# code.
|
||||
|
||||
# %%
|
||||
# Creating a Package
|
||||
# ------------------
|
||||
#
|
||||
# The last step is to structure our code as a Python package. This
|
||||
# involves configuring ``gym-examples/setup.py``. A minimal example of how
|
||||
# to do so is as follows:
|
||||
#
|
||||
# .. code:: python
|
||||
#
|
||||
# from setuptools import setup
|
||||
#
|
||||
# setup(
|
||||
# name="gym_examples",
|
||||
# version="0.0.1",
|
||||
# install_requires=["gymnasium==0.26.0", "pygame==2.1.0"],
|
||||
# )
|
||||
#
|
||||
# Creating Environment Instances
|
||||
# ------------------------------
|
||||
#
|
||||
# After you have installed your package locally with
|
||||
# ``pip install -e gym-examples``, you can create an instance of the
|
||||
# environment via:
|
||||
#
|
||||
# .. code:: python
|
||||
#
|
||||
# import gym_examples
|
||||
# env = gymnasium.make('gym_examples/GridWorld-v0')
|
||||
#
|
||||
# You can also pass keyword arguments of your environment’s constructor to
|
||||
# ``gymnasium.make`` to customize the environment. In our case, we could
|
||||
# do:
|
||||
#
|
||||
# .. code:: python
|
||||
#
|
||||
# env = gymnasium.make('gym_examples/GridWorld-v0', size=10)
|
||||
#
|
||||
# Sometimes, you may find it more convenient to skip registration and call
|
||||
# the environment’s constructor yourself. Some may find this approach more
|
||||
# pythonic and environments that are instantiated like this are also
|
||||
# perfectly fine (but remember to add wrappers as well!).
|
||||
#
|
||||
# Using Wrappers
|
||||
# --------------
|
||||
#
|
||||
# Oftentimes, we want to use different variants of a custom environment,
|
||||
# or we want to modify the behavior of an environment that is provided by
|
||||
# Gymnasium or some other party. Wrappers allow us to do this without
|
||||
# changing the environment implementation or adding any boilerplate code.
|
||||
# Check out the `wrapper documentation </api/wrappers/>`__ for details on
|
||||
# how to use wrappers and instructions for implementing your own. In our
|
||||
# example, observations cannot be used directly in learning code because
|
||||
# they are dictionaries. However, we don’t actually need to touch our
|
||||
# environment implementation to fix this! We can simply add a wrapper on
|
||||
# top of environment instances to flatten observations into a single
|
||||
# array:
|
||||
#
|
||||
# .. code:: python
|
||||
#
|
||||
# import gym_examples
|
||||
# from gymnasium.wrappers import FlattenObservation
|
||||
#
|
||||
# env = gymnasium.make('gym_examples/GridWorld-v0')
|
||||
# wrapped_env = FlattenObservation(env)
|
||||
# print(wrapped_env.reset()) # E.g. [3 0 3 3], {}
|
||||
#
|
||||
# Wrappers have the big advantage that they make environments highly
|
||||
# modular. For instance, instead of flattening the observations from
|
||||
# GridWorld, you might only want to look at the relative position of the
|
||||
# target and the agent. In the section on
|
||||
# `ObservationWrappers </api/wrappers/#observationwrapper>`__ we have
|
||||
# implemented a wrapper that does this job. This wrapper is also available
|
||||
# in gym-examples:
|
||||
#
|
||||
# .. code:: python
|
||||
#
|
||||
# import gym_examples
|
||||
# from gym_examples.wrappers import RelativePosition
|
||||
#
|
||||
# env = gymnasium.make('gym_examples/GridWorld-v0')
|
||||
# wrapped_env = RelativePosition(env)
|
||||
# print(wrapped_env.reset()) # E.g. [-3 3], {}
|
@@ -0,0 +1,318 @@
|
||||
# fmt: off
|
||||
"""
|
||||
Training using REINFORCE for Mujoco
|
||||
===================================
|
||||
|
||||
.. image:: /_static/img/tutorials/reinforce_invpend_gym_v26_fig1.gif
|
||||
:width: 400
|
||||
:alt: agent-environment-diagram
|
||||
|
||||
This tutorial serves 2 purposes:
|
||||
1. To understand how to implement REINFORCE [1] from scratch to solve Mujoco's InvertedPendulum-v4
|
||||
2. Implementation a deep reinforcement learning algorithm with Gymnasium's v0.26+ `step()` function
|
||||
|
||||
We will be using **REINFORCE**, one of the earliest policy gradient methods. Unlike going under the burden of learning a value function first and then deriving a policy out of it,
|
||||
REINFORCE optimizes the policy directly. In other words, it is trained to maximize the probability of Monte-Carlo returns. More on that later.
|
||||
|
||||
**Inverted Pendulum** is Mujoco's cartpole but now powered by the Mujoco physics simulator -
|
||||
which allows more complex experiments (such as varying the effects of gravity).
|
||||
This environment involves a cart that can moved linearly, with a pole fixed on it at one end and having another end free.
|
||||
The cart can be pushed left or right, and the goal is to balance the pole on the top of the cart by applying forces on the cart.
|
||||
More information on the environment could be found at https://gymnasium.farama.org/environments/mujoco/inverted_pendulum/
|
||||
|
||||
**Training Objectives**: To balance the pole (inverted pendulum) on top of the cart
|
||||
|
||||
**Actions**: The agent takes a 1D vector for actions. The action space is a continuous ``(action)`` in ``[-3, 3]``,
|
||||
where action represents the numerical force applied to the cart
|
||||
(with magnitude representing the amount of force and sign representing the direction)
|
||||
|
||||
**Approach**: We use PyTorch to code REINFORCE from scratch to train a Neural Network policy to master Inverted Pendulum.
|
||||
|
||||
An explanation of the Gymnasium v0.26+ `Env.step()` function
|
||||
|
||||
``env.step(A)`` allows us to take an action 'A' in the current environment 'env'. The environment then executes the action
|
||||
and returns five variables:
|
||||
|
||||
- ``next_obs``: This is the observation that the agent will receive after taking the action.
|
||||
- ``reward``: This is the reward that the agent will receive after taking the action.
|
||||
- ``terminated``: This is a boolean variable that indicates whether or not the environment has terminated.
|
||||
- ``truncated``: This is a boolean variable that also indicates whether the episode ended by early truncation, i.e., a time limit is reached.
|
||||
- ``info``: This is a dictionary that might contain additional information about the environment.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.distributions.normal import Normal
|
||||
|
||||
import gymnasium as gym
|
||||
|
||||
|
||||
plt.rcParams["figure.figsize"] = (10, 5)
|
||||
|
||||
|
||||
# %%
|
||||
# Policy Network
|
||||
# ~~~~~~~~~~~~~~
|
||||
#
|
||||
# .. image:: /_static/img/tutorials/reinforce_invpend_gym_v26_fig2.png
|
||||
#
|
||||
# We start by building a policy that the agent will learn using REINFORCE.
|
||||
# A policy is a mapping from the current environment observation to a probability distribution of the actions to be taken.
|
||||
# The policy used in the tutorial is parameterized by a neural network. It consists of 2 linear layers that are shared between both the predicted mean and standard deviation.
|
||||
# Further, the single individual linear layers are used to estimate the mean and the standard deviation. ``nn.Tanh`` is used as a non-linearity between the hidden layers.
|
||||
# The following function estimates a mean and standard deviation of a normal distribution from which an action is sampled. Hence it is expected for the policy to learn
|
||||
# appropriate weights to output means and standard deviation based on the current observation.
|
||||
|
||||
|
||||
class Policy_Network(nn.Module):
|
||||
"""Parametrized Policy Network."""
|
||||
|
||||
def __init__(self, obs_space_dims: int, action_space_dims: int):
|
||||
"""Initializes a neural network that estimates the mean and standard deviation
|
||||
of a normal distribution from which an action is sampled from.
|
||||
|
||||
Args:
|
||||
obs_space_dims: Dimension of the observation space
|
||||
action_space_dims: Dimension of the action space
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
hidden_space1 = 16 # Nothing special with 16, feel free to change
|
||||
hidden_space2 = 32 # Nothing special with 32, feel free to change
|
||||
|
||||
# Shared Network
|
||||
self.shared_net = nn.Sequential(
|
||||
nn.Linear(obs_space_dims, hidden_space1),
|
||||
nn.Tanh(),
|
||||
nn.Linear(hidden_space1, hidden_space2),
|
||||
nn.Tanh(),
|
||||
)
|
||||
|
||||
# Policy Mean specific Linear Layer
|
||||
self.policy_mean_net = nn.Sequential(
|
||||
nn.Linear(hidden_space2, action_space_dims)
|
||||
)
|
||||
|
||||
# Policy Std Dev specific Linear Layer
|
||||
self.policy_stddev_net = nn.Sequential(
|
||||
nn.Linear(hidden_space2, action_space_dims)
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Conditioned on the observation, returns the mean and standard deviation
|
||||
of a normal distribution from which an action is sampled from.
|
||||
|
||||
Args:
|
||||
x: Observation from the environment
|
||||
|
||||
Returns:
|
||||
action_means: predicted mean of the normal distribution
|
||||
action_stddevs: predicted standard deviation of the normal distribution
|
||||
"""
|
||||
shared_features = self.shared_net(x.float())
|
||||
|
||||
action_means = self.policy_mean_net(shared_features)
|
||||
action_stddevs = torch.log(
|
||||
1 + torch.exp(self.policy_stddev_net(shared_features))
|
||||
)
|
||||
|
||||
return action_means, action_stddevs
|
||||
|
||||
|
||||
# %%
|
||||
# Building an agent
|
||||
# ~~~~~~~~~~~~~~~~~
|
||||
#
|
||||
# .. image:: /_static/img/tutorials/reinforce_invpend_gym_v26_fig3.jpeg
|
||||
#
|
||||
# Now that we are done building the policy, let us develop **REINFORCE** which gives life to the policy network.
|
||||
# The algorithm of REINFORCE could be found above. As mentioned before, REINFORCE aims to maximize the Monte-Carlo returns.
|
||||
#
|
||||
# Fun Fact: REINFROCE is an acronym for " 'RE'ward 'I'ncrement 'N'on-negative 'F'actor times 'O'ffset 'R'einforcement times 'C'haracteristic 'E'ligibility
|
||||
#
|
||||
# Note: The choice of hyperparameters is to train a decently performing agent. No extensive hyperparameter
|
||||
# tuning was done.
|
||||
#
|
||||
|
||||
|
||||
class REINFORCE:
|
||||
"""REINFORCE algorithm."""
|
||||
|
||||
def __init__(self, obs_space_dims: int, action_space_dims: int):
|
||||
"""Initializes an agent that learns a policy via REINFORCE algorithm [1]
|
||||
to solve the task at hand (Inverted Pendulum v4).
|
||||
|
||||
Args:
|
||||
obs_space_dims: Dimension of the observation space
|
||||
action_space_dims: Dimension of the action space
|
||||
"""
|
||||
|
||||
# Hyperparameters
|
||||
self.learning_rate = 1e-4 # Learning rate for policy optimization
|
||||
self.gamma = 0.99 # Discount factor
|
||||
self.eps = 1e-6 # small number for mathematical stability
|
||||
|
||||
self.probs = [] # Stores probability values of the sampled action
|
||||
self.rewards = [] # Stores the corresponding rewards
|
||||
|
||||
self.net = Policy_Network(obs_space_dims, action_space_dims)
|
||||
self.optimizer = torch.optim.AdamW(self.net.parameters(), lr=self.learning_rate)
|
||||
|
||||
def sample_action(self, state: np.ndarray) -> float:
|
||||
"""Returns an action, conditioned on the policy and observation.
|
||||
|
||||
Args:
|
||||
state: Observation from the environment
|
||||
|
||||
Returns:
|
||||
action: Action to be performed
|
||||
"""
|
||||
state = torch.tensor(np.array([state]))
|
||||
action_means, action_stddevs = self.net(state)
|
||||
|
||||
# create a normal distribution from the predicted
|
||||
# mean and standard deviation and sample an action
|
||||
distrib = Normal(action_means[0] + self.eps, action_stddevs[0] + self.eps)
|
||||
action = distrib.sample()
|
||||
prob = distrib.log_prob(action)
|
||||
|
||||
action = action.numpy()
|
||||
|
||||
self.probs.append(prob)
|
||||
|
||||
return action
|
||||
|
||||
def update(self):
|
||||
"""Updates the policy network's weights."""
|
||||
running_g = 0
|
||||
gs = []
|
||||
|
||||
# Discounted return (backwards) - [::-1] will return an array in reverse
|
||||
for R in self.rewards[::-1]:
|
||||
running_g = R + self.gamma * running_g
|
||||
gs.insert(0, running_g)
|
||||
|
||||
deltas = torch.tensor(gs)
|
||||
|
||||
loss = 0
|
||||
# minimize -1 * prob * reward obtained
|
||||
for log_prob, delta in zip(self.probs, deltas):
|
||||
loss += log_prob.mean() * delta * (-1)
|
||||
|
||||
# Update the policy network
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
|
||||
# Empty / zero out all episode-centric/related variables
|
||||
self.probs = []
|
||||
self.rewards = []
|
||||
|
||||
|
||||
# %%
|
||||
# Now lets train the policy using REINFORCE to master the task of Inverted Pendulum.
|
||||
#
|
||||
# Following is the overview of the training procedure
|
||||
#
|
||||
# for seed in random seeds
|
||||
# reinitialize agent
|
||||
#
|
||||
# for episode in range of max number of episodes
|
||||
# until episode is done
|
||||
# sample action based on current observation
|
||||
#
|
||||
# take action and receive reward and next observation
|
||||
#
|
||||
# store action take, its probability, and the observed reward
|
||||
# update the policy
|
||||
#
|
||||
# Note: Deep RL is fairly brittle concerning random seed in a lot of common use cases (https://spinningup.openai.com/en/latest/spinningup/spinningup.html).
|
||||
# Hence it is important to test out various seeds, which we will be doing.
|
||||
|
||||
|
||||
# Create and wrap the environment
|
||||
env = gym.make("InvertedPendulum-v4")
|
||||
wrapped_env = gym.wrappers.RecordEpisodeStatistics(env, 50) # Records episode-reward
|
||||
|
||||
total_num_episodes = int(5e3) # Total number of episodes
|
||||
# Observation-space of InvertedPendulum-v4 (4)
|
||||
obs_space_dims = env.observation_space.shape[0]
|
||||
# Action-space of InvertedPendulum-v4 (1)
|
||||
action_space_dims = env.action_space.shape[0]
|
||||
rewards_over_seeds = []
|
||||
|
||||
for seed in [1, 2, 3, 5, 8]: # Fibonacci seeds
|
||||
# set seed
|
||||
torch.manual_seed(seed)
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
|
||||
# Reinitialize agent every seed
|
||||
agent = REINFORCE(obs_space_dims, action_space_dims)
|
||||
reward_over_episodes = []
|
||||
|
||||
for episode in range(total_num_episodes):
|
||||
# gymnasium v26 requires users to set seed while resetting the environment
|
||||
obs, info = wrapped_env.reset(seed=seed)
|
||||
|
||||
done = False
|
||||
while not done:
|
||||
action = agent.sample_action(obs)
|
||||
|
||||
# Step return type - `tuple[ObsType, SupportsFloat, bool, bool, dict[str, Any]]`
|
||||
# These represent the next observation, the reward from the step,
|
||||
# if the episode is terminated, if the episode is truncated and
|
||||
# additional info from the step
|
||||
obs, reward, terminated, truncated, info = wrapped_env.step(action)
|
||||
agent.rewards.append(reward)
|
||||
|
||||
# End the episode when either truncated or terminated is true
|
||||
# - truncated: The episode duration reaches max number of timesteps
|
||||
# - terminated: Any of the state space values is no longer finite.
|
||||
done = terminated or truncated
|
||||
|
||||
reward_over_episodes.append(wrapped_env.return_queue[-1])
|
||||
agent.update()
|
||||
|
||||
if episode % 1000 == 0:
|
||||
avg_reward = int(np.mean(wrapped_env.return_queue))
|
||||
print("Episode:", episode, "Average Reward:", avg_reward)
|
||||
|
||||
rewards_over_seeds.append(reward_over_episodes)
|
||||
|
||||
|
||||
# %%
|
||||
# Plot learning curve
|
||||
# ~~~~~~~~~~~~~~~~~~~
|
||||
#
|
||||
|
||||
rewards_to_plot = [[reward[0] for reward in rewards] for rewards in rewards_over_seeds]
|
||||
df1 = pd.DataFrame(rewards_to_plot).melt()
|
||||
df1.rename(columns={"variable": "episodes", "value": "reward"}, inplace=True)
|
||||
sns.set(style="darkgrid", context="talk", palette="rainbow")
|
||||
sns.lineplot(x="episodes", y="reward", data=df1).set(
|
||||
title="REINFORCE for InvertedPendulum-v4"
|
||||
)
|
||||
plt.show()
|
||||
|
||||
# %%
|
||||
# .. image:: /_static/img/tutorials/reinforce_invpend_gym_v26_fig4.png
|
||||
#
|
||||
# Author: Siddarth Chandrasekar
|
||||
#
|
||||
# License: MIT License
|
||||
#
|
||||
# References
|
||||
# ~~~~~~~~~~
|
||||
#
|
||||
# [1] Williams, Ronald J.. “Simple statistical gradient-following
|
||||
# algorithms for connectionist reinforcement learning.” Machine Learning 8
|
||||
# (2004): 229-256.
|
||||
#
|
@@ -0,0 +1,80 @@
|
||||
"""
|
||||
Handling Time Limits
|
||||
====================
|
||||
|
||||
In using Gymnasium environments with reinforcement learning code, a common problem observed is how time limits are incorrectly handled. The ``done`` signal received (in previous versions of OpenAI Gym < 0.26) from ``env.step`` indicated whether an episode has ended. However, this signal did not distinguish whether the episode ended due to ``termination`` or ``truncation``.
|
||||
|
||||
Termination
|
||||
-----------
|
||||
|
||||
Termination refers to the episode ending after reaching a terminal state that is defined as part of the environment
|
||||
definition. Examples are - task success, task failure, robot falling down etc. Notably, this also includes episodes
|
||||
ending in finite-horizon environments due to a time-limit inherent to the environment. Note that to preserve Markov
|
||||
property, a representation of the remaining time must be present in the agent's observation in finite-horizon environments.
|
||||
`(Reference) <https://arxiv.org/abs/1712.00378>`_
|
||||
|
||||
Truncation
|
||||
----------
|
||||
|
||||
Truncation refers to the episode ending after an externally defined condition (that is outside the scope of the Markov
|
||||
Decision Process). This could be a time-limit, a robot going out of bounds etc.
|
||||
|
||||
An infinite-horizon environment is an obvious example of where this is needed. We cannot wait forever for the episode
|
||||
to complete, so we set a practical time-limit after which we forcibly halt the episode. The last state in this case is
|
||||
not a terminal state since it has a non-zero transition probability of moving to another state as per the Markov
|
||||
Decision Process that defines the RL problem. This is also different from time-limits in finite horizon environments
|
||||
as the agent in this case has no idea about this time-limit.
|
||||
"""
|
||||
|
||||
# %%
|
||||
# Importance in learning code
|
||||
# ---------------------------
|
||||
# Bootstrapping (using one or more estimated values of a variable to update estimates of the same variable) is a key
|
||||
# aspect of Reinforcement Learning. A value function will tell you how much discounted reward you will get from a
|
||||
# particular state if you follow a given policy. When an episode stops at any given point, by looking at the value of
|
||||
# the final state, the agent is able to estimate how much discounted reward could have been obtained if the episode has
|
||||
# continued. This is an example of handling truncation.
|
||||
#
|
||||
# More formally, a common example of bootstrapping in RL is updating the estimate of the Q-value function,
|
||||
#
|
||||
# .. math::
|
||||
# Q_{target}(o_t, a_t) = r_t + \gamma . \max_a(Q(o_{t+1}, a_{t+1}))
|
||||
#
|
||||
#
|
||||
# In classical RL, the new ``Q`` estimate is a weighted average of the previous ``Q`` estimate and ``Q_target`` while in Deep
|
||||
# Q-Learning, the error between ``Q_target`` and the previous ``Q`` estimate is minimized.
|
||||
#
|
||||
# However, at the terminal state, bootstrapping is not done,
|
||||
#
|
||||
# .. math::
|
||||
# Q_{target}(o_t, a_t) = r_t
|
||||
#
|
||||
# This is where the distinction between termination and truncation becomes important. When an episode ends due to
|
||||
# termination we don't bootstrap, when it ends due to truncation, we bootstrap.
|
||||
#
|
||||
# While using gymnasium environments, the ``done`` signal (default for < v0.26) is frequently used to determine whether to
|
||||
# bootstrap or not. However, this is incorrect since it does not differentiate between termination and truncation.
|
||||
#
|
||||
# A simple example of value functions is shown below. This is an illustrative example and not part of any specific algorithm.
|
||||
#
|
||||
# .. code:: python
|
||||
#
|
||||
# # INCORRECT
|
||||
# vf_target = rew + gamma * (1 - done) * vf_next_state
|
||||
#
|
||||
# This is incorrect in the case of episode ending due to a truncation, where bootstrapping needs to happen but it doesn't.
|
||||
|
||||
# %%
|
||||
# Solution
|
||||
# ----------
|
||||
#
|
||||
# From v0.26 onwards, Gymnasium's ``env.step`` API returns both termination and truncation information explicitly.
|
||||
# In the previous version truncation information was supplied through the info key ``TimeLimit.truncated``.
|
||||
# The correct way to handle terminations and truncations now is,
|
||||
#
|
||||
# .. code:: python
|
||||
#
|
||||
# # terminated = done and 'TimeLimit.truncated' not in info
|
||||
# # This was needed in previous versions.
|
||||
#
|
||||
# vf_target = rew + gamma * (1 - terminated) * vf_next_state
|
@@ -0,0 +1,204 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n# Make your own custom environment\n\nThis documentation overviews creating new environments and relevant\nuseful wrappers, utilities and tests included in Gymnasium designed for\nthe creation of new environments. You can clone gym-examples to play\nwith the code that is presented here. We recommend that you use a virtual environment:\n\n.. code:: console\n\n git clone https://github.com/Farama-Foundation/gym-examples\n cd gym-examples\n python -m venv .env\n source .env/bin/activate\n pip install -e .\n\n## Subclassing gymnasium.Env\n\nBefore learning how to create your own environment you should check out\n[the documentation of Gymnasium\u2019s API](/api/core)_.\n\nWe will be concerned with a subset of gym-examples that looks like this:\n\n.. code:: sh\n\n gym-examples/\n README.md\n setup.py\n gym_examples/\n __init__.py\n envs/\n __init__.py\n grid_world.py\n wrappers/\n __init__.py\n relative_position.py\n reacher_weighted_reward.py\n discrete_action.py\n clip_reward.py\n\nTo illustrate the process of subclassing ``gymnasium.Env``, we will\nimplement a very simplistic game, called ``GridWorldEnv``. We will write\nthe code for our custom environment in\n``gym-examples/gym_examples/envs/grid_world.py``. The environment\nconsists of a 2-dimensional square grid of fixed size (specified via the\n``size`` parameter during construction). The agent can move vertically\nor horizontally between grid cells in each timestep. The goal of the\nagent is to navigate to a target on the grid that has been placed\nrandomly at the beginning of the episode.\n\n- Observations provide the location of the target and agent.\n- There are 4 actions in our environment, corresponding to the\n movements \u201cright\u201d, \u201cup\u201d, \u201cleft\u201d, and \u201cdown\u201d.\n- A done signal is issued as soon as the agent has navigated to the\n grid cell where the target is located.\n- Rewards are binary and sparse, meaning that the immediate reward is\n always zero, unless the agent has reached the target, then it is 1.\n\nAn episode in this environment (with ``size=5``) might look like this:\n\nwhere the blue dot is the agent and the red square represents the\ntarget.\n\nLet us look at the source code of ``GridWorldEnv`` piece by piece:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Declaration and Initialization\n\nOur custom environment will inherit from the abstract class\n``gymnasium.Env``. You shouldn\u2019t forget to add the ``metadata``\nattribute to your class. There, you should specify the render-modes that\nare supported by your environment (e.g.\u00a0``\"human\"``, ``\"rgb_array\"``,\n``\"ansi\"``) and the framerate at which your environment should be\nrendered. Every environment should support ``None`` as render-mode; you\ndon\u2019t need to add it in the metadata. In ``GridWorldEnv``, we will\nsupport the modes \u201crgb_array\u201d and \u201chuman\u201d and render at 4 FPS.\n\nThe ``__init__`` method of our environment will accept the integer\n``size``, that determines the size of the square grid. We will set up\nsome variables for rendering and define ``self.observation_space`` and\n``self.action_space``. In our case, observations should provide\ninformation about the location of the agent and target on the\n2-dimensional grid. We will choose to represent observations in the form\nof dictionaries with keys ``\"agent\"`` and ``\"target\"``. An observation\nmay look like ``{\"agent\": array([1, 0]), \"target\": array([0, 3])}``.\nSince we have 4 actions in our environment (\u201cright\u201d, \u201cup\u201d, \u201cleft\u201d,\n\u201cdown\u201d), we will use ``Discrete(4)`` as an action space. Here is the\ndeclaration of ``GridWorldEnv`` and the implementation of ``__init__``:\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\nimport pygame\n\nimport gymnasium as gym\nfrom gymnasium import spaces\n\n\nclass GridWorldEnv(gym.Env):\n metadata = {\"render_modes\": [\"human\", \"rgb_array\"], \"render_fps\": 4}\n\n def __init__(self, render_mode=None, size=5):\n self.size = size # The size of the square grid\n self.window_size = 512 # The size of the PyGame window\n\n # Observations are dictionaries with the agent's and the target's location.\n # Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]).\n self.observation_space = spaces.Dict(\n {\n \"agent\": spaces.Box(0, size - 1, shape=(2,), dtype=int),\n \"target\": spaces.Box(0, size - 1, shape=(2,), dtype=int),\n }\n )\n\n # We have 4 actions, corresponding to \"right\", \"up\", \"left\", \"down\"\n self.action_space = spaces.Discrete(4)\n\n \"\"\"\n The following dictionary maps abstract actions from `self.action_space` to\n the direction we will walk in if that action is taken.\n I.e. 0 corresponds to \"right\", 1 to \"up\" etc.\n \"\"\"\n self._action_to_direction = {\n 0: np.array([1, 0]),\n 1: np.array([0, 1]),\n 2: np.array([-1, 0]),\n 3: np.array([0, -1]),\n }\n\n assert render_mode is None or render_mode in self.metadata[\"render_modes\"]\n self.render_mode = render_mode\n\n \"\"\"\n If human-rendering is used, `self.window` will be a reference\n to the window that we draw to. `self.clock` will be a clock that is used\n to ensure that the environment is rendered at the correct framerate in\n human-mode. They will remain `None` until human-mode is used for the\n first time.\n \"\"\"\n self.window = None\n self.clock = None"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Constructing Observations From Environment States\n\nSince we will need to compute observations both in ``reset`` and\n``step``, it is often convenient to have a (private) method ``_get_obs``\nthat translates the environment\u2019s state into an observation. However,\nthis is not mandatory and you may as well compute observations in\n``reset`` and ``step`` separately:\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _get_obs(self):\n return {\"agent\": self._agent_location, \"target\": self._target_location}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can also implement a similar method for the auxiliary information\nthat is returned by ``step`` and ``reset``. In our case, we would like\nto provide the manhattan distance between the agent and the target:\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _get_info(self):\n return {\n \"distance\": np.linalg.norm(\n self._agent_location - self._target_location, ord=1\n )\n }"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Oftentimes, info will also contain some data that is only available\ninside the ``step`` method (e.g.\u00a0individual reward terms). In that case,\nwe would have to update the dictionary that is returned by ``_get_info``\nin ``step``.\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Reset\n\nThe ``reset`` method will be called to initiate a new episode. You may\nassume that the ``step`` method will not be called before ``reset`` has\nbeen called. Moreover, ``reset`` should be called whenever a done signal\nhas been issued. Users may pass the ``seed`` keyword to ``reset`` to\ninitialize any random number generator that is used by the environment\nto a deterministic state. It is recommended to use the random number\ngenerator ``self.np_random`` that is provided by the environment\u2019s base\nclass, ``gymnasium.Env``. If you only use this RNG, you do not need to\nworry much about seeding, *but you need to remember to call\n``super().reset(seed=seed)``* to make sure that ``gymnasium.Env``\ncorrectly seeds the RNG. Once this is done, we can randomly set the\nstate of our environment. In our case, we randomly choose the agent\u2019s\nlocation and the random sample target positions, until it does not\ncoincide with the agent\u2019s position.\n\nThe ``reset`` method should return a tuple of the initial observation\nand some auxiliary information. We can use the methods ``_get_obs`` and\n``_get_info`` that we implemented earlier for that:\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def reset(self, seed=None, options=None):\n # We need the following line to seed self.np_random\n super().reset(seed=seed)\n\n # Choose the agent's location uniformly at random\n self._agent_location = self.np_random.integers(0, self.size, size=2, dtype=int)\n\n # We will sample the target's location randomly until it does not coincide with the agent's location\n self._target_location = self._agent_location\n while np.array_equal(self._target_location, self._agent_location):\n self._target_location = self.np_random.integers(\n 0, self.size, size=2, dtype=int\n )\n\n observation = self._get_obs()\n info = self._get_info()\n\n if self.render_mode == \"human\":\n self._render_frame()\n\n return observation, info"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Step\n\nThe ``step`` method usually contains most of the logic of your\nenvironment. It accepts an ``action``, computes the state of the\nenvironment after applying that action and returns the 4-tuple\n``(observation, reward, done, info)``. Once the new state of the\nenvironment has been computed, we can check whether it is a terminal\nstate and we set ``done`` accordingly. Since we are using sparse binary\nrewards in ``GridWorldEnv``, computing ``reward`` is trivial once we\nknow ``done``. To gather ``observation`` and ``info``, we can again make\nuse of ``_get_obs`` and ``_get_info``:\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def step(self, action):\n # Map the action (element of {0,1,2,3}) to the direction we walk in\n direction = self._action_to_direction[action]\n # We use `np.clip` to make sure we don't leave the grid\n self._agent_location = np.clip(\n self._agent_location + direction, 0, self.size - 1\n )\n # An episode is done iff the agent has reached the target\n terminated = np.array_equal(self._agent_location, self._target_location)\n reward = 1 if terminated else 0 # Binary sparse rewards\n observation = self._get_obs()\n info = self._get_info()\n\n if self.render_mode == \"human\":\n self._render_frame()\n\n return observation, reward, terminated, False, info"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Rendering\n\nHere, we are using PyGame for rendering. A similar approach to rendering\nis used in many environments that are included with Gymnasium and you\ncan use it as a skeleton for your own environments:\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def render(self):\n if self.render_mode == \"rgb_array\":\n return self._render_frame()\n\n def _render_frame(self):\n if self.window is None and self.render_mode == \"human\":\n pygame.init()\n pygame.display.init()\n self.window = pygame.display.set_mode(\n (self.window_size, self.window_size)\n )\n if self.clock is None and self.render_mode == \"human\":\n self.clock = pygame.time.Clock()\n\n canvas = pygame.Surface((self.window_size, self.window_size))\n canvas.fill((255, 255, 255))\n pix_square_size = (\n self.window_size / self.size\n ) # The size of a single grid square in pixels\n\n # First we draw the target\n pygame.draw.rect(\n canvas,\n (255, 0, 0),\n pygame.Rect(\n pix_square_size * self._target_location,\n (pix_square_size, pix_square_size),\n ),\n )\n # Now we draw the agent\n pygame.draw.circle(\n canvas,\n (0, 0, 255),\n (self._agent_location + 0.5) * pix_square_size,\n pix_square_size / 3,\n )\n\n # Finally, add some gridlines\n for x in range(self.size + 1):\n pygame.draw.line(\n canvas,\n 0,\n (0, pix_square_size * x),\n (self.window_size, pix_square_size * x),\n width=3,\n )\n pygame.draw.line(\n canvas,\n 0,\n (pix_square_size * x, 0),\n (pix_square_size * x, self.window_size),\n width=3,\n )\n\n if self.render_mode == \"human\":\n # The following line copies our drawings from `canvas` to the visible window\n self.window.blit(canvas, canvas.get_rect())\n pygame.event.pump()\n pygame.display.update()\n\n # We need to ensure that human-rendering occurs at the predefined framerate.\n # The following line will automatically add a delay to keep the framerate stable.\n self.clock.tick(self.metadata[\"render_fps\"])\n else: # rgb_array\n return np.transpose(\n np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)\n )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Close\n\nThe ``close`` method should close any open resources that were used by\nthe environment. In many cases, you don\u2019t actually have to bother to\nimplement this method. However, in our example ``render_mode`` may be\n``\"human\"`` and we might need to close the window that has been opened:\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def close(self):\n if self.window is not None:\n pygame.display.quit()\n pygame.quit()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In other environments ``close`` might also close files that were opened\nor release other resources. You shouldn\u2019t interact with the environment\nafter having called ``close``.\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Registering Envs\n\nIn order for the custom environments to be detected by Gymnasium, they\nmust be registered as follows. We will choose to put this code in\n``gym-examples/gym_examples/__init__.py``.\n\n.. code:: python\n\n from gymnasium.envs.registration import register\n\n register(\n id=\"gym_examples/GridWorld-v0\",\n entry_point=\"gym_examples.envs:GridWorldEnv\",\n max_episode_steps=300,\n )\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The environment ID consists of three components, two of which are\noptional: an optional namespace (here: ``gym_examples``), a mandatory\nname (here: ``GridWorld``) and an optional but recommended version\n(here: v0). It might have also been registered as ``GridWorld-v0`` (the\nrecommended approach), ``GridWorld`` or ``gym_examples/GridWorld``, and\nthe appropriate ID should then be used during environment creation.\n\nThe keyword argument ``max_episode_steps=300`` will ensure that\nGridWorld environments that are instantiated via ``gymnasium.make`` will\nbe wrapped in a ``TimeLimit`` wrapper (see [the wrapper\ndocumentation](/api/wrappers)_ for more information). A done signal\nwill then be produced if the agent has reached the target *or* 300 steps\nhave been executed in the current episode. To distinguish truncation and\ntermination, you can check ``info[\"TimeLimit.truncated\"]``.\n\nApart from ``id`` and ``entrypoint``, you may pass the following\nadditional keyword arguments to ``register``:\n\n+----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+\n| Name | Type | Default | Description |\n+======================+===========+===========+===============================================================================================================+\n| ``reward_threshold`` | ``float`` | ``None`` | The reward threshold before the task is considered solved |\n+----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+\n| ``nondeterministic`` | ``bool`` | ``False`` | Whether this environment is non-deterministic even after seeding |\n+----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+\n| ``max_episode_steps``| ``int`` | ``None`` | The maximum number of steps that an episode can consist of. If not ``None``, a ``TimeLimit`` wrapper is added |\n+----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+\n| ``order_enforce`` | ``bool`` | ``True`` | Whether to wrap the environment in an ``OrderEnforcing`` wrapper |\n+----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+\n| ``autoreset`` | ``bool`` | ``False`` | Whether to wrap the environment in an ``AutoResetWrapper`` |\n+----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+\n| ``kwargs`` | ``dict`` | ``{}`` | The default kwargs to pass to the environment class |\n+----------------------+-----------+-----------+---------------------------------------------------------------------------------------------------------------+\n\nMost of these keywords (except for ``max_episode_steps``,\n``order_enforce`` and ``kwargs``) do not alter the behavior of\nenvironment instances but merely provide some extra information about\nyour environment. After registration, our custom ``GridWorldEnv``\nenvironment can be created with\n``env = gymnasium.make('gym_examples/GridWorld-v0')``.\n\n``gym-examples/gym_examples/envs/__init__.py`` should have:\n\n.. code:: python\n\n from gym_examples.envs.grid_world import GridWorldEnv\n\nIf your environment is not registered, you may optionally pass a module\nto import, that would register your environment before creating it like\nthis - ``env = gymnasium.make('module:Env-v0')``, where ``module``\ncontains the registration code. For the GridWorld env, the registration\ncode is run by importing ``gym_examples`` so if it were not possible to\nimport gym_examples explicitly, you could register while making by\n``env = gymnasium.make('gym_examples:gym_examples/GridWorld-v0)``. This\nis especially useful when you\u2019re allowed to pass only the environment ID\ninto a third-party codebase (eg. learning library). This lets you\nregister your environment without needing to edit the library\u2019s source\ncode.\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Creating a Package\n\nThe last step is to structure our code as a Python package. This\ninvolves configuring ``gym-examples/setup.py``. A minimal example of how\nto do so is as follows:\n\n.. code:: python\n\n from setuptools import setup\n\n setup(\n name=\"gym_examples\",\n version=\"0.0.1\",\n install_requires=[\"gymnasium==0.26.0\", \"pygame==2.1.0\"],\n )\n\n## Creating Environment Instances\n\nAfter you have installed your package locally with\n``pip install -e gym-examples``, you can create an instance of the\nenvironment via:\n\n.. code:: python\n\n import gym_examples\n env = gymnasium.make('gym_examples/GridWorld-v0')\n\nYou can also pass keyword arguments of your environment\u2019s constructor to\n``gymnasium.make`` to customize the environment. In our case, we could\ndo:\n\n.. code:: python\n\n env = gymnasium.make('gym_examples/GridWorld-v0', size=10)\n\nSometimes, you may find it more convenient to skip registration and call\nthe environment\u2019s constructor yourself. Some may find this approach more\npythonic and environments that are instantiated like this are also\nperfectly fine (but remember to add wrappers as well!).\n\n## Using Wrappers\n\nOftentimes, we want to use different variants of a custom environment,\nor we want to modify the behavior of an environment that is provided by\nGymnasium or some other party. Wrappers allow us to do this without\nchanging the environment implementation or adding any boilerplate code.\nCheck out the [wrapper documentation](/api/wrappers/)_ for details on\nhow to use wrappers and instructions for implementing your own. In our\nexample, observations cannot be used directly in learning code because\nthey are dictionaries. However, we don\u2019t actually need to touch our\nenvironment implementation to fix this! We can simply add a wrapper on\ntop of environment instances to flatten observations into a single\narray:\n\n.. code:: python\n\n import gym_examples\n from gymnasium.wrappers import FlattenObservation\n\n env = gymnasium.make('gym_examples/GridWorld-v0')\n wrapped_env = FlattenObservation(env)\n print(wrapped_env.reset()) # E.g. [3 0 3 3], {}\n\nWrappers have the big advantage that they make environments highly\nmodular. For instance, instead of flattening the observations from\nGridWorld, you might only want to look at the relative position of the\ntarget and the agent. In the section on\n[ObservationWrappers](/api/wrappers/#observationwrapper)_ we have\nimplemented a wrapper that does this job. This wrapper is also available\nin gym-examples:\n\n.. code:: python\n\n import gym_examples\n from gym_examples.wrappers import RelativePosition\n\n env = gymnasium.make('gym_examples/GridWorld-v0')\n wrapped_env = RelativePosition(env)\n print(wrapped_env.reset()) # E.g. [-3 3], {}\n\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
Implementing Custom Wrappers
|
||||
============================
|
||||
|
||||
In this tutorial we will describe how to implement your own custom wrappers.
|
||||
Wrappers are a great way to add functionality to your environments in a modular way.
|
||||
This will save you a lot of boilerplate code.
|
||||
|
||||
We will show how to create a wrapper by
|
||||
|
||||
- Inheriting from :class:`gymnasium.ObservationWrapper`
|
||||
- Inheriting from :class:`gymnasium.ActionWrapper`
|
||||
- Inheriting from :class:`gymnasium.RewardWrapper`
|
||||
- Inheriting from :class:`gymnasium.Wrapper`
|
||||
|
||||
Before following this tutorial, make sure to check out the docs of the :mod:`gymnasium.wrappers` module.
|
||||
"""
|
||||
|
||||
# %%
|
||||
# Inheriting from :class:`gymnasium.ObservationWrapper`
|
||||
# -----------------------------------------------------
|
||||
# Observation wrappers are useful if you want to apply some function to the observations that are returned
|
||||
# by an environment. If you implement an observation wrapper, you only need to define this transformation
|
||||
# by implementing the :meth:`gymnasium.ObservationWrapper.observation` method. Moreover, you should remember to
|
||||
# update the observation space, if the transformation changes the shape of observations (e.g. by transforming
|
||||
# dictionaries into numpy arrays, as in the following example).
|
||||
#
|
||||
# Imagine you have a 2D navigation task where the environment returns dictionaries as observations with
|
||||
# keys ``"agent_position"`` and ``"target_position"``. A common thing to do might be to throw away some degrees of
|
||||
# freedom and only consider the position of the target relative to the agent, i.e.
|
||||
# ``observation["target_position"] - observation["agent_position"]``. For this, you could implement an
|
||||
# observation wrapper like this:
|
||||
|
||||
import numpy as np
|
||||
from gym import ActionWrapper, ObservationWrapper, RewardWrapper, Wrapper
|
||||
|
||||
import gymnasium as gym
|
||||
from gymnasium.spaces import Box, Discrete
|
||||
|
||||
|
||||
class RelativePosition(ObservationWrapper):
|
||||
def __init__(self, env):
|
||||
super().__init__(env)
|
||||
self.observation_space = Box(shape=(2,), low=-np.inf, high=np.inf)
|
||||
|
||||
def observation(self, obs):
|
||||
return obs["target"] - obs["agent"]
|
||||
|
||||
|
||||
# %%
|
||||
# Inheriting from :class:`gymnasium.ActionWrapper`
|
||||
# ------------------------------------------------
|
||||
# Action wrappers can be used to apply a transformation to actions before applying them to the environment.
|
||||
# If you implement an action wrapper, you need to define that transformation by implementing
|
||||
# :meth:`gymnasium.ActionWrapper.action`. Moreover, you should specify the domain of that transformation
|
||||
# by updating the action space of the wrapper.
|
||||
#
|
||||
# Let’s say you have an environment with action space of type :class:`gymnasium.spaces.Box`, but you would only like
|
||||
# to use a finite subset of actions. Then, you might want to implement the following wrapper:
|
||||
|
||||
|
||||
class DiscreteActions(ActionWrapper):
|
||||
def __init__(self, env, disc_to_cont):
|
||||
super().__init__(env)
|
||||
self.disc_to_cont = disc_to_cont
|
||||
self.action_space = Discrete(len(disc_to_cont))
|
||||
|
||||
def action(self, act):
|
||||
return self.disc_to_cont[act]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
env = gym.make("LunarLanderContinuous-v2")
|
||||
wrapped_env = DiscreteActions(
|
||||
env, [np.array([1, 0]), np.array([-1, 0]), np.array([0, 1]), np.array([0, -1])]
|
||||
)
|
||||
print(wrapped_env.action_space) # Discrete(4)
|
||||
|
||||
|
||||
# %%
|
||||
# Inheriting from :class:`gymnasium.RewardWrapper`
|
||||
# ------------------------------------------------
|
||||
# Reward wrappers are used to transform the reward that is returned by an environment.
|
||||
# As for the previous wrappers, you need to specify that transformation by implementing the
|
||||
# :meth:`gymnasium.RewardWrapper.reward` method. Also, you might want to update the reward range of the wrapper.
|
||||
#
|
||||
# Let us look at an example: Sometimes (especially when we do not have control over the reward
|
||||
# because it is intrinsic), we want to clip the reward to a range to gain some numerical stability.
|
||||
# To do that, we could, for instance, implement the following wrapper:
|
||||
|
||||
from typing import SupportsFloat
|
||||
|
||||
|
||||
class ClipReward(RewardWrapper):
|
||||
def __init__(self, env, min_reward, max_reward):
|
||||
super().__init__(env)
|
||||
self.min_reward = min_reward
|
||||
self.max_reward = max_reward
|
||||
self.reward_range = (min_reward, max_reward)
|
||||
|
||||
def reward(self, r: SupportsFloat) -> SupportsFloat:
|
||||
return np.clip(r, self.min_reward, self.max_reward)
|
||||
|
||||
|
||||
# %%
|
||||
# Inheriting from :class:`gymnasium.Wrapper`
|
||||
# ------------------------------------------
|
||||
# Sometimes you might need to implement a wrapper that does some more complicated modifications (e.g. modify the
|
||||
# reward based on data in ``info`` or change the rendering behavior).
|
||||
# Such wrappers can be implemented by inheriting from :class:`gymnasium.Wrapper`.
|
||||
#
|
||||
# - You can set a new action or observation space by defining ``self.action_space`` or ``self.observation_space`` in ``__init__``, respectively
|
||||
# - You can set new metadata and reward range by defining ``self.metadata`` and ``self.reward_range`` in ``__init__``, respectively
|
||||
# - You can override :meth:`gymnasium.Wrapper.step`, :meth:`gymnasium.Wrapper.render`, :meth:`gymnasium.Wrapper.close` etc.
|
||||
# If you do this, you can access the environment that was passed
|
||||
# to your wrapper (which *still* might be wrapped in some other wrapper) by accessing the attribute :attr:`env`.
|
||||
#
|
||||
# Let's also take a look at an example for this case. Most MuJoCo environments return a reward that consists
|
||||
# of different terms: For instance, there might be a term that rewards the agent for completing the task and one term that
|
||||
# penalizes large actions (i.e. energy usage). Usually, you can pass weight parameters for those terms during
|
||||
# initialization of the environment. However, *Reacher* does not allow you to do this! Nevertheless, all individual terms
|
||||
# of the reward are returned in `info`, so let us build a wrapper for Reacher that allows us to weight those terms:
|
||||
|
||||
|
||||
class ReacherRewardWrapper(Wrapper):
|
||||
def __init__(self, env, reward_dist_weight, reward_ctrl_weight):
|
||||
super().__init__(env)
|
||||
self.reward_dist_weight = reward_dist_weight
|
||||
self.reward_ctrl_weight = reward_ctrl_weight
|
||||
|
||||
def step(self, action):
|
||||
obs, _, terminated, truncated, info = self.env.step(action)
|
||||
reward = (
|
||||
self.reward_dist_weight * info["reward_dist"]
|
||||
+ self.reward_ctrl_weight * info["reward_ctrl"]
|
||||
)
|
||||
return obs, reward, terminated, truncated, info
|
@@ -0,0 +1,455 @@
|
||||
"""
|
||||
Solving Blackjack with Q-Learning
|
||||
=================================
|
||||
|
||||
"""
|
||||
|
||||
# %%
|
||||
# .. image:: /_static/img/tutorials/blackjack_AE_loop.jpg
|
||||
# :width: 650
|
||||
# :alt: agent-environment-diagram
|
||||
# :class: only-light
|
||||
# .. image:: /_static/img/tutorials/blackjack_AE_loop_dark.png
|
||||
# :width: 650
|
||||
# :alt: agent-environment-diagram
|
||||
# :class: only-dark
|
||||
#
|
||||
# In this tutorial, we’ll explore and solve the *Blackjack-v1*
|
||||
# environment.
|
||||
#
|
||||
# **Blackjack** is one of the most popular casino card games that is also
|
||||
# infamous for being beatable under certain conditions. This version of
|
||||
# the game uses an infinite deck (we draw the cards with replacement), so
|
||||
# counting cards won’t be a viable strategy in our simulated game.
|
||||
# Full documentation can be found at https://gymnasium.farama.org/environments/toy_text/blackjack
|
||||
#
|
||||
# **Objective**: To win, your card sum should be greater than the
|
||||
# dealers without exceeding 21.
|
||||
#
|
||||
# **Actions**: Agents can pick between two actions:
|
||||
# - stand (0): the player takes no more cards
|
||||
# - hit (1): the player will be given another card, however the player could get over 21 and bust
|
||||
#
|
||||
# **Approach**: To solve this environment by yourself, you can pick your
|
||||
# favorite discrete RL algorithm. The presented solution uses *Q-learning*
|
||||
# (a model-free RL algorithm).
|
||||
#
|
||||
|
||||
|
||||
# %%
|
||||
# Imports and Environment Setup
|
||||
# ------------------------------
|
||||
#
|
||||
|
||||
# Author: Till Zemann
|
||||
# License: MIT License
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import seaborn as sns
|
||||
from matplotlib.patches import Patch
|
||||
from tqdm import tqdm
|
||||
|
||||
import gymnasium as gym
|
||||
|
||||
|
||||
# Let's start by creating the blackjack environment.
|
||||
# Note: We are going to follow the rules from Sutton & Barto.
|
||||
# Other versions of the game can be found below for you to experiment.
|
||||
|
||||
env = gym.make("Blackjack-v1", sab=True)
|
||||
|
||||
# %%
|
||||
# .. code:: py
|
||||
#
|
||||
# # Other possible environment configurations are:
|
||||
#
|
||||
# env = gym.make('Blackjack-v1', natural=True, sab=False)
|
||||
# # Whether to give an additional reward for starting with a natural blackjack, i.e. starting with an ace and ten (sum is 21).
|
||||
#
|
||||
# env = gym.make('Blackjack-v1', natural=False, sab=False)
|
||||
# # Whether to follow the exact rules outlined in the book by Sutton and Barto. If `sab` is `True`, the keyword argument `natural` will be ignored.
|
||||
#
|
||||
|
||||
|
||||
# %%
|
||||
# Observing the environment
|
||||
# ------------------------------
|
||||
#
|
||||
# First of all, we call ``env.reset()`` to start an episode. This function
|
||||
# resets the environment to a starting position and returns an initial
|
||||
# ``observation``. We usually also set ``done = False``. This variable
|
||||
# will be useful later to check if a game is terminated (i.e., the player wins or loses).
|
||||
#
|
||||
|
||||
# reset the environment to get the first observation
|
||||
done = False
|
||||
observation, info = env.reset()
|
||||
|
||||
# observation = (16, 9, False)
|
||||
|
||||
|
||||
# %%
|
||||
# Note that our observation is a 3-tuple consisting of 3 values:
|
||||
#
|
||||
# - The players current sum
|
||||
# - Value of the dealers face-up card
|
||||
# - Boolean whether the player holds a usable ace (An ace is usable if it
|
||||
# counts as 11 without busting)
|
||||
#
|
||||
|
||||
|
||||
# %%
|
||||
# Executing an action
|
||||
# ------------------------------
|
||||
#
|
||||
# After receiving our first observation, we are only going to use the
|
||||
# ``env.step(action)`` function to interact with the environment. This
|
||||
# function takes an action as input and executes it in the environment.
|
||||
# Because that action changes the state of the environment, it returns
|
||||
# four useful variables to us. These are:
|
||||
#
|
||||
# - ``next_state``: This is the observation that the agent will receive
|
||||
# after taking the action.
|
||||
# - ``reward``: This is the reward that the agent will receive after
|
||||
# taking the action.
|
||||
# - ``terminated``: This is a boolean variable that indicates whether or
|
||||
# not the environment has terminated.
|
||||
# - ``truncated``: This is a boolean variable that also indicates whether
|
||||
# the episode ended by early truncation, i.e., a time limit is reached.
|
||||
# - ``info``: This is a dictionary that might contain additional
|
||||
# information about the environment.
|
||||
#
|
||||
# The ``next_state``, ``reward``, ``terminated`` and ``truncated`` variables are
|
||||
# self-explanatory, but the ``info`` variable requires some additional
|
||||
# explanation. This variable contains a dictionary that might have some
|
||||
# extra information about the environment, but in the Blackjack-v1
|
||||
# environment you can ignore it. For example in Atari environments the
|
||||
# info dictionary has a ``ale.lives`` key that tells us how many lives the
|
||||
# agent has left. If the agent has 0 lives, then the episode is over.
|
||||
#
|
||||
# Note that it is not a good idea to call ``env.render()`` in your training
|
||||
# loop because rendering slows down training by a lot. Rather try to build
|
||||
# an extra loop to evaluate and showcase the agent after training.
|
||||
#
|
||||
|
||||
# sample a random action from all valid actions
|
||||
action = env.action_space.sample()
|
||||
# action=1
|
||||
|
||||
# execute the action in our environment and receive infos from the environment
|
||||
observation, reward, terminated, truncated, info = env.step(action)
|
||||
|
||||
# observation=(24, 10, False)
|
||||
# reward=-1.0
|
||||
# terminated=True
|
||||
# truncated=False
|
||||
# info={}
|
||||
|
||||
|
||||
# %%
|
||||
# Once ``terminated = True`` or ``truncated=True``, we should stop the
|
||||
# current episode and begin a new one with ``env.reset()``. If you
|
||||
# continue executing actions without resetting the environment, it still
|
||||
# responds but the output won’t be useful for training (it might even be
|
||||
# harmful if the agent learns on invalid data).
|
||||
#
|
||||
|
||||
|
||||
# %%
|
||||
# Building an agent
|
||||
# ------------------------------
|
||||
#
|
||||
# Let’s build a ``Q-learning agent`` to solve *Blackjack-v1*! We’ll need
|
||||
# some functions for picking an action and updating the agents action
|
||||
# values. To ensure that the agents explores the environment, one possible
|
||||
# solution is the ``epsilon-greedy`` strategy, where we pick a random
|
||||
# action with the percentage ``epsilon`` and the greedy action (currently
|
||||
# valued as the best) ``1 - epsilon``.
|
||||
#
|
||||
|
||||
|
||||
class BlackjackAgent:
|
||||
def __init__(
|
||||
self,
|
||||
learning_rate: float,
|
||||
initial_epsilon: float,
|
||||
epsilon_decay: float,
|
||||
final_epsilon: float,
|
||||
discount_factor: float = 0.95,
|
||||
):
|
||||
"""Initialize a Reinforcement Learning agent with an empty dictionary
|
||||
of state-action values (q_values), a learning rate and an epsilon.
|
||||
|
||||
Args:
|
||||
learning_rate: The learning rate
|
||||
initial_epsilon: The initial epsilon value
|
||||
epsilon_decay: The decay for epsilon
|
||||
final_epsilon: The final epsilon value
|
||||
discount_factor: The discount factor for computing the Q-value
|
||||
"""
|
||||
self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))
|
||||
|
||||
self.lr = learning_rate
|
||||
self.discount_factor = discount_factor
|
||||
|
||||
self.epsilon = initial_epsilon
|
||||
self.epsilon_decay = epsilon_decay
|
||||
self.final_epsilon = final_epsilon
|
||||
|
||||
self.training_error = []
|
||||
|
||||
def get_action(self, obs: tuple[int, int, bool]) -> int:
|
||||
"""
|
||||
Returns the best action with probability (1 - epsilon)
|
||||
otherwise a random action with probability epsilon to ensure exploration.
|
||||
"""
|
||||
# with probability epsilon return a random action to explore the environment
|
||||
if np.random.random() < self.epsilon:
|
||||
return env.action_space.sample()
|
||||
|
||||
# with probability (1 - epsilon) act greedily (exploit)
|
||||
else:
|
||||
return int(np.argmax(self.q_values[obs]))
|
||||
|
||||
def update(
|
||||
self,
|
||||
obs: tuple[int, int, bool],
|
||||
action: int,
|
||||
reward: float,
|
||||
terminated: bool,
|
||||
next_obs: tuple[int, int, bool],
|
||||
):
|
||||
"""Updates the Q-value of an action."""
|
||||
future_q_value = (not terminated) * np.max(self.q_values[next_obs])
|
||||
temporal_difference = (
|
||||
reward + self.discount_factor * future_q_value - self.q_values[obs][action]
|
||||
)
|
||||
|
||||
self.q_values[obs][action] = (
|
||||
self.q_values[obs][action] + self.lr * temporal_difference
|
||||
)
|
||||
self.training_error.append(temporal_difference)
|
||||
|
||||
def decay_epsilon(self):
|
||||
self.epsilon = max(self.final_epsilon, self.epsilon - epsilon_decay)
|
||||
|
||||
|
||||
# %%
|
||||
# To train the agent, we will let the agent play one episode (one complete
|
||||
# game is called an episode) at a time and then update it’s Q-values after
|
||||
# each episode. The agent will have to experience a lot of episodes to
|
||||
# explore the environment sufficiently.
|
||||
#
|
||||
# Now we should be ready to build the training loop.
|
||||
#
|
||||
|
||||
# hyperparameters
|
||||
learning_rate = 0.01
|
||||
n_episodes = 100_000
|
||||
start_epsilon = 1.0
|
||||
epsilon_decay = start_epsilon / (n_episodes / 2) # reduce the exploration over time
|
||||
final_epsilon = 0.1
|
||||
|
||||
agent = BlackjackAgent(
|
||||
learning_rate=learning_rate,
|
||||
initial_epsilon=start_epsilon,
|
||||
epsilon_decay=epsilon_decay,
|
||||
final_epsilon=final_epsilon,
|
||||
)
|
||||
|
||||
# %%
|
||||
# Great, let’s train!
|
||||
#
|
||||
# Info: The current hyperparameters are set to quickly train a decent agent.
|
||||
# If you want to converge to the optimal policy, try increasing
|
||||
# the n_episodes by 10x and lower the learning_rate (e.g. to 0.001).
|
||||
#
|
||||
|
||||
|
||||
env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
|
||||
for episode in tqdm(range(n_episodes)):
|
||||
obs, info = env.reset()
|
||||
done = False
|
||||
|
||||
# play one episode
|
||||
while not done:
|
||||
action = agent.get_action(obs)
|
||||
next_obs, reward, terminated, truncated, info = env.step(action)
|
||||
|
||||
# update the agent
|
||||
agent.update(obs, action, reward, terminated, next_obs)
|
||||
|
||||
# update if the environment is done and the current obs
|
||||
done = terminated or truncated
|
||||
obs = next_obs
|
||||
|
||||
agent.decay_epsilon()
|
||||
|
||||
|
||||
# %%
|
||||
# Visualizing the training
|
||||
# ------------------------------
|
||||
#
|
||||
|
||||
rolling_length = 500
|
||||
fig, axs = plt.subplots(ncols=3, figsize=(12, 5))
|
||||
axs[0].set_title("Episode rewards")
|
||||
reward_moving_average = (
|
||||
np.convolve(
|
||||
np.array(env.return_queue).flatten(), np.ones(rolling_length), mode="valid"
|
||||
)
|
||||
/ rolling_length
|
||||
)
|
||||
axs[0].plot(range(len(reward_moving_average)), reward_moving_average)
|
||||
axs[1].set_title("Episode lengths")
|
||||
length_moving_average = (
|
||||
np.convolve(
|
||||
np.array(env.length_queue).flatten(), np.ones(rolling_length), mode="same"
|
||||
)
|
||||
/ rolling_length
|
||||
)
|
||||
axs[1].plot(range(len(length_moving_average)), length_moving_average)
|
||||
axs[2].set_title("Training Error")
|
||||
training_error_moving_average = (
|
||||
np.convolve(np.array(agent.training_error), np.ones(rolling_length), mode="same")
|
||||
/ rolling_length
|
||||
)
|
||||
axs[2].plot(range(len(training_error_moving_average)), training_error_moving_average)
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
# %%
|
||||
# .. image:: /_static/img/tutorials/blackjack_training_plots.png
|
||||
#
|
||||
|
||||
|
||||
# %%
|
||||
# Visualising the policy
|
||||
# ------------------------------
|
||||
|
||||
|
||||
def create_grids(agent, usable_ace=False):
|
||||
"""Create value and policy grid given an agent."""
|
||||
# convert our state-action values to state values
|
||||
# and build a policy dictionary that maps observations to actions
|
||||
state_value = defaultdict(float)
|
||||
policy = defaultdict(int)
|
||||
for obs, action_values in agent.q_values.items():
|
||||
state_value[obs] = float(np.max(action_values))
|
||||
policy[obs] = int(np.argmax(action_values))
|
||||
|
||||
player_count, dealer_count = np.meshgrid(
|
||||
# players count, dealers face-up card
|
||||
np.arange(12, 22),
|
||||
np.arange(1, 11),
|
||||
)
|
||||
|
||||
# create the value grid for plotting
|
||||
value = np.apply_along_axis(
|
||||
lambda obs: state_value[(obs[0], obs[1], usable_ace)],
|
||||
axis=2,
|
||||
arr=np.dstack([player_count, dealer_count]),
|
||||
)
|
||||
value_grid = player_count, dealer_count, value
|
||||
|
||||
# create the policy grid for plotting
|
||||
policy_grid = np.apply_along_axis(
|
||||
lambda obs: policy[(obs[0], obs[1], usable_ace)],
|
||||
axis=2,
|
||||
arr=np.dstack([player_count, dealer_count]),
|
||||
)
|
||||
return value_grid, policy_grid
|
||||
|
||||
|
||||
def create_plots(value_grid, policy_grid, title: str):
|
||||
"""Creates a plot using a value and policy grid."""
|
||||
# create a new figure with 2 subplots (left: state values, right: policy)
|
||||
player_count, dealer_count, value = value_grid
|
||||
fig = plt.figure(figsize=plt.figaspect(0.4))
|
||||
fig.suptitle(title, fontsize=16)
|
||||
|
||||
# plot the state values
|
||||
ax1 = fig.add_subplot(1, 2, 1, projection="3d")
|
||||
ax1.plot_surface(
|
||||
player_count,
|
||||
dealer_count,
|
||||
value,
|
||||
rstride=1,
|
||||
cstride=1,
|
||||
cmap="viridis",
|
||||
edgecolor="none",
|
||||
)
|
||||
plt.xticks(range(12, 22), range(12, 22))
|
||||
plt.yticks(range(1, 11), ["A"] + list(range(2, 11)))
|
||||
ax1.set_title(f"State values: {title}")
|
||||
ax1.set_xlabel("Player sum")
|
||||
ax1.set_ylabel("Dealer showing")
|
||||
ax1.zaxis.set_rotate_label(False)
|
||||
ax1.set_zlabel("Value", fontsize=14, rotation=90)
|
||||
ax1.view_init(20, 220)
|
||||
|
||||
# plot the policy
|
||||
fig.add_subplot(1, 2, 2)
|
||||
ax2 = sns.heatmap(policy_grid, linewidth=0, annot=True, cmap="Accent_r", cbar=False)
|
||||
ax2.set_title(f"Policy: {title}")
|
||||
ax2.set_xlabel("Player sum")
|
||||
ax2.set_ylabel("Dealer showing")
|
||||
ax2.set_xticklabels(range(12, 22))
|
||||
ax2.set_yticklabels(["A"] + list(range(2, 11)), fontsize=12)
|
||||
|
||||
# add a legend
|
||||
legend_elements = [
|
||||
Patch(facecolor="lightgreen", edgecolor="black", label="Hit"),
|
||||
Patch(facecolor="grey", edgecolor="black", label="Stick"),
|
||||
]
|
||||
ax2.legend(handles=legend_elements, bbox_to_anchor=(1.3, 1))
|
||||
return fig
|
||||
|
||||
|
||||
# state values & policy with usable ace (ace counts as 11)
|
||||
value_grid, policy_grid = create_grids(agent, usable_ace=True)
|
||||
fig1 = create_plots(value_grid, policy_grid, title="With usable ace")
|
||||
plt.show()
|
||||
|
||||
# %%
|
||||
# .. image:: /_static/img/tutorials/blackjack_with_usable_ace.png
|
||||
#
|
||||
|
||||
# state values & policy without usable ace (ace counts as 1)
|
||||
value_grid, policy_grid = create_grids(agent, usable_ace=False)
|
||||
fig2 = create_plots(value_grid, policy_grid, title="Without usable ace")
|
||||
plt.show()
|
||||
|
||||
# %%
|
||||
# .. image:: /_static/img/tutorials/blackjack_without_usable_ace.png
|
||||
#
|
||||
# It's good practice to call env.close() at the end of your script,
|
||||
# so that any used resources by the environment will be closed.
|
||||
#
|
||||
|
||||
# %%
|
||||
# Think you can do better?
|
||||
# ------------------------------
|
||||
|
||||
# You can visualize the environment using the play function
|
||||
# and try to win a few games.
|
||||
|
||||
|
||||
# %%
|
||||
# Hopefully this Tutorial helped you get a grip of how to interact with
|
||||
# OpenAI-Gym environments and sets you on a journey to solve many more RL
|
||||
# challenges.
|
||||
#
|
||||
# It is recommended that you solve this environment by yourself (project
|
||||
# based learning is really effective!). You can apply your favorite
|
||||
# discrete RL algorithm or give Monte Carlo ES a try (covered in `Sutton &
|
||||
# Barto <http://incompleteideas.net/book/the-book-2nd.html>`_, section
|
||||
# 5.3) - this way you can compare your results directly to the book.
|
||||
#
|
||||
# Best of fun!
|
||||
#
|
@@ -0,0 +1,272 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n# Solving Blackjack with Q-Learning\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"file://_static/img/tutorials/blackjack_AE_loop.jpg\" width=\"650\" alt=\"agent-environment-diagram\" class=\"only-light\">\n<img src=\"file://_static/img/tutorials/blackjack_AE_loop_dark.png\" width=\"650\" alt=\"agent-environment-diagram\" class=\"only-dark\">\n\nIn this tutorial, we\u2019ll explore and solve the *Blackjack-v1*\nenvironment.\n\n**Blackjack** is one of the most popular casino card games that is also\ninfamous for being beatable under certain conditions. This version of\nthe game uses an infinite deck (we draw the cards with replacement), so\ncounting cards won\u2019t be a viable strategy in our simulated game.\nFull documentation can be found at https://gymnasium.farama.org/environments/toy_text/blackjack\n\n**Objective**: To win, your card sum should be greater than the\ndealers without exceeding 21.\n\n**Actions**: Agents can pick between two actions:\n - stand (0): the player takes no more cards\n - hit (1): the player will be given another card, however the player could get over 21 and bust\n\n**Approach**: To solve this environment by yourself, you can pick your\nfavorite discrete RL algorithm. The presented solution uses *Q-learning*\n(a model-free RL algorithm).\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Imports and Environment Setup\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Author: Till Zemann\n# License: MIT License\n\nfrom __future__ import annotations\n\nfrom collections import defaultdict\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport seaborn as sns\nfrom matplotlib.patches import Patch\nfrom tqdm import tqdm\n\nimport gymnasium as gym\n\n\n# Let's start by creating the blackjack environment.\n# Note: We are going to follow the rules from Sutton & Barto.\n# Other versions of the game can be found below for you to experiment.\n\nenv = gym.make(\"Blackjack-v1\", sab=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
".. code:: py\n\n # Other possible environment configurations are:\n\n env = gym.make('Blackjack-v1', natural=True, sab=False)\n # Whether to give an additional reward for starting with a natural blackjack, i.e. starting with an ace and ten (sum is 21).\n\n env = gym.make('Blackjack-v1', natural=False, sab=False)\n # Whether to follow the exact rules outlined in the book by Sutton and Barto. If `sab` is `True`, the keyword argument `natural` will be ignored.\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Observing the environment\n\nFirst of all, we call ``env.reset()`` to start an episode. This function\nresets the environment to a starting position and returns an initial\n``observation``. We usually also set ``done = False``. This variable\nwill be useful later to check if a game is terminated (i.e., the player wins or loses).\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# reset the environment to get the first observation\ndone = False\nobservation, info = env.reset()\n\n# observation = (16, 9, False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note that our observation is a 3-tuple consisting of 3 values:\n\n- The players current sum\n- Value of the dealers face-up card\n- Boolean whether the player holds a usable ace (An ace is usable if it\n counts as 11 without busting)\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Executing an action\n\nAfter receiving our first observation, we are only going to use the\n``env.step(action)`` function to interact with the environment. This\nfunction takes an action as input and executes it in the environment.\nBecause that action changes the state of the environment, it returns\nfour useful variables to us. These are:\n\n- ``next_state``: This is the observation that the agent will receive\n after taking the action.\n- ``reward``: This is the reward that the agent will receive after\n taking the action.\n- ``terminated``: This is a boolean variable that indicates whether or\n not the environment has terminated.\n- ``truncated``: This is a boolean variable that also indicates whether\n the episode ended by early truncation, i.e., a time limit is reached.\n- ``info``: This is a dictionary that might contain additional\n information about the environment.\n\nThe ``next_state``, ``reward``, ``terminated`` and ``truncated`` variables are\nself-explanatory, but the ``info`` variable requires some additional\nexplanation. This variable contains a dictionary that might have some\nextra information about the environment, but in the Blackjack-v1\nenvironment you can ignore it. For example in Atari environments the\ninfo dictionary has a ``ale.lives`` key that tells us how many lives the\nagent has left. If the agent has 0 lives, then the episode is over.\n\nNote that it is not a good idea to call ``env.render()`` in your training\nloop because rendering slows down training by a lot. Rather try to build\nan extra loop to evaluate and showcase the agent after training.\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# sample a random action from all valid actions\naction = env.action_space.sample()\n# action=1\n\n# execute the action in our environment and receive infos from the environment\nobservation, reward, terminated, truncated, info = env.step(action)\n\n# observation=(24, 10, False)\n# reward=-1.0\n# terminated=True\n# truncated=False\n# info={}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Once ``terminated = True`` or ``truncated=True``, we should stop the\ncurrent episode and begin a new one with ``env.reset()``. If you\ncontinue executing actions without resetting the environment, it still\nresponds but the output won\u2019t be useful for training (it might even be\nharmful if the agent learns on invalid data).\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Building an agent\n\nLet\u2019s build a ``Q-learning agent`` to solve *Blackjack-v1*! We\u2019ll need\nsome functions for picking an action and updating the agents action\nvalues. To ensure that the agents explores the environment, one possible\nsolution is the ``epsilon-greedy`` strategy, where we pick a random\naction with the percentage ``epsilon`` and the greedy action (currently\nvalued as the best) ``1 - epsilon``.\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class BlackjackAgent:\n def __init__(\n self,\n learning_rate: float,\n initial_epsilon: float,\n epsilon_decay: float,\n final_epsilon: float,\n discount_factor: float = 0.95,\n ):\n \"\"\"Initialize a Reinforcement Learning agent with an empty dictionary\n of state-action values (q_values), a learning rate and an epsilon.\n\n Args:\n learning_rate: The learning rate\n initial_epsilon: The initial epsilon value\n epsilon_decay: The decay for epsilon\n final_epsilon: The final epsilon value\n discount_factor: The discount factor for computing the Q-value\n \"\"\"\n self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))\n\n self.lr = learning_rate\n self.discount_factor = discount_factor\n\n self.epsilon = initial_epsilon\n self.epsilon_decay = epsilon_decay\n self.final_epsilon = final_epsilon\n\n self.training_error = []\n\n def get_action(self, obs: tuple[int, int, bool]) -> int:\n \"\"\"\n Returns the best action with probability (1 - epsilon)\n otherwise a random action with probability epsilon to ensure exploration.\n \"\"\"\n # with probability epsilon return a random action to explore the environment\n if np.random.random() < self.epsilon:\n return env.action_space.sample()\n\n # with probability (1 - epsilon) act greedily (exploit)\n else:\n return int(np.argmax(self.q_values[obs]))\n\n def update(\n self,\n obs: tuple[int, int, bool],\n action: int,\n reward: float,\n terminated: bool,\n next_obs: tuple[int, int, bool],\n ):\n \"\"\"Updates the Q-value of an action.\"\"\"\n future_q_value = (not terminated) * np.max(self.q_values[next_obs])\n temporal_difference = (\n reward + self.discount_factor * future_q_value - self.q_values[obs][action]\n )\n\n self.q_values[obs][action] = (\n self.q_values[obs][action] + self.lr * temporal_difference\n )\n self.training_error.append(temporal_difference)\n\n def decay_epsilon(self):\n self.epsilon = max(self.final_epsilon, self.epsilon - epsilon_decay)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"To train the agent, we will let the agent play one episode (one complete\ngame is called an episode) at a time and then update it\u2019s Q-values after\neach episode. The agent will have to experience a lot of episodes to\nexplore the environment sufficiently.\n\nNow we should be ready to build the training loop.\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# hyperparameters\nlearning_rate = 0.01\nn_episodes = 100_000\nstart_epsilon = 1.0\nepsilon_decay = start_epsilon / (n_episodes / 2) # reduce the exploration over time\nfinal_epsilon = 0.1\n\nagent = BlackjackAgent(\n learning_rate=learning_rate,\n initial_epsilon=start_epsilon,\n epsilon_decay=epsilon_decay,\n final_epsilon=final_epsilon,\n)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Great, let\u2019s train!\n\nInfo: The current hyperparameters are set to quickly train a decent agent.\nIf you want to converge to the optimal policy, try increasing\nthe n_episodes by 10x and lower the learning_rate (e.g. to 0.001).\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)\nfor episode in tqdm(range(n_episodes)):\n obs, info = env.reset()\n done = False\n\n # play one episode\n while not done:\n action = agent.get_action(obs)\n next_obs, reward, terminated, truncated, info = env.step(action)\n\n # update the agent\n agent.update(obs, action, reward, terminated, next_obs)\n\n # update if the environment is done and the current obs\n done = terminated or truncated\n obs = next_obs\n\n agent.decay_epsilon()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Visualizing the training\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"rolling_length = 500\nfig, axs = plt.subplots(ncols=3, figsize=(12, 5))\naxs[0].set_title(\"Episode rewards\")\nreward_moving_average = (\n np.convolve(\n np.array(env.return_queue).flatten(), np.ones(rolling_length), mode=\"valid\"\n )\n / rolling_length\n)\naxs[0].plot(range(len(reward_moving_average)), reward_moving_average)\naxs[1].set_title(\"Episode lengths\")\nlength_moving_average = (\n np.convolve(\n np.array(env.length_queue).flatten(), np.ones(rolling_length), mode=\"same\"\n )\n / rolling_length\n)\naxs[1].plot(range(len(length_moving_average)), length_moving_average)\naxs[2].set_title(\"Training Error\")\ntraining_error_moving_average = (\n np.convolve(np.array(agent.training_error), np.ones(rolling_length), mode=\"same\")\n / rolling_length\n)\naxs[2].plot(range(len(training_error_moving_average)), training_error_moving_average)\nplt.tight_layout()\nplt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"file://_static/img/tutorials/blackjack_training_plots.png\">\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Visualising the policy\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_grids(agent, usable_ace=False):\n \"\"\"Create value and policy grid given an agent.\"\"\"\n # convert our state-action values to state values\n # and build a policy dictionary that maps observations to actions\n state_value = defaultdict(float)\n policy = defaultdict(int)\n for obs, action_values in agent.q_values.items():\n state_value[obs] = float(np.max(action_values))\n policy[obs] = int(np.argmax(action_values))\n\n player_count, dealer_count = np.meshgrid(\n # players count, dealers face-up card\n np.arange(12, 22),\n np.arange(1, 11),\n )\n\n # create the value grid for plotting\n value = np.apply_along_axis(\n lambda obs: state_value[(obs[0], obs[1], usable_ace)],\n axis=2,\n arr=np.dstack([player_count, dealer_count]),\n )\n value_grid = player_count, dealer_count, value\n\n # create the policy grid for plotting\n policy_grid = np.apply_along_axis(\n lambda obs: policy[(obs[0], obs[1], usable_ace)],\n axis=2,\n arr=np.dstack([player_count, dealer_count]),\n )\n return value_grid, policy_grid\n\n\ndef create_plots(value_grid, policy_grid, title: str):\n \"\"\"Creates a plot using a value and policy grid.\"\"\"\n # create a new figure with 2 subplots (left: state values, right: policy)\n player_count, dealer_count, value = value_grid\n fig = plt.figure(figsize=plt.figaspect(0.4))\n fig.suptitle(title, fontsize=16)\n\n # plot the state values\n ax1 = fig.add_subplot(1, 2, 1, projection=\"3d\")\n ax1.plot_surface(\n player_count,\n dealer_count,\n value,\n rstride=1,\n cstride=1,\n cmap=\"viridis\",\n edgecolor=\"none\",\n )\n plt.xticks(range(12, 22), range(12, 22))\n plt.yticks(range(1, 11), [\"A\"] + list(range(2, 11)))\n ax1.set_title(f\"State values: {title}\")\n ax1.set_xlabel(\"Player sum\")\n ax1.set_ylabel(\"Dealer showing\")\n ax1.zaxis.set_rotate_label(False)\n ax1.set_zlabel(\"Value\", fontsize=14, rotation=90)\n ax1.view_init(20, 220)\n\n # plot the policy\n fig.add_subplot(1, 2, 2)\n ax2 = sns.heatmap(policy_grid, linewidth=0, annot=True, cmap=\"Accent_r\", cbar=False)\n ax2.set_title(f\"Policy: {title}\")\n ax2.set_xlabel(\"Player sum\")\n ax2.set_ylabel(\"Dealer showing\")\n ax2.set_xticklabels(range(12, 22))\n ax2.set_yticklabels([\"A\"] + list(range(2, 11)), fontsize=12)\n\n # add a legend\n legend_elements = [\n Patch(facecolor=\"lightgreen\", edgecolor=\"black\", label=\"Hit\"),\n Patch(facecolor=\"grey\", edgecolor=\"black\", label=\"Stick\"),\n ]\n ax2.legend(handles=legend_elements, bbox_to_anchor=(1.3, 1))\n return fig\n\n\n# state values & policy with usable ace (ace counts as 11)\nvalue_grid, policy_grid = create_grids(agent, usable_ace=True)\nfig1 = create_plots(value_grid, policy_grid, title=\"With usable ace\")\nplt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"file://_static/img/tutorials/blackjack_with_usable_ace.png\">\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# state values & policy without usable ace (ace counts as 1)\nvalue_grid, policy_grid = create_grids(agent, usable_ace=False)\nfig2 = create_plots(value_grid, policy_grid, title=\"Without usable ace\")\nplt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"file://_static/img/tutorials/blackjack_without_usable_ace.png\">\n\nIt's good practice to call env.close() at the end of your script,\nso that any used resources by the environment will be closed.\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Think you can do better?\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# You can visualize the environment using the play function\n# and try to win a few games."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Hopefully this Tutorial helped you get a grip of how to interact with\nOpenAI-Gym environments and sets you on a journey to solve many more RL\nchallenges.\n\nIt is recommended that you solve this environment by yourself (project\nbased learning is really effective!). You can apply your favorite\ndiscrete RL algorithm or give Monte Carlo ES a try (covered in [Sutton &\nBarto](http://incompleteideas.net/book/the-book-2nd.html), section\n5.3) - this way you can compare your results directly to the book.\n\nBest of fun!\n\n\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
@@ -0,0 +1,133 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n# Training using REINFORCE for Mujoco\n\n<img src=\"file://_static/img/tutorials/reinforce_invpend_gym_v26_fig1.gif\" width=\"400\" alt=\"agent-environment-diagram\">\n\nThis tutorial serves 2 purposes:\n 1. To understand how to implement REINFORCE [1] from scratch to solve Mujoco's InvertedPendulum-v4\n 2. Implementation a deep reinforcement learning algorithm with Gymnasium's v0.26+ `step()` function\n\nWe will be using **REINFORCE**, one of the earliest policy gradient methods. Unlike going under the burden of learning a value function first and then deriving a policy out of it,\nREINFORCE optimizes the policy directly. In other words, it is trained to maximize the probability of Monte-Carlo returns. More on that later.\n\n**Inverted Pendulum** is Mujoco's cartpole but now powered by the Mujoco physics simulator -\nwhich allows more complex experiments (such as varying the effects of gravity).\nThis environment involves a cart that can moved linearly, with a pole fixed on it at one end and having another end free.\nThe cart can be pushed left or right, and the goal is to balance the pole on the top of the cart by applying forces on the cart.\nMore information on the environment could be found at https://gymnasium.farama.org/environments/mujoco/inverted_pendulum/\n\n**Training Objectives**: To balance the pole (inverted pendulum) on top of the cart\n\n**Actions**: The agent takes a 1D vector for actions. The action space is a continuous ``(action)`` in ``[-3, 3]``,\nwhere action represents the numerical force applied to the cart\n(with magnitude representing the amount of force and sign representing the direction)\n\n**Approach**: We use PyTorch to code REINFORCE from scratch to train a Neural Network policy to master Inverted Pendulum.\n\nAn explanation of the Gymnasium v0.26+ `Env.step()` function\n\n``env.step(A)`` allows us to take an action 'A' in the current environment 'env'. The environment then executes the action\nand returns five variables:\n\n- ``next_obs``: This is the observation that the agent will receive after taking the action.\n- ``reward``: This is the reward that the agent will receive after taking the action.\n- ``terminated``: This is a boolean variable that indicates whether or not the environment has terminated.\n- ``truncated``: This is a boolean variable that also indicates whether the episode ended by early truncation, i.e., a time limit is reached.\n- ``info``: This is a dictionary that might contain additional information about the environment.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from __future__ import annotations\n\nimport random\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport seaborn as sns\nimport torch\nimport torch.nn as nn\nfrom torch.distributions.normal import Normal\n\nimport gymnasium as gym\n\n\nplt.rcParams[\"figure.figsize\"] = (10, 5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Policy Network\n\n<img src=\"file://_static/img/tutorials/reinforce_invpend_gym_v26_fig2.png\">\n\nWe start by building a policy that the agent will learn using REINFORCE.\nA policy is a mapping from the current environment observation to a probability distribution of the actions to be taken.\nThe policy used in the tutorial is parameterized by a neural network. It consists of 2 linear layers that are shared between both the predicted mean and standard deviation.\nFurther, the single individual linear layers are used to estimate the mean and the standard deviation. ``nn.Tanh`` is used as a non-linearity between the hidden layers.\nThe following function estimates a mean and standard deviation of a normal distribution from which an action is sampled. Hence it is expected for the policy to learn\nappropriate weights to output means and standard deviation based on the current observation.\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class Policy_Network(nn.Module):\n \"\"\"Parametrized Policy Network.\"\"\"\n\n def __init__(self, obs_space_dims: int, action_space_dims: int):\n \"\"\"Initializes a neural network that estimates the mean and standard deviation\n of a normal distribution from which an action is sampled from.\n\n Args:\n obs_space_dims: Dimension of the observation space\n action_space_dims: Dimension of the action space\n \"\"\"\n super().__init__()\n\n hidden_space1 = 16 # Nothing special with 16, feel free to change\n hidden_space2 = 32 # Nothing special with 32, feel free to change\n\n # Shared Network\n self.shared_net = nn.Sequential(\n nn.Linear(obs_space_dims, hidden_space1),\n nn.Tanh(),\n nn.Linear(hidden_space1, hidden_space2),\n nn.Tanh(),\n )\n\n # Policy Mean specific Linear Layer\n self.policy_mean_net = nn.Sequential(\n nn.Linear(hidden_space2, action_space_dims)\n )\n\n # Policy Std Dev specific Linear Layer\n self.policy_stddev_net = nn.Sequential(\n nn.Linear(hidden_space2, action_space_dims)\n )\n\n def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:\n \"\"\"Conditioned on the observation, returns the mean and standard deviation\n of a normal distribution from which an action is sampled from.\n\n Args:\n x: Observation from the environment\n\n Returns:\n action_means: predicted mean of the normal distribution\n action_stddevs: predicted standard deviation of the normal distribution\n \"\"\"\n shared_features = self.shared_net(x.float())\n\n action_means = self.policy_mean_net(shared_features)\n action_stddevs = torch.log(\n 1 + torch.exp(self.policy_stddev_net(shared_features))\n )\n\n return action_means, action_stddevs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Building an agent\n\n<img src=\"file://_static/img/tutorials/reinforce_invpend_gym_v26_fig3.jpeg\">\n\nNow that we are done building the policy, let us develop **REINFORCE** which gives life to the policy network.\nThe algorithm of REINFORCE could be found above. As mentioned before, REINFORCE aims to maximize the Monte-Carlo returns.\n\nFun Fact: REINFROCE is an acronym for \" 'RE'ward 'I'ncrement 'N'on-negative 'F'actor times 'O'ffset 'R'einforcement times 'C'haracteristic 'E'ligibility\n\nNote: The choice of hyperparameters is to train a decently performing agent. No extensive hyperparameter\ntuning was done.\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class REINFORCE:\n \"\"\"REINFORCE algorithm.\"\"\"\n\n def __init__(self, obs_space_dims: int, action_space_dims: int):\n \"\"\"Initializes an agent that learns a policy via REINFORCE algorithm [1]\n to solve the task at hand (Inverted Pendulum v4).\n\n Args:\n obs_space_dims: Dimension of the observation space\n action_space_dims: Dimension of the action space\n \"\"\"\n\n # Hyperparameters\n self.learning_rate = 1e-4 # Learning rate for policy optimization\n self.gamma = 0.99 # Discount factor\n self.eps = 1e-6 # small number for mathematical stability\n\n self.probs = [] # Stores probability values of the sampled action\n self.rewards = [] # Stores the corresponding rewards\n\n self.net = Policy_Network(obs_space_dims, action_space_dims)\n self.optimizer = torch.optim.AdamW(self.net.parameters(), lr=self.learning_rate)\n\n def sample_action(self, state: np.ndarray) -> float:\n \"\"\"Returns an action, conditioned on the policy and observation.\n\n Args:\n state: Observation from the environment\n\n Returns:\n action: Action to be performed\n \"\"\"\n state = torch.tensor(np.array([state]))\n action_means, action_stddevs = self.net(state)\n\n # create a normal distribution from the predicted\n # mean and standard deviation and sample an action\n distrib = Normal(action_means[0] + self.eps, action_stddevs[0] + self.eps)\n action = distrib.sample()\n prob = distrib.log_prob(action)\n\n action = action.numpy()\n\n self.probs.append(prob)\n\n return action\n\n def update(self):\n \"\"\"Updates the policy network's weights.\"\"\"\n running_g = 0\n gs = []\n\n # Discounted return (backwards) - [::-1] will return an array in reverse\n for R in self.rewards[::-1]:\n running_g = R + self.gamma * running_g\n gs.insert(0, running_g)\n\n deltas = torch.tensor(gs)\n\n loss = 0\n # minimize -1 * prob * reward obtained\n for log_prob, delta in zip(self.probs, deltas):\n loss += log_prob.mean() * delta * (-1)\n\n # Update the policy network\n self.optimizer.zero_grad()\n loss.backward()\n self.optimizer.step()\n\n # Empty / zero out all episode-centric/related variables\n self.probs = []\n self.rewards = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now lets train the policy using REINFORCE to master the task of Inverted Pendulum.\n\nFollowing is the overview of the training procedure\n\n for seed in random seeds\n reinitialize agent\n\n for episode in range of max number of episodes\n until episode is done\n sample action based on current observation\n\n take action and receive reward and next observation\n\n store action take, its probability, and the observed reward\n update the policy\n\nNote: Deep RL is fairly brittle concerning random seed in a lot of common use cases (https://spinningup.openai.com/en/latest/spinningup/spinningup.html).\nHence it is important to test out various seeds, which we will be doing.\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create and wrap the environment\nenv = gym.make(\"InvertedPendulum-v4\")\nwrapped_env = gym.wrappers.RecordEpisodeStatistics(env, 50) # Records episode-reward\n\ntotal_num_episodes = int(5e3) # Total number of episodes\n# Observation-space of InvertedPendulum-v4 (4)\nobs_space_dims = env.observation_space.shape[0]\n# Action-space of InvertedPendulum-v4 (1)\naction_space_dims = env.action_space.shape[0]\nrewards_over_seeds = []\n\nfor seed in [1, 2, 3, 5, 8]: # Fibonacci seeds\n # set seed\n torch.manual_seed(seed)\n random.seed(seed)\n np.random.seed(seed)\n\n # Reinitialize agent every seed\n agent = REINFORCE(obs_space_dims, action_space_dims)\n reward_over_episodes = []\n\n for episode in range(total_num_episodes):\n # gymnasium v26 requires users to set seed while resetting the environment\n obs, info = wrapped_env.reset(seed=seed)\n\n done = False\n while not done:\n action = agent.sample_action(obs)\n\n # Step return type - `tuple[ObsType, SupportsFloat, bool, bool, dict[str, Any]]`\n # These represent the next observation, the reward from the step,\n # if the episode is terminated, if the episode is truncated and\n # additional info from the step\n obs, reward, terminated, truncated, info = wrapped_env.step(action)\n agent.rewards.append(reward)\n\n # End the episode when either truncated or terminated is true\n # - truncated: The episode duration reaches max number of timesteps\n # - terminated: Any of the state space values is no longer finite.\n done = terminated or truncated\n\n reward_over_episodes.append(wrapped_env.return_queue[-1])\n agent.update()\n\n if episode % 1000 == 0:\n avg_reward = int(np.mean(wrapped_env.return_queue))\n print(\"Episode:\", episode, \"Average Reward:\", avg_reward)\n\n rewards_over_seeds.append(reward_over_episodes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Plot learning curve\n\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"rewards_to_plot = [[reward[0] for reward in rewards] for rewards in rewards_over_seeds]\ndf1 = pd.DataFrame(rewards_to_plot).melt()\ndf1.rename(columns={\"variable\": \"episodes\", \"value\": \"reward\"}, inplace=True)\nsns.set(style=\"darkgrid\", context=\"talk\", palette=\"rainbow\")\nsns.lineplot(x=\"episodes\", y=\"reward\", data=df1).set(\n title=\"REINFORCE for InvertedPendulum-v4\"\n)\nplt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"file://_static/img/tutorials/reinforce_invpend_gym_v26_fig4.png\">\n\nAuthor: Siddarth Chandrasekar\n\nLicense: MIT License\n\n## References\n\n[1] Williams, Ronald J.. \u201cSimple statistical gradient-following\nalgorithms for connectionist reinforcement learning.\u201d Machine Learning 8\n(2004): 229-256.\n\n\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
@@ -0,0 +1,57 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n# Handling Time Limits\n\nIn using Gymnasium environments with reinforcement learning code, a common problem observed is how time limits are incorrectly handled. The ``done`` signal received (in previous versions of OpenAI Gym < 0.26) from ``env.step`` indicated whether an episode has ended. However, this signal did not distinguish whether the episode ended due to ``termination`` or ``truncation``.\n\n## Termination\n\nTermination refers to the episode ending after reaching a terminal state that is defined as part of the environment\ndefinition. Examples are - task success, task failure, robot falling down etc. Notably, this also includes episodes\nending in finite-horizon environments due to a time-limit inherent to the environment. Note that to preserve Markov\nproperty, a representation of the remaining time must be present in the agent's observation in finite-horizon environments.\n[(Reference)](https://arxiv.org/abs/1712.00378)\n\n## Truncation\n\nTruncation refers to the episode ending after an externally defined condition (that is outside the scope of the Markov\nDecision Process). This could be a time-limit, a robot going out of bounds etc.\n\nAn infinite-horizon environment is an obvious example of where this is needed. We cannot wait forever for the episode\nto complete, so we set a practical time-limit after which we forcibly halt the episode. The last state in this case is\nnot a terminal state since it has a non-zero transition probability of moving to another state as per the Markov\nDecision Process that defines the RL problem. This is also different from time-limits in finite horizon environments\nas the agent in this case has no idea about this time-limit.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Importance in learning code\nBootstrapping (using one or more estimated values of a variable to update estimates of the same variable) is a key\naspect of Reinforcement Learning. A value function will tell you how much discounted reward you will get from a\nparticular state if you follow a given policy. When an episode stops at any given point, by looking at the value of\nthe final state, the agent is able to estimate how much discounted reward could have been obtained if the episode has\ncontinued. This is an example of handling truncation.\n\nMore formally, a common example of bootstrapping in RL is updating the estimate of the Q-value function,\n\n\\begin{align}Q_{target}(o_t, a_t) = r_t + \\gamma . \\max_a(Q(o_{t+1}, a_{t+1}))\\end{align}\n\n\nIn classical RL, the new ``Q`` estimate is a weighted average of the previous ``Q`` estimate and ``Q_target`` while in Deep\nQ-Learning, the error between ``Q_target`` and the previous ``Q`` estimate is minimized.\n\nHowever, at the terminal state, bootstrapping is not done,\n\n\\begin{align}Q_{target}(o_t, a_t) = r_t\\end{align}\n\nThis is where the distinction between termination and truncation becomes important. When an episode ends due to\ntermination we don't bootstrap, when it ends due to truncation, we bootstrap.\n\nWhile using gymnasium environments, the ``done`` signal (default for < v0.26) is frequently used to determine whether to\nbootstrap or not. However, this is incorrect since it does not differentiate between termination and truncation.\n\nA simple example of value functions is shown below. This is an illustrative example and not part of any specific algorithm.\n\n.. code:: python\n\n # INCORRECT\n vf_target = rew + gamma * (1 - done) * vf_next_state\n\nThis is incorrect in the case of episode ending due to a truncation, where bootstrapping needs to happen but it doesn't.\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Solution\n\nFrom v0.26 onwards, Gymnasium's ``env.step`` API returns both termination and truncation information explicitly.\nIn the previous version truncation information was supplied through the info key ``TimeLimit.truncated``.\nThe correct way to handle terminations and truncations now is,\n\n.. code:: python\n\n # terminated = done and 'TimeLimit.truncated' not in info\n # This was needed in previous versions.\n\n vf_target = rew + gamma * (1 - terminated) * vf_next_state\n\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
@@ -0,0 +1,115 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n# Implementing Custom Wrappers\n\nIn this tutorial we will describe how to implement your own custom wrappers.\nWrappers are a great way to add functionality to your environments in a modular way.\nThis will save you a lot of boilerplate code.\n\nWe will show how to create a wrapper by\n\n- Inheriting from :class:`gymnasium.ObservationWrapper`\n- Inheriting from :class:`gymnasium.ActionWrapper`\n- Inheriting from :class:`gymnasium.RewardWrapper`\n- Inheriting from :class:`gymnasium.Wrapper`\n\nBefore following this tutorial, make sure to check out the docs of the :mod:`gymnasium.wrappers` module.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Inheriting from :class:`gymnasium.ObservationWrapper`\nObservation wrappers are useful if you want to apply some function to the observations that are returned\nby an environment. If you implement an observation wrapper, you only need to define this transformation\nby implementing the :meth:`gymnasium.ObservationWrapper.observation` method. Moreover, you should remember to\nupdate the observation space, if the transformation changes the shape of observations (e.g. by transforming\ndictionaries into numpy arrays, as in the following example).\n\nImagine you have a 2D navigation task where the environment returns dictionaries as observations with\nkeys ``\"agent_position\"`` and ``\"target_position\"``. A common thing to do might be to throw away some degrees of\nfreedom and only consider the position of the target relative to the agent, i.e.\n``observation[\"target_position\"] - observation[\"agent_position\"]``. For this, you could implement an\nobservation wrapper like this:\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\nfrom gym import ActionWrapper, ObservationWrapper, RewardWrapper, Wrapper\n\nimport gymnasium as gym\nfrom gymnasium.spaces import Box, Discrete\n\n\nclass RelativePosition(ObservationWrapper):\n def __init__(self, env):\n super().__init__(env)\n self.observation_space = Box(shape=(2,), low=-np.inf, high=np.inf)\n\n def observation(self, obs):\n return obs[\"target\"] - obs[\"agent\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Inheriting from :class:`gymnasium.ActionWrapper`\nAction wrappers can be used to apply a transformation to actions before applying them to the environment.\nIf you implement an action wrapper, you need to define that transformation by implementing\n:meth:`gymnasium.ActionWrapper.action`. Moreover, you should specify the domain of that transformation\nby updating the action space of the wrapper.\n\nLet\u2019s say you have an environment with action space of type :class:`gymnasium.spaces.Box`, but you would only like\nto use a finite subset of actions. Then, you might want to implement the following wrapper:\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class DiscreteActions(ActionWrapper):\n def __init__(self, env, disc_to_cont):\n super().__init__(env)\n self.disc_to_cont = disc_to_cont\n self.action_space = Discrete(len(disc_to_cont))\n\n def action(self, act):\n return self.disc_to_cont[act]\n\n\nif __name__ == \"__main__\":\n env = gym.make(\"LunarLanderContinuous-v2\")\n wrapped_env = DiscreteActions(\n env, [np.array([1, 0]), np.array([-1, 0]), np.array([0, 1]), np.array([0, -1])]\n )\n print(wrapped_env.action_space) # Discrete(4)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Inheriting from :class:`gymnasium.RewardWrapper`\nReward wrappers are used to transform the reward that is returned by an environment.\nAs for the previous wrappers, you need to specify that transformation by implementing the\n:meth:`gymnasium.RewardWrapper.reward` method. Also, you might want to update the reward range of the wrapper.\n\nLet us look at an example: Sometimes (especially when we do not have control over the reward\nbecause it is intrinsic), we want to clip the reward to a range to gain some numerical stability.\nTo do that, we could, for instance, implement the following wrapper:\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import SupportsFloat\n\n\nclass ClipReward(RewardWrapper):\n def __init__(self, env, min_reward, max_reward):\n super().__init__(env)\n self.min_reward = min_reward\n self.max_reward = max_reward\n self.reward_range = (min_reward, max_reward)\n\n def reward(self, r: SupportsFloat) -> SupportsFloat:\n return np.clip(r, self.min_reward, self.max_reward)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Inheriting from :class:`gymnasium.Wrapper`\nSometimes you might need to implement a wrapper that does some more complicated modifications (e.g. modify the\nreward based on data in ``info`` or change the rendering behavior).\nSuch wrappers can be implemented by inheriting from :class:`gymnasium.Wrapper`.\n\n- You can set a new action or observation space by defining ``self.action_space`` or ``self.observation_space`` in ``__init__``, respectively\n- You can set new metadata and reward range by defining ``self.metadata`` and ``self.reward_range`` in ``__init__``, respectively\n- You can override :meth:`gymnasium.Wrapper.step`, :meth:`gymnasium.Wrapper.render`, :meth:`gymnasium.Wrapper.close` etc.\nIf you do this, you can access the environment that was passed\nto your wrapper (which *still* might be wrapped in some other wrapper) by accessing the attribute :attr:`env`.\n\nLet's also take a look at an example for this case. Most MuJoCo environments return a reward that consists\nof different terms: For instance, there might be a term that rewards the agent for completing the task and one term that\npenalizes large actions (i.e. energy usage). Usually, you can pass weight parameters for those terms during\ninitialization of the environment. However, *Reacher* does not allow you to do this! Nevertheless, all individual terms\nof the reward are returned in `info`, so let us build a wrapper for Reacher that allows us to weight those terms:\n\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class ReacherRewardWrapper(Wrapper):\n def __init__(self, env, reward_dist_weight, reward_ctrl_weight):\n super().__init__(env)\n self.reward_dist_weight = reward_dist_weight\n self.reward_ctrl_weight = reward_ctrl_weight\n\n def step(self, action):\n obs, _, terminated, truncated, info = self.env.step(action)\n reward = (\n self.reward_dist_weight * info[\"reward_dist\"]\n + self.reward_ctrl_weight * info[\"reward_ctrl\"]\n )\n return obs, reward, terminated, truncated, info"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
BIN
0.27.0/_images/AE_loop.png
Normal file
After Width: | Height: | Size: 337 KiB |
BIN
0.27.0/_images/AE_loop_dark.png
Normal file
After Width: | Height: | Size: 360 KiB |
BIN
0.27.0/_images/acrobot.gif
Normal file
After Width: | Height: | Size: 102 KiB |
BIN
0.27.0/_images/adventure.gif
Normal file
After Width: | Height: | Size: 165 KiB |
BIN
0.27.0/_images/air_raid.gif
Normal file
After Width: | Height: | Size: 696 KiB |
BIN
0.27.0/_images/alien.gif
Normal file
After Width: | Height: | Size: 636 KiB |
BIN
0.27.0/_images/amidar.gif
Normal file
After Width: | Height: | Size: 87 KiB |
BIN
0.27.0/_images/ant.gif
Normal file
After Width: | Height: | Size: 2.9 MiB |
BIN
0.27.0/_images/assault.gif
Normal file
After Width: | Height: | Size: 130 KiB |
BIN
0.27.0/_images/asterix.gif
Normal file
After Width: | Height: | Size: 60 KiB |
BIN
0.27.0/_images/asteroids.gif
Normal file
After Width: | Height: | Size: 59 KiB |
BIN
0.27.0/_images/atlantis.gif
Normal file
After Width: | Height: | Size: 337 KiB |
BIN
0.27.0/_images/bank_heist.gif
Normal file
After Width: | Height: | Size: 321 KiB |
BIN
0.27.0/_images/battle_zone.gif
Normal file
After Width: | Height: | Size: 696 KiB |
BIN
0.27.0/_images/beam_rider.gif
Normal file
After Width: | Height: | Size: 83 KiB |
BIN
0.27.0/_images/berzerk.gif
Normal file
After Width: | Height: | Size: 115 KiB |
BIN
0.27.0/_images/bipedal_walker.gif
Normal file
After Width: | Height: | Size: 2.0 MiB |
BIN
0.27.0/_images/blackjack.gif
Normal file
After Width: | Height: | Size: 522 KiB |
BIN
0.27.0/_images/blackjack_AE_loop.jpg
Normal file
After Width: | Height: | Size: 92 KiB |
BIN
0.27.0/_images/blackjack_AE_loop_dark.png
Normal file
After Width: | Height: | Size: 266 KiB |
BIN
0.27.0/_images/blackjack_training_plots.png
Normal file
After Width: | Height: | Size: 70 KiB |
BIN
0.27.0/_images/blackjack_with_usable_ace.png
Normal file
After Width: | Height: | Size: 102 KiB |
BIN
0.27.0/_images/blackjack_without_usable_ace.png
Normal file
After Width: | Height: | Size: 107 KiB |
BIN
0.27.0/_images/bowling.gif
Normal file
After Width: | Height: | Size: 140 KiB |
BIN
0.27.0/_images/boxing.gif
Normal file
After Width: | Height: | Size: 672 KiB |
BIN
0.27.0/_images/breakout.gif
Normal file
After Width: | Height: | Size: 46 KiB |
BIN
0.27.0/_images/car_racing.gif
Normal file
After Width: | Height: | Size: 1.2 MiB |
BIN
0.27.0/_images/carnival.gif
Normal file
After Width: | Height: | Size: 99 KiB |
BIN
0.27.0/_images/cart_pole.gif
Normal file
After Width: | Height: | Size: 57 KiB |
BIN
0.27.0/_images/centipede.gif
Normal file
After Width: | Height: | Size: 67 KiB |
BIN
0.27.0/_images/chopper_command.gif
Normal file
After Width: | Height: | Size: 615 KiB |
BIN
0.27.0/_images/cliff_walking.gif
Normal file
After Width: | Height: | Size: 41 KiB |
BIN
0.27.0/_images/crazy_climber.gif
Normal file
After Width: | Height: | Size: 16 KiB |
BIN
0.27.0/_images/defender.gif
Normal file
After Width: | Height: | Size: 170 KiB |
BIN
0.27.0/_images/demon_attack.gif
Normal file
After Width: | Height: | Size: 95 KiB |
BIN
0.27.0/_images/double_dunk.gif
Normal file
After Width: | Height: | Size: 178 KiB |
BIN
0.27.0/_images/elevator_action.gif
Normal file
After Width: | Height: | Size: 382 KiB |
BIN
0.27.0/_images/enduro.gif
Normal file
After Width: | Height: | Size: 296 KiB |
BIN
0.27.0/_images/fishing_derby.gif
Normal file
After Width: | Height: | Size: 641 KiB |
BIN
0.27.0/_images/freeway.gif
Normal file
After Width: | Height: | Size: 872 KiB |
BIN
0.27.0/_images/frostbite.gif
Normal file
After Width: | Height: | Size: 436 KiB |
BIN
0.27.0/_images/frozen_lake.gif
Normal file
After Width: | Height: | Size: 121 KiB |
BIN
0.27.0/_images/gopher.gif
Normal file
After Width: | Height: | Size: 163 KiB |
BIN
0.27.0/_images/gravitar.gif
Normal file
After Width: | Height: | Size: 45 KiB |
BIN
0.27.0/_images/half_cheetah.gif
Normal file
After Width: | Height: | Size: 2.3 MiB |
BIN
0.27.0/_images/hero.gif
Normal file
After Width: | Height: | Size: 261 KiB |
BIN
0.27.0/_images/hopper.gif
Normal file
After Width: | Height: | Size: 3.8 MiB |
BIN
0.27.0/_images/humanoid.gif
Normal file
After Width: | Height: | Size: 3.3 MiB |
BIN
0.27.0/_images/humanoid_standup.gif
Normal file
After Width: | Height: | Size: 2.5 MiB |
BIN
0.27.0/_images/ice_hockey.gif
Normal file
After Width: | Height: | Size: 235 KiB |
BIN
0.27.0/_images/inverted_double_pendulum.gif
Normal file
After Width: | Height: | Size: 3.8 MiB |
BIN
0.27.0/_images/inverted_pendulum.gif
Normal file
After Width: | Height: | Size: 504 KiB |
BIN
0.27.0/_images/jamesbond.gif
Normal file
After Width: | Height: | Size: 249 KiB |
BIN
0.27.0/_images/journey_escape.gif
Normal file
After Width: | Height: | Size: 218 KiB |
BIN
0.27.0/_images/kangaroo.gif
Normal file
After Width: | Height: | Size: 206 KiB |
BIN
0.27.0/_images/krull.gif
Normal file
After Width: | Height: | Size: 187 KiB |
BIN
0.27.0/_images/kung_fu_master.gif
Normal file
After Width: | Height: | Size: 245 KiB |
BIN
0.27.0/_images/lunar_lander.gif
Normal file
After Width: | Height: | Size: 88 KiB |
BIN
0.27.0/_images/lunar_lander_continuous.gif
Normal file
After Width: | Height: | Size: 210 KiB |
BIN
0.27.0/_images/montezuma_revenge.gif
Normal file
After Width: | Height: | Size: 182 KiB |
BIN
0.27.0/_images/mountain_car.gif
Normal file
After Width: | Height: | Size: 48 KiB |
BIN
0.27.0/_images/mountain_car_continuous.gif
Normal file
After Width: | Height: | Size: 46 KiB |
BIN
0.27.0/_images/ms_pacman.gif
Normal file
After Width: | Height: | Size: 572 KiB |
BIN
0.27.0/_images/name_this_game.gif
Normal file
After Width: | Height: | Size: 752 KiB |
BIN
0.27.0/_images/pendulum.gif
Normal file
After Width: | Height: | Size: 122 KiB |
BIN
0.27.0/_images/pendulum.png
Normal file
After Width: | Height: | Size: 9.8 KiB |
BIN
0.27.0/_images/phoenix.gif
Normal file
After Width: | Height: | Size: 163 KiB |
BIN
0.27.0/_images/pitfall.gif
Normal file
After Width: | Height: | Size: 284 KiB |
BIN
0.27.0/_images/pong.gif
Normal file
After Width: | Height: | Size: 507 KiB |
BIN
0.27.0/_images/pooyan.gif
Normal file
After Width: | Height: | Size: 615 KiB |
BIN
0.27.0/_images/private_eye.gif
Normal file
After Width: | Height: | Size: 406 KiB |
BIN
0.27.0/_images/pusher.gif
Normal file
After Width: | Height: | Size: 396 KiB |
BIN
0.27.0/_images/qbert.gif
Normal file
After Width: | Height: | Size: 129 KiB |
BIN
0.27.0/_images/reacher.gif
Normal file
After Width: | Height: | Size: 2.6 MiB |
BIN
0.27.0/_images/reinforce_invpend_gym_v26_fig1.gif
Normal file
After Width: | Height: | Size: 1.5 MiB |
BIN
0.27.0/_images/reinforce_invpend_gym_v26_fig2.png
Normal file
After Width: | Height: | Size: 116 KiB |
BIN
0.27.0/_images/reinforce_invpend_gym_v26_fig3.jpeg
Normal file
After Width: | Height: | Size: 25 KiB |
BIN
0.27.0/_images/reinforce_invpend_gym_v26_fig4.png
Normal file
After Width: | Height: | Size: 86 KiB |
BIN
0.27.0/_images/riverraid.gif
Normal file
After Width: | Height: | Size: 719 KiB |
BIN
0.27.0/_images/road_runner.gif
Normal file
After Width: | Height: | Size: 73 KiB |
BIN
0.27.0/_images/robotank.gif
Normal file
After Width: | Height: | Size: 474 KiB |
BIN
0.27.0/_images/seaquest.gif
Normal file
After Width: | Height: | Size: 406 KiB |
BIN
0.27.0/_images/skiing.gif
Normal file
After Width: | Height: | Size: 740 KiB |
BIN
0.27.0/_images/solaris.gif
Normal file
After Width: | Height: | Size: 223 KiB |
BIN
0.27.0/_images/space_invaders.gif
Normal file
After Width: | Height: | Size: 123 KiB |