From ae75ad2e44e6be930f7a57357b886e3819232898 Mon Sep 17 00:00:00 2001
From: axb2035 <axb2035@gmail.com>
Date: Wed, 30 Nov 2022 08:45:19 +1030
Subject: [PATCH] Update docstrings for improved documentation (#160)

---
 docs/environments/toy_text.md           |   6 +-
 gymnasium/envs/toy_text/blackjack.py    |  69 ++++++++---
 gymnasium/envs/toy_text/cliffwalking.py |  72 ++++++++----
 gymnasium/envs/toy_text/frozen_lake.py  | 109 +++++++++++------
 gymnasium/envs/toy_text/taxi.py         | 150 ++++++++++++++----------
 5 files changed, 264 insertions(+), 142 deletions(-)
diff --git a/docs/environments/toy_text.md b/docs/environments/toy_text.md
index 2bfab7a96..0f754f9cd 100644
--- a/docs/environments/toy_text.md
+++ b/docs/environments/toy_text.md
@@ -18,8 +18,6 @@ toy_text/frozen_lake.md
    :file: toy_text/list.html
 ```
 
-All toy text environments were created by us using native Python libraries such as StringIO.
+Toy text environments are designed to be extremely simple, with small discrete state and action spaces, and hence easy to learn. As a result, they are suitable for debugging implementations of reinforcement learning algorithms.
 
-These environments are designed to be extremely simple, with small discrete state and action spaces, and hence easy to learn. As a result, they are suitable for debugging implementations of reinforcement learning algorithms.
-
-All environments are configurable via arguments specified in each environment's documentation.
+Environments are configurable via arguments specified in each environment's documentation.
diff --git a/gymnasium/envs/toy_text/blackjack.py b/gymnasium/envs/toy_text/blackjack.py
index b8bb9ead3..cd02f7429 100644
--- a/gymnasium/envs/toy_text/blackjack.py
+++ b/gymnasium/envs/toy_text/blackjack.py
@@ -52,46 +52,72 @@ class BlackjackEnv(gym.Env):
     that sum to closer to 21 (without going over 21) than the dealers cards.
 
     ## Description
-    Card Values:
+    The game starts with the dealer having one face up and one face down card,
+    while the player has two face up cards. All cards are drawn from an infinite deck
+    (i.e. with replacement).
 
+    The card values are:
     - Face cards (Jack, Queen, King) have a point value of 10.
     - Aces can either count as 11 (called a 'usable ace') or 1.
     - Numerical cards (2-9) have a value equal to their number.
 
-    This game is played with an infinite deck (or with replacement).
-    The game starts with the dealer having one face up and one face down card,
-    while the player has two face up cards.
+    The player has the sum of cards held. The player can request
+    additional cards (hit) until they decide to stop (stick) or exceed 21 (bust,
+    immediate loss).
+
+    After the player sticks, the dealer reveals their facedown card, and draws cards
+    until their sum is 17 or greater. If the dealer goes bust, the player wins.
 
-    The player can request additional cards (hit, action=1) until they decide to stop (stick, action=0)
-    or exceed 21 (bust, immediate loss).
-    After the player sticks, the dealer reveals their facedown card, and draws
-    until their sum is 17 or greater.  If the dealer goes bust, the player wins.
     If neither the player nor the dealer busts, the outcome (win, lose, draw) is
     decided by whose sum is closer to 21.
 
+    This environment corresponds to the version of the blackjack problem
+    described in Example 5.1 in Reinforcement Learning: An Introduction
+    by Sutton and Barto [<a href="#blackjack_ref">1</a>].
+
     ## Action Space
-    There are two actions:
-    - 0: stick
-    - 1: hit
+    The action shape is `(1,)` in the range `{0, 1}` indicating
+    whether to stick or hit.
+
+    - 0: Stick
+    - 1: Hit
 
     ## Observation Space
     The observation consists of a 3-tuple containing: the player's current sum,
     the value of the dealer's one showing card (1-10 where 1 is ace),
     and whether the player holds a usable ace (0 or 1).
 
-    This environment corresponds to the version of the blackjack problem
-    described in Example 5.1 in Reinforcement Learning: An Introduction
-    by Sutton and Barto (http://incompleteideas.net/book/the-book-2nd.html).
+    The observation is returned as `(int(), int(), int())`.
+
+    ## Starting State
+    The starting state is initialised in the following range.
+
+    | Observation               | Min  | Max  |
+    |---------------------------|------|------|
+    | Player current sum        |  4   |  12  |
+    | Dealer showing card value |  2   |  11  |
+    | Usable Ace                |  0   |  1   |
 
     ## Rewards
     - win game: +1
     - lose game: -1
     - draw game: 0
     - win game with natural blackjack:
+    +1.5 (if <a href="#nat">natural</a> is True)
+    +1 (if <a href="#nat">natural</a> is False)
 
-        +1.5 (if <a href="#nat">natural</a> is True)
+    ## Episode End
+    The episode ends if the following happens:
 
-        +1 (if <a href="#nat">natural</a> is False)
+    - Termination:
+    1. The player hits and the sum of hand exceeds 21.
+    2. The player sticks.
+
+    An ace will always be counted as usable (11) unless it busts the player.
+
+    ## Information
+
+    No additional information is returned.
 
     ## Arguments
 
@@ -100,17 +126,22 @@ class BlackjackEnv(gym.Env):
     gym.make('Blackjack-v1', natural=False, sab=False)
     ```
 
-    <a id="nat">`natural=False`</a>: Whether to give an additional reward for
+    <a id="nat"></a>`natural=False`: Whether to give an additional reward for
     starting with a natural blackjack, i.e. starting with an ace and ten (sum is 21).
 
-    <a id="sab">`sab=False`</a>: Whether to follow the exact rules outlined in the book by
+    <a id="sab"></a>`sab=False`: Whether to follow the exact rules outlined in the book by
     Sutton and Barto. If `sab` is `True`, the keyword argument `natural` will be ignored.
     If the player achieves a natural blackjack and the dealer does not, the player
     will win (i.e. get a reward of +1). The reverse rule does not apply.
     If both the player and the dealer get a natural, it will be a draw (i.e. reward 0).
 
+    ## References
+    <a id="blackjack_ref"></a>[1] R. Sutton and A. Barto, “Reinforcement Learning:
+    An Introduction” 2020. [Online]. Available: [http://www.incompleteideas.net/book/RLbook2020.pdf](http://www.incompleteideas.net/book/RLbook2020.pdf)
+
     ## Version History
-    * v0: Initial versions release (1.0.0)
+    * v1: Fix the natural handling in Blackjack
+    * v0: Initial version release
     """
 
     metadata = {
diff --git a/gymnasium/envs/toy_text/cliffwalking.py b/gymnasium/envs/toy_text/cliffwalking.py
index 93473e8d3..5912a7649 100644
--- a/gymnasium/envs/toy_text/cliffwalking.py
+++ b/gymnasium/envs/toy_text/cliffwalking.py
@@ -18,39 +18,60 @@ LEFT = 3
 
 class CliffWalkingEnv(Env):
     """
-    This is a simple implementation of the Gridworld Cliff
-    reinforcement learning task.
+    Cliff walking involves crossing a gridworld from start to goal while avoiding falling off a cliff.
 
-    Adapted from Example 6.6 (page 106) from [Reinforcement Learning: An Introduction
-    by Sutton and Barto](http://incompleteideas.net/book/bookdraft2018jan1.pdf).
+    ## Description
+    The game starts with the player at location [3, 0] of the 4x12 grid world with the
+    goal located at [3, 11]. If the player reaches the goal the episode ends.
+
+    A cliff runs along [3, 1..10]. If the player moves to a cliff location it
+    returns to the start location.
+
+    The player makes moves until they reach the goal.
+
+    Adapted from Example 6.6 (page 132) from Reinforcement Learning: An Introduction
+    by Sutton and Barto [<a href="#cliffwalk_ref">1</a>].
 
     With inspiration from:
     [https://github.com/dennybritz/reinforcement-learning/blob/master/lib/envs/cliff_walking.py](https://github.com/dennybritz/reinforcement-learning/blob/master/lib/envs/cliff_walking.py)
 
-    ## Description
-    The board is a 4x12 matrix, with (using NumPy matrix indexing):
-    - [3, 0] as the start at bottom-left
-    - [3, 11] as the goal at bottom-right
-    - [3, 1..10] as the cliff at bottom-center
+    ## Action Space
+    The action shape is `(1,)` in the range `{0, 3}` indicating
+    which direction to move the player.
 
-    If the agent steps on the cliff, it returns to the start.
-    An episode terminates when the agent reaches the goal.
+    - 0: Move up
+    - 1: Move right
+    - 2: Move down
+    - 3: Move left
 
-    ## Actions
-    There are 4 discrete deterministic actions:
-    - 0: move up
-    - 1: move right
-    - 2: move down
-    - 3: move left
+    ## Observation Space
+    There are 3 x 12 + 1 possible states. The player cannot be at the cliff, nor at
+    the goal as the latter results in the end of the episode. What remains are all
+    the positions of the first 3 rows plus the bottom-left cell.
 
-    ## Observations
-    There are 3x12 + 1 possible states. In fact, the agent cannot be at the cliff, nor at the goal
-    (as this results in the end of the episode).
-    It remains all the positions of the first 3 rows plus the bottom-left cell.
-    The observation is simply the current position encoded as [flattened index](https://numpy.org/doc/stable/reference/generated/numpy.unravel_index.html).
+    The observation is a value representing the player's current position as
+    current_row * nrows + current_col (where both the row and col start at 0).
+
+    For example, the stating position can be calculated as follows: 3 * 12 + 0 = 36.
+
+    The observation is returned as an `int()`.
+
+    ## Starting State
+    The episode starts with the player in state `[36]` (location [3, 0]).
 
     ## Reward
-    Each time step incurs -1 reward, and stepping into the cliff incurs -100 reward.
+    Each time step incurs -1 reward, unless the player stepped into the cliff,
+    which incurs -100 reward.
+
+    ## Episode End
+    The episode terminates when the player enters state `[47]` (location [3, 11]).
+
+    ## Information
+
+    `step()` and `reset()` return a dict with the following keys:
+    - "p" - transition proability for the state.
+
+    As cliff walking is not stochastic, the transition probability returned always 1.0.
 
     ## Arguments
 
@@ -59,8 +80,13 @@ class CliffWalkingEnv(Env):
     gym.make('CliffWalking-v0')
     ```
 
+    ## References
+    <a id="cliffwalk_ref"></a>[1] R. Sutton and A. Barto, “Reinforcement Learning:
+    An Introduction” 2020. [Online]. Available: [http://www.incompleteideas.net/book/RLbook2020.pdf](http://www.incompleteideas.net/book/RLbook2020.pdf)
+
     ## Version History
     - v0: Initial version release
+
     """
 
     metadata = {
diff --git a/gymnasium/envs/toy_text/frozen_lake.py b/gymnasium/envs/toy_text/frozen_lake.py
index 65e51f8cd..4eafe867d 100644
--- a/gymnasium/envs/toy_text/frozen_lake.py
+++ b/gymnasium/envs/toy_text/frozen_lake.py
@@ -81,33 +81,72 @@ def generate_random_map(
 
 class FrozenLakeEnv(Env):
     """
-    Frozen lake involves crossing a frozen lake from Start(S) to Goal(G) without falling into any Holes(H)
-    by walking over the Frozen(F) lake.
-    The agent may not always move in the intended direction due to the slippery nature of the frozen lake.
+    Frozen lake involves crossing a frozen lake from start to goal without falling into any holes
+    by walking over the frozen lake.
+    The player may not always move in the intended direction due to the slippery nature of the frozen lake.
 
+    ## Description
+    The game starts with the player at location [0,0] of the frozen lake grid world with the
+    goal located at far extent of the world e.g. [3,3] for the 4x4 environment.
+
+    Holes in the ice are distributed in set locations when using a pre-determined map
+    or in random locations when a random map is generated.
+
+    The player makes moves until they reach the goal or fall in a hole.
+
+    The lake is slippery (unless disabled) so the player may move perpendicular
+    to the intended direction sometimes (see <a href="#is_slippy">`is_slippery`</a>).
+
+    Randomly generated worlds will always have a path to the goal.
+
+    Elf and stool from [https://franuka.itch.io/rpg-snow-tileset](https://franuka.itch.io/rpg-snow-tileset).
+    All other assets by Mel Tillery [http://www.cyaneus.com/](http://www.cyaneus.com/).
 
     ## Action Space
-    The agent takes a 1-element vector for actions.
-    The action space is `(dir)`, where `dir` decides direction to move in which can be:
+    The action shape is `(1,)` in the range `{0, 3}` indicating
+    which direction to move the player.
 
-    - 0: LEFT
-    - 1: DOWN
-    - 2: RIGHT
-    - 3: UP
+    - 0: Move left
+    - 1: Move down
+    - 2: Move right
+    - 3: Move up
 
     ## Observation Space
-    The observation is a value representing the agent's current position as
+    The observation is a value representing the player's current position as
     current_row * nrows + current_col (where both the row and col start at 0).
+
     For example, the goal position in the 4x4 map can be calculated as follows: 3 * 4 + 3 = 15.
     The number of possible observations is dependent on the size of the map.
-    For example, the 4x4 map has 16 possible observations.
+
+    The observation is returned as an `int()`.
+
+    ## Starting State
+    The episode starts with the player in state `[0]` (location [0, 0]).
 
     ## Rewards
 
     Reward schedule:
-    - Reach goal(G): +1
-    - Reach hole(H): 0
-    - Reach frozen(F): 0
+    - Reach goal: +1
+    - Reach hole: 0
+    - Reach frozen: 0
+
+    ## Episode End
+    The episode ends if the following happens:
+
+    - Termination:
+        1. The player moves into a hole.
+        2. The player reaches the goal at `max(nrow) * max(ncol) - 1` (location `[max(nrow)-1, max(ncol)-1]`).
+
+    - Truncation (when using the time_limit wrapper):
+        1. The length of the episode is 100 for 4x4 environment, 200 for 8x8 environment.
+
+    ## Information
+
+    `step()` and `reset()` return a dict with the following keys:
+    - p - transition probability for the state.
+
+    See <a href="#is_slippy">`is_slippery`</a> for transition probability information.
+
 
     ## Arguments
 
@@ -116,20 +155,22 @@ class FrozenLakeEnv(Env):
     gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True)
     ```
 
-    `desc`: Used to specify custom map for frozen lake. For example,
+    `desc=None`: Used to specify maps non-preloaded maps.
 
+    Specify a custom map.
+    ```
         desc=["SFFF", "FHFH", "FFFH", "HFFG"].
+    ```
 
-        A random generated map can be specified by calling the function `generate_random_map`. For example,
+    A random generated map can be specified by calling the function `generate_random_map`.
+    ```
+    from gymnasium.envs.toy_text.frozen_lake import generate_random_map
 
-        ```
-        from gymnasium.envs.toy_text.frozen_lake import generate_random_map
-
-        gym.make('FrozenLake-v1', desc=generate_random_map(size=8))
-        ```
-
-    `map_name`: ID to use any of the preloaded maps.
+    gym.make('FrozenLake-v1', desc=generate_random_map(size=8))
+    ```
 
+    `map_name="4x4"`: ID to use any of the preloaded maps.
+    ```
         "4x4":[
             "SFFF",
             "FHFH",
@@ -147,27 +188,25 @@ class FrozenLakeEnv(Env):
             "FHFFHFHF",
             "FFFHFFFG",
         ]
+    ```
 
+    If `desc=None` then `map_name` will be used. If both `desc` and `map_name` are
+    `None` a random 8x8 map with 80% of locations frozen will be generated.
 
-    #### Option is_slippery
-    Boolean, set true by default. When True, will move in intended direction with
+    <a id="is_slippy"></a>`is_slippery=True`: If true the player will move in intended direction with
     probability of 1/3 else will move in either perpendicular direction with
     equal probability of 1/3 in both directions.
 
-        For example, if action is left and is_slippery is True, then:
-        - P(move left)=1/3
-        - P(move up)=1/3
-        - P(move down)=1/3
+    For example, if action is left and is_slippery is True, then:
+    - P(move left)=1/3
+    - P(move up)=1/3
+    - P(move down)=1/3
 
-    To init the environment without a default value, specify explicitly the value of is_slippery
-    in the make command:
-
-        import gymnasium as gym
-        gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True)
 
     ## Version History
     * v1: Bug fixes to rewards
-    * v0: Initial versions release (1.0.0)
+    * v0: Initial version release
+
     """
 
     metadata = {
diff --git a/gymnasium/envs/toy_text/taxi.py b/gymnasium/envs/toy_text/taxi.py
index a50e5ccee..d8e0418d2 100644
--- a/gymnasium/envs/toy_text/taxi.py
+++ b/gymnasium/envs/toy_text/taxi.py
@@ -24,43 +24,69 @@ WINDOW_SIZE = (550, 350)
 
 class TaxiEnv(Env):
     """
-
-    The Taxi Problem
-    from "Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition"
-    by Tom Dietterich
+    The Taxi Problem involves navigating to passengers in a grid world, picking them up and dropping them
+    off at one of four locations.
 
     ## Description
-    There are four designated locations in the grid world indicated by R(ed),
-    G(reen), Y(ellow), and B(lue). When the episode starts, the taxi starts off
-    at a random square and the passenger is at a random location. The taxi
-    drives to the passenger's location, picks up the passenger, drives to the
-    passenger's destination (another one of the four specified locations), and
-    then drops off the passenger. Once the passenger is dropped off, the episode ends.
+    There are four designated pick-up and drop-off locations (Red, Green, Yellow and Blue) in the
+    5x5 grid world. The taxi starts off at a random square and the passenger at one of the
+    designated locations.
+
+    The goal is move the taxi to the passenger's location, pick up the passenger,
+    move to the passenger's desired destination, and
+    drop off the passenger. Once the passenger is dropped off, the episode ends.
+
+    The player receives positive rewards for successfully dropping-off the passenger at the correct
+    location. Negative rewards for incorrect attempts to pick-up/drop-off passenger and
+    for each step where another reward is not received.
 
     Map:
 
-        +---------+
-        |R: | : :G|
-        | : | : : |
-        | : : : : |
-        | | : | : |
-        |Y| : |B: |
-        +---------+
+            +---------+
+            |R: | : :G|
+            | : | : : |
+            | : : : : |
+            | | : | : |
+            |Y| : |B: |
+            +---------+
 
-    ## Actions
-    There are 6 discrete deterministic actions:
-    - 0: move south
-    - 1: move north
-    - 2: move east
-    - 3: move west
-    - 4: pickup passenger
-    - 5: drop off passenger
+    From "Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition"
+    by Tom Dietterich [<a href="#taxi_ref">1</a>].
 
-    ## Observations
+    ## Action Space
+    The action shape is `(1,)` in the range `{0, 5}` indicating
+    which direction to move the taxi or to pickup/drop off passengers.
+
+    - 0: Move south (down)
+    - 1: Move north (up)
+    - 2: Move east (right)
+    - 3: Move west (left)
+    - 4: Pickup passenger
+    - 5: Drop off passenger
+
+    ## Observation Space
     There are 500 discrete states since there are 25 taxi positions, 5 possible
     locations of the passenger (including the case when the passenger is in the
     taxi), and 4 destination locations.
 
+    Destination on the map are represented with the first letter of the color.
+
+    Passenger locations:
+    - 0: Red
+    - 1: Green
+    - 2: Yellow
+    - 3: Blue
+    - 4: In taxi
+
+    Destinations:
+    - 0: Red
+    - 1: Green
+    - 2: Yellow
+    - 3: Blue
+
+    An observation is returned as an `int()` that encodes the corresponding state, calculated by
+    `((taxi_row * 5 + taxi_col) * 5 + passenger_location) * 4 + destination`
+
     Note that there are 400 states that can actually be reached during an
     episode. The missing states correspond to situations in which the passenger
     is at the same location as their destination, as this typically signals the
@@ -68,46 +94,44 @@ class TaxiEnv(Env):
     successful episodes, when both the passenger and the taxi are at the destination.
     This gives a total of 404 reachable discrete states.
 
-    Each state space is represented by the tuple:
-    (taxi_row, taxi_col, passenger_location, destination)
-
-    An observation is an integer that encodes the corresponding state.
-    The state tuple can then be decoded with the "decode" method.
-
-    Passenger locations:
-    - 0: R(ed)
-    - 1: G(reen)
-    - 2: Y(ellow)
-    - 3: B(lue)
-    - 4: in taxi
-
-    Destinations:
-    - 0: R(ed)
-    - 1: G(reen)
-    - 2: Y(ellow)
-    - 3: B(lue)
-
-    ## Info
-
-    ``step`` and ``reset()`` will return an info dictionary that contains "p" and "action_mask" containing
-        the probability that the state is taken and a mask of what actions will result in a change of state to speed up training.
-
-    As Taxi's initial state is a stochastic, the "p" key represents the probability of the
-    transition however this value is currently bugged being 1.0, this will be fixed soon.
-    As the steps are deterministic, "p" represents the probability of the transition which is always 1.0
-
-    For some cases, taking an action will have no effect on the state of the agent.
-    In v0.25.0, ``info["action_mask"]`` contains a np.ndarray for each of the action specifying
-    if the action will change the state.
-
-    To sample a modifying action, use ``action = env.action_space.sample(info["action_mask"])``
-    Or with a Q-value based algorithm ``action = np.argmax(q_values[obs, np.where(info["action_mask"] == 1)[0]])``.
+    ## Starting State
+    The episode starts with the player in a random state.
 
     ## Rewards
     - -1 per step unless other reward is triggered.
     - +20 delivering passenger.
     - -10  executing "pickup" and "drop-off" actions illegally.
 
+    An action that results a noop, like moving into a wall, will incur the time step
+    penalty. Noops can be avoided by sampling the `action_mask` returned in `info`.
+
+    ## Episode End
+    The episode ends if the following happens:
+
+    - Termination:
+            1. The taxi drops off the passenger.
+
+    - Truncation (when using the time_limit wrapper):
+            1. The length of the episode is 200.
+
+    ## Information
+
+    `step()` and `reset()` return a dict with the following keys:
+    - p - transition proability for the state.
+    - action_mask - if actions will cause a transition to a new state.
+
+    As taxi is not stochastic, the transition probability is always 1.0. Implementing
+    a transitional probability in line with the Dietterich paper ('The fickle taxi task')
+    is a TODO.
+
+    For some cases, taking an action will have no effect on the state of the episode.
+    In v0.25.0, ``info["action_mask"]`` contains a np.ndarray for each of the actions specifying
+    if the action will change the state.
+
+    To sample a modifying action, use ``action = env.action_space.sample(info["action_mask"])``
+    Or with a Q-value based algorithm ``action = np.argmax(q_values[obs, np.where(info["action_mask"] == 1)[0]])``.
+
+
     ## Arguments
 
     ```python
@@ -115,11 +139,15 @@ class TaxiEnv(Env):
     gym.make('Taxi-v3')
     ```
 
+    ## References
+    <a id="taxi_ref"></a>[1] T. G. Dietterich, “Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition,”
+    Journal of Artificial Intelligence Research, vol. 13, pp. 227–303, Nov. 2000, doi: 10.1613/jair.639.
+
     ## Version History
     * v3: Map Correction + Cleaner Domain Description, v0.25.0 action masking added to the reset and step information
     * v2: Disallow Taxi start location = goal location, Update Taxi observations in the rollout, Update Taxi reward threshold.
     * v1: Remove (3,2) from locs, add passidx<4 check
-    * v0: Initial versions release
+    * v0: Initial version release
     """
 
     metadata = {