Remove DiscreteEnv class (#2514)

2025-08-23 15:04:20 +00:00 · 2021-12-22 19:25:36 +01:00
parent 4fe7efaacd
commit 102cd1bf4c
5 changed files with 95 additions and 93 deletions
--- a/gym/envs/toy_text/cliffwalking.py
+++ b/gym/envs/toy_text/cliffwalking.py
@@ -1,8 +1,11 @@
 import numpy as np
 import sys
 from contextlib import closing
 from io import StringIO
-from gym.envs.toy_text import discrete
+from typing import Optional
 import numpy as np
 from gym import Env, spaces
 from gym.envs.toy_text.utils import categorical_sample
 UP = 0
 RIGHT = 1
@@ -10,7 +13,7 @@ DOWN = 2
 LEFT = 3
-class CliffWalkingEnv(discrete.DiscreteEnv):
+class CliffWalkingEnv(Env):
    """
    This is a simple implementation of the Gridworld Cliff
    reinforcement learning task.
@@ -37,29 +40,30 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
        self.shape = (4, 12)
        self.start_state_index = np.ravel_multi_index((3, 0), self.shape)
-        nS = np.prod(self.shape)
+        self.nS = np.prod(self.shape)
-        nA = 4
+        self.nA = 4
        # Cliff Location
        self._cliff = np.zeros(self.shape, dtype=np.bool)
        self._cliff[3, 1:-1] = True
        # Calculate transition probabilities and rewards
-        P = {}
+        self.P = {}
-        for s in range(nS):
+        for s in range(self.nS):
            position = np.unravel_index(s, self.shape)
-            P[s] = {a: [] for a in range(nA)}
+            self.P[s] = {a: [] for a in range(self.nA)}
-            P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
+            self.P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
-            P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
+            self.P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
-            P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
+            self.P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
-            P[s][LEFT] = self._calculate_transition_prob(position, [0, -1])
+            self.P[s][LEFT] = self._calculate_transition_prob(position, [0, -1])
        # Calculate initial state distribution
        # We always start in state (3, 0)
-        isd = np.zeros(nS)
+        self.initial_state_distrib = np.zeros(self.nS)
-        isd[self.start_state_index] = 1.0
+        self.initial_state_distrib[self.start_state_index] = 1.0
-        super().__init__(nS, nA, P, isd)
+        self.observation_space = spaces.Discrete(self.nS)
        self.action_space = spaces.Discrete(self.nA)
    def _limit_coordinates(self, coord):
        """
@@ -90,6 +94,20 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
        is_done = tuple(new_position) == terminal_state
        return [(1.0, new_state, -1, is_done)]
    def step(self, a):
        transitions = self.P[self.s][a]
        i = categorical_sample([t[0] for t in transitions], self.np_random)
        p, s, r, d = transitions[i]
        self.s = s
        self.lastaction = a
        return (int(s), r, d, {"prob": p})
    def reset(self, seed: Optional[int] = None):
        super().reset(seed=seed)
        self.s = categorical_sample(self.initial_state_distrib, self.np_random)
        self.lastaction = None
        return int(self.s)
    def render(self, mode="human"):
        outfile = StringIO() if mode == "ansi" else sys.stdout
--- a/gym/envs/toy_text/discrete.py
+++ b/gym/envs/toy_text/discrete.py
@@ -1,57 +0,0 @@
 from typing import Optional
 import numpy as np
 from gym import Env, spaces
 from gym.utils import seeding
 def categorical_sample(prob_n, np_random):
    """
    Sample from categorical distribution
    Each row specifies class probabilities
    """
    prob_n = np.asarray(prob_n)
    csprob_n = np.cumsum(prob_n)
    return (csprob_n > np_random.random()).argmax()
 class DiscreteEnv(Env):
    """
    Has the following members
    - nS: number of states
    - nA: number of actions
    - P: transitions (*)
    - isd: initial state distribution (**)
    (*) dictionary of lists, where
      P[s][a] == [(probability, nextstate, reward, done), ...]
    (**) list or array of length nS
    """
    def __init__(self, nS, nA, P, isd):
        self.P = P
        self.isd = isd
        self.lastaction = None  # for rendering
        self.nS = nS
        self.nA = nA
        self.action_space = spaces.Discrete(self.nA)
        self.observation_space = spaces.Discrete(self.nS)
    def reset(self, seed: Optional[int] = None):
        super().reset(seed=seed)
        self.s = categorical_sample(self.isd, self.np_random)
        self.lastaction = None
        return int(self.s)
    def step(self, a):
        transitions = self.P[self.s][a]
        i = categorical_sample([t[0] for t in transitions], self.np_random)
        p, s, r, d = transitions[i]
        self.s = s
        self.lastaction = a
        return (int(s), r, d, {"prob": p})
--- a/gym/envs/toy_text/frozen_lake.py
+++ b/gym/envs/toy_text/frozen_lake.py
@@ -1,11 +1,11 @@
 import sys
 from contextlib import closing
 from io import StringIO
 from typing import Optional
 import numpy as np
-from io import StringIO
+from gym import Env, spaces, utils
-
+from gym.envs.toy_text.utils import categorical_sample
 from gym import utils
 from gym.envs.toy_text import discrete
 LEFT = 0
 DOWN = 1
@@ -63,7 +63,7 @@ def generate_random_map(size=8, p=0.8):
    return ["".join(x) for x in res]
-class FrozenLakeEnv(discrete.DiscreteEnv):
+class FrozenLakeEnv(Env):
    """
    Winter is here. You and your friends were tossing around a frisbee at the
    park when you made a wild throw that left the frisbee out in the middle of
@@ -103,10 +103,10 @@ class FrozenLakeEnv(discrete.DiscreteEnv):
        nA = 4
        nS = nrow * ncol
-        isd = np.array(desc == b"S").astype("float64").ravel()
+        self.initial_state_distrib = np.array(desc == b"S").astype("float64").ravel()
-        isd /= isd.sum()
+        self.initial_state_distrib /= self.initial_state_distrib.sum()
-        P = {s: {a: [] for a in range(nA)} for s in range(nS)}
+        self.P = {s: {a: [] for a in range(nA)} for s in range(nS)}
        def to_s(row, col):
            return row * ncol + col
@@ -134,7 +134,7 @@ class FrozenLakeEnv(discrete.DiscreteEnv):
            for col in range(ncol):
                s = to_s(row, col)
                for a in range(4):
-                    li = P[s][a]
+                    li = self.P[s][a]
                    letter = desc[row, col]
                    if letter in b"GH":
                        li.append((1.0, s, 0, True))
@@ -147,7 +147,22 @@ class FrozenLakeEnv(discrete.DiscreteEnv):
                        else:
                            li.append((1.0, *update_probability_matrix(row, col, a)))
-        super().__init__(nS, nA, P, isd)
+        self.observation_space = spaces.Discrete(nS)
        self.action_space = spaces.Discrete(nA)
    def step(self, a):
        transitions = self.P[self.s][a]
        i = categorical_sample([t[0] for t in transitions], self.np_random)
        p, s, r, d = transitions[i]
        self.s = s
        self.lastaction = a
        return (int(s), r, d, {"prob": p})
    def reset(self, seed: Optional[int] = None):
        super().reset(seed=seed)
        self.s = categorical_sample(self.initial_state_distrib, self.np_random)
        self.lastaction = None
        return int(self.s)
    def render(self, mode="human"):
        outfile = StringIO() if mode == "ansi" else sys.stdout
--- a/gym/envs/toy_text/taxi.py
+++ b/gym/envs/toy_text/taxi.py
@@ -1,9 +1,11 @@
 import sys
 from contextlib import closing
 from io import StringIO
-from gym import utils
+from typing import Optional
-from gym.envs.toy_text import discrete
+
 import numpy as np
 from gym import Env, spaces, utils
 from gym.envs.toy_text.utils import categorical_sample
 MAP = [
    "+---------+",
@@ -16,7 +18,7 @@ MAP = [
 ]
-class TaxiEnv(discrete.DiscreteEnv):
+class TaxiEnv(Env):
    """
    The Taxi Problem
    from "Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition"
@@ -81,9 +83,9 @@ class TaxiEnv(discrete.DiscreteEnv):
        num_columns = 5
        max_row = num_rows - 1
        max_col = num_columns - 1
-        initial_state_distrib = np.zeros(num_states)
+        self.initial_state_distrib = np.zeros(num_states)
        num_actions = 6
-        P = {
+        self.P = {
            state: {action: [] for action in range(num_actions)}
            for state in range(num_states)
        }
@@ -93,7 +95,7 @@ class TaxiEnv(discrete.DiscreteEnv):
                    for dest_idx in range(len(locs)):
                        state = self.encode(row, col, pass_idx, dest_idx)
                        if pass_idx < 4 and pass_idx != dest_idx:
-                            initial_state_distrib[state] += 1
+                            self.initial_state_distrib[state] += 1
                        for action in range(num_actions):
                            # defaults
                            new_row, new_col, new_pass_idx = row, col, pass_idx
@@ -128,11 +130,10 @@ class TaxiEnv(discrete.DiscreteEnv):
                            new_state = self.encode(
                                new_row, new_col, new_pass_idx, dest_idx
                            )
-                            P[state][action].append((1.0, new_state, reward, done))
+                            self.P[state][action].append((1.0, new_state, reward, done))
-        initial_state_distrib /= initial_state_distrib.sum()
+        self.initial_state_distrib /= self.initial_state_distrib.sum()
-        discrete.DiscreteEnv.__init__(
+        self.action_space = spaces.Discrete(num_actions)
-            self, num_states, num_actions, P, initial_state_distrib
+        self.observation_space = spaces.Discrete(num_states)
        )
    def encode(self, taxi_row, taxi_col, pass_loc, dest_idx):
        # (5) 5, 5, 4
@@ -157,6 +158,20 @@ class TaxiEnv(discrete.DiscreteEnv):
        assert 0 <= i < 5
        return reversed(out)
    def step(self, a):
        transitions = self.P[self.s][a]
        i = categorical_sample([t[0] for t in transitions], self.np_random)
        p, s, r, d = transitions[i]
        self.s = s
        self.lastaction = a
        return (int(s), r, d, {"prob": p})
    def reset(self, seed: Optional[int] = None):
        super().reset(seed=seed)
        self.s = categorical_sample(self.initial_state_distrib, self.np_random)
        self.lastaction = None
        return int(self.s)
    def render(self, mode="human"):
        outfile = StringIO() if mode == "ansi" else sys.stdout
--- a/gym/envs/toy_text/utils.py
+++ b/gym/envs/toy_text/utils.py
@@ -0,0 +1,11 @@
 import numpy as np
 def categorical_sample(prob_n, np_random):
    """
    Sample from categorical distribution
    Each row specifies class probabilities
    """
    prob_n = np.asarray(prob_n)
    csprob_n = np.cumsum(prob_n)
    return (csprob_n > np_random.random()).argmax()