Algorithmic refactor (#383)

* Refactor/document algorithmic environments and add tests. * test for 3 row addition * Fix failing rollout test by reinserting quirk in reversedAddition env * todo regarding addition3-v0 * Fix python 3 division issues * typo fix * Re-generate python3 rollout file to account for ReversedAddition bug fix
2025-08-20 05:52:03 +00:00 · 2016-10-21 16:06:48 -07:00
parent bee6be5632
commit e84bd0ffe1
11 changed files with 4849 additions and 947 deletions
--- a/gym/envs/algorithmic/init.py
+++ b/gym/envs/algorithmic/init.py
@@ -1,4 +1,4 @@
-from gym.envs.algorithmic.copy import CopyEnv
+from gym.envs.algorithmic.copy_ import CopyEnv
 from gym.envs.algorithmic.repeat_copy import RepeatCopyEnv
 from gym.envs.algorithmic.duplicated_input import DuplicatedInputEnv
 from gym.envs.algorithmic.reverse import ReverseEnv
--- a/gym/envs/algorithmic/algorithmic_env.py
+++ b/gym/envs/algorithmic/algorithmic_env.py
@@ -1,3 +1,35 @@
 """
 Algorithmic environments have the following traits in common:
 - A 1-d "input tape" or 2-d "input grid" of characters
 - A target string which is a deterministic function of the input characters
 Agents control a read head that moves over the input tape. Observations consist
 of the single character currently under the read head. The read head may fall
 off the end of the tape in any direction. When this happens, agents will observe
 a special blank character (with index=env.base) until they get back in bounds.
 Actions consist of 3 sub-actions:
    - Direction to move the read head (left or right, plus up and down for 2-d envs)
    - Whether to write to the output tape
    - Which character to write (ignored if the above sub-action is 0)
 An episode ends when:
    - The agent writes the full target string to the output tape.
    - The agent writes an incorrect character.
    - The agent runs out the time limit. (Which is fairly conservative.)
 Reward schedule:
    write a correct character: +1
    write a wrong character: -.5
    run out the clock: -1
    otherwise: 0
 In the beginning, input strings will be fairly short. After an environment has
 been consistently solved over some window of episodes, the environment will 
 increase the average length of generated strings. Typical env specs require
 leveling up many times to reach their reward threshold.
 """
 from gym import Env
 from gym.spaces import Discrete, Tuple
 from gym.utils import colorize, seeding
@@ -5,93 +37,82 @@ import numpy as np
 from six import StringIO
 import sys
 import math
 import logging
-hash_base = None
+logger = logging.getLogger(__name__)
 def ha(array):
    return (hash_base * (array + 5)).sum()
 class AlgorithmicEnv(Env):
    metadata = {'render.modes': ['human', 'ansi']}
    # Only 'promote' the length of generated input strings if the worst of the 
    # last n episodes was no more than this far from the maximum reward
    MIN_REWARD_SHORTFALL_FOR_PROMOTION = -1.0
-    def __init__(self, inp_dim=1, base=10, chars=False):
+    def __init__(self, base=10, chars=False, starting_min_length=2):
-        global hash_base
+        """
-
+        base: Number of distinct characters. 
-        hash_base = 50 ** np.arange(inp_dim)
+        chars: If True, use uppercase alphabet. Otherwise, digits. Only affects
               rendering.
        starting_min_length: Minimum input string length. Ramps up as episodes 
                             are consistently solved.
        """
        self.base = base
        # Keep track of this many past episodes
        self.last = 10
-        self.total_reward = 0
+        # Cumulative reward earned this episode
-        self.sum_reward = 0
+        self.episode_total_reward = None
-        AlgorithmicEnv.sum_rewards = []
+        # Running tally of reward shortfalls. e.g. if there were 10 points to earn and
-        self.chars = chars
+        # we got 8, we'd append -2
-        self.inp_dim = inp_dim
+        AlgorithmicEnv.reward_shortfalls = []
-        AlgorithmicEnv.current_length = 2
+        if chars:
-        tape_control = []
+            self.charmap = [chr(ord('A')+i) for i in range(base)]
-
+        else:
-        self.action_space = Tuple(([Discrete(2 * self.inp_dim), Discrete(2), Discrete(self.base)]))
+            self.charmap = [str(i) for i in range(base)]
        self.charmap.append(' ')
        # TODO: Not clear why this is a class variable rather than instance. 
        # Could lead to some spooky action at a distance if someone is working
        # with multiple algorithmic envs at once. Also makes testing tricky.
        AlgorithmicEnv.min_length = starting_min_length
        # Three sub-actions:
        #       1. Move read head left or write (or up/down)
        #       2. Write or not
        #       3. Which character to write. (Ignored if should_write=0)
        self.action_space = Tuple(
            [Discrete(len(self.MOVEMENTS)), Discrete(2), Discrete(self.base)]
        )
        # Can see just what is on the input tape (one of n characters, or nothing)
        self.observation_space = Discrete(self.base + 1)
        self._seed()
        self.reset()
    @classmethod
    def _movement_idx(kls, movement_name):
        return kls.MOVEMENTS.index(movement_name)
    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    def _get_obs(self, pos=None):
-        if pos is None:
+        """Return an observation corresponding to the given read head position
-            pos = self.x
+        (or the current read head position, if none is given)."""
-        assert isinstance(pos, np.ndarray) and pos.shape[0] == self.inp_dim
+        raise NotImplemented
        if ha(pos) not in self.content:
            self.content[ha(pos)] = self.base
        return self.content[ha(pos)]
    def _get_str_obs(self, pos=None):
        ret = self._get_obs(pos)
-        if ret == self.base:
+        return self.charmap[ret]
            return " "
        else:
            if self.chars:
                return chr(ret + ord('A'))
            return str(ret)
-    def _get_str_target(self, pos=None):
+    def _get_str_target(self, pos):
-        if pos not in self.target:
+        """Return the ith character of the target string (or " " if index
        out of bounds)."""
        if pos < 0 or len(self.target) <= pos:
            return " "
        else:
-            ret = self.target[pos]
+            return self.charmap[self.target[pos]]
            if self.chars:
                return chr(ret + ord('A'))
            return str(ret)
    def _render_observation(self):
-        x = self.x
+        """Return a string representation of the input tape/grid."""
-        if self.inp_dim == 1:
+        raise NotImplemented
            x_str =      "Observation Tape    : "
            for i in range(-2, self.total_len + 2):
                if i == x:
                    x_str += colorize(self._get_str_obs(np.array([i])), 'green', highlight=True)
                else:
                    x_str += self._get_str_obs(np.array([i]))
            x_str += "\n"
            return x_str
        elif self.inp_dim == 2:
            label =      "Observation Grid    : "
            x_str = ""
            for j in range(-1, 3):
                if j != -1:
                    x_str += " " * len(label)
                for i in range(-2, self.total_len + 2):
                    if i == x[0] and j == x[1]:
                        x_str += colorize(self._get_str_obs(np.array([i, j])), 'green', highlight=True)
                    else:
                        x_str += self._get_str_obs(np.array([i, j]))
                x_str += "\n"
            x_str = label + x_str
            return x_str
        else:
            assert False
    def _render(self, mode='human', close=False):
        if close:
@@ -99,34 +120,25 @@ class AlgorithmicEnv(Env):
            return
        outfile = StringIO() if mode == 'ansi' else sys.stdout
-        inp = "Total length of input instance: %d, step: %d\n" % (self.total_len, self.time)
+        inp = "Total length of input instance: %d, step: %d\n" % (self.input_width, self.time)
        outfile.write(inp)
-        x, y, action = self.x, self.y, self.last_action
+        x, y, action = self.read_head_position, self.write_head_position, self.last_action
        if action is not None:
            inp_act, out_act, pred = action
        outfile.write("=" * (len(inp) - 1) + "\n")
        y_str =      "Output Tape         : "
-        target_str = "Targets             :   "
+        target_str = "Targets             : "
        if action is not None:
-            if self.chars:
+            pred_str = self.charmap[pred]
                pred_str = chr(pred + ord('A'))
            else:
                pred_str = str(pred)
        x_str = self._render_observation()
-        max_len = int(self.total_reward) + 1
+        for i in range(-2, len(self.target) + 2):
        for i in range(-2, max_len):
            if i not in self.target:
                y_str += " "
                continue
            target_str += self._get_str_target(i)
            if i < y - 1:
                y_str += self._get_str_target(i)
            elif i == (y - 1):
                if action is not None and out_act == 1:
-                    if pred == self.target[i]:
+                    color = 'green' if pred == self.target[i] else 'red'
-                        y_str += colorize(pred_str, 'green', highlight=True)
+                    y_str += colorize(pred_str, color, highlight=True)
                    else:
                        y_str += colorize(pred_str, 'red', highlight=True)
                else:
                    y_str += self._get_str_target(i)
        outfile.write(x_str)
@@ -134,77 +146,185 @@ class AlgorithmicEnv(Env):
        outfile.write(target_str + "\n\n")
        if action is not None:
-            outfile.write("Current reward      :   %.3f\n" % self.reward)
+            outfile.write("Current reward      :   %.3f\n" % self.last_reward)
-            outfile.write("Cumulative reward   :   %.3f\n" % self.sum_reward)
+            outfile.write("Cumulative reward   :   %.3f\n" % self.episode_total_reward)
-            move = ""
+            move = self.MOVEMENTS[inp_act]
            if inp_act == 0:
                move = "left"
            elif inp_act == 1:
                move = "right"
            elif inp_act == 2:
                move += "up"
            elif inp_act == 3:
                move += "down"
            outfile.write("Action              :   Tuple(move over input: %s,\n" % move)
-            if out_act == 1:
+            out_act = out_act == 1
                out_act = "True"
            else:
                out_act = "False"
            outfile.write("                              write to the output tape: %s,\n" % out_act)
            outfile.write("                              prediction: %s)\n" % pred_str)
        else:
            outfile.write("\n" * 5)
        return outfile
    @property
    def input_width(self):
        return len(self.input_data)
    def _step(self, action):
        assert self.action_space.contains(action)
        self.last_action = action
        inp_act, out_act, pred = action
        done = False
        reward = 0.0
        # We are outside the sample.
        self.time += 1
-        if self.y not in self.target:
+        assert 0 <= self.write_head_position
-            reward = -10.0
+        if out_act == 1:
-            done = True
+            try:
-        else:
+                correct = pred == self.target[self.write_head_position]
-            if out_act == 1:
+            except IndexError:
-                if pred == self.target[self.y]:
+                logger.warn("It looks like you're calling step() even though this "+
-                    reward = 1.0
+                    "environment has already returned done=True. You should always call "+
-                else:
+                    "reset() once you receive done=True. Any further steps are undefined "+
-                    reward = -0.5
+                    "behaviour.")
-                    done = True
+                correct = False
-                self.y += 1
+            if correct:
-                if self.y not in self.target:
+                reward = 1.0
-                    done = True
+            else:
-            if inp_act == 0:
+                # Bail as soon as a wrong character is written to the tape
-                self.x[0] -= 1
+                reward = -0.5
            elif inp_act == 1:
                self.x[0] += 1
            elif inp_act == 2:
                self.x[1] -= 1
            elif inp_act == 3:
                self.x[1] += 1
            if self.time > self.total_len + self.total_reward + 4:
                reward = -1.0
                done = True
            self.write_head_position += 1
            if self.write_head_position >= len(self.target):
                done = True
        self._move(inp_act)
        if self.time > self.time_limit:
            reward = -1.0
            done = True
        obs = self._get_obs()
-        self.reward = reward
+        self.last_reward = reward
-        self.sum_reward += reward
+        self.episode_total_reward += reward
        return (obs, reward, done, {})
    @property
    def time_limit(self):
        """If an agent takes more than this many timesteps, end the episode
        immediately and return a negative reward."""
        # (Seemingly arbitrary)
        return self.input_width + len(self.target) + 4
    def _check_levelup(self):
        """Called between episodes. Update our running record of episode rewards 
        and, if appropriate, 'level up' minimum input length."""
        if self.episode_total_reward is None:
            # This is before the first episode/call to reset(). Nothing to do
            return
        AlgorithmicEnv.reward_shortfalls.append(self.episode_total_reward - len(self.target))
        AlgorithmicEnv.reward_shortfalls = AlgorithmicEnv.reward_shortfalls[-self.last:]
        if len(AlgorithmicEnv.reward_shortfalls) == self.last and \
          min(AlgorithmicEnv.reward_shortfalls) >= self.MIN_REWARD_SHORTFALL_FOR_PROMOTION and \
          AlgorithmicEnv.min_length < 30:
            AlgorithmicEnv.min_length += 1
            AlgorithmicEnv.reward_shortfalls = []
    def _reset(self):
        self._check_levelup()
        self.last_action = None
-        self.x = np.zeros(self.inp_dim).astype(np.int)
+        self.last_reward = 0
-        self.y = 0
+        self.read_head_position = self.READ_HEAD_START
-        AlgorithmicEnv.sum_rewards.append(self.sum_reward - self.total_reward)
+        self.write_head_position = 0
-        AlgorithmicEnv.sum_rewards = AlgorithmicEnv.sum_rewards[-self.last:]
+        self.episode_total_reward = 0.0
        if len(AlgorithmicEnv.sum_rewards) == self.last and \
          min(AlgorithmicEnv.sum_rewards) >= -1.0 and \
          AlgorithmicEnv.current_length < 30:
            AlgorithmicEnv.current_length += 1
            AlgorithmicEnv.sum_rewards = []
        self.sum_reward = 0.0
        self.time = 0
-        self.total_len = self.np_random.randint(3) + AlgorithmicEnv.current_length
+        length = self.np_random.randint(3) + AlgorithmicEnv.min_length
-        self.set_data()
+        self.input_data = self.generate_input_data(length)
        self.target = self.target_from_input_data(self.input_data)
        return self._get_obs()
    def generate_input_data(self, size):
        raise NotImplemented
    def target_from_input_data(self, input_data):
        raise NotImplemented("Subclasses must implement")
    def _move(self, movement):
        raise NotImplemented
 class TapeAlgorithmicEnv(AlgorithmicEnv):
    """An algorithmic env with a 1-d input tape."""
    MOVEMENTS = ['left', 'right']
    READ_HEAD_START = 0
    def _move(self, movement):
        named = self.MOVEMENTS[movement]
        self.read_head_position += 1 if named == 'right' else -1
    def _get_obs(self, pos=None):
        if pos is None:
            pos = self.read_head_position
        if pos < 0:
            return self.base
        try:
            return self.input_data[pos]
        except IndexError:
            return self.base
    def generate_input_data(self, size):
        return [self.np_random.randint(self.base) for _ in range(size)]
    def _render_observation(self):
        x = self.read_head_position
        x_str =      "Observation Tape    : "
        for i in range(-2, self.input_width + 2):
            if i == x:
                x_str += colorize(self._get_str_obs(np.array([i])), 'green', highlight=True)
            else:
                x_str += self._get_str_obs(np.array([i]))
        x_str += "\n"
        return x_str
 class GridAlgorithmicEnv(AlgorithmicEnv):
    """An algorithmic env with a 2-d input grid."""
    MOVEMENTS = ['left', 'right', 'up', 'down']
    READ_HEAD_START = (0, 0)
    def __init__(self, rows, *args, **kwargs):
        self.rows = rows
        AlgorithmicEnv.__init__(self, *args, **kwargs)
    def _move(self, movement):
        named = self.MOVEMENTS[movement]
        x, y = self.read_head_position
        if named == 'left':
            x -= 1
        elif named == 'right':
            x += 1
        elif named == 'up':
            y -= 1
        elif named == 'down':
            y += 1
        else:
            raise ValueError("Unrecognized direction: {}".format(named))
        self.read_head_position = x, y
    def generate_input_data(self, size):
        return [
            [self.np_random.randint(self.base) for _ in range(self.rows)]
            for __ in range(size)
        ]
    def _get_obs(self, pos=None):
        if pos is None:
            pos = self.read_head_position
        x, y = pos
        if any(idx < 0 for idx in pos):
            return self.base
        try:
            return self.input_data[x][y]
        except IndexError:
            return self.base
    def _render_observation(self):
        x = self.read_head_position
        label =      "Observation Grid    : "
        x_str = ""
        for j in range(-1, self.rows+1):
            if j != -1:
                x_str += " " * len(label)
            for i in range(-2, self.input_width + 2):
                if i == x[0] and j == x[1]:
                    x_str += colorize(self._get_str_obs((i, j)), 'green', highlight=True)
                else:
                    x_str += self._get_str_obs((i, j))
            x_str += "\n"
        x_str = label + x_str
        return x_str
--- a/gym/envs/algorithmic/copy.py
+++ b/gym/envs/algorithmic/copy.py
@@ -1,22 +0,0 @@
 """
 Task is to copy content from the input tape to
 the output tape. http://arxiv.org/abs/1511.07275
 """
 import numpy as np
 from gym.envs.algorithmic import algorithmic_env
 from gym.envs.algorithmic.algorithmic_env import ha
 class CopyEnv(algorithmic_env.AlgorithmicEnv):
    def __init__(self, base=5):
        algorithmic_env.AlgorithmicEnv.__init__(self,
                                                inp_dim=1,
                                                base=base,
                                                chars=True)
    def set_data(self):
        self.content = {}
        self.target = {}
        for i in range(self.total_len):
            val = self.np_random.randint(self.base)
            self.content[ha(np.array([i]))] = val
            self.target[i] = val
        self.total_reward = self.total_len
--- a/gym/envs/algorithmic/copy_.py
+++ b/gym/envs/algorithmic/copy_.py
@@ -0,0 +1,14 @@
 """
 Task is to copy content from the input tape to
 the output tape. http://arxiv.org/abs/1511.07275
 """
 import numpy as np
 from gym.envs.algorithmic import algorithmic_env
 class CopyEnv(algorithmic_env.TapeAlgorithmicEnv):
    def __init__(self, base=5, chars=True):
        super(CopyEnv, self).__init__(base=base, chars=chars)
    def target_from_input_data(self, input_data):
        return input_data
--- a/gym/envs/algorithmic/duplicated_input.py
+++ b/gym/envs/algorithmic/duplicated_input.py
@@ -1,26 +1,25 @@
 """
-Task is to return every second character from the input tape.
+Task is to return every nth character from the input tape.
 http://arxiv.org/abs/1511.07275
 """
-
+from __future__ import division
 import numpy as np
 from gym.envs.algorithmic import algorithmic_env
 from gym.envs.algorithmic.algorithmic_env import ha
-class DuplicatedInputEnv(algorithmic_env.AlgorithmicEnv):
+class DuplicatedInputEnv(algorithmic_env.TapeAlgorithmicEnv):
    def __init__(self, duplication=2, base=5):
        self.duplication = duplication
-        algorithmic_env.AlgorithmicEnv.__init__(self,
+        super(DuplicatedInputEnv, self).__init__(base=base, chars=True)
-                                                inp_dim=1,
+
-                                                base=base,
+    def generate_input_data(self, size):
-                                                chars=True)
+        res = []
-    def set_data(self):
+        if size < self.duplication:
-        self.content = {}
+            size = self.duplication
-        self.target = {}
+        for i in range(size//self.duplication):
-        copies = int(self.total_len / self.duplication)
+            char = self.np_random.randint(self.base)
-        for i in range(copies):
+            for _ in range(self.duplication):
-            val = self.np_random.randint(self.base)
+                res.append(char)
-            self.target[i] = val
+        return res
-            for d in range(self.duplication):
+
-                self.content[ha(np.array([i * self.duplication + d]))] = val
+    def target_from_input_data(self, input_data):
-        self.total_reward = self.total_len / self.duplication
+        return [input_data[i] for i in range(0, len(input_data), self.duplication)]
--- a/gym/envs/algorithmic/repeat_copy.py
+++ b/gym/envs/algorithmic/repeat_copy.py
@@ -1,27 +1,16 @@
 """
-Task is to copy content multiple-times from the input tape to
+Task is to copy content multiple times from the input tape to
 the output tape. http://arxiv.org/abs/1511.07275
 """
 import numpy as np
 from gym.envs.algorithmic import algorithmic_env
 from gym.envs.algorithmic.algorithmic_env import ha
-class RepeatCopyEnv(algorithmic_env.AlgorithmicEnv):
+class RepeatCopyEnv(algorithmic_env.TapeAlgorithmicEnv):
    MIN_REWARD_SHORTFALL_FOR_PROMOTION = -.1
    def __init__(self, base=5):
-        algorithmic_env.AlgorithmicEnv.__init__(self,
+        super(RepeatCopyEnv, self).__init__(base=base, chars=True)
                                                inp_dim=1,
                                                base=base,
                                                chars=True)
        self.last = 50
-    def set_data(self):
+    def target_from_input_data(self, input_data):
-        self.content = {}
+        return input_data + list(reversed(input_data)) + input_data
-        self.target = {}
+
        unique = set()
        for i in range(self.total_len):
            val = self.np_random.randint(self.base)
            self.content[ha(np.array([i]))] = val
            self.target[i] = val
            self.target[2 * self.total_len - i - 1] = val
            self.target[2 * self.total_len + i] = val
        self.total_reward = 3.0 * self.total_len + 0.9
--- a/gym/envs/algorithmic/reverse.py
+++ b/gym/envs/algorithmic/reverse.py
@@ -5,22 +5,12 @@ http://arxiv.org/abs/1511.07275
 import numpy as np
 from gym.envs.algorithmic import algorithmic_env
 from gym.envs.algorithmic.algorithmic_env import ha
-class ReverseEnv(algorithmic_env.AlgorithmicEnv):
+class ReverseEnv(algorithmic_env.TapeAlgorithmicEnv):
    MIN_REWARD_SHORTFALL_FOR_PROMOTION = -.1
    def __init__(self, base=2):
-        algorithmic_env.AlgorithmicEnv.__init__(self,
+        super(ReverseEnv, self).__init__(base=base, chars=True, starting_min_length=1)
                                                inp_dim=1,
                                                base=base,
                                                chars=True)
        algorithmic_env.AlgorithmicEnv.current_length = 1
        self.last = 50
-    def set_data(self):
+    def target_from_input_data(self, input_str):
-        self.content = {}
+        return list(reversed(input_str))
        self.target = {}
        for i in range(self.total_len):
            val = self.np_random.randint(self.base)
            self.content[ha(np.array([i]))] = val
            self.target[self.total_len - i - 1] = val
        self.total_reward = self.total_len + 0.9
--- a/gym/envs/algorithmic/reversed_addition.py
+++ b/gym/envs/algorithmic/reversed_addition.py
@@ -1,27 +1,30 @@
 from __future__ import division
 import numpy as np
 from gym.envs.algorithmic import algorithmic_env
 from gym.envs.algorithmic.algorithmic_env import ha
-class ReversedAdditionEnv(algorithmic_env.AlgorithmicEnv):
+class ReversedAdditionEnv(algorithmic_env.GridAlgorithmicEnv):
    def __init__(self, rows=2, base=3):
-        self.rows = rows
+        super(ReversedAdditionEnv, self).__init__(rows=rows, base=base, chars=False)
-        algorithmic_env.AlgorithmicEnv.__init__(self,
+
-                                                inp_dim=2,
+    def target_from_input_data(self, input_strings):
                                                base=base,
                                                chars=False)
    def set_data(self):
        self.content = {}
        self.target = {}
        curry = 0
-        for i in range(self.total_len):
+        target = []
-            vals = []
+        for digits in input_strings:
-            for k in range(self.rows):
+            total = sum(digits) + curry
-                val = self.np_random.randint(self.base)
+            target.append(total % self.base)
-                self.content[ha(np.array([i, k]))] = val
+            curry = total // self.base
-                vals.append(val)
+
            total = sum(vals) + curry
            self.target[i] = total % self.base
            curry = total / self.base
        if curry > 0:
-            self.target[self.total_len] = curry
+            target.append(curry)
-        self.total_reward = self.total_len
+        return target
    @property
    def time_limit(self):
        # Quirk preserved for the sake of consistency: add the length of the input
        # rather than the length of the desired output (which may differ if there's
        # an extra carried digit).
        # TODO: It seems like this time limit is so strict as to make Addition3-v0
        # unsolvable, since agents aren't even given enough time steps to look at
        # all the digits. (The solutions on the scoreboard seem to only work by
        # save-scumming.)
        return self.input_width*2 + 4
--- a/gym/envs/algorithmic/tests/init.py
+++ b/gym/envs/algorithmic/tests/init.py
--- a/gym/envs/algorithmic/tests/test_algorithmic.py
+++ b/gym/envs/algorithmic/tests/test_algorithmic.py
@@ -0,0 +1,239 @@
 from gym.envs import algorithmic as alg
 import unittest
 # All concrete subclasses of AlgorithmicEnv
 ALL_ENVS = [
    alg.copy_.CopyEnv, 
    alg.duplicated_input.DuplicatedInputEnv,
    alg.repeat_copy.RepeatCopyEnv,
    alg.reverse.ReverseEnv,
    alg.reversed_addition.ReversedAdditionEnv,
 ]
 ALL_TAPE_ENVS = [env for env in ALL_ENVS 
    if issubclass(env, alg.algorithmic_env.TapeAlgorithmicEnv)]
 ALL_GRID_ENVS = [env for env in ALL_ENVS 
    if issubclass(env, alg.algorithmic_env.GridAlgorithmicEnv)]
 def imprint(env, input_arr):
    """Monkey-patch the given environment so that when reset() is called, the
    input tape/grid will be set to the given data, rather than being randomly
    generated."""
    env.generate_input_data = lambda _: input_arr
 class TestAlgorithmicEnvInteractions(unittest.TestCase):
    """Test some generic behaviour not specific to any particular algorithmic
    environment. Movement, allocation of rewards, etc."""
    CANNED_INPUT = [0, 1]
    ENV_KLS = alg.copy_.CopyEnv
    LEFT, RIGHT = ENV_KLS._movement_idx('left'), ENV_KLS._movement_idx('right')
    def setUp(self):
        self.env = self.ENV_KLS(base=2, chars=True)
        imprint(self.env, self.CANNED_INPUT)
    def test_successful_interaction(self):
        obs = self.env.reset()
        self.assertEqual(obs, 0)
        obs, reward, done, _ = self.env.step([self.RIGHT, 1, 0])
        self.assertEqual(obs, 1)
        self.assertGreater(reward, 0)
        self.assertFalse(done)
        obs, reward, done, _ = self.env.step([self.LEFT, 1, 1])
        self.assertTrue(done)
        self.assertGreater(reward, 0)
    def test_bad_output_fail_fast(self):
        obs = self.env.reset()
        obs, reward, done, _ = self.env.step([self.RIGHT, 1, 1])
        self.assertTrue(done)
        self.assertLess(reward, 0)
    def test_levelup(self):
        obs = self.env.reset()
        # Kind of a hack
        alg.algorithmic_env.AlgorithmicEnv.reward_shortfalls = []
        min_length = self.env.min_length
        for i in range(self.env.last):
            obs, reward, done, _ = self.env.step([self.RIGHT, 1, 0])
            self.assertFalse(done)
            obs, reward, done, _ = self.env.step([self.RIGHT, 1, 1])
            self.assertTrue(done)
            self.env.reset()
            if i < self.env.last-1:
                self.assertEqual(len(alg.algorithmic_env.AlgorithmicEnv.reward_shortfalls), i+1)
            else:
                # Should have leveled up on the last iteration
                self.assertEqual(self.env.min_length, min_length+1)
                self.assertEqual(len(alg.algorithmic_env.AlgorithmicEnv.reward_shortfalls), 0)
    def test_walk_off_the_end(self):
        obs = self.env.reset()
        # Walk off the end
        obs, r, done, _ = self.env.step([self.LEFT, 0, 0])
        self.assertEqual(obs, self.env.base)
        self.assertEqual(r, 0)
        self.assertFalse(done)
        # Walk further off track
        obs, r, done, _ = self.env.step([self.LEFT, 0, 0])
        self.assertEqual(obs, self.env.base)
        self.assertFalse(done)
        # Return to the first input character
        obs, r, done, _ = self.env.step([self.RIGHT, 0, 0])
        self.assertEqual(obs, self.env.base)
        self.assertFalse(done)
        obs, r, done, _ = self.env.step([self.RIGHT, 0, 0])
        self.assertEqual(obs, 0)
    def test_grid_naviation(self):
        env = alg.reversed_addition.ReversedAdditionEnv(rows=2, base=6)
        N,S,E,W = [env._movement_idx(named_dir) for named_dir in ['up', 'down', 'right', 'left']]
        # Corresponds to a grid that looks like...
        #       0 1 2
        #       3 4 5
        canned = [ [0, 3], [1, 4], [2, 5] ]
        imprint(env, canned)
        obs = env.reset()
        self.assertEqual(obs, 0)
        navigation = [
          (S, 3), (N, 0), (E, 1), (S, 4), (S, 6), (E, 6), (N, 5), (N, 2), (W, 1)
        ]
        for (movement, expected_obs) in navigation:
            obs, reward, done, _ = env.step([movement, 0, 0])
            self.assertEqual(reward, 0)
            self.assertFalse(done)
            self.assertEqual(obs, expected_obs)
    def test_grid_success(self):
        env = alg.reversed_addition.ReversedAdditionEnv(rows=2, base=3)
        canned = [ [1, 2], [1, 0], [2, 2] ]
        imprint(env, canned)
        obs = env.reset()
        target = [0, 2, 1, 1]
        self.assertEqual(env.target, target)
        self.assertEqual(obs, 1)
        for i, target_digit in enumerate(target):
            obs, reward, done, _ = env.step([0, 1, target_digit])
            self.assertGreater(reward, 0)
            self.assertEqual(done, i==len(target)-1)
    def test_sane_time_limit(self):
        obs = self.env.reset()
        self.assertLess(self.env.time_limit, 100)
        for _ in range(100):
            obs, r, done, _ = self.env.step([self.LEFT, 0, 0])
            if done:
                return
        self.fail("Time limit wasn't enforced")
    def test_rendering(self):
        env = self.env
        obs = env.reset()
        self.assertEqual(env._get_str_obs(), 'A')
        self.assertEqual(env._get_str_obs(1), 'B')
        self.assertEqual(env._get_str_obs(-1), ' ')
        self.assertEqual(env._get_str_obs(2), ' ')
        self.assertEqual(env._get_str_target(0), 'A')
        self.assertEqual(env._get_str_target(1), 'B')
        # Test numerical alphabet rendering
        env = self.ENV_KLS(base=3, chars=False)
        imprint(env, self.CANNED_INPUT)
        env.reset()
        self.assertEqual(env._get_str_obs(), '0')
        self.assertEqual(env._get_str_obs(1), '1')
 class TestTargets(unittest.TestCase):
    """Test the rules mapping input strings/grids to target outputs."""
    def test_reverse_target(self):
        input_expected = [
            ([0], [0]),
            ([0, 1], [1, 0]),
            ([1, 1], [1, 1]),
            ([1, 0, 1], [1, 0, 1]),
            ([0, 0, 1, 1], [1, 1, 0, 0]),
        ]
        env = alg.reverse.ReverseEnv()
        for input_arr, expected in input_expected:
            target = env.target_from_input_data(input_arr)
            self.assertEqual(target, expected)
    def test_reversed_addition_target(self):
        env = alg.reversed_addition.ReversedAdditionEnv(base=3)
        input_expected = [
            ([[1,1], [1,1]], [2, 2]),
            ([[2,2], [0,1]], [1, 2]),
            ([[2,1], [1,1], [1,1], [1,0]], [0, 0, 0, 2]),
        ]
        for (input_grid, expected_target) in input_expected:
            self.assertEqual(env.target_from_input_data(input_grid), expected_target)
    def test_reversed_addition_3rows(self):
        env = alg.reversed_addition.ReversedAdditionEnv(base=3, rows=3)
        input_expected = [
            ([[1,1,0],[0,1,1]], [2, 2]),
            ([[1,1,2],[0,1,1]], [1,0,1]),
        ]
        for (input_grid, expected_target) in input_expected:
            self.assertEqual(env.target_from_input_data(input_grid), expected_target)
    def test_copy_target(self):
        env = alg.copy_.CopyEnv()
        self.assertEqual(env.target_from_input_data([0, 1, 2]), [0, 1, 2])
    def test_duplicated_input_target(self):
        env = alg.duplicated_input.DuplicatedInputEnv(duplication=2)
        self.assertEqual(env.target_from_input_data([0, 0, 0, 0, 1, 1]), [0, 0, 1])
    def test_repeat_copy_target(self):
        env = alg.repeat_copy.RepeatCopyEnv()
        self.assertEqual(env.target_from_input_data([0, 1, 2]), [0, 1, 2, 2, 1, 0, 0, 1, 2])
 class TestInputGeneration(unittest.TestCase):
    """Test random input generation.
    """
    def test_tape_inputs(self):
        for env_kls in ALL_TAPE_ENVS:
            env = env_kls()
            for size in range(2,5):
                input_tape = env.generate_input_data(size)
                self.assertTrue(all(0<=x<=env.base for x in input_tape),
                "Invalid input tape from env {}: {}".format(env_kls, input_tape))
                # DuplicatedInput needs to generate inputs with even length,
                # so it may be short one
                self.assertLessEqual(len(input_tape), size)
    def test_grid_inputs(self):
        for env_kls in ALL_GRID_ENVS:
            env = env_kls()
            for size in range(2, 5):
                input_grid = env.generate_input_data(size)
                # Should get "size" sublists, each of length self.rows (not the
                # opposite, as you might expect)
                self.assertEqual(len(input_grid), size)
                self.assertTrue(all(len(col) == env.rows for col in input_grid))
                self.assertTrue(all(0<=x<=env.base for x in input_grid[0]))
    def test_duplicatedinput_inputs(self):
        """The duplicated_input env needs to generate strings with the appropriate
        amount of repetiion."""
        env = alg.duplicated_input.DuplicatedInputEnv(duplication=2)
        input_tape = env.generate_input_data(4)
        self.assertEqual(len(input_tape), 4)
        self.assertEqual(input_tape[0], input_tape[1])
        self.assertEqual(input_tape[2], input_tape[3])
        # If requested input size isn't a multiple of duplication, go lower
        input_tape = env.generate_input_data(3)
        self.assertEqual(len(input_tape), 2)
        self.assertEqual(input_tape[0], input_tape[1])
        # If requested input size is *less than* duplication, go up
        input_tape = env.generate_input_data(1)
        self.assertEqual(len(input_tape), 2)
        self.assertEqual(input_tape[0], input_tape[1])
        env = alg.duplicated_input.DuplicatedInputEnv(duplication=3)
        input_tape = env.generate_input_data(6)
        self.assertEqual(len(input_tape), 6)
        self.assertEqual(input_tape[0], input_tape[1])
        self.assertEqual(input_tape[1], input_tape[2])
 if __name__ == '__main__':
    unittest.main()
--- a/gym/envs/tests/rollout_py3.json
+++ b/gym/envs/tests/rollout_py3.json