2016-04-27 08:00:58 -07:00
|
|
|
import logging
|
2016-05-03 22:27:26 -04:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2016-04-27 08:00:58 -07:00
|
|
|
import numpy as np
|
2016-08-11 14:45:52 -07:00
|
|
|
import weakref
|
2016-04-27 08:00:58 -07:00
|
|
|
|
|
|
|
from gym import error, monitoring
|
2016-08-05 16:30:24 -07:00
|
|
|
from gym.utils import closer, reraise
|
2016-05-27 12:16:35 -07:00
|
|
|
|
|
|
|
env_closer = closer.Closer()
|
2016-04-27 08:00:58 -07:00
|
|
|
|
|
|
|
# Env-related abstractions
|
|
|
|
|
|
|
|
class Env(object):
|
|
|
|
"""The main OpenAI Gym class. It encapsulates an environment with
|
2016-04-28 10:33:37 -07:00
|
|
|
arbitrary behind-the-scenes dynamics. An environment can be
|
|
|
|
partially or fully observed.
|
|
|
|
|
|
|
|
The main API methods that users of this class need to know are:
|
|
|
|
|
|
|
|
step
|
2016-05-29 09:07:09 -07:00
|
|
|
reset
|
2016-04-28 10:33:37 -07:00
|
|
|
render
|
2016-05-15 15:59:02 -07:00
|
|
|
close
|
2016-06-12 20:56:21 -07:00
|
|
|
configure
|
2016-05-29 09:07:09 -07:00
|
|
|
seed
|
2016-04-27 08:00:58 -07:00
|
|
|
|
|
|
|
When implementing an environment, override the following methods
|
|
|
|
in your subclass:
|
|
|
|
|
|
|
|
_step
|
|
|
|
_reset
|
|
|
|
_render
|
2016-05-29 09:07:09 -07:00
|
|
|
_close
|
2016-06-12 20:56:21 -07:00
|
|
|
_configure
|
2016-05-29 09:07:09 -07:00
|
|
|
_seed
|
2016-04-27 08:00:58 -07:00
|
|
|
|
|
|
|
And set the following attributes:
|
|
|
|
|
|
|
|
action_space: The Space object corresponding to valid actions
|
|
|
|
observation_space: The Space object corresponding to valid observations
|
2016-05-27 12:16:35 -07:00
|
|
|
reward_range: A tuple corresponding to the min and max possible rewards
|
2016-04-27 08:00:58 -07:00
|
|
|
|
2016-08-24 23:10:58 +02:00
|
|
|
Note: a default reward range set to [-inf,+inf] already exists. Set it if you want a narrower range.
|
|
|
|
|
2016-04-27 08:00:58 -07:00
|
|
|
The methods are accessed publicly as "step", "reset", etc.. The
|
|
|
|
non-underscored versions are wrapper methods to which we may add
|
2016-06-29 01:47:55 -07:00
|
|
|
functionality over time.
|
2016-04-27 08:00:58 -07:00
|
|
|
"""
|
|
|
|
|
2016-05-27 12:16:35 -07:00
|
|
|
def __new__(cls, *args, **kwargs):
|
|
|
|
# We use __new__ since we want the env author to be able to
|
2016-08-13 13:25:19 -04:00
|
|
|
# override __init__ without remembering to call super.
|
2016-05-27 12:16:35 -07:00
|
|
|
env = super(Env, cls).__new__(cls)
|
|
|
|
env._env_closer_id = env_closer.register(env)
|
|
|
|
env._closed = False
|
2016-06-12 20:56:21 -07:00
|
|
|
env._configured = False
|
2016-08-11 14:45:52 -07:00
|
|
|
env._unwrapped = None
|
2016-05-27 12:16:35 -07:00
|
|
|
|
|
|
|
# Will be automatically set when creating an environment via 'make'
|
|
|
|
env.spec = None
|
|
|
|
return env
|
|
|
|
|
2016-04-27 08:00:58 -07:00
|
|
|
# Set this in SOME subclasses
|
|
|
|
metadata = {'render.modes': []}
|
2016-04-30 22:53:10 -04:00
|
|
|
reward_range = (-np.inf, np.inf)
|
2016-04-27 08:00:58 -07:00
|
|
|
|
2016-05-15 15:59:02 -07:00
|
|
|
# Override in SOME subclasses
|
|
|
|
def _close(self):
|
|
|
|
pass
|
|
|
|
|
2016-06-12 20:56:21 -07:00
|
|
|
def _configure(self):
|
|
|
|
pass
|
|
|
|
|
2016-04-27 08:00:58 -07:00
|
|
|
# Set these in ALL subclasses
|
|
|
|
action_space = None
|
|
|
|
observation_space = None
|
|
|
|
|
|
|
|
# Override in ALL subclasses
|
|
|
|
def _step(self, action): raise NotImplementedError
|
|
|
|
def _reset(self): raise NotImplementedError
|
|
|
|
def _render(self, mode='human', close=False):
|
|
|
|
if close:
|
|
|
|
return
|
|
|
|
raise NotImplementedError
|
2016-05-29 09:07:09 -07:00
|
|
|
def _seed(self, seed=None): return []
|
2016-04-27 08:00:58 -07:00
|
|
|
|
2016-09-05 10:30:54 -07:00
|
|
|
# Do not override
|
|
|
|
_owns_render = True
|
|
|
|
|
2016-06-09 11:29:34 -07:00
|
|
|
@property
|
|
|
|
def monitor(self):
|
2016-06-09 11:31:35 -07:00
|
|
|
"""Lazily creates a monitor instance.
|
|
|
|
|
|
|
|
We do this lazily rather than at environment creation time
|
|
|
|
since when the monitor closes, we need remove the existing
|
|
|
|
monitor but also make it easy to start a new one. We could
|
|
|
|
still just forcibly create a new monitor instance on old
|
|
|
|
monitor close, but that seems less clean.
|
|
|
|
"""
|
2016-06-09 11:29:34 -07:00
|
|
|
if not hasattr(self, '_monitor'):
|
|
|
|
self._monitor = monitoring.Monitor(self)
|
|
|
|
return self._monitor
|
|
|
|
|
2016-04-27 08:00:58 -07:00
|
|
|
def step(self, action):
|
2016-04-28 10:33:37 -07:00
|
|
|
"""Run one timestep of the environment's dynamics. When end of
|
|
|
|
episode is reached, you are responsible for calling `reset()`
|
|
|
|
to reset this environment's state.
|
2016-04-27 08:00:58 -07:00
|
|
|
|
2016-05-27 12:16:35 -07:00
|
|
|
Accepts an action and returns a tuple (observation, reward, done, info).
|
2016-04-27 08:00:58 -07:00
|
|
|
|
2016-05-27 12:16:35 -07:00
|
|
|
Args:
|
|
|
|
action (object): an action provided by the environment
|
2016-04-27 08:00:58 -07:00
|
|
|
|
2016-05-27 12:16:35 -07:00
|
|
|
Returns:
|
|
|
|
observation (object): agent's observation of the current environment
|
|
|
|
reward (float) : amount of reward returned after previous action
|
|
|
|
done (boolean): whether the episode has ended, in which case further step() calls will return undefined results
|
|
|
|
info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning)
|
2016-04-27 08:00:58 -07:00
|
|
|
"""
|
|
|
|
self.monitor._before_step(action)
|
|
|
|
observation, reward, done, info = self._step(action)
|
2016-05-11 05:36:09 +02:00
|
|
|
|
2016-04-27 08:00:58 -07:00
|
|
|
done = self.monitor._after_step(observation, reward, done, info)
|
|
|
|
return observation, reward, done, info
|
|
|
|
|
|
|
|
def reset(self):
|
2016-09-04 00:38:03 -07:00
|
|
|
"""Resets the state of the environment and returns an initial
|
|
|
|
observation. Will call 'configure()' if not already called.
|
2016-04-27 08:00:58 -07:00
|
|
|
|
2016-09-04 00:38:03 -07:00
|
|
|
Returns: observation (object): the initial observation of the
|
|
|
|
space. (Initial reward is assumed to be 0.)
|
2016-04-27 08:00:58 -07:00
|
|
|
"""
|
2016-06-12 20:56:21 -07:00
|
|
|
if self.metadata.get('configure.required') and not self._configured:
|
2016-09-04 00:38:03 -07:00
|
|
|
raise error.Error("{} requires manually calling 'configure()' before 'reset()'".format(self))
|
|
|
|
elif not self._configured:
|
|
|
|
self.configure()
|
2016-06-12 20:56:21 -07:00
|
|
|
|
2016-04-27 08:00:58 -07:00
|
|
|
self.monitor._before_reset()
|
|
|
|
observation = self._reset()
|
|
|
|
self.monitor._after_reset(observation)
|
|
|
|
return observation
|
|
|
|
|
|
|
|
def render(self, mode='human', close=False):
|
|
|
|
"""Renders the environment.
|
|
|
|
|
|
|
|
The set of supported modes varies per environment. (And some
|
|
|
|
environments do not support rendering at all.) By convention,
|
|
|
|
if mode is:
|
|
|
|
|
|
|
|
- human: render to the current display or terminal and
|
|
|
|
return nothing. Usually for human consumption.
|
|
|
|
- rgb_array: Return an numpy.ndarray with shape (x, y, 3),
|
|
|
|
representing RGB values for an x-by-y pixel image, suitable
|
|
|
|
for turning into a video.
|
|
|
|
- ansi: Return a string (str) or StringIO.StringIO containing a
|
|
|
|
terminal-style text representation. The text can include newlines
|
|
|
|
and ANSI escape sequences (e.g. for colors).
|
|
|
|
|
|
|
|
Note:
|
|
|
|
Make sure that your class's metadata 'render.modes' key includes
|
|
|
|
the list of supported modes. It's recommended to call super()
|
|
|
|
in implementations to use the functionality of this method.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
mode (str): the mode to render with
|
|
|
|
close (bool): close all open renderings
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
|
|
|
class MyEnv(Env):
|
2016-05-27 12:16:35 -07:00
|
|
|
metadata = {'render.modes': ['human', 'rgb_array']}
|
|
|
|
|
|
|
|
def render(self, mode='human'):
|
|
|
|
if mode == 'rgb_array':
|
|
|
|
return np.array(...) # return RGB frame suitable for video
|
|
|
|
elif mode is 'human':
|
|
|
|
... # pop up a window and render
|
|
|
|
else:
|
|
|
|
super(MyEnv, self).render(mode=mode) # just raise an exception
|
2016-04-27 08:00:58 -07:00
|
|
|
"""
|
|
|
|
if close:
|
|
|
|
return self._render(close=close)
|
|
|
|
|
|
|
|
# This code can be useful for calling super() in a subclass.
|
|
|
|
modes = self.metadata.get('render.modes', [])
|
|
|
|
if len(modes) == 0:
|
|
|
|
raise error.UnsupportedMode('{} does not support rendering (requested mode: {})'.format(self, mode))
|
|
|
|
elif mode not in modes:
|
|
|
|
raise error.UnsupportedMode('Unsupported rendering mode: {}. (Supported modes for {}: {})'.format(mode, self, modes))
|
|
|
|
|
|
|
|
return self._render(mode=mode, close=close)
|
|
|
|
|
2016-05-15 15:59:02 -07:00
|
|
|
def close(self):
|
2016-05-27 12:16:35 -07:00
|
|
|
"""Override _close in your subclass to perform any necessary cleanup.
|
|
|
|
|
|
|
|
Environments will automatically close() themselves when
|
|
|
|
garbage collected or when the program exits.
|
2016-05-15 15:59:02 -07:00
|
|
|
"""
|
2016-05-29 09:07:09 -07:00
|
|
|
# _closed will be missing if this instance is still
|
|
|
|
# initializing.
|
|
|
|
if not hasattr(self, '_closed') or self._closed:
|
2016-05-27 12:16:35 -07:00
|
|
|
return
|
|
|
|
|
2016-09-05 16:00:31 -07:00
|
|
|
# Automatically close the monitor and any render window.
|
|
|
|
if hasattr(self, '_monitor'):
|
|
|
|
self.monitor.close()
|
2016-09-05 10:30:54 -07:00
|
|
|
if self._owns_render:
|
|
|
|
self.render(close=True)
|
2016-08-17 13:58:52 -07:00
|
|
|
|
2016-05-27 12:16:35 -07:00
|
|
|
self._close()
|
|
|
|
env_closer.unregister(self._env_closer_id)
|
|
|
|
# If an error occurs before this line, it's possible to
|
|
|
|
# end up with double close.
|
|
|
|
self._closed = True
|
2016-05-15 15:59:02 -07:00
|
|
|
|
2016-05-29 09:07:09 -07:00
|
|
|
def seed(self, seed=None):
|
|
|
|
"""Sets the seed for this env's random number generator(s).
|
|
|
|
|
|
|
|
Note:
|
|
|
|
Some environments use multiple pseudorandom number generators.
|
|
|
|
We want to capture all such seeds used in order to ensure that
|
|
|
|
there aren't accidental correlations between multiple generators.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
list<bigint>: Returns the list of seeds used in this env's random
|
|
|
|
number generators. The first value in the list should be the
|
|
|
|
"main" seed, or the value which a reproducer should pass to
|
|
|
|
'seed'. Often, the main seed equals the provided 'seed', but
|
|
|
|
this won't be true if seed=None, for example.
|
|
|
|
"""
|
|
|
|
return self._seed(seed)
|
|
|
|
|
2016-06-12 20:56:21 -07:00
|
|
|
def configure(self, *args, **kwargs):
|
|
|
|
"""Provides runtime configuration to the environment.
|
|
|
|
|
|
|
|
This configuration should consist of data that tells your
|
|
|
|
environment how to run (such as an address of a remote server,
|
|
|
|
or path to your ImageNet data). It should not affect the
|
|
|
|
semantics of the environment.
|
|
|
|
"""
|
|
|
|
|
|
|
|
self._configured = True
|
2016-08-05 16:30:24 -07:00
|
|
|
|
|
|
|
try:
|
2016-09-04 00:38:03 -07:00
|
|
|
self._configure(*args, **kwargs)
|
2016-08-05 16:30:24 -07:00
|
|
|
except TypeError as e:
|
|
|
|
# It can be confusing if you have the wrong environment
|
|
|
|
# and try calling with unsupported arguments, since your
|
|
|
|
# stack trace will only show core.py.
|
|
|
|
if self.spec:
|
|
|
|
reraise(suffix='(for {})'.format(self.spec.id))
|
|
|
|
else:
|
|
|
|
raise
|
2016-06-12 20:56:21 -07:00
|
|
|
|
2016-08-17 15:16:45 -07:00
|
|
|
@property
|
|
|
|
def unwrapped(self):
|
|
|
|
"""Completely unwrap this env.
|
2016-08-11 14:45:52 -07:00
|
|
|
|
|
|
|
Notes:
|
2016-08-17 15:16:45 -07:00
|
|
|
EXPERIMENTAL: may be removed in a later version of Gym
|
2016-08-11 14:45:52 -07:00
|
|
|
|
2016-08-17 15:16:45 -07:00
|
|
|
This is a dynamic property in order to avoid refcycles.
|
2016-08-13 19:24:48 -07:00
|
|
|
|
2016-08-11 14:45:52 -07:00
|
|
|
Returns:
|
2016-08-17 15:16:45 -07:00
|
|
|
gym.Env: The base non-wrapped gym.Env instance
|
2016-08-11 14:45:52 -07:00
|
|
|
"""
|
|
|
|
if self._unwrapped is not None:
|
|
|
|
return self._unwrapped
|
|
|
|
else:
|
|
|
|
return self
|
|
|
|
|
2016-05-15 15:59:02 -07:00
|
|
|
def __del__(self):
|
|
|
|
self.close()
|
|
|
|
|
2016-04-27 08:00:58 -07:00
|
|
|
def __str__(self):
|
|
|
|
return '<{} instance>'.format(type(self).__name__)
|
|
|
|
|
|
|
|
# Space-related abstractions
|
|
|
|
|
|
|
|
class Space(object):
|
2016-08-11 14:45:52 -07:00
|
|
|
"""Defines the observation and action spaces, so you can write generic
|
|
|
|
code that applies to any Env. For example, you can choose a random
|
|
|
|
action.
|
2016-04-27 08:00:58 -07:00
|
|
|
"""
|
|
|
|
|
|
|
|
def sample(self, seed=0):
|
|
|
|
"""
|
|
|
|
Uniformly randomly sample a random elemnt of this space
|
|
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
def contains(self, x):
|
|
|
|
"""
|
|
|
|
Return boolean specifying if x is a valid
|
|
|
|
member of this space
|
|
|
|
"""
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
def to_jsonable(self, sample_n):
|
|
|
|
"""Convert a batch of samples from this space to a JSONable data type."""
|
|
|
|
# By default, assume identity is JSONable
|
|
|
|
return sample_n
|
|
|
|
|
|
|
|
def from_jsonable(self, sample_n):
|
|
|
|
"""Convert a JSONable data type to a batch of samples from this space."""
|
|
|
|
# By default, assume identity is JSONable
|
|
|
|
return sample_n
|
2016-08-11 14:45:52 -07:00
|
|
|
|
|
|
|
class Wrapper(Env):
|
2016-09-09 10:13:27 -07:00
|
|
|
# Clear metadata so by default we don't override any keys.
|
|
|
|
metadata = {}
|
|
|
|
|
2016-09-05 10:30:54 -07:00
|
|
|
_owns_render = False
|
|
|
|
|
2016-09-04 00:38:03 -07:00
|
|
|
# Make sure self.env is always defined, even if things break
|
|
|
|
# early.
|
|
|
|
env = None
|
|
|
|
|
|
|
|
def __init__(self, env=None):
|
2016-08-13 19:24:48 -07:00
|
|
|
self.env = env
|
2016-09-04 00:38:03 -07:00
|
|
|
# Merge with the base metadata
|
|
|
|
metadata = self.metadata
|
|
|
|
self.metadata = self.env.metadata.copy()
|
|
|
|
self.metadata.update(metadata)
|
|
|
|
|
|
|
|
self.action_space = self.env.action_space
|
|
|
|
self.observation_space = self.env.observation_space
|
|
|
|
self.reward_range = self.env.reward_range
|
|
|
|
self._spec = self.env.spec
|
|
|
|
self._unwrapped = self.env.unwrapped
|
2016-08-11 14:45:52 -07:00
|
|
|
|
|
|
|
def _step(self, action):
|
|
|
|
return self.env.step(action)
|
|
|
|
|
|
|
|
def _reset(self):
|
|
|
|
return self.env.reset()
|
|
|
|
|
|
|
|
def _render(self, mode='human', close=False):
|
2016-09-04 00:38:03 -07:00
|
|
|
if self.env is None:
|
|
|
|
return
|
2016-08-11 14:45:52 -07:00
|
|
|
return self.env.render(mode, close)
|
|
|
|
|
|
|
|
def _close(self):
|
2016-09-04 00:38:03 -07:00
|
|
|
if self.env is None:
|
|
|
|
return
|
2016-08-11 14:45:52 -07:00
|
|
|
return self.env.close()
|
|
|
|
|
|
|
|
def _configure(self, *args, **kwargs):
|
|
|
|
return self.env.configure(*args, **kwargs)
|
|
|
|
|
|
|
|
def _seed(self, seed=None):
|
|
|
|
return self.env.seed(seed)
|
|
|
|
|
|
|
|
def __str__(self):
|
2016-09-04 00:38:03 -07:00
|
|
|
return '<{}{}>'.format(type(self).__name__, self.env)
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return str(self)
|
2016-08-17 15:16:45 -07:00
|
|
|
|
|
|
|
@property
|
|
|
|
def spec(self):
|
2016-08-17 15:55:05 -07:00
|
|
|
if self._spec is None:
|
|
|
|
self._spec = self.env.spec
|
2016-08-17 15:16:45 -07:00
|
|
|
return self._spec
|
|
|
|
|
|
|
|
@spec.setter
|
|
|
|
def spec(self, spec):
|
2016-08-17 15:55:05 -07:00
|
|
|
# Won't have an env attr while in the __new__ from gym.Env
|
2016-09-04 00:38:03 -07:00
|
|
|
if self.env is not None:
|
2016-08-17 15:16:45 -07:00
|
|
|
self.env.spec = spec
|
|
|
|
self._spec = spec
|
2016-09-04 00:38:03 -07:00
|
|
|
|
|
|
|
class ObservationWrapper(Wrapper):
|
|
|
|
def _reset(self):
|
2016-11-13 19:42:54 -08:00
|
|
|
observation = self.env.reset()
|
|
|
|
return self._observation(observation)
|
2016-09-04 00:38:03 -07:00
|
|
|
|
|
|
|
def _step(self, action):
|
|
|
|
observation, reward, done, info = self.env.step(action)
|
2016-09-04 01:44:20 -07:00
|
|
|
return self.observation(observation), reward, done, info
|
|
|
|
|
|
|
|
def observation(self, observation):
|
|
|
|
return self._observation(observation)
|
2016-09-04 00:38:03 -07:00
|
|
|
|
|
|
|
def _observation(self, observation):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
class RewardWrapper(Wrapper):
|
|
|
|
def _step(self, action):
|
|
|
|
observation, reward, done, info = self.env.step(action)
|
2016-09-04 01:44:20 -07:00
|
|
|
return observation, self.reward(reward), done, info
|
|
|
|
|
|
|
|
def reward(self, reward):
|
|
|
|
return self._reward(reward)
|
2016-09-04 00:38:03 -07:00
|
|
|
|
|
|
|
def _reward(self, reward):
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
class ActionWrapper(Wrapper):
|
|
|
|
def _step(self, action):
|
2016-09-04 01:44:20 -07:00
|
|
|
action = self.action(action)
|
2016-09-04 00:38:03 -07:00
|
|
|
return self.env.step(action)
|
|
|
|
|
2016-09-04 01:44:20 -07:00
|
|
|
def action(self, action):
|
|
|
|
return self._action(action)
|
|
|
|
|
2016-09-04 00:38:03 -07:00
|
|
|
def _action(self, action):
|
|
|
|
raise NotImplementedError
|
2016-10-14 22:07:47 -07:00
|
|
|
|
|
|
|
def reverse_action(self, action):
|
|
|
|
return self._reverse_action(action)
|
|
|
|
|
|
|
|
def _reverse_action(self, action):
|
|
|
|
raise NotImplementedError
|