mirror of
https://github.com/Farama-Foundation/Gymnasium.git
synced 2025-08-01 22:11:25 +00:00
234 lines
8.8 KiB
Python
234 lines
8.8 KiB
Python
from gym import error
|
|
try:
|
|
import pachi_py
|
|
except ImportError as e:
|
|
# The dependency group [pachi] should match the name is setup.py.
|
|
raise error.DependencyNotInstalled('{}. (HINT: you may need to install the Go dependencies via "pip install gym[pachi].)"'.format(e))
|
|
|
|
import numpy as np
|
|
import gym
|
|
from gym import spaces
|
|
import StringIO
|
|
import sys
|
|
|
|
|
|
# The coordinate representation of Pachi (and pachi_py) is defined on a board
|
|
# with extra rows and columns on the margin of the board, so positions on the board
|
|
# are not numbers in [0, board_size**2) as one would expect. For this Go env, we instead
|
|
# use an action representation that does fall in this more natural range.
|
|
|
|
def _coord_to_action(board, c):
|
|
'''Converts Pachi coordinates to actions'''
|
|
if c == pachi_py.PASS_COORD: return board.size**2 # pass
|
|
if c == pachi_py.RESIGN_COORD: return board.size**2 + 1 # resign
|
|
i, j = board.coord_to_ij(c)
|
|
return i*board.size + j
|
|
|
|
|
|
def _action_to_coord(board, a):
|
|
'''Converts actions to Pachi coordinates'''
|
|
if a == board.size**2: return pachi_py.PASS_COORD
|
|
if a == board.size**2 + 1: return pachi_py.RESIGN_COORD
|
|
return board.ij_to_coord(a // board.size, a % board.size)
|
|
|
|
def str_to_action(board, s):
|
|
return _coord_to_action(board, board.str_to_coord(s))
|
|
|
|
class GoState(object):
|
|
'''
|
|
Go game state. Consists of a current player and a board.
|
|
Actions are exposed as integers in [0, num_actions), which is different
|
|
from Pachi's internal "coord_t" encoding.
|
|
'''
|
|
def __init__(self, board, color):
|
|
'''
|
|
Args:
|
|
board: current board
|
|
color: color of current player
|
|
'''
|
|
assert color in [pachi_py.BLACK, pachi_py.WHITE], 'Invalid player color'
|
|
self.board, self.color = board, color
|
|
|
|
def act(self, action):
|
|
'''
|
|
Executes an action for the current player
|
|
|
|
Returns:
|
|
a new GoState with the new board and the player switched
|
|
'''
|
|
return GoState(
|
|
self.board.play(_action_to_coord(self.board, action), self.color),
|
|
pachi_py.stone_other(self.color))
|
|
|
|
def __repr__(self):
|
|
return 'To play: {}\n{}'.format(pachi_py.color_to_str(self.color), repr(self.board))
|
|
|
|
|
|
### Adversary policies ###
|
|
def random_policy(curr_state, prev_state, prev_action):
|
|
b = curr_state.board
|
|
legal_coords = b.get_legal_coords(curr_state.color)
|
|
return _coord_to_action(b, np.random.choice(legal_coords))
|
|
|
|
def make_pachi_policy(board, engine_type='uct', threads=1, pachi_timestr=''):
|
|
engine = pachi_py.PyPachiEngine(board, engine_type, 'threads=%d' % threads)
|
|
|
|
def pachi_policy(curr_state, prev_state, prev_action):
|
|
if prev_state is not None:
|
|
assert engine.curr_board == prev_state.board, 'Engine internal board is inconsistent with provided board. The Pachi engine must be called consistently as the game progresses.'
|
|
prev_coord = _action_to_coord(prev_state.board, prev_action)
|
|
engine.notify(prev_coord, prev_state.color)
|
|
engine.curr_board.play_inplace(prev_coord, prev_state.color)
|
|
out_coord = engine.genmove(curr_state.color, pachi_timestr)
|
|
out_action = _coord_to_action(curr_state.board, out_coord)
|
|
engine.curr_board.play_inplace(out_coord, curr_state.color)
|
|
return out_action
|
|
|
|
return pachi_policy
|
|
|
|
|
|
def _play(black_policy_fn, white_policy_fn, board_size=19):
|
|
'''
|
|
Samples a trajectory for two player policies.
|
|
Args:
|
|
black_policy_fn, white_policy_fn: functions that maps a GoState to a move coord (int)
|
|
'''
|
|
moves = []
|
|
|
|
prev_state, prev_action = None, None
|
|
curr_state = GoState(CreateBoard(board_size), BLACK)
|
|
|
|
while not curr_state.board.is_terminal:
|
|
a = (black_policy_fn if curr_state.color == BLACK else white_policy_fn)(curr_state, prev_state, prev_action)
|
|
next_state = curr_state.act(a)
|
|
moves.append((curr_state, a, next_state))
|
|
|
|
prev_state, prev_action = curr_state, a
|
|
curr_state = next_state
|
|
|
|
return moves
|
|
|
|
|
|
class GoEnv(gym.Env):
|
|
'''
|
|
Go environment. Play against a fixed opponent.
|
|
'''
|
|
metadata = {"render.modes": ["human", "ansi"]}
|
|
|
|
def __init__(self, player_color, opponent, observation_type, illegal_move_mode, board_size):
|
|
'''
|
|
Args:
|
|
player_color: Stone color for the agent. Either 'black' or 'white'
|
|
opponent: An opponent policy
|
|
observation_type: State encoding
|
|
illegal_move_mode: What to do when the agent makes an illegal move. Choices: 'raise' or 'lose'
|
|
'''
|
|
assert isinstance(board_size, int) and board_size >= 1, 'Invalid board size: {}'.format(board_size)
|
|
self.board_size = board_size
|
|
|
|
colormap = {
|
|
'black': pachi_py.BLACK,
|
|
'white': pachi_py.WHITE,
|
|
}
|
|
try:
|
|
self.player_color = colormap[player_color]
|
|
except KeyError:
|
|
raise error.Error("player_color must be 'black' or 'white', not {}".format(player_color))
|
|
|
|
self.opponent_policy = None
|
|
self.opponent = opponent
|
|
|
|
assert observation_type in ['image3c']
|
|
self.observation_type = observation_type
|
|
|
|
assert illegal_move_mode in ['lose', 'raise']
|
|
self.illegal_move_mode = illegal_move_mode
|
|
|
|
# One action for each board position, pass, and resign
|
|
self.action_space = spaces.Discrete(self.board_size**2 + 2)
|
|
|
|
if self.observation_type == 'image3c':
|
|
shape = pachi_py.CreateBoard(self.board_size).encode().shape
|
|
self.observation_space = spaces.Box(np.zeros(shape), np.ones(shape))
|
|
else:
|
|
raise error.Error('Unsupported observation type: {}'.format(self.observation_type))
|
|
|
|
self.reset()
|
|
|
|
def _reset(self):
|
|
self.state = GoState(pachi_py.CreateBoard(self.board_size), pachi_py.BLACK)
|
|
|
|
# (re-initialize) the opponent
|
|
# necessary because a pachi engine is attached to a game via internal data in a board
|
|
# so with a fresh game, we need a fresh engine
|
|
self._reset_opponent(self.state.board)
|
|
|
|
# Let the opponent play if it's not the agent's turn
|
|
if self.state.color != self.player_color:
|
|
self.state = self._exec_opponent_play(self.state, None, None)
|
|
assert self.state.color == self.player_color
|
|
|
|
self.done = self.state.board.is_terminal
|
|
return self.state.board.encode()
|
|
|
|
def _render(self, mode="human", close=False):
|
|
if close:
|
|
return
|
|
outfile = StringIO.StringIO() if mode == 'ansi' else sys.stdout
|
|
outfile.write(repr(self.state) + '\n')
|
|
return outfile
|
|
|
|
def _step(self, action):
|
|
assert self.state.color == self.player_color
|
|
|
|
# If already terminal, then don't do anything
|
|
if self.done:
|
|
return self.state.board.encode(), 0., True, {'state': self.state}
|
|
|
|
# Play
|
|
prev_state = self.state
|
|
try:
|
|
self.state = self.state.act(action)
|
|
except pachi_py.IllegalMove:
|
|
if self.illegal_move_mode == 'raise':
|
|
raise
|
|
elif self.illegal_move_mode == 'lose':
|
|
# Automatic loss on illegal move
|
|
self.done = True
|
|
return self.state.board.encode(), -1., True, {'state': self.state}
|
|
else:
|
|
raise error.Error('Unsupported illegal move action: {}'.format(self.illegal_move_mode))
|
|
|
|
# Opponent play
|
|
if not self.state.board.is_terminal:
|
|
self.state = self._exec_opponent_play(self.state, prev_state, action)
|
|
# After opponent play, we should be back to the original color
|
|
assert self.state.color == self.player_color
|
|
|
|
# Reward: 0 if nonterminal, 1 if won, -1 if lost
|
|
if self.state.board.is_terminal:
|
|
self.done = True
|
|
white_wins = self.state.board.official_score > 0
|
|
reward = 1. if (white_wins and self.player_color == pachi_py.WHITE) else -1.
|
|
else:
|
|
self.done = False
|
|
reward = 0.
|
|
return self.state.board.encode(), reward, self.done, {'state': self.state}
|
|
|
|
def _exec_opponent_play(self, curr_state, prev_state, prev_action):
|
|
assert curr_state.color != self.player_color
|
|
opponent_action = self.opponent_policy(curr_state, prev_state, prev_action)
|
|
return curr_state.act(opponent_action)
|
|
|
|
@property
|
|
def _state(self):
|
|
return self.state
|
|
|
|
def _reset_opponent(self, board):
|
|
if self.opponent == 'random':
|
|
self.opponent_policy = random_policy
|
|
elif self.opponent == 'pachi:uct:_2400':
|
|
self.opponent_policy = make_pachi_policy(board=board, engine_type='uct', pachi_timestr='_2400') # TODO: strength as argument
|
|
else:
|
|
raise error.Error('Unrecognized opponent policy {}'.format(self.opponent))
|