2016-04-27 08:00:58 -07:00
from gym import error
try :
import pachi_py
except ImportError as e :
# The dependency group [pachi] should match the name is setup.py.
2016-04-28 14:01:24 +02:00
raise error . DependencyNotInstalled ( ' {} . (HINT: you may need to install the Go dependencies via " pip install gym[pachi] " .) ' . format ( e ) )
2016-04-27 08:00:58 -07:00
import numpy as np
import gym
from gym import spaces
import StringIO
import sys
# The coordinate representation of Pachi (and pachi_py) is defined on a board
# with extra rows and columns on the margin of the board, so positions on the board
# are not numbers in [0, board_size**2) as one would expect. For this Go env, we instead
# use an action representation that does fall in this more natural range.
def _coord_to_action ( board , c ) :
''' Converts Pachi coordinates to actions '''
if c == pachi_py . PASS_COORD : return board . size * * 2 # pass
if c == pachi_py . RESIGN_COORD : return board . size * * 2 + 1 # resign
i , j = board . coord_to_ij ( c )
return i * board . size + j
def _action_to_coord ( board , a ) :
''' Converts actions to Pachi coordinates '''
if a == board . size * * 2 : return pachi_py . PASS_COORD
if a == board . size * * 2 + 1 : return pachi_py . RESIGN_COORD
return board . ij_to_coord ( a / / board . size , a % board . size )
def str_to_action ( board , s ) :
return _coord_to_action ( board , board . str_to_coord ( s ) )
class GoState ( object ) :
'''
Go game state . Consists of a current player and a board .
Actions are exposed as integers in [ 0 , num_actions ) , which is different
from Pachi ' s internal " coord_t " encoding.
'''
def __init__ ( self , board , color ) :
'''
Args :
board : current board
color : color of current player
'''
assert color in [ pachi_py . BLACK , pachi_py . WHITE ] , ' Invalid player color '
self . board , self . color = board , color
def act ( self , action ) :
'''
Executes an action for the current player
Returns :
a new GoState with the new board and the player switched
'''
return GoState (
self . board . play ( _action_to_coord ( self . board , action ) , self . color ) ,
pachi_py . stone_other ( self . color ) )
def __repr__ ( self ) :
return ' To play: {} \n {} ' . format ( pachi_py . color_to_str ( self . color ) , repr ( self . board ) )
### Adversary policies ###
def random_policy ( curr_state , prev_state , prev_action ) :
b = curr_state . board
legal_coords = b . get_legal_coords ( curr_state . color )
return _coord_to_action ( b , np . random . choice ( legal_coords ) )
def make_pachi_policy ( board , engine_type = ' uct ' , threads = 1 , pachi_timestr = ' ' ) :
engine = pachi_py . PyPachiEngine ( board , engine_type , ' threads= %d ' % threads )
def pachi_policy ( curr_state , prev_state , prev_action ) :
if prev_state is not None :
assert engine . curr_board == prev_state . board , ' Engine internal board is inconsistent with provided board. The Pachi engine must be called consistently as the game progresses. '
prev_coord = _action_to_coord ( prev_state . board , prev_action )
engine . notify ( prev_coord , prev_state . color )
engine . curr_board . play_inplace ( prev_coord , prev_state . color )
out_coord = engine . genmove ( curr_state . color , pachi_timestr )
out_action = _coord_to_action ( curr_state . board , out_coord )
engine . curr_board . play_inplace ( out_coord , curr_state . color )
return out_action
return pachi_policy
def _play ( black_policy_fn , white_policy_fn , board_size = 19 ) :
'''
Samples a trajectory for two player policies .
Args :
black_policy_fn , white_policy_fn : functions that maps a GoState to a move coord ( int )
'''
moves = [ ]
prev_state , prev_action = None , None
2016-04-28 21:28:58 +02:00
curr_state = GoState ( pachi_py . CreateBoard ( board_size ) , BLACK )
2016-04-27 08:00:58 -07:00
while not curr_state . board . is_terminal :
a = ( black_policy_fn if curr_state . color == BLACK else white_policy_fn ) ( curr_state , prev_state , prev_action )
next_state = curr_state . act ( a )
moves . append ( ( curr_state , a , next_state ) )
prev_state , prev_action = curr_state , a
curr_state = next_state
return moves
class GoEnv ( gym . Env ) :
'''
Go environment . Play against a fixed opponent .
'''
metadata = { " render.modes " : [ " human " , " ansi " ] }
def __init__ ( self , player_color , opponent , observation_type , illegal_move_mode , board_size ) :
'''
Args :
player_color : Stone color for the agent . Either ' black ' or ' white '
opponent : An opponent policy
observation_type : State encoding
illegal_move_mode : What to do when the agent makes an illegal move . Choices : ' raise ' or ' lose '
'''
assert isinstance ( board_size , int ) and board_size > = 1 , ' Invalid board size: {} ' . format ( board_size )
self . board_size = board_size
colormap = {
' black ' : pachi_py . BLACK ,
' white ' : pachi_py . WHITE ,
}
try :
self . player_color = colormap [ player_color ]
except KeyError :
raise error . Error ( " player_color must be ' black ' or ' white ' , not {} " . format ( player_color ) )
self . opponent_policy = None
self . opponent = opponent
assert observation_type in [ ' image3c ' ]
self . observation_type = observation_type
assert illegal_move_mode in [ ' lose ' , ' raise ' ]
self . illegal_move_mode = illegal_move_mode
# One action for each board position, pass, and resign
self . action_space = spaces . Discrete ( self . board_size * * 2 + 2 )
if self . observation_type == ' image3c ' :
shape = pachi_py . CreateBoard ( self . board_size ) . encode ( ) . shape
self . observation_space = spaces . Box ( np . zeros ( shape ) , np . ones ( shape ) )
else :
raise error . Error ( ' Unsupported observation type: {} ' . format ( self . observation_type ) )
self . reset ( )
def _reset ( self ) :
self . state = GoState ( pachi_py . CreateBoard ( self . board_size ) , pachi_py . BLACK )
# (re-initialize) the opponent
# necessary because a pachi engine is attached to a game via internal data in a board
# so with a fresh game, we need a fresh engine
self . _reset_opponent ( self . state . board )
# Let the opponent play if it's not the agent's turn
if self . state . color != self . player_color :
self . state = self . _exec_opponent_play ( self . state , None , None )
assert self . state . color == self . player_color
self . done = self . state . board . is_terminal
return self . state . board . encode ( )
def _render ( self , mode = " human " , close = False ) :
if close :
return
outfile = StringIO . StringIO ( ) if mode == ' ansi ' else sys . stdout
outfile . write ( repr ( self . state ) + ' \n ' )
return outfile
def _step ( self , action ) :
assert self . state . color == self . player_color
# If already terminal, then don't do anything
if self . done :
return self . state . board . encode ( ) , 0. , True , { ' state ' : self . state }
# Play
prev_state = self . state
try :
self . state = self . state . act ( action )
except pachi_py . IllegalMove :
if self . illegal_move_mode == ' raise ' :
raise
elif self . illegal_move_mode == ' lose ' :
# Automatic loss on illegal move
self . done = True
return self . state . board . encode ( ) , - 1. , True , { ' state ' : self . state }
else :
raise error . Error ( ' Unsupported illegal move action: {} ' . format ( self . illegal_move_mode ) )
# Opponent play
if not self . state . board . is_terminal :
2016-04-28 20:30:50 -04:00
self . state , self . opponent_resigned = self . _exec_opponent_play ( self . state , prev_state , action )
2016-04-27 08:00:58 -07:00
# After opponent play, we should be back to the original color
assert self . state . color == self . player_color
# Reward: 0 if nonterminal, 1 if won, -1 if lost
if self . state . board . is_terminal :
self . done = True
white_wins = self . state . board . official_score > 0
2016-04-28 20:30:50 -04:00
player_wins = ( white_wins and self . player_color == pachi_py . WHITE ) or ( not white_wins and self . player_color == pachi_py . BLACK )
reward = 1. if ( self . opponent_resigned or player_wins ) else - 1.
2016-04-27 08:00:58 -07:00
else :
self . done = False
reward = 0.
return self . state . board . encode ( ) , reward , self . done , { ' state ' : self . state }
def _exec_opponent_play ( self , curr_state , prev_state , prev_action ) :
assert curr_state . color != self . player_color
opponent_action = self . opponent_policy ( curr_state , prev_state , prev_action )
2016-04-28 20:30:50 -04:00
opponent_resigned = opponent_action == self . board_size * * 2 + 1
return curr_state . act ( opponent_action ) , opponent_resigned
2016-04-27 08:00:58 -07:00
@property
def _state ( self ) :
return self . state
def _reset_opponent ( self , board ) :
if self . opponent == ' random ' :
self . opponent_policy = random_policy
elif self . opponent == ' pachi:uct:_2400 ' :
self . opponent_policy = make_pachi_policy ( board = board , engine_type = ' uct ' , pachi_timestr = ' _2400 ' ) # TODO: strength as argument
else :
raise error . Error ( ' Unrecognized opponent policy {} ' . format ( self . opponent ) )