2016-10-26 16:57:26 -07:00
from __future__ import division
2016-09-23 01:04:26 -07:00
import logging
import numpy as np
from gym import envs
logger = logging . getLogger ( __name__ )
2016-10-25 21:53:58 -07:00
def benchmark_aggregate_score ( benchmark , env_id_to_benchmark_results ) :
scores = { }
solves = { }
start_times = [ ]
end_times = [ ]
2016-10-27 18:29:28 -07:00
elapsed_times = [ ]
2016-10-25 21:53:58 -07:00
# N.B. for each env_id, our benchmark_results will have a list of scores,
# solves, and times corresponding to the different tasks for that env_id. If
# we don't have enough trials, we zero out the score.
# TODO could do smarter matching of results to trials if we have extras
# TODO for now, baked in assumption that the number of trials is the
# same for all tasks involving a particular env.
for env_id in benchmark . env_ids :
task_list = benchmark . task_specs ( env_id )
num_trials = task_list [ 0 ] . trials
2016-10-27 12:09:49 -07:00
benchmark_results = env_id_to_benchmark_results . get ( env_id , [ ] )
2016-10-25 21:53:58 -07:00
for trial in range ( num_trials ) :
if trial < len ( benchmark_results ) :
# okay process this benchmark result against this trial
benchmark_result = benchmark_results [ trial ]
env_scores = scores . setdefault ( env_id , [ ] )
env_scores . append ( benchmark_result [ ' scores ' ] )
# note: solves is a list of lists - for each task for this env,
# does each episode solve that task. We consider the env solved
# if every episode for every task is individually solved.
solved = solves . setdefault ( env_id , True )
2016-12-13 22:00:51 -08:00
solves [ env_id ] = solved and np . sum ( benchmark_result [ ' solves ' ] )
2016-10-25 21:53:58 -07:00
# these timestamps are a list of the first / last valid timestamp
# for each task involving this env.
start_times . append ( benchmark_result [ ' initial_reset_timestamp ' ] )
end_times . append ( max ( benchmark_result [ ' timestamps ' ] ) )
2016-10-27 18:29:28 -07:00
elapsed_times . extend ( benchmark_result [ ' elapsed_times ' ] )
2016-10-25 21:53:58 -07:00
else :
# no matching benchmark result for this trial
2017-03-05 18:16:03 -08:00
# TODOJT bug?
2016-10-25 21:53:58 -07:00
env_scores = scores . setdefault ( env_id , [ ] )
2017-02-27 19:34:00 -08:00
env_scores . append ( [ benchmark . scorer . null_score for _ in task_list ] )
2016-10-25 21:53:58 -07:00
solves [ env_id ] = False
score = benchmark . score_benchmark ( scores )
num_envs_solved = len ( [ s for s in solves . values ( ) if s ] )
2016-10-27 18:29:28 -07:00
start_to_finish_seconds = max ( end_times ) - min ( start_times ) if end_times and start_times else 0.0
2016-10-27 22:49:16 -07:00
summed_task_wall_time = np . sum ( [ end - start for end , start in zip ( end_times , start_times ) ] )
2016-10-27 18:29:28 -07:00
summed_training_seconds = np . sum ( elapsed_times )
2016-10-25 21:53:58 -07:00
return dict (
score = score ,
num_envs_solved = num_envs_solved ,
start_to_finish_seconds = start_to_finish_seconds ,
2016-10-27 22:49:16 -07:00
summed_task_wall_time = summed_task_wall_time ,
2016-10-25 21:53:58 -07:00
summed_training_seconds = summed_training_seconds ,
)
2016-09-23 01:04:26 -07:00
class ClipTo01ThenAverage ( object ) :
2016-10-27 12:37:45 -07:00
""" Benchmark scoring rule
For each task , we take the last num_episodes ( default : 100 ) evaluation
episodes before either the max_seconds or max_timesteps limit , whichever is
earlier . If there are not num_episodes evaluations , we fill in the rest with
scores of reward_floor .
For each valid evaluation episode , we clip the reward to be between the
reward_floor and reward_ceiling for that task . The score for the task is the
average across all episodes .
The benchmark score is the average of all task scores .
"""
2016-10-25 21:44:43 -07:00
def __init__ ( self , num_episodes = 100 ) :
2016-09-23 01:04:26 -07:00
self . num_episodes = num_episodes
2016-10-25 21:44:43 -07:00
2017-02-27 19:34:00 -08:00
@property
2016-10-25 21:44:43 -07:00
def null_score ( self ) :
2016-10-27 12:37:45 -07:00
"""
This is used to compute benchmark scores when we are missing an evaluation
"""
2016-10-25 21:44:43 -07:00
return 0.0
2016-09-23 01:04:26 -07:00
2016-10-20 17:25:29 -07:00
def score_evaluation ( self , benchmark , env_id , data_sources , initial_reset_timestamps , episode_lengths , episode_rewards , episode_types , timestamps ) :
tasks = benchmark . task_specs ( env_id )
2016-09-23 01:04:26 -07:00
spec = envs . spec ( env_id )
2016-10-20 17:25:29 -07:00
#### 0. Compute timing stats
if len ( initial_reset_timestamps ) > 0 :
initial_reset_timestamp = min ( initial_reset_timestamps )
else :
initial_reset_timestamp = 0
2016-10-27 12:09:49 -07:00
2016-10-20 17:25:29 -07:00
# How long each episode actually took
durations = np . zeros ( len ( timestamps ) )
2016-10-27 17:46:16 -07:00
data_sources = np . array ( data_sources )
timestamps = np . array ( timestamps )
2016-10-27 22:25:54 -07:00
for source , initial_ts in enumerate ( initial_reset_timestamps ) :
2016-10-27 17:46:16 -07:00
( source_indexes , ) = np . where ( data_sources == source )
2016-10-27 12:09:49 -07:00
2016-11-30 22:38:11 -08:00
if len ( source_indexes ) == 0 :
continue
2016-10-20 17:25:29 -07:00
# Once we know the indexes corresponding to a particular
# source (i.e. worker thread), we can just subtract
# adjoining values
2016-10-27 22:25:54 -07:00
durations [ source_indexes [ 0 ] ] = timestamps [ source_indexes [ 0 ] ] - initial_ts
2016-10-27 17:46:16 -07:00
durations [ source_indexes [ 1 : ] ] = timestamps [ source_indexes [ 1 : ] ] - timestamps [ source_indexes [ : - 1 ] ]
2016-10-20 17:25:29 -07:00
#### 1. Select out which indexes are for evaluation and which are for training
2016-09-23 01:04:26 -07:00
( t_idx , ) = np . where ( [ t == ' t ' for t in episode_types ] ) # training episodes
( e_idx , ) = np . where ( [ t == ' e ' for t in episode_types ] ) # evaluation episodes
if len ( e_idx ) == 0 :
# If no episodes marked for evaluation, consider
# everything both a training and evaluation episode.
( t_idx , ) = np . where ( [ True for t in episode_types ] )
( e_idx , ) = np . where ( [ True for t in episode_types ] )
2016-10-20 17:25:29 -07:00
#### 2. Grab the data corresponding to each of evaluation/training
2016-09-23 01:04:26 -07:00
training_lengths = np . array ( episode_lengths ) [ t_idx ]
training_rewards = np . array ( episode_rewards ) [ t_idx ]
2016-10-20 17:25:29 -07:00
training_durations = np . array ( durations ) [ t_idx ]
2016-09-23 01:04:26 -07:00
evaluation_lengths = np . array ( episode_lengths ) [ e_idx ]
evaluation_rewards = np . array ( episode_rewards ) [ e_idx ]
2016-10-20 17:25:29 -07:00
evaluation_durations = np . array ( durations ) [ e_idx ]
#### 3. Calculate the total elapsed time (in various units)
#### for each episode
2016-09-23 01:04:26 -07:00
# How many training timesteps have elapsed by the end of each
# episode. Not to be confused with Unix timestamps.
elapsed_timesteps = np . cumsum ( training_lengths )
2016-10-20 17:25:29 -07:00
# Total number of seconds elapsed by the end of each
# episode. Note that with n parallel workers each running for
# m seconds, we want to count the total time as n * m.
elapsed_seconds = np . cumsum ( training_durations )
2016-09-23 01:04:26 -07:00
scores = [ ]
solves = [ ]
rewards = [ ]
2017-02-13 02:03:26 -08:00
lengths = [ ]
2016-09-23 01:04:26 -07:00
_timestamps = [ ]
2016-10-27 18:29:28 -07:00
elapsed_times = [ ]
2016-09-23 01:04:26 -07:00
for task in tasks :
# Find the first episode where we're over the allotted
# training timesteps.
2016-10-20 17:25:29 -07:00
cutoff_idx = np . inf
if task . max_timesteps :
2016-10-27 21:35:23 -07:00
# this looks a little funny, but we want the first idx greater
# than the cutoff
2016-10-20 17:25:29 -07:00
( timestep_cutoff , ) = np . where ( elapsed_timesteps > task . max_timesteps )
if len ( timestep_cutoff ) > 0 :
2016-10-27 21:35:23 -07:00
cutoff_idx = min ( cutoff_idx , timestep_cutoff [ 0 ] )
2016-10-20 17:25:29 -07:00
if task . max_seconds :
( seconds_cutoff , ) = np . where ( elapsed_seconds > task . max_seconds )
if len ( seconds_cutoff ) > 0 :
2016-10-27 21:35:23 -07:00
cutoff_idx = min ( cutoff_idx , seconds_cutoff [ 0 ] )
2016-10-20 17:25:29 -07:00
if np . isfinite ( cutoff_idx ) :
orig_cutoff_idx = t_idx [ cutoff_idx ] # cutoff index in the original (i.e. before filtering to training/evaluation)
2016-09-23 01:04:26 -07:00
( allowed_e_idx , ) = np . where ( e_idx < orig_cutoff_idx ) # restrict to earlier episodes
else :
# All episodes are fair game
allowed_e_idx = e_idx
# Grab the last num_episodes evaluation episodes from
# before the cutoff (at which point we've gathered too
# much experience).
#
# This probably won't work long-term but is fine for now.
allowed_episode_rewards = np . array ( episode_rewards ) [ allowed_e_idx ]
reward = allowed_episode_rewards [ - self . num_episodes : ]
2017-02-13 02:03:26 -08:00
allowed_episode_lengths = np . array ( episode_lengths ) [ allowed_e_idx ]
length = allowed_episode_lengths [ - self . num_episodes : ]
2016-09-23 01:04:26 -07:00
floor = task . reward_floor
ceiling = task . reward_ceiling
2016-09-23 02:08:03 -07:00
if len ( reward ) < self . num_episodes :
extra = self . num_episodes - len ( reward )
logger . info ( ' Only %s rewards for %s ; adding %s ' , len ( reward ) , env_id , extra )
reward = np . concatenate ( [ reward , [ floor ] * extra ] )
2017-02-13 02:03:26 -08:00
length = np . concatenate ( [ length , [ 0 ] * extra ] )
2016-09-23 02:08:03 -07:00
2016-09-23 01:04:26 -07:00
# Grab the indexes where we reached the ceiling
solved = reward > = ceiling
# Linearly rescale rewards to between 0 and 1
clipped = np . clip ( ( reward - floor ) / ( ceiling - floor ) , 0 , 1 )
# Take the mean rescaled score
2016-09-23 02:08:03 -07:00
score = np . mean ( clipped )
2016-09-23 01:04:26 -07:00
scores . append ( score )
# Record the list of solved episodes
solves . append ( solved )
# Record the list of rewards
rewards . append ( reward )
2017-02-13 02:03:26 -08:00
# Record the list of lengths
lengths . append ( length )
2016-10-27 12:09:49 -07:00
if len ( allowed_e_idx ) > 0 :
2016-10-28 11:48:49 -07:00
if not np . isfinite ( cutoff_idx ) :
cutoff_idx = len ( elapsed_seconds ) - 1
last_t_idx = t_idx [ cutoff_idx ]
# timestamps is full length
last_timestamp = timestamps [ last_t_idx ]
# elapsed seconds contains only training
elapsed_time = elapsed_seconds [ cutoff_idx ]
2016-10-27 12:09:49 -07:00
else :
# If we don't have any evaluation episodes, then the
# last valid timestamp is when we started.
last_timestamp = initial_reset_timestamp
2016-10-27 18:29:28 -07:00
elapsed_time = 0.0
2016-10-27 12:09:49 -07:00
2016-09-23 01:04:26 -07:00
# Record the timestamp of the last episode timestamp
_timestamps . append ( last_timestamp )
2016-10-27 18:29:28 -07:00
elapsed_times . append ( elapsed_time )
2016-09-23 01:04:26 -07:00
return {
' rewards ' : rewards ,
2017-02-13 02:03:26 -08:00
' lengths ' : lengths ,
2016-09-23 01:04:26 -07:00
' scores ' : scores ,
' solves ' : solves ,
' timestamps ' : _timestamps ,
2016-10-27 18:29:28 -07:00
' elapsed_times ' : elapsed_times ,
2016-10-24 23:38:01 -07:00
' initial_reset_timestamp ' : initial_reset_timestamp ,
2016-09-23 01:04:26 -07:00
}
def score_benchmark ( self , benchmark , episode_scores ) :
all_scores = [ ]
for env_id , scores in episode_scores . items ( ) :
all_scores + = scores
return np . mean ( all_scores )
2016-10-27 17:46:16 -07:00
2016-12-13 21:25:59 -08:00
def _compute_episode_durations ( initial_reset_timestamps , data_sources , timestamps ) :
# We'd like to compute the actual time taken by each episode.
# This should be a simple as subtracting adjoining timestamps
# However all the monitor timestamps are mixed together from multiple
# sources, so we do some munging to separate out by source the data_source
# is an array of ints that is the same size as timestamps and maps back to
# the original source initial_reset_timestamps is an array with the initial
# timestamp for each source file
# TODO if we don't merge monitor files together at a higher level this logic
# can be a lot simpler
durations = np . zeros ( len ( timestamps ) )
data_sources = np . array ( data_sources )
for source , initial_ts in enumerate ( initial_reset_timestamps ) :
( source_indexes , ) = np . where ( data_sources == source )
if len ( source_indexes ) == 0 :
continue
# Once we know the indexes corresponding to a particular
# source (i.e. worker thread), we can just subtract
# adjoining values
durations [ source_indexes [ 0 ] ] = timestamps [ source_indexes [ 0 ] ] - initial_ts
durations [ source_indexes [ 1 : ] ] = timestamps [ source_indexes [ 1 : ] ] - timestamps [ source_indexes [ : - 1 ] ]
return durations
def _find_cutoffs_for_task ( task , elapsed_timesteps , elapsed_seconds ) :
# Apply max_timesteps and max_seconds cutoffs. Return np.inf if no cutoff is necessary
cutoff_idx = np . inf
if task . max_timesteps :
# this looks a little funny, but we want the first idx greater
# than the cutoff
( timestep_cutoff , ) = np . where ( elapsed_timesteps > task . max_timesteps )
if len ( timestep_cutoff ) > 0 :
cutoff_idx = min ( cutoff_idx , timestep_cutoff [ 0 ] )
if task . max_seconds :
( seconds_cutoff , ) = np . where ( elapsed_seconds > task . max_seconds )
if len ( seconds_cutoff ) > 0 :
cutoff_idx = min ( cutoff_idx , seconds_cutoff [ 0 ] )
return cutoff_idx
class BenchmarkScoringRule ( object ) :
""" Benchmark scoring rule class
Takes care of munging the monitor files to identify which episodes for each
task appear before the max_seconds or max_timesteps limit , whichever is
earlier .
It passes the rewards for the episodes to the " score_and_solved_func "
callback given in __init__
2016-10-27 17:46:16 -07:00
The benchmark score is the average of all task scores .
2016-12-13 21:25:59 -08:00
"""
def __init__ ( self , score_and_solved_func ) :
self . score_and_solved_func = score_and_solved_func
2016-10-27 17:46:16 -07:00
2017-02-27 19:34:00 -08:00
@property
2016-10-27 17:46:16 -07:00
def null_score ( self ) :
return 0.0
def score_evaluation ( self , benchmark , env_id , data_sources , initial_reset_timestamps , episode_lengths , episode_rewards , episode_types , timestamps ) :
tasks = benchmark . task_specs ( env_id )
spec = envs . spec ( env_id )
#### 0. Compute timing stats
if len ( initial_reset_timestamps ) > 0 :
initial_reset_timestamp = min ( initial_reset_timestamps )
else :
initial_reset_timestamp = 0
# How long each episode actually took
timestamps = np . array ( timestamps )
2016-12-13 21:25:59 -08:00
durations = _compute_episode_durations ( initial_reset_timestamps , data_sources , timestamps )
2016-10-27 17:46:16 -07:00
2016-10-27 18:29:28 -07:00
#### Grab the data corresponding to each of evaluation/training
2016-10-27 17:46:16 -07:00
lengths = np . array ( episode_lengths )
rewards = np . array ( episode_rewards )
2016-10-27 18:29:28 -07:00
#### Calculate the total elapsed time (in various units)
2016-10-27 17:46:16 -07:00
#### for each episode
# How many training timesteps have elapsed by the end of each
# episode. Not to be confused with Unix timestamps.
elapsed_timesteps = np . cumsum ( lengths )
# Total number of seconds elapsed by the end of each
# episode. Note that with n parallel workers each running for
# m seconds, we want to count the total time as n * m.
elapsed_seconds = np . cumsum ( durations )
2016-12-13 21:25:59 -08:00
# List of score for each task
2016-10-27 17:46:16 -07:00
scores = [ ]
2016-12-13 21:25:59 -08:00
# List of lists of solved episodes for each task
2016-10-27 17:46:16 -07:00
solves = [ ]
2016-12-13 21:25:59 -08:00
# List of lists of episode rewards for each task
2016-10-27 17:46:16 -07:00
rewards = [ ]
2017-02-13 02:03:26 -08:00
# List of lists of relevant episode lengths for each task
cutoff_lengths = [ ]
2016-10-27 17:46:16 -07:00
_timestamps = [ ]
2016-10-27 18:29:28 -07:00
elapsed_times = [ ]
2016-10-27 17:46:16 -07:00
for task in tasks :
# Find the first episode where we're over the allotted
# training timesteps.
2016-12-13 21:25:59 -08:00
cutoff_idx = _find_cutoffs_for_task ( task , elapsed_timesteps , elapsed_seconds )
2016-10-27 17:46:16 -07:00
if not np . isfinite ( cutoff_idx ) :
# All episodes are fair game
cutoff_idx = len ( lengths )
reward = np . array ( episode_rewards ) [ : cutoff_idx ]
2016-12-13 21:25:59 -08:00
score , solved = self . score_and_solved_func ( task , reward , elapsed_seconds [ : cutoff_idx ] )
2016-10-27 17:46:16 -07:00
scores . append ( score )
solves . append ( solved )
rewards . append ( reward )
2017-02-13 02:03:26 -08:00
cutoff_lengths . append ( lengths [ : cutoff_idx ] )
2016-10-27 17:46:16 -07:00
if np . any ( timestamps [ : cutoff_idx ] ) :
2016-12-13 21:25:59 -08:00
last_timestamp = timestamps [ cutoff_idx - 1 ]
elapsed_time = elapsed_seconds [ cutoff_idx - 1 ]
2016-12-13 21:13:49 -08:00
else :
# If we don't have any valid episodes, then the
# last valid timestamp is when we started.
last_timestamp = initial_reset_timestamp
elapsed_time = 0.0
2016-12-13 21:25:59 -08:00
# Record the timestamp of the last episode
2016-12-13 21:13:49 -08:00
_timestamps . append ( last_timestamp )
elapsed_times . append ( elapsed_time )
return {
' rewards ' : rewards ,
2017-02-13 02:03:26 -08:00
' lengths ' : cutoff_lengths ,
2016-12-13 21:13:49 -08:00
' scores ' : scores ,
' solves ' : solves ,
' timestamps ' : _timestamps ,
' elapsed_times ' : elapsed_times ,
' initial_reset_timestamp ' : initial_reset_timestamp ,
}
def score_benchmark ( self , benchmark , episode_scores ) :
all_scores = [ ]
for env_id , scores in episode_scores . items ( ) :
all_scores + = scores
return np . mean ( all_scores )
2017-02-27 19:34:00 -08:00
def total_reward_from_episode_rewards ( task , reward , elapsed_seconds ) :
" TotalReward scoring takes the mean of all rewards earned over the course of the episode and clips it between reward_floor and reward_ceiling "
# reward is an array containing valid rewards for the episode
floor = task . reward_floor
ceiling = task . reward_ceiling
solved = reward > = ceiling
# Sum raw rewards, linearly rescale to between 0 and 1
score = np . clip ( ( np . mean ( reward ) - floor ) / ( ceiling - floor ) , 0 , 1 )
return score , solved
class TotalReward ( BenchmarkScoringRule ) :
def __init__ ( self ) :
super ( TotalReward , self ) . __init__ ( total_reward_from_episode_rewards )
def reward_per_time_from_episode_rewards ( task , reward , elapsed_seconds ) :
" RewardPerTime scoring takes the total reward earned over the course of the episode, divides by the elapsed time, and clips it between reward_floor and reward_ceiling "
floor = task . reward_floor
ceiling = task . reward_ceiling
# TODO actually compute solves for this
solved = np . zeros ( len ( reward ) )
2016-12-13 21:13:49 -08:00
2017-02-27 19:34:00 -08:00
# Sum the rewards for all episodes, divide by total time taken for all episodes
reward_per_second = np . sum ( reward ) / elapsed_seconds [ - 1 ] if np . any ( elapsed_seconds ) else 0.0
score = np . clip ( ( reward_per_second - floor ) / ( ceiling - floor ) , 0 , 1 )
return score , solved
2016-12-13 21:13:49 -08:00
2017-02-27 19:34:00 -08:00
class RewardPerTime ( BenchmarkScoringRule ) :
def __init__ ( self ) :
super ( RewardPerTime , self ) . __init__ ( reward_per_time_from_episode_rewards )