Files
Gymnasium/gym/benchmarks/scoring.py

506 lines
20 KiB
Python
Raw Normal View History

2016-10-26 16:57:26 -07:00
from __future__ import division
import logging
import numpy as np
from gym import envs
logger = logging.getLogger(__name__)
def benchmark_aggregate_score(benchmark, env_id_to_benchmark_results):
scores = {}
solves = {}
start_times = []
end_times = []
2016-10-27 18:29:28 -07:00
elapsed_times = []
# N.B. for each env_id, our benchmark_results will have a list of scores,
# solves, and times corresponding to the different tasks for that env_id. If
# we don't have enough trials, we zero out the score.
# TODO could do smarter matching of results to trials if we have extras
# TODO for now, baked in assumption that the number of trials is the
# same for all tasks involving a particular env.
for env_id in benchmark.env_ids:
task_list = benchmark.task_specs(env_id)
num_trials = task_list[0].trials
benchmark_results = env_id_to_benchmark_results.get(env_id, [])
for trial in range(num_trials):
if trial < len(benchmark_results):
# okay process this benchmark result against this trial
benchmark_result = benchmark_results[trial]
env_scores = scores.setdefault(env_id, [])
env_scores.append(benchmark_result['scores'])
# note: solves is a list of lists - for each task for this env,
# does each episode solve that task. We consider the env solved
# if every episode for every task is individually solved.
solved = solves.setdefault(env_id, True)
solves[env_id] = solved and np.all(benchmark_result['solves'])
# these timestamps are a list of the first / last valid timestamp
# for each task involving this env.
start_times.append(benchmark_result['initial_reset_timestamp'])
end_times.append(max(benchmark_result['timestamps']))
2016-10-27 18:29:28 -07:00
elapsed_times.extend(benchmark_result['elapsed_times'])
else:
# no matching benchmark result for this trial
env_scores = scores.setdefault(env_id, [])
env_scores.append([benchmark.scorer.null_score() for _ in task_list])
solves[env_id] = False
score = benchmark.score_benchmark(scores)
num_envs_solved = len([s for s in solves.values() if s])
2016-10-27 18:29:28 -07:00
start_to_finish_seconds = max(end_times) - min(start_times) if end_times and start_times else 0.0
2016-10-27 22:49:16 -07:00
summed_task_wall_time = np.sum([end - start for end, start in zip(end_times, start_times)])
2016-10-27 18:29:28 -07:00
summed_training_seconds = np.sum(elapsed_times)
return dict(
score=score,
num_envs_solved=num_envs_solved,
start_to_finish_seconds=start_to_finish_seconds,
2016-10-27 22:49:16 -07:00
summed_task_wall_time=summed_task_wall_time,
summed_training_seconds=summed_training_seconds,
)
class ClipTo01ThenAverage(object):
2016-10-27 12:37:45 -07:00
"""Benchmark scoring rule
For each task, we take the last num_episodes (default: 100) evaluation
episodes before either the max_seconds or max_timesteps limit, whichever is
earlier. If there are not num_episodes evaluations, we fill in the rest with
scores of reward_floor.
For each valid evaluation episode, we clip the reward to be between the
reward_floor and reward_ceiling for that task. The score for the task is the
average across all episodes.
The benchmark score is the average of all task scores.
"""
def __init__(self, num_episodes=100):
self.num_episodes = num_episodes
def null_score(self):
2016-10-27 12:37:45 -07:00
"""
This is used to compute benchmark scores when we are missing an evaluation
"""
return 0.0
def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps):
tasks = benchmark.task_specs(env_id)
spec = envs.spec(env_id)
#### 0. Compute timing stats
if len(initial_reset_timestamps) > 0:
initial_reset_timestamp = min(initial_reset_timestamps)
else:
initial_reset_timestamp = 0
# How long each episode actually took
# How long each episode actually took
durations = np.zeros(len(timestamps))
data_sources = np.array(data_sources)
timestamps = np.array(timestamps)
for source, initial_ts in enumerate(initial_reset_timestamps):
(source_indexes,) = np.where(data_sources == source)
if len(source_indexes) == 0:
continue
# Once we know the indexes corresponding to a particular
# source (i.e. worker thread), we can just subtract
# adjoining values
durations[source_indexes[0]] = timestamps[source_indexes[0]] - initial_ts
durations[source_indexes[1:]] = timestamps[source_indexes[1:]] - timestamps[source_indexes[:-1]]
#### 1. Select out which indexes are for evaluation and which are for training
(t_idx,) = np.where([t == 't' for t in episode_types]) # training episodes
(e_idx,) = np.where([t == 'e' for t in episode_types]) # evaluation episodes
if len(e_idx) == 0:
# If no episodes marked for evaluation, consider
# everything both a training and evaluation episode.
(t_idx,) = np.where([True for t in episode_types])
(e_idx,) = np.where([True for t in episode_types])
#### 2. Grab the data corresponding to each of evaluation/training
training_lengths = np.array(episode_lengths)[t_idx]
training_rewards = np.array(episode_rewards)[t_idx]
training_durations = np.array(durations)[t_idx]
evaluation_lengths = np.array(episode_lengths)[e_idx]
evaluation_rewards = np.array(episode_rewards)[e_idx]
evaluation_durations = np.array(durations)[e_idx]
#### 3. Calculate the total elapsed time (in various units)
#### for each episode
# How many training timesteps have elapsed by the end of each
# episode. Not to be confused with Unix timestamps.
elapsed_timesteps = np.cumsum(training_lengths)
# Total number of seconds elapsed by the end of each
# episode. Note that with n parallel workers each running for
# m seconds, we want to count the total time as n * m.
elapsed_seconds = np.cumsum(training_durations)
scores = []
solves = []
rewards = []
_timestamps = []
2016-10-27 18:29:28 -07:00
elapsed_times = []
for task in tasks:
# Find the first episode where we're over the allotted
# training timesteps.
cutoff_idx = np.inf
if task.max_timesteps:
# this looks a little funny, but we want the first idx greater
# than the cutoff
(timestep_cutoff,) = np.where(elapsed_timesteps > task.max_timesteps)
if len(timestep_cutoff) > 0:
cutoff_idx = min(cutoff_idx, timestep_cutoff[0])
if task.max_seconds:
(seconds_cutoff,) = np.where(elapsed_seconds > task.max_seconds)
if len(seconds_cutoff) > 0:
cutoff_idx = min(cutoff_idx, seconds_cutoff[0])
if np.isfinite(cutoff_idx):
orig_cutoff_idx = t_idx[cutoff_idx] # cutoff index in the original (i.e. before filtering to training/evaluation)
(allowed_e_idx,) = np.where(e_idx < orig_cutoff_idx) # restrict to earlier episodes
else:
# All episodes are fair game
allowed_e_idx = e_idx
# Grab the last num_episodes evaluation episodes from
# before the cutoff (at which point we've gathered too
# much experience).
#
# This probably won't work long-term but is fine for now.
allowed_episode_rewards = np.array(episode_rewards)[allowed_e_idx]
reward = allowed_episode_rewards[-self.num_episodes:]
floor = task.reward_floor
ceiling = task.reward_ceiling
2016-09-23 02:08:03 -07:00
if len(reward) < self.num_episodes:
extra = self.num_episodes-len(reward)
logger.info('Only %s rewards for %s; adding %s', len(reward), env_id, extra)
reward = np.concatenate([reward, [floor] * extra])
# Grab the indexes where we reached the ceiling
solved = reward >= ceiling
# Linearly rescale rewards to between 0 and 1
clipped = np.clip((reward - floor) / (ceiling - floor), 0, 1)
# Take the mean rescaled score
2016-09-23 02:08:03 -07:00
score = np.mean(clipped)
scores.append(score)
# Record the list of solved episodes
solves.append(solved)
# Record the list of rewards
rewards.append(reward)
if len(allowed_e_idx) > 0:
if not np.isfinite(cutoff_idx):
cutoff_idx = len(elapsed_seconds) - 1
last_t_idx = t_idx[cutoff_idx]
# timestamps is full length
last_timestamp = timestamps[last_t_idx]
# elapsed seconds contains only training
elapsed_time = elapsed_seconds[cutoff_idx]
else:
# If we don't have any evaluation episodes, then the
# last valid timestamp is when we started.
last_timestamp = initial_reset_timestamp
2016-10-27 18:29:28 -07:00
elapsed_time = 0.0
# Record the timestamp of the last episode timestamp
_timestamps.append(last_timestamp)
2016-10-27 18:29:28 -07:00
elapsed_times.append(elapsed_time)
return {
'rewards': rewards,
'scores': scores,
'solves': solves,
'timestamps': _timestamps,
2016-10-27 18:29:28 -07:00
'elapsed_times': elapsed_times,
'initial_reset_timestamp': initial_reset_timestamp,
}
def score_benchmark(self, benchmark, episode_scores):
all_scores = []
for env_id, scores in episode_scores.items():
all_scores += scores
return np.mean(all_scores)
class TotalReward(object):
"""Benchmark scoring rule
For each task, we take all evaluation episodes before either the max_seconds
or max_timesteps limit, whichever is earlier.
We sum up rewards; for each task, we clip the total reward to be between the
reward_floor and reward_ceiling for the task and normalize so scores are between 0 and 1
The benchmark score is the average of all task scores.
"""
def __init__(self):
pass
def null_score(self):
return 0.0
def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps):
# TODO refactor code shared with the clip scoring rule above
tasks = benchmark.task_specs(env_id)
spec = envs.spec(env_id)
#### 0. Compute timing stats
if len(initial_reset_timestamps) > 0:
initial_reset_timestamp = min(initial_reset_timestamps)
else:
initial_reset_timestamp = 0
# How long each episode actually took
durations = np.zeros(len(timestamps))
data_sources = np.array(data_sources)
timestamps = np.array(timestamps)
for source, initial_ts in enumerate(initial_reset_timestamps):
(source_indexes,) = np.where(data_sources == source)
if len(source_indexes) == 0:
continue
# Once we know the indexes corresponding to a particular
# source (i.e. worker thread), we can just subtract
# adjoining values
durations[source_indexes[0]] = timestamps[source_indexes[0]] - initial_ts
durations[source_indexes[1:]] = timestamps[source_indexes[1:]] - timestamps[source_indexes[:-1]]
2016-10-27 18:29:28 -07:00
#### Grab the data corresponding to each of evaluation/training
lengths = np.array(episode_lengths)
rewards = np.array(episode_rewards)
durations = np.array(durations)
2016-10-27 18:29:28 -07:00
#### Calculate the total elapsed time (in various units)
#### for each episode
# How many training timesteps have elapsed by the end of each
# episode. Not to be confused with Unix timestamps.
elapsed_timesteps = np.cumsum(lengths)
# Total number of seconds elapsed by the end of each
# episode. Note that with n parallel workers each running for
# m seconds, we want to count the total time as n * m.
elapsed_seconds = np.cumsum(durations)
scores = []
solves = []
rewards = []
_timestamps = []
2016-10-27 18:29:28 -07:00
elapsed_times = []
for task in tasks:
# Find the first episode where we're over the allotted
# training timesteps.
cutoff_idx = np.inf
if task.max_timesteps:
# this looks a little funny, but we want the first idx greater
# than the cutoff
(timestep_cutoff,) = np.where(elapsed_timesteps > task.max_timesteps)
if len(timestep_cutoff) > 0:
cutoff_idx = min(cutoff_idx, timestep_cutoff[0])
if task.max_seconds:
(seconds_cutoff,) = np.where(elapsed_seconds > task.max_seconds)
if len(seconds_cutoff) > 0:
cutoff_idx = min(cutoff_idx, seconds_cutoff[0])
if not np.isfinite(cutoff_idx):
# All episodes are fair game
cutoff_idx = len(lengths)
reward = np.array(episode_rewards)[:cutoff_idx]
floor = task.reward_floor
ceiling = task.reward_ceiling
solved = reward >= ceiling
# Sum raw rewards, linearly rescale to between 0 and 1
score = np.clip((np.mean(reward) - floor) / (ceiling - floor), 0, 1)
# Take the mean rescaled score
scores.append(score)
# Record the list of solved episodes
solves.append(solved)
# Record the list of rewards
rewards.append(reward)
if np.any(timestamps[:cutoff_idx]):
last_idx = cutoff_idx - 1
last_timestamp = timestamps[last_idx]
elapsed_time = elapsed_seconds[last_idx]
else:
# If we don't have any valid episodes, then the
# last valid timestamp is when we started.
last_timestamp = initial_reset_timestamp
elapsed_time = 0.0
# Record the timestamp of the last episode timestamp
_timestamps.append(last_timestamp)
elapsed_times.append(elapsed_time)
return {
'rewards': rewards,
'scores': scores,
'solves': solves,
'timestamps': _timestamps,
'elapsed_times': elapsed_times,
'initial_reset_timestamp': initial_reset_timestamp,
}
def score_benchmark(self, benchmark, episode_scores):
all_scores = []
for env_id, scores in episode_scores.items():
all_scores += scores
return np.mean(all_scores)
class RewardPerTime(object):
"""Benchmark scoring rule
For each task, we take all evaluation episodes before either the max_seconds
or max_timesteps limit, whichever is earlier.
The score for each evaluation is the sum of rewards for all episodes divided by the total time taken for all episodes. We clip the score to be between the reward_floor and reward_ceiling for the task and normalize so scores are between 0 and 1
The benchmark score is the average of all task scores.
"""
def __init__(self):
pass
def null_score(self):
return 0.0
def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps):
# TODO refactor code shared with the clip scoring rule above
tasks = benchmark.task_specs(env_id)
spec = envs.spec(env_id)
#### 0. Compute timing stats
if len(initial_reset_timestamps) > 0:
initial_reset_timestamp = min(initial_reset_timestamps)
else:
initial_reset_timestamp = 0
# How long each episode actually took
durations = np.zeros(len(timestamps))
data_sources = np.array(data_sources)
timestamps = np.array(timestamps)
for source, initial_ts in enumerate(initial_reset_timestamps):
(source_indexes,) = np.where(data_sources == source)
if len(source_indexes) == 0:
continue
# Once we know the indexes corresponding to a particular
# source (i.e. worker thread), we can just subtract
# adjoining values
durations[source_indexes[0]] = timestamps[source_indexes[0]] - initial_ts
durations[source_indexes[1:]] = timestamps[source_indexes[1:]] - timestamps[source_indexes[:-1]]
#### Grab the data corresponding to each of evaluation/training
lengths = np.array(episode_lengths)
rewards = np.array(episode_rewards)
durations = np.array(durations)
#### Calculate the total elapsed time (in various units)
#### for each episode
# How many training timesteps have elapsed by the end of each
# episode. Not to be confused with Unix timestamps.
elapsed_timesteps = np.cumsum(lengths)
# Total number of seconds elapsed by the end of each
# episode. Note that with n parallel workers each running for
# m seconds, we want to count the total time as n * m.
elapsed_seconds = np.cumsum(durations)
scores = []
solves = []
rewards = []
_timestamps = []
elapsed_times = []
for task in tasks:
# Find the first episode where we're over the allotted
# training timesteps.
cutoff_idx = np.inf
if task.max_timesteps:
# this looks a little funny, but we want the first idx greater
# than the cutoff
(timestep_cutoff,) = np.where(elapsed_timesteps > task.max_timesteps)
if len(timestep_cutoff) > 0:
cutoff_idx = min(cutoff_idx, timestep_cutoff[0])
if task.max_seconds:
(seconds_cutoff,) = np.where(elapsed_seconds > task.max_seconds)
if len(seconds_cutoff) > 0:
cutoff_idx = min(cutoff_idx, seconds_cutoff[0])
if not np.isfinite(cutoff_idx):
# All episodes are fair game
cutoff_idx = len(lengths)
reward = np.array(episode_rewards)[:cutoff_idx]
floor = task.reward_floor
ceiling = task.reward_ceiling
# skip computing this for now
solved = [False] * len(reward)
# Sum the rewards for all episodes, divide by total time taken for
# all episodes
reward_per_second = np.sum(reward) / elapsed_seconds[cutoff_idx - 1]
score = np.clip((reward_per_second - floor) / (ceiling - floor), 0, 1)
# Take the mean rescaled score
scores.append(score)
# Record the list of solved episodes
solves.append(solved)
# Record the list of rewards
rewards.append(reward)
if np.any(timestamps[:cutoff_idx]):
last_idx = cutoff_idx - 1
last_timestamp = timestamps[last_idx]
elapsed_time = elapsed_seconds[last_idx]
else:
# If we don't have any valid episodes, then the
# last valid timestamp is when we started.
last_timestamp = initial_reset_timestamp
2016-10-27 18:29:28 -07:00
elapsed_time = 0.0
# Record the timestamp of the last episode timestamp
_timestamps.append(last_timestamp)
2016-10-27 18:29:28 -07:00
elapsed_times.append(elapsed_time)
return {
'rewards': rewards,
'scores': scores,
'solves': solves,
'timestamps': _timestamps,
2016-10-27 18:29:28 -07:00
'elapsed_times': elapsed_times,
'initial_reset_timestamp': initial_reset_timestamp,
}
def score_benchmark(self, benchmark, episode_scores):
all_scores = []
for env_id, scores in episode_scores.items():
all_scores += scores
return np.mean(all_scores)