Fix some bugs with new partial benchmark scoring

This commit is contained in:
Jie Tang
2016-10-27 12:09:49 -07:00
parent f7a45f6953
commit 71af1191e0
2 changed files with 40 additions and 18 deletions

View File

@@ -21,7 +21,7 @@ def benchmark_aggregate_score(benchmark, env_id_to_benchmark_results):
for env_id in benchmark.env_ids: for env_id in benchmark.env_ids:
task_list = benchmark.task_specs(env_id) task_list = benchmark.task_specs(env_id)
num_trials = task_list[0].trials num_trials = task_list[0].trials
benchmark_results = env_id_to_benchmark_results[env_id] benchmark_results = env_id_to_benchmark_results.get(env_id, [])
for trial in range(num_trials): for trial in range(num_trials):
if trial < len(benchmark_results): if trial < len(benchmark_results):
# okay process this benchmark result against this trial # okay process this benchmark result against this trial
@@ -46,10 +46,9 @@ def benchmark_aggregate_score(benchmark, env_id_to_benchmark_results):
env_scores.append([benchmark.scorer.null_score() for _ in task_list]) env_scores.append([benchmark.scorer.null_score() for _ in task_list])
solves[env_id] = False solves[env_id] = False
score = benchmark.score_benchmark(scores) score = benchmark.score_benchmark(scores)
num_envs_solved = len([s for s in solves.values() if s]) num_envs_solved = len([s for s in solves.values() if s])
start_to_finish_seconds = max(end_times) - min(start_times) start_to_finish_seconds = max(end_times) - min(start_times) if start_times and end_times else 0.0
summed_training_seconds = np.sum([end - start for end, start in zip(end_times, start_times)]) summed_training_seconds = np.sum([end - start for end, start in zip(end_times, start_times)])
return dict( return dict(
@@ -77,20 +76,19 @@ class ClipTo01ThenAverage(object):
else: else:
initial_reset_timestamp = 0 initial_reset_timestamp = 0
# How long each episode actually took # How long each episode actually took
durations = np.zeros(len(timestamps)) durations = np.zeros(len(timestamps))
# (Details computing duration.) for source, initial_reset_timestamp in enumerate(initial_reset_timestamps):
data_sources = np.array(data_sources) temp_data_sources = np.array([source] + data_sources)
timestamps = np.array(timestamps) temp_timestamps = np.array([initial_reset_timestamp] + timestamps)
for source in range(len(initial_reset_timestamps)): (source_indexes,) = np.where(temp_data_sources == source)
# Once we know the indexes corresponding to a particular # Once we know the indexes corresponding to a particular
# source (i.e. worker thread), we can just subtract # source (i.e. worker thread), we can just subtract
# adjoining values # adjoining values
(source_indexes,) = np.where(data_sources == source) durations[source_indexes[:-1]] = temp_timestamps[source_indexes[1:]] - temp_timestamps[source_indexes[:-1]]
durations[source_indexes[0]] = timestamps[source_indexes[0]] - initial_reset_timestamps[source]
durations[source_indexes[1:]] = timestamps[source_indexes[1:]] - timestamps[source_indexes[:-1]]
#### 1. Select out which indexes are for evaluation and which are for training #### 1. Select out which indexes are for evaluation and which are for training
@@ -146,13 +144,6 @@ class ClipTo01ThenAverage(object):
# All episodes are fair game # All episodes are fair game
allowed_e_idx = e_idx allowed_e_idx = e_idx
if len(allowed_e_idx) > 0:
last_timestamp = timestamps[allowed_e_idx[-1]]
else:
# If we don't have any evaluation episodes, then the
# last valid timestamp is when we started.
last_timestamp = initial_reset_timestamp
# Grab the last num_episodes evaluation episodes from # Grab the last num_episodes evaluation episodes from
# before the cutoff (at which point we've gathered too # before the cutoff (at which point we've gathered too
# much experience). # much experience).
@@ -181,6 +172,14 @@ class ClipTo01ThenAverage(object):
solves.append(solved) solves.append(solved)
# Record the list of rewards # Record the list of rewards
rewards.append(reward) rewards.append(reward)
if len(allowed_e_idx) > 0:
last_timestamp = timestamps[allowed_e_idx[-1]]
else:
# If we don't have any evaluation episodes, then the
# last valid timestamp is when we started.
last_timestamp = initial_reset_timestamp
# Record the timestamp of the last episode timestamp # Record the timestamp of the last episode timestamp
_timestamps.append(last_timestamp) _timestamps.append(last_timestamp)

View File

@@ -116,6 +116,29 @@ def test_benchmark_scoring():
assert np.all(np.isclose(scores['score'], 0.0001)), "scores={}".format(scores) assert np.all(np.isclose(scores['score'], 0.0001)), "scores={}".format(scores)
assert scores['num_envs_solved'] == 0, debug_str assert scores['num_envs_solved'] == 0, debug_str
def test_benchmark_empty():
benchmark_results = defaultdict(list)
task = benchmark.tasks[0]
env_id = task.env_id
benchmark_results[env_id].append(benchmark.score_evaluation(
env_id,
data_sources=[0],
initial_reset_timestamps=[1],
episode_lengths=[1],
episode_rewards=[1],
episode_types=['t'],
timestamps=[2],
))
scores = scoring.benchmark_aggregate_score(benchmark, benchmark_results)
debug_str = "scores={}".format(scores)
assert np.all(np.isclose(scores['summed_training_seconds'], 1.0)), debug_str
assert np.all(np.isclose(scores['start_to_finish_seconds'], 1.0)), debug_str
assert np.all(np.isclose(scores['score'], 0.00005)), "scores={}".format(scores)
assert scores['num_envs_solved'] == 0, debug_str
scores = scoring.benchmark_aggregate_score(benchmark, {})
def test_benchmark_solved(): def test_benchmark_solved():
benchmark_results = defaultdict(list) benchmark_results = defaultdict(list)
N = 200 N = 200