Fix some bugs with new partial benchmark scoring

2025-09-02 18:36:16 +00:00 · 2016-10-27 12:09:49 -07:00
parent f7a45f6953
commit 71af1191e0
2 changed files with 40 additions and 18 deletions
--- a/gym/benchmarks/scoring.py
+++ b/gym/benchmarks/scoring.py
@@ -21,7 +21,7 @@ def benchmark_aggregate_score(benchmark, env_id_to_benchmark_results):
    for env_id in benchmark.env_ids:
        task_list = benchmark.task_specs(env_id)
        num_trials = task_list[0].trials
-        benchmark_results = env_id_to_benchmark_results[env_id]
+        benchmark_results = env_id_to_benchmark_results.get(env_id, [])
        for trial in range(num_trials):
            if trial < len(benchmark_results):
                # okay process this benchmark result against this trial
@@ -46,10 +46,9 @@ def benchmark_aggregate_score(benchmark, env_id_to_benchmark_results):
                env_scores.append([benchmark.scorer.null_score() for _ in task_list])
                solves[env_id] = False
    score = benchmark.score_benchmark(scores)
    num_envs_solved = len([s for s in solves.values() if s])
-    start_to_finish_seconds = max(end_times) - min(start_times)
+    start_to_finish_seconds = max(end_times) - min(start_times) if start_times and end_times else 0.0
    summed_training_seconds = np.sum([end - start for end, start in zip(end_times, start_times)])
    return dict(
@@ -77,20 +76,19 @@ class ClipTo01ThenAverage(object):
        else:
            initial_reset_timestamp = 0
        # How long each episode actually took
        durations = np.zeros(len(timestamps))
-        # (Details computing duration.)
+        for source, initial_reset_timestamp in enumerate(initial_reset_timestamps):
-        data_sources = np.array(data_sources)
+            temp_data_sources = np.array([source] + data_sources)
-        timestamps = np.array(timestamps)
+            temp_timestamps = np.array([initial_reset_timestamp] + timestamps)
-        for source in range(len(initial_reset_timestamps)):
+            (source_indexes,) = np.where(temp_data_sources == source)
            # Once we know the indexes corresponding to a particular
            # source (i.e. worker thread), we can just subtract
            # adjoining values
-            (source_indexes,) = np.where(data_sources == source)
+            durations[source_indexes[:-1]] = temp_timestamps[source_indexes[1:]] - temp_timestamps[source_indexes[:-1]]
            durations[source_indexes[0]] = timestamps[source_indexes[0]] - initial_reset_timestamps[source]
            durations[source_indexes[1:]] = timestamps[source_indexes[1:]] - timestamps[source_indexes[:-1]]
        #### 1. Select out which indexes are for evaluation and which are for training
@@ -146,13 +144,6 @@ class ClipTo01ThenAverage(object):
                # All episodes are fair game
                allowed_e_idx = e_idx
            if len(allowed_e_idx) > 0:
                last_timestamp = timestamps[allowed_e_idx[-1]]
            else:
                # If we don't have any evaluation episodes, then the
                # last valid timestamp is when we started.
                last_timestamp = initial_reset_timestamp
            # Grab the last num_episodes evaluation episodes from
            # before the cutoff (at which point we've gathered too
            # much experience).
@@ -181,6 +172,14 @@ class ClipTo01ThenAverage(object):
            solves.append(solved)
            # Record the list of rewards
            rewards.append(reward)
            if len(allowed_e_idx) > 0:
                last_timestamp = timestamps[allowed_e_idx[-1]]
            else:
                # If we don't have any evaluation episodes, then the
                # last valid timestamp is when we started.
                last_timestamp = initial_reset_timestamp
            # Record the timestamp of the last episode timestamp
            _timestamps.append(last_timestamp)
--- a/gym/scoreboard/tests/test_scoring.py
+++ b/gym/scoreboard/tests/test_scoring.py
@@ -116,6 +116,29 @@ def test_benchmark_scoring():
    assert np.all(np.isclose(scores['score'], 0.0001)), "scores={}".format(scores)
    assert scores['num_envs_solved'] == 0, debug_str
 def test_benchmark_empty():
    benchmark_results = defaultdict(list)
    task = benchmark.tasks[0]
    env_id = task.env_id
    benchmark_results[env_id].append(benchmark.score_evaluation(
        env_id,
        data_sources=[0],
        initial_reset_timestamps=[1],
        episode_lengths=[1],
        episode_rewards=[1],
        episode_types=['t'],
        timestamps=[2],
    ))
    scores = scoring.benchmark_aggregate_score(benchmark, benchmark_results)
    debug_str = "scores={}".format(scores)
    assert np.all(np.isclose(scores['summed_training_seconds'], 1.0)), debug_str
    assert np.all(np.isclose(scores['start_to_finish_seconds'], 1.0)), debug_str
    assert np.all(np.isclose(scores['score'], 0.00005)), "scores={}".format(scores)
    assert scores['num_envs_solved'] == 0, debug_str
    scores = scoring.benchmark_aggregate_score(benchmark, {})
 def test_benchmark_solved():
    benchmark_results = defaultdict(list)
    N = 200