Gymnasium/gym/benchmarks/scoring.py

from __future__ import division

import logging
import numpy as np
from gym import envs

logger = logging.getLogger(__name__)

def benchmark_aggregate_score(benchmark, env_id_to_benchmark_results):
    scores = {}
    solves = {}
    start_times = []
    end_times = []

    # N.B. for each env_id, our benchmark_results will have a list of scores,
    # solves, and times corresponding to the different tasks for that env_id. If
    # we don't have enough trials, we zero out the score.
    # TODO could do smarter matching of results to trials if we have extras
    # TODO for now, baked in assumption that the number of trials is the
    # same for all tasks involving a particular env.
    for env_id in benchmark.env_ids:
        task_list = benchmark.task_specs(env_id)
        num_trials = task_list[0].trials
        benchmark_results = env_id_to_benchmark_results.get(env_id, [])
        for trial in range(num_trials):
            if trial < len(benchmark_results):
                # okay process this benchmark result against this trial
                benchmark_result = benchmark_results[trial]

                env_scores = scores.setdefault(env_id, [])
                env_scores.append(benchmark_result['scores'])

                # note: solves is a list of lists - for each task for this env,
                # does each episode solve that task. We consider the env solved
                # if every episode for every task is individually solved.
                solved = solves.setdefault(env_id, True)
                solves[env_id] = solved and np.all(benchmark_result['solves'])

                # these timestamps are a list of the first / last valid timestamp
                # for each task involving this env.
                start_times.append(benchmark_result['initial_reset_timestamp'])
                end_times.append(max(benchmark_result['timestamps']))
            else:
                # no matching benchmark result for this trial
                env_scores = scores.setdefault(env_id, [])
                env_scores.append([benchmark.scorer.null_score() for _ in task_list])
                solves[env_id] = False

    score = benchmark.score_benchmark(scores)
    num_envs_solved = len([s for s in solves.values() if s])
    start_to_finish_seconds = max(end_times) - min(start_times) if start_times and end_times else 0.0
    summed_training_seconds = np.sum([end - start for end, start in zip(end_times, start_times)])

    return dict(
        score=score,
        num_envs_solved=num_envs_solved,
        start_to_finish_seconds=start_to_finish_seconds,
        summed_training_seconds=summed_training_seconds,
    )

class ClipTo01ThenAverage(object):
    def __init__(self, num_episodes=100):
        self.num_episodes = num_episodes

    def null_score(self):
        return 0.0

    def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps):
        tasks = benchmark.task_specs(env_id)
        spec = envs.spec(env_id)

        #### 0. Compute timing stats

        if len(initial_reset_timestamps) > 0:
            initial_reset_timestamp = min(initial_reset_timestamps)
        else:
            initial_reset_timestamp = 0


        # How long each episode actually took
        durations = np.zeros(len(timestamps))

        for source, initial_reset_timestamp in enumerate(initial_reset_timestamps):
            temp_data_sources = np.array([source] + data_sources)
            temp_timestamps = np.array([initial_reset_timestamp] + timestamps)
            (source_indexes,) = np.where(temp_data_sources == source)

            # Once we know the indexes corresponding to a particular
            # source (i.e. worker thread), we can just subtract
            # adjoining values
            durations[source_indexes[:-1]] = temp_timestamps[source_indexes[1:]] - temp_timestamps[source_indexes[:-1]]

        #### 1. Select out which indexes are for evaluation and which are for training

        (t_idx,) = np.where([t == 't' for t in episode_types]) # training episodes
        (e_idx,) = np.where([t == 'e' for t in episode_types]) # evaluation episodes
        if len(e_idx) == 0:
            # If no episodes marked for evaluation, consider
            # everything both a training and evaluation episode.
            (t_idx,) = np.where([True for t in episode_types])
            (e_idx,) = np.where([True for t in episode_types])

        #### 2. Grab the data corresponding to each of evaluation/training

        training_lengths = np.array(episode_lengths)[t_idx]
        training_rewards = np.array(episode_rewards)[t_idx]
        training_durations = np.array(durations)[t_idx]

        evaluation_lengths = np.array(episode_lengths)[e_idx]
        evaluation_rewards = np.array(episode_rewards)[e_idx]
        evaluation_durations = np.array(durations)[e_idx]

        #### 3. Calculate the total elapsed time (in various units)
        #### for each episode

        # How many training timesteps have elapsed by the end of each
        # episode. Not to be confused with Unix timestamps.
        elapsed_timesteps = np.cumsum(training_lengths)
        # Total number of seconds elapsed by the end of each
        # episode. Note that with n parallel workers each running for
        # m seconds, we want to count the total time as n * m.
        elapsed_seconds = np.cumsum(training_durations)

        scores = []
        solves = []
        rewards = []
        _timestamps = []
        for task in tasks:
            # Find the first episode where we're over the allotted
            # training timesteps.
            cutoff_idx = np.inf
            if task.max_timesteps:
                (timestep_cutoff,) = np.where(elapsed_timesteps > task.max_timesteps)
                if len(timestep_cutoff) > 0:
                    cutoff_idx = min(cutoff_idx, timestep_cutoff[-1])
            if task.max_seconds:
                (seconds_cutoff,) = np.where(elapsed_seconds > task.max_seconds)
                if len(seconds_cutoff) > 0:
                    cutoff_idx = min(cutoff_idx, seconds_cutoff[-1])
            if np.isfinite(cutoff_idx):
                orig_cutoff_idx = t_idx[cutoff_idx] # cutoff index in the original (i.e. before filtering to training/evaluation)
                (allowed_e_idx,) = np.where(e_idx < orig_cutoff_idx) # restrict to earlier episodes
            else:
                # All episodes are fair game
                allowed_e_idx = e_idx

            # Grab the last num_episodes evaluation episodes from
            # before the cutoff (at which point we've gathered too
            # much experience).
            #
            # This probably won't work long-term but is fine for now.
            allowed_episode_rewards = np.array(episode_rewards)[allowed_e_idx]
            reward = allowed_episode_rewards[-self.num_episodes:]

            floor = task.reward_floor
            ceiling = task.reward_ceiling

            if len(reward) < self.num_episodes:
                extra = self.num_episodes-len(reward)
                logger.info('Only %s rewards for %s; adding %s', len(reward), env_id, extra)
                reward = np.concatenate([reward, [floor] * extra])

            # Grab the indexes where we reached the ceiling
            solved = reward >= ceiling
            # Linearly rescale rewards to between 0 and 1
            clipped = np.clip((reward - floor) / (ceiling - floor), 0, 1)

            # Take the mean rescaled score
            score = np.mean(clipped)
            scores.append(score)
            # Record the list of solved episodes
            solves.append(solved)
            # Record the list of rewards
            rewards.append(reward)

            if len(allowed_e_idx) > 0:
                last_timestamp = timestamps[allowed_e_idx[-1]]
            else:
                # If we don't have any evaluation episodes, then the
                # last valid timestamp is when we started.
                last_timestamp = initial_reset_timestamp

            # Record the timestamp of the last episode timestamp
            _timestamps.append(last_timestamp)

        return {
            'rewards': rewards,
            'scores': scores,
            'solves': solves,
            'timestamps': _timestamps,
            'initial_reset_timestamp': initial_reset_timestamp,
        }

    def score_benchmark(self, benchmark, episode_scores):
        all_scores = []
        for env_id, scores in episode_scores.items():
            all_scores += scores

        return np.mean(all_scores)
py2 numerical compatibility 2016-10-26 16:57:26 -07:00			`from __future__ import division`

Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`import logging`
			`import numpy as np`
			`from gym import envs`

			`logger = logging.getLogger(__name__)`

Move / rename benchmark scoring function 2016-10-25 21:53:58 -07:00			`def benchmark_aggregate_score(benchmark, env_id_to_benchmark_results):`
			`scores = {}`
			`solves = {}`
			`start_times = []`
			`end_times = []`

			`# N.B. for each env_id, our benchmark_results will have a list of scores,`
			`# solves, and times corresponding to the different tasks for that env_id. If`
			`# we don't have enough trials, we zero out the score.`
			`# TODO could do smarter matching of results to trials if we have extras`
			`# TODO for now, baked in assumption that the number of trials is the`
			`# same for all tasks involving a particular env.`
			`for env_id in benchmark.env_ids:`
			`task_list = benchmark.task_specs(env_id)`
			`num_trials = task_list[0].trials`
Fix some bugs with new partial benchmark scoring 2016-10-27 12:09:49 -07:00			`benchmark_results = env_id_to_benchmark_results.get(env_id, [])`
Move / rename benchmark scoring function 2016-10-25 21:53:58 -07:00			`for trial in range(num_trials):`
			`if trial < len(benchmark_results):`
			`# okay process this benchmark result against this trial`
			`benchmark_result = benchmark_results[trial]`

			`env_scores = scores.setdefault(env_id, [])`
			`env_scores.append(benchmark_result['scores'])`

			`# note: solves is a list of lists - for each task for this env,`
			`# does each episode solve that task. We consider the env solved`
			`# if every episode for every task is individually solved.`
			`solved = solves.setdefault(env_id, True)`
			`solves[env_id] = solved and np.all(benchmark_result['solves'])`

			`# these timestamps are a list of the first / last valid timestamp`
			`# for each task involving this env.`
			`start_times.append(benchmark_result['initial_reset_timestamp'])`
			`end_times.append(max(benchmark_result['timestamps']))`
			`else:`
			`# no matching benchmark result for this trial`
			`env_scores = scores.setdefault(env_id, [])`
			`env_scores.append([benchmark.scorer.null_score() for _ in task_list])`
			`solves[env_id] = False`

			`score = benchmark.score_benchmark(scores)`
			`num_envs_solved = len([s for s in solves.values() if s])`
Fix some bugs with new partial benchmark scoring 2016-10-27 12:09:49 -07:00			`start_to_finish_seconds = max(end_times) - min(start_times) if start_times and end_times else 0.0`
Move / rename benchmark scoring function 2016-10-25 21:53:58 -07:00			`summed_training_seconds = np.sum([end - start for end, start in zip(end_times, start_times)])`

			`return dict(`
			`score=score,`
			`num_envs_solved=num_envs_solved,`
			`start_to_finish_seconds=start_to_finish_seconds,`
			`summed_training_seconds=summed_training_seconds,`
			`)`

Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`class ClipTo01ThenAverage(object):`
Fix bug in max_seconds scoring. Refactor null_score, add tests for it all 2016-10-25 21:44:43 -07:00			`def __init__(self, num_episodes=100):`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`self.num_episodes = num_episodes`
Fix bug in max_seconds scoring. Refactor null_score, add tests for it all 2016-10-25 21:44:43 -07:00
			`def null_score(self):`
			`return 0.0`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00
Update benchmark spec (#385) * Update benchmark spec * Update format of benchmark again * Add support for max_seconds to benchmark * Bump version 2016-10-20 17:25:29 -07:00			`def score_evaluation(self, benchmark, env_id, data_sources, initial_reset_timestamps, episode_lengths, episode_rewards, episode_types, timestamps):`
			`tasks = benchmark.task_specs(env_id)`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`spec = envs.spec(env_id)`

Update benchmark spec (#385) * Update benchmark spec * Update format of benchmark again * Add support for max_seconds to benchmark * Bump version 2016-10-20 17:25:29 -07:00			`#### 0. Compute timing stats`

			`if len(initial_reset_timestamps) > 0:`
			`initial_reset_timestamp = min(initial_reset_timestamps)`
			`else:`
			`initial_reset_timestamp = 0`

Fix some bugs with new partial benchmark scoring 2016-10-27 12:09:49 -07:00
Update benchmark spec (#385) * Update benchmark spec * Update format of benchmark again * Add support for max_seconds to benchmark * Bump version 2016-10-20 17:25:29 -07:00			`# How long each episode actually took`
			`durations = np.zeros(len(timestamps))`

Fix some bugs with new partial benchmark scoring 2016-10-27 12:09:49 -07:00			`for source, initial_reset_timestamp in enumerate(initial_reset_timestamps):`
			`temp_data_sources = np.array([source] + data_sources)`
			`temp_timestamps = np.array([initial_reset_timestamp] + timestamps)`
			`(source_indexes,) = np.where(temp_data_sources == source)`

Update benchmark spec (#385) * Update benchmark spec * Update format of benchmark again * Add support for max_seconds to benchmark * Bump version 2016-10-20 17:25:29 -07:00			`# Once we know the indexes corresponding to a particular`
			`# source (i.e. worker thread), we can just subtract`
			`# adjoining values`
Fix some bugs with new partial benchmark scoring 2016-10-27 12:09:49 -07:00			`durations[source_indexes[:-1]] = temp_timestamps[source_indexes[1:]] - temp_timestamps[source_indexes[:-1]]`
Update benchmark spec (#385) * Update benchmark spec * Update format of benchmark again * Add support for max_seconds to benchmark * Bump version 2016-10-20 17:25:29 -07:00
			`#### 1. Select out which indexes are for evaluation and which are for training`

Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`(t_idx,) = np.where([t == 't' for t in episode_types]) # training episodes`
			`(e_idx,) = np.where([t == 'e' for t in episode_types]) # evaluation episodes`
			`if len(e_idx) == 0:`
			`# If no episodes marked for evaluation, consider`
			`# everything both a training and evaluation episode.`
			`(t_idx,) = np.where([True for t in episode_types])`
			`(e_idx,) = np.where([True for t in episode_types])`

Update benchmark spec (#385) * Update benchmark spec * Update format of benchmark again * Add support for max_seconds to benchmark * Bump version 2016-10-20 17:25:29 -07:00			`#### 2. Grab the data corresponding to each of evaluation/training`

Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`training_lengths = np.array(episode_lengths)[t_idx]`
			`training_rewards = np.array(episode_rewards)[t_idx]`
Update benchmark spec (#385) * Update benchmark spec * Update format of benchmark again * Add support for max_seconds to benchmark * Bump version 2016-10-20 17:25:29 -07:00			`training_durations = np.array(durations)[t_idx]`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00
			`evaluation_lengths = np.array(episode_lengths)[e_idx]`
			`evaluation_rewards = np.array(episode_rewards)[e_idx]`
Update benchmark spec (#385) * Update benchmark spec * Update format of benchmark again * Add support for max_seconds to benchmark * Bump version 2016-10-20 17:25:29 -07:00			`evaluation_durations = np.array(durations)[e_idx]`

			`#### 3. Calculate the total elapsed time (in various units)`
			`#### for each episode`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00
			`# How many training timesteps have elapsed by the end of each`
			`# episode. Not to be confused with Unix timestamps.`
			`elapsed_timesteps = np.cumsum(training_lengths)`
Update benchmark spec (#385) * Update benchmark spec * Update format of benchmark again * Add support for max_seconds to benchmark * Bump version 2016-10-20 17:25:29 -07:00			`# Total number of seconds elapsed by the end of each`
			`# episode. Note that with n parallel workers each running for`
			`# m seconds, we want to count the total time as n * m.`
			`elapsed_seconds = np.cumsum(training_durations)`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00
			`scores = []`
			`solves = []`
			`rewards = []`
			`_timestamps = []`
			`for task in tasks:`
			`# Find the first episode where we're over the allotted`
			`# training timesteps.`
Update benchmark spec (#385) * Update benchmark spec * Update format of benchmark again * Add support for max_seconds to benchmark * Bump version 2016-10-20 17:25:29 -07:00			`cutoff_idx = np.inf`
			`if task.max_timesteps:`
			`(timestep_cutoff,) = np.where(elapsed_timesteps > task.max_timesteps)`
			`if len(timestep_cutoff) > 0:`
			`cutoff_idx = min(cutoff_idx, timestep_cutoff[-1])`
			`if task.max_seconds:`
			`(seconds_cutoff,) = np.where(elapsed_seconds > task.max_seconds)`
			`if len(seconds_cutoff) > 0:`
			`cutoff_idx = min(cutoff_idx, seconds_cutoff[-1])`
			`if np.isfinite(cutoff_idx):`
			`orig_cutoff_idx = t_idx[cutoff_idx] # cutoff index in the original (i.e. before filtering to training/evaluation)`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`(allowed_e_idx,) = np.where(e_idx < orig_cutoff_idx) # restrict to earlier episodes`
			`else:`
			`# All episodes are fair game`
			`allowed_e_idx = e_idx`

			`# Grab the last num_episodes evaluation episodes from`
			`# before the cutoff (at which point we've gathered too`
			`# much experience).`
			`#`
			`# This probably won't work long-term but is fine for now.`
			`allowed_episode_rewards = np.array(episode_rewards)[allowed_e_idx]`
			`reward = allowed_episode_rewards[-self.num_episodes:]`

			`floor = task.reward_floor`
			`ceiling = task.reward_ceiling`

Assign floor for any missing episodes 2016-09-23 02:08:03 -07:00			`if len(reward) < self.num_episodes:`
			`extra = self.num_episodes-len(reward)`
			`logger.info('Only %s rewards for %s; adding %s', len(reward), env_id, extra)`
			`reward = np.concatenate([reward, [floor] * extra])`

Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`# Grab the indexes where we reached the ceiling`
			`solved = reward >= ceiling`
			`# Linearly rescale rewards to between 0 and 1`
			`clipped = np.clip((reward - floor) / (ceiling - floor), 0, 1)`

			`# Take the mean rescaled score`
Assign floor for any missing episodes 2016-09-23 02:08:03 -07:00			`score = np.mean(clipped)`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`scores.append(score)`
			`# Record the list of solved episodes`
			`solves.append(solved)`
			`# Record the list of rewards`
			`rewards.append(reward)`
Fix some bugs with new partial benchmark scoring 2016-10-27 12:09:49 -07:00
			`if len(allowed_e_idx) > 0:`
			`last_timestamp = timestamps[allowed_e_idx[-1]]`
			`else:`
			`# If we don't have any evaluation episodes, then the`
			`# last valid timestamp is when we started.`
			`last_timestamp = initial_reset_timestamp`

Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`# Record the timestamp of the last episode timestamp`
			`_timestamps.append(last_timestamp)`

			`return {`
			`'rewards': rewards,`
			`'scores': scores,`
			`'solves': solves,`
			`'timestamps': _timestamps,`
Implement benchmark scoring on gym side 2016-10-24 23:38:01 -07:00			`'initial_reset_timestamp': initial_reset_timestamp,`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`}`

			`def score_benchmark(self, benchmark, episode_scores):`
			`all_scores = []`
			`for env_id, scores in episode_scores.items():`
			`all_scores += scores`

			`return np.mean(all_scores)`