Gymnasium/gym/scoreboard/scoring.py

"""This is the actual code we use to score people's solutions
server-side. The interfaces here are not yet stable, but we include
them so that people can reproduce our scoring calculations
independently.

We correspondly do not currently import this module.
"""

import numpy as np
import requests

import gym

def score_from_remote(url):
    result = requests.get(url)
    parsed = result.json()
    episode_lengths = parsed['episode_lengths']
    episode_rewards = parsed['episode_rewards']
    episode_types = parsed.get('episode_types')
    timestamps = parsed['timestamps']
    # Handle legacy entries where initial_reset_timestamp wasn't set
    initial_reset_timestamp = parsed.get('initial_reset_timestamp', timestamps[0])
    env_id = parsed['env_id']

    spec = gym.spec(env_id)
    return score_from_merged(episode_lengths, episode_rewards, episode_types, timestamps, initial_reset_timestamp, spec.trials, spec.reward_threshold)

def score_from_local(directory):
    """Calculate score from a local results directory"""
    results = gym.monitoring.monitor.load_results(directory)
    # No scores yet saved
    if results is None:
        return None

    episode_lengths = results['episode_lengths']
    episode_rewards = results['episode_rewards']
    episode_types = results['episode_types']
    timestamps = results['timestamps']
    initial_reset_timestamp = results['initial_reset_timestamp']
    spec = gym.spec(results['env_info']['env_id'])

    return score_from_merged(episode_lengths, episode_rewards, episode_types, timestamps, initial_reset_timestamp, spec.trials, spec.reward_threshold)

def score_from_merged(episode_lengths, episode_rewards, episode_types, timestamps, initial_reset_timestamp, trials, reward_threshold):
    """Method to calculate the score from merged monitor files. Scores
    only a single environment; mostly legacy.
    """
    if episode_types is not None:
        # Select only the training episodes
        t_idx = np.where(e == 't' for e in episode_types)
        episode_lengths = np.array(episode_lengths)[t_idx]
        episode_rewards = np.array(episode_rewards)[t_idx]
        timestamps = np.array(timestamps)[t_idx]

    # Make sure everything is a float -- no pesky ints.
    episode_rewards = np.array(episode_rewards, dtype='float64')

    episode_t_value = timestep_t_value = mean = error = None
    seconds_to_solve = seconds_in_total = None

    if len(timestamps) > 0:
        # This is: time from the first reset to the end of the last episode
        seconds_in_total = timestamps[-1] - initial_reset_timestamp
    if len(episode_rewards) >= trials:
        means = running_mean(episode_rewards, trials)
        if reward_threshold is not None:
            # Compute t-value by finding the first index at or above
            # the threshold. It comes out as a singleton tuple.
            (indexes_above_threshold, ) = np.where(means >= reward_threshold)
            if len(indexes_above_threshold) > 0:
                # Grab the first episode index that is above the threshold value
                episode_t_value = indexes_above_threshold[0]

                # Find timestep corresponding to this episode
                cumulative_timesteps = np.cumsum(np.insert(episode_lengths, 0, 0))
                # Convert that into timesteps
                timestep_t_value = cumulative_timesteps[episode_t_value]
                # This is: time from the first reset to the end of the first solving episode
                seconds_to_solve = timestamps[episode_t_value] - initial_reset_timestamp

        # Find the window with the best mean
        best_idx = np.argmax(means)
        best_rewards = episode_rewards[best_idx:best_idx+trials]
        mean = np.mean(best_rewards)
        if trials == 1: # avoid NaN
            error = 0.
        else:
            error = np.std(best_rewards) / (np.sqrt(trials) - 1)
    return {
        'episode_t_value': episode_t_value,
        'timestep_t_value': timestep_t_value,
        'mean': mean,
        'error': error,
        'number_episodes': len(episode_rewards),
        'number_timesteps': sum(episode_lengths),
        'seconds_to_solve': seconds_to_solve,
        'seconds_in_total': seconds_in_total,
    }

def benchmark_score_from_merged(benchmark, env_id, episode_lengths, episode_rewards, episode_types):
    """Method to calculate an environment's benchmark score from merged
    monitor files.
    """
    return benchmark.score(benchmark, env_id, episode_lengths, episode_rewards, episode_types)

def running_mean(x, N):
    x = np.array(x, dtype='float64')
    cumsum = np.cumsum(np.insert(x, 0, 0))
    return (cumsum[N:] - cumsum[:-N]) / N

def compute_graph_stats(episode_lengths, episode_rewards, timestamps, initial_reset_timestamp, buckets):
    """Method to compute the aggregates for the graphs."""
    # Not a dependency of OpenAI Gym generally.
    import scipy.stats

    num_episodes = len(episode_lengths)

    # Catch for if no files written which causes error with scipy.stats.binned_statistic
    if num_episodes == 0:
        return None

    episode_rewards = np.array(episode_rewards)
    episode_lengths = np.array(episode_lengths)

    # The index of the start of each episode
    x_timestep = np.cumsum(np.insert(episode_lengths, 0, 0))[:-1]
    assert len(x_timestep) == num_episodes

    # Delta since the beginning of time
    x_seconds = [timestamp - initial_reset_timestamp for timestamp in timestamps]

    # The index of each episode
    x_episode = range(num_episodes)

    # Calculate the appropriate x/y statistics
    x_timestep_y_reward = scipy.stats.binned_statistic(x_timestep, episode_rewards, 'median', buckets)
    x_timestep_y_length = scipy.stats.binned_statistic(x_timestep, episode_lengths, 'median', buckets)

    x_episode_y_reward = scipy.stats.binned_statistic(x_episode, episode_rewards, 'median', buckets)
    x_episode_y_length = scipy.stats.binned_statistic(x_episode, episode_lengths, 'median', buckets)

    x_seconds_y_reward = scipy.stats.binned_statistic(x_seconds, episode_rewards, 'median', buckets)
    x_seconds_y_length = scipy.stats.binned_statistic(x_seconds, episode_lengths, 'median', buckets)

    return {
        'initial_reset_timestamp': initial_reset_timestamp,
        'x_timestep_y_reward': graphable_binned_statistic(x_timestep_y_reward),
        'x_timestep_y_length': graphable_binned_statistic(x_timestep_y_length),
        'x_episode_y_reward': graphable_binned_statistic(x_episode_y_reward),
        'x_episode_y_length': graphable_binned_statistic(x_episode_y_length),
        'x_seconds_y_length': graphable_binned_statistic(x_seconds_y_length),
        'x_seconds_y_reward': graphable_binned_statistic(x_seconds_y_reward),
    }

def graphable_binned_statistic(binned):
    x = running_mean(binned.bin_edges, 2)
    y = binned.statistic
    assert len(x) == len(y)

    # Get rid of nasty NaNs
    valid = np.logical_not(np.isnan(x)) & np.logical_not(np.isnan(y))
    x = x[valid]
    y = y[valid]

    return {
        'x': x,
        'y': y,
    }
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`"""This is the actual code we use to score people's solutions`
			`server-side. The interfaces here are not yet stable, but we include`
			`them so that people can reproduce our scoring calculations`
			`independently.`

			`We correspondly do not currently import this module.`
			`"""`

			`import numpy as np`
			`import requests`

			`import gym`

			`def score_from_remote(url):`
			`result = requests.get(url)`
			`parsed = result.json()`
			`episode_lengths = parsed['episode_lengths']`
			`episode_rewards = parsed['episode_rewards']`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`episode_types = parsed.get('episode_types')`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`timestamps = parsed['timestamps']`
scoring.py: Handle entries without initial_reset_timestamp 2016-04-27 09:32:31 -07:00			`# Handle legacy entries where initial_reset_timestamp wasn't set`
			`initial_reset_timestamp = parsed.get('initial_reset_timestamp', timestamps[0])`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`env_id = parsed['env_id']`

			`spec = gym.spec(env_id)`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`return score_from_merged(episode_lengths, episode_rewards, episode_types, timestamps, initial_reset_timestamp, spec.trials, spec.reward_threshold)`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00
Improve score_from_local implementation (#66) * Make sure that a callable is passed to start * Improve autoflushing for score calculation * Write stats and manifests using proper atomic_writes 2016-05-06 18:19:16 -07:00			`def score_from_local(directory):`
			`"""Calculate score from a local results directory"""`
			`results = gym.monitoring.monitor.load_results(directory)`
			`# No scores yet saved`
			`if results is None:`
			`return None`

			`episode_lengths = results['episode_lengths']`
			`episode_rewards = results['episode_rewards']`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`episode_types = results['episode_types']`
Improve score_from_local implementation (#66) * Make sure that a callable is passed to start * Improve autoflushing for score calculation * Write stats and manifests using proper atomic_writes 2016-05-06 18:19:16 -07:00			`timestamps = results['timestamps']`
			`initial_reset_timestamp = results['initial_reset_timestamp']`
			`spec = gym.spec(results['env_info']['env_id'])`
Issue 45 - Easily return environment score values (#57) * Set restriction on selected actions * Used self.action_space instead of custom set * Move action validation to core.py * Added env.score() function to return current score * Set up multi monitor scoring * Removed unneccesary package inclusions * Monitor writes scores automatically. Video callable can now use monitor_id * Monitor ID not necessary for video filtering * Fix so final stats get written when env.reset() doesn't happen * gitignore PyCharm Project Files 2016-05-07 10:38:42 +10:00
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`return score_from_merged(episode_lengths, episode_rewards, episode_types, timestamps, initial_reset_timestamp, spec.trials, spec.reward_threshold)`
Issue 45 - Easily return environment score values (#57) * Set restriction on selected actions * Used self.action_space instead of custom set * Move action validation to core.py * Added env.score() function to return current score * Set up multi monitor scoring * Removed unneccesary package inclusions * Monitor writes scores automatically. Video callable can now use monitor_id * Monitor ID not necessary for video filtering * Fix so final stats get written when env.reset() doesn't happen * gitignore PyCharm Project Files 2016-05-07 10:38:42 +10:00
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`def score_from_merged(episode_lengths, episode_rewards, episode_types, timestamps, initial_reset_timestamp, trials, reward_threshold):`
			`"""Method to calculate the score from merged monitor files. Scores`
			`only a single environment; mostly legacy.`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`"""`
Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`if episode_types is not None:`
			`# Select only the training episodes`
			`t_idx = np.where(e == 't' for e in episode_types)`
			`episode_lengths = np.array(episode_lengths)[t_idx]`
			`episode_rewards = np.array(episode_rewards)[t_idx]`
			`timestamps = np.array(timestamps)[t_idx]`

Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`# Make sure everything is a float -- no pesky ints.`
			`episode_rewards = np.array(episode_rewards, dtype='float64')`
scoring.py: Include more timestamp metrics 2016-04-27 08:58:36 -07:00
			`episode_t_value = timestep_t_value = mean = error = None`
			`seconds_to_solve = seconds_in_total = None`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00
Record initial reset timestamp, and use it for duration calculations 2016-04-27 09:17:05 -07:00			`if len(timestamps) > 0:`
			`# This is: time from the first reset to the end of the last episode`
			`seconds_in_total = timestamps[-1] - initial_reset_timestamp`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`if len(episode_rewards) >= trials:`
			`means = running_mean(episode_rewards, trials)`
			`if reward_threshold is not None:`
scoring.py: Make the success threshold non-strict 2016-04-28 07:17:09 -07:00			`# Compute t-value by finding the first index at or above`
			`# the threshold. It comes out as a singleton tuple.`
			`(indexes_above_threshold, ) = np.where(means >= reward_threshold)`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`if len(indexes_above_threshold) > 0:`
			`# Grab the first episode index that is above the threshold value`
			`episode_t_value = indexes_above_threshold[0]`

			`# Find timestep corresponding to this episode`
			`cumulative_timesteps = np.cumsum(np.insert(episode_lengths, 0, 0))`
			`# Convert that into timesteps`
			`timestep_t_value = cumulative_timesteps[episode_t_value]`
Record initial reset timestamp, and use it for duration calculations 2016-04-27 09:17:05 -07:00			`# This is: time from the first reset to the end of the first solving episode`
			`seconds_to_solve = timestamps[episode_t_value] - initial_reset_timestamp`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00
			`# Find the window with the best mean`
			`best_idx = np.argmax(means)`
			`best_rewards = episode_rewards[best_idx:best_idx+trials]`
			`mean = np.mean(best_rewards)`
scoring.py: Avoid NaN when computing standard error 2016-06-17 18:45:14 -07:00			`if trials == 1: # avoid NaN`
			`error = 0.`
			`else:`
			`error = np.std(best_rewards) / (np.sqrt(trials) - 1)`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`return {`
			`'episode_t_value': episode_t_value,`
			`'timestep_t_value': timestep_t_value,`
			`'mean': mean,`
			`'error': error,`
			`'number_episodes': len(episode_rewards),`
			`'number_timesteps': sum(episode_lengths),`
scoring.py: Include more timestamp metrics 2016-04-27 08:58:36 -07:00			`'seconds_to_solve': seconds_to_solve,`
			`'seconds_in_total': seconds_in_total,`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`}`

Add benchmark support (#338) * Warn if seed doesn't return a list * Add preliminary BenchmarkRun support * Add experimental benchmark registration * Flesh out interface * Add preliminary BenchmarkRun support * Warn if seed doesn't return a list * Add experimental benchmark registration * Flesh out interface * Make benchmarkrun upload recursive * Add evaluation episodes * Add benchmark scoring * Tweak reward locations * Tweak scoring * Clear default metadata in Wrapper * Improve scoring * Expose registry; fix test * Add initial_reset_timestamp * Add back algorithm; fix tests 2016-09-23 01:04:26 -07:00			`def benchmark_score_from_merged(benchmark, env_id, episode_lengths, episode_rewards, episode_types):`
			`"""Method to calculate an environment's benchmark score from merged`
			`monitor files.`
			`"""`
			`return benchmark.score(benchmark, env_id, episode_lengths, episode_rewards, episode_types)`

Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`def running_mean(x, N):`
			`x = np.array(x, dtype='float64')`
			`cumsum = np.cumsum(np.insert(x, 0, 0))`
			`return (cumsum[N:] - cumsum[:-N]) / N`

scoring.py: Change the graph wallclock time format 2016-04-27 22:52:23 -07:00			`def compute_graph_stats(episode_lengths, episode_rewards, timestamps, initial_reset_timestamp, buckets):`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`"""Method to compute the aggregates for the graphs."""`
			`# Not a dependency of OpenAI Gym generally.`
scoring.compute_graph_stats Default return and fix import statement (#90) 2016-05-12 01:38:08 +10:00			`import scipy.stats`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00
			`num_episodes = len(episode_lengths)`

scoring.compute_graph_stats Default return and fix import statement (#90) 2016-05-12 01:38:08 +10:00			`# Catch for if no files written which causes error with scipy.stats.binned_statistic`
			`if num_episodes == 0:`
			`return None`

Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`episode_rewards = np.array(episode_rewards)`
			`episode_lengths = np.array(episode_lengths)`

			`# The index of the start of each episode`
			`x_timestep = np.cumsum(np.insert(episode_lengths, 0, 0))[:-1]`
			`assert len(x_timestep) == num_episodes`

scoring.py: Change the graph wallclock time format 2016-04-27 22:52:23 -07:00			`# Delta since the beginning of time`
			`x_seconds = [timestamp - initial_reset_timestamp for timestamp in timestamps]`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00
			`# The index of each episode`
			`x_episode = range(num_episodes)`

			`# Calculate the appropriate x/y statistics`
			`x_timestep_y_reward = scipy.stats.binned_statistic(x_timestep, episode_rewards, 'median', buckets)`
			`x_timestep_y_length = scipy.stats.binned_statistic(x_timestep, episode_lengths, 'median', buckets)`

			`x_episode_y_reward = scipy.stats.binned_statistic(x_episode, episode_rewards, 'median', buckets)`
			`x_episode_y_length = scipy.stats.binned_statistic(x_episode, episode_lengths, 'median', buckets)`

scoring.py: Change the graph wallclock time format 2016-04-27 22:52:23 -07:00			`x_seconds_y_reward = scipy.stats.binned_statistic(x_seconds, episode_rewards, 'median', buckets)`
			`x_seconds_y_length = scipy.stats.binned_statistic(x_seconds, episode_lengths, 'median', buckets)`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00
			`return {`
scoring.py: Change the graph wallclock time format 2016-04-27 22:52:23 -07:00			`'initial_reset_timestamp': initial_reset_timestamp,`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`'x_timestep_y_reward': graphable_binned_statistic(x_timestep_y_reward),`
			`'x_timestep_y_length': graphable_binned_statistic(x_timestep_y_length),`
			`'x_episode_y_reward': graphable_binned_statistic(x_episode_y_reward),`
			`'x_episode_y_length': graphable_binned_statistic(x_episode_y_length),`
scoring.py: Change the graph wallclock time format 2016-04-27 22:52:23 -07:00			`'x_seconds_y_length': graphable_binned_statistic(x_seconds_y_length),`
			`'x_seconds_y_reward': graphable_binned_statistic(x_seconds_y_reward),`
Initial release. Hello world :). 2016-04-27 08:00:58 -07:00			`}`

			`def graphable_binned_statistic(binned):`
			`x = running_mean(binned.bin_edges, 2)`
			`y = binned.statistic`
			`assert len(x) == len(y)`

			`# Get rid of nasty NaNs`
			`valid = np.logical_not(np.isnan(x)) & np.logical_not(np.isnan(y))`
			`x = x[valid]`
			`y = y[valid]`

			`return {`
			`'x': x,`
			`'y': y,`
			`}`