* exported rl-algs * more stuff from rl-algs * run slow tests * re-exported rl_algs * re-exported rl_algs - fixed problems with serialization test and test_cartpole * replaced atari_arg_parser with common_arg_parser * run.py can run algos from both baselines and rl_algs * added approximate humanoid reward with ppo2 into the README for reference * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * very dummy commit to RUN BENCHMARKS * serialize variables as a dict, not as a list * running_mean_std uses tensorflow variables * fixed import in vec_normalize * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * flake8 complaints * save all variables to make sure we save the vec_normalize normalization * benchmarks on ppo2 only RUN BENCHMARKS * make_atari_env compatible with mpi * run ppo_mpi benchmarks only RUN BENCHMARKS * hardcode names of retro environments * add defaults * changed default ppo2 lr schedule to linear RUN BENCHMARKS * non-tf normalization benchmark RUN BENCHMARKS * use ncpu=1 for mujoco sessions - gives a bit of a performance speedup * reverted running_mean_std to user property decorators for mean, var, count * reverted VecNormalize to use RunningMeanStd (no tf) * reverted VecNormalize to use RunningMeanStd (no tf) * profiling wip * use VecNormalize with regular RunningMeanStd * added acer runner (missing import) * flake8 complaints * added a note in README about TfRunningMeanStd and serialization of VecNormalize * dummy commit to RUN BENCHMARKS * merged benchmarks branch
92 lines
2.6 KiB
Python
92 lines
2.6 KiB
Python
import tensorflow as tf
|
|
import numpy as np
|
|
from gym.spaces import np_random
|
|
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
|
|
|
|
N_TRIALS = 10000
|
|
N_EPISODES = 100
|
|
|
|
def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
|
|
np.random.seed(0)
|
|
np_random.seed(0)
|
|
|
|
env = DummyVecEnv([env_fn])
|
|
|
|
|
|
with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
|
|
tf.set_random_seed(0)
|
|
|
|
model = learn_fn(env)
|
|
|
|
sum_rew = 0
|
|
done = True
|
|
|
|
for i in range(n_trials):
|
|
if done:
|
|
obs = env.reset()
|
|
state = model.initial_state
|
|
|
|
if state is not None:
|
|
a, v, state, _ = model.step(obs, S=state, M=[False])
|
|
else:
|
|
a, v, _, _ = model.step(obs)
|
|
|
|
obs, rew, done, _ = env.step(a)
|
|
sum_rew += float(rew)
|
|
|
|
print("Reward in {} trials is {}".format(n_trials, sum_rew))
|
|
assert sum_rew > min_reward_fraction * n_trials, \
|
|
'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
|
|
|
|
|
|
|
|
def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES):
|
|
env = DummyVecEnv([env_fn])
|
|
|
|
with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
|
|
model = learn_fn(env)
|
|
|
|
N_TRIALS = 100
|
|
|
|
observations, actions, rewards = rollout(env, model, N_TRIALS)
|
|
rewards = [sum(r) for r in rewards]
|
|
|
|
avg_rew = sum(rewards) / N_TRIALS
|
|
print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
|
|
assert avg_rew > min_avg_reward, \
|
|
'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
|
|
|
|
def rollout(env, model, n_trials):
|
|
rewards = []
|
|
actions = []
|
|
observations = []
|
|
|
|
for i in range(n_trials):
|
|
obs = env.reset()
|
|
state = model.initial_state
|
|
episode_rew = []
|
|
episode_actions = []
|
|
episode_obs = []
|
|
|
|
while True:
|
|
if state is not None:
|
|
a, v, state, _ = model.step(obs, S=state, M=[False])
|
|
else:
|
|
a,v, _, _ = model.step(obs)
|
|
|
|
obs, rew, done, _ = env.step(a)
|
|
|
|
episode_rew.append(rew)
|
|
episode_actions.append(a)
|
|
episode_obs.append(obs)
|
|
|
|
if done:
|
|
break
|
|
|
|
rewards.append(episode_rew)
|
|
actions.append(episode_actions)
|
|
observations.append(episode_obs)
|
|
|
|
return observations, actions, rewards
|
|
|