baselines/baselines/common/tests/test_fixed_sequence.py

import pytest
from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv

from baselines.common.tests.util import simple_test
from baselines.run import get_learn_function

common_kwargs = dict(
    seed=0,
    total_timesteps=50000,
)
    
learn_kwargs = {
    'a2c': {},
    'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
    # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps)
    # github issue: https://github.com/openai/baselines/issues/188
    # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001)
}


alg_list = learn_kwargs.keys()
rnn_list = ['lstm']

@pytest.mark.slow
@pytest.mark.parametrize("alg", alg_list)
@pytest.mark.parametrize("rnn", rnn_list)
def test_fixed_sequence(alg, rnn):
    '''
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    '''

    kwargs = learn_kwargs[alg]
    kwargs.update(common_kwargs)

    episode_len = 5
    env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
    learn = lambda e: get_learn_function(alg)(
        env=e, 
        network=rnn,
        **kwargs
    )

    simple_test(env_fn, learn, 0.7)


if __name__ == '__main__':
    test_fixed_sequence('ppo2', 'lstm')
refactor a2c, acer, acktr, ppo2, deepq, and trpo_mpi (#490) * exported rl-algs * more stuff from rl-algs * run slow tests * re-exported rl_algs * re-exported rl_algs - fixed problems with serialization test and test_cartpole * replaced atari_arg_parser with common_arg_parser * run.py can run algos from both baselines and rl_algs * added approximate humanoid reward with ppo2 into the README for reference * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * very dummy commit to RUN BENCHMARKS * serialize variables as a dict, not as a list * running_mean_std uses tensorflow variables * fixed import in vec_normalize * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * flake8 complaints * save all variables to make sure we save the vec_normalize normalization * benchmarks on ppo2 only RUN BENCHMARKS * make_atari_env compatible with mpi * run ppo_mpi benchmarks only RUN BENCHMARKS * hardcode names of retro environments * add defaults * changed default ppo2 lr schedule to linear RUN BENCHMARKS * non-tf normalization benchmark RUN BENCHMARKS * use ncpu=1 for mujoco sessions - gives a bit of a performance speedup * reverted running_mean_std to user property decorators for mean, var, count * reverted VecNormalize to use RunningMeanStd (no tf) * reverted VecNormalize to use RunningMeanStd (no tf) * profiling wip * use VecNormalize with regular RunningMeanStd * added acer runner (missing import) * flake8 complaints * added a note in README about TfRunningMeanStd and serialization of VecNormalize * dummy commit to RUN BENCHMARKS * merged benchmarks branch 2018-08-13 09:56:44 -07:00			`import pytest`
			`from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv`

			`from baselines.common.tests.util import simple_test`
			`from baselines.run import get_learn_function`

			`common_kwargs = dict(`
			`seed=0,`
			`total_timesteps=50000,`
			`)`

			`learn_kwargs = {`
			`'a2c': {},`
			`'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),`
			`# TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps)`
			`# github issue: https://github.com/openai/baselines/issues/188`
			`# 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001)`
			`}`


			`alg_list = learn_kwargs.keys()`
			`rnn_list = ['lstm']`

			`@pytest.mark.slow`
			`@pytest.mark.parametrize("alg", alg_list)`
			`@pytest.mark.parametrize("rnn", rnn_list)`
			`def test_fixed_sequence(alg, rnn):`
			`'''`
			`Test if the algorithm (with a given policy)`
			`can learn an identity transformation (i.e. return observation as an action)`
			`'''`

			`kwargs = learn_kwargs[alg]`
			`kwargs.update(common_kwargs)`

			`episode_len = 5`
			`env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)`
			`learn = lambda e: get_learn_function(alg)(`
			`env=e,`
			`network=rnn,`
			`**kwargs`
			`)`

			`simple_test(env_fn, learn, 0.7)`


			`if __name__ == '__main__':`
			`test_fixed_sequence('ppo2', 'lstm')`