* initial implementaion of ppo2_rnn. * set lstm memory as tf.GraphKeys.LOCAL_VARIABLES. * replace dones with tf.placeholder_with_default. * improves for 'play' option. * removed unnecessary TODO . * improve lstm code. * move learning rate placeholer to optimizer scope. * support the microbatched model. * sync cnn lstm layer with originals. * add cnn_lnlstm layer. * fix a case when `states` is None. * add initial_state variable to help test. * make ppo2 rnn test available. * rename 'obs' with 'observations'. rename 'transition' with 'transitions'. fix forgetting `dones` in the replay buffer. fix a misuse of `states` and `next_states` in the replay buffer. * make initialization once. make `test_fixed_sequence` compatible with ppo2. * adjust input shape. * fix checking of a model input args in `simple_test` function. * disable warning on purpose. * support the play. * improve scopes to compatible with multiple models (i.e, other tensorflow global/local variables) * clean the scope of ppo2 policy model. * name the memory variable of PPO RNNs more describly * wrap the initializations in ppo2. * remove redundant lines. * update `REAMD.md`. * add RNN layers. * add the result of HalfCheeta-v2 env experiment. * correct a typo. * add RNN class. * rename `nlstm` with `num_units` in RNN builder functions. * remove state saving. * reuse RNNs in a2c.utils. * revert baselines/run.py. * replace `ppo2.step()` with original interface. * revert `baselines/common/tests/util.py`. * remove redundant lines. * revert `baselines/common/test/util.py` tob875fb7
. * remove `states` variable. * move RNN class to `baselines/ppo2/layers.py' and revert `baselines/common/models.py` to858afa8
. * rename `model.step_as_dict` with `model.step_with_dict`. * removed `ppo_lstm_mlp`. * fix02e26fd
.
139 lines
4.3 KiB
Python
139 lines
4.3 KiB
Python
import os
|
|
import tempfile
|
|
from functools import partial
|
|
|
|
import gym
|
|
import numpy as np
|
|
import pytest
|
|
import tensorflow as tf
|
|
|
|
from baselines.common.tests.envs.mnist_env import MnistEnv
|
|
from baselines.common.tf_util import make_session, get_session
|
|
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
|
|
from baselines.run import get_learn_function
|
|
|
|
learn_kwargs = {
|
|
'deepq': {},
|
|
'a2c': {},
|
|
'acktr': {},
|
|
'acer': {},
|
|
'ppo2': {'nminibatches': 1, 'nsteps': 10},
|
|
'trpo_mpi': {},
|
|
}
|
|
|
|
network_kwargs = {
|
|
'mlp': {},
|
|
'cnn': {'pad': 'SAME'},
|
|
'lstm': {},
|
|
'cnn_lnlstm': {'pad': 'SAME'}
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize("learn_fn", learn_kwargs.keys())
|
|
@pytest.mark.parametrize("network_fn", network_kwargs.keys())
|
|
def test_serialization(learn_fn, network_fn):
|
|
'''
|
|
Test if the trained model can be serialized
|
|
'''
|
|
|
|
_network_kwargs = network_kwargs[network_fn]
|
|
|
|
if network_fn.endswith('lstm') and learn_fn in ['acer', 'acktr', 'trpo_mpi', 'deepq']:
|
|
# TODO make acktr work with recurrent policies
|
|
# and test
|
|
# github issue: https://github.com/openai/baselines/issues/660
|
|
return
|
|
elif network_fn.endswith('lstm') and learn_fn == 'ppo2':
|
|
network_fn = 'ppo_' + network_fn
|
|
|
|
def make_env():
|
|
env = MnistEnv(episode_len=100)
|
|
env.seed(10)
|
|
return env
|
|
|
|
env = DummyVecEnv([make_env])
|
|
ob = env.reset().copy()
|
|
learn = get_learn_function(learn_fn)
|
|
|
|
kwargs = {}
|
|
kwargs.update(_network_kwargs)
|
|
kwargs.update(learn_kwargs[learn_fn])
|
|
|
|
learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs)
|
|
|
|
with tempfile.TemporaryDirectory() as td:
|
|
model_path = os.path.join(td, 'serialization_test_model')
|
|
|
|
with tf.Graph().as_default(), make_session().as_default():
|
|
model = learn(total_timesteps=100)
|
|
model.save(model_path)
|
|
mean1, std1 = _get_action_stats(model, ob)
|
|
variables_dict1 = _serialize_variables()
|
|
|
|
with tf.Graph().as_default(), make_session().as_default():
|
|
model = learn(total_timesteps=0, load_path=model_path)
|
|
mean2, std2 = _get_action_stats(model, ob)
|
|
variables_dict2 = _serialize_variables()
|
|
|
|
for k, v in variables_dict1.items():
|
|
np.testing.assert_allclose(v, variables_dict2[k], atol=0.01,
|
|
err_msg='saved and loaded variable {} value mismatch'.format(k))
|
|
|
|
np.testing.assert_allclose(mean1, mean2, atol=0.5)
|
|
np.testing.assert_allclose(std1, std2, atol=0.5)
|
|
|
|
|
|
@pytest.mark.parametrize("learn_fn", learn_kwargs.keys())
|
|
@pytest.mark.parametrize("network_fn", ['mlp'])
|
|
def test_coexistence(learn_fn, network_fn):
|
|
'''
|
|
Test if more than one model can exist at a time
|
|
'''
|
|
|
|
if learn_fn == 'deepq':
|
|
# TODO enable multiple DQN models to be useable at the same time
|
|
# github issue https://github.com/openai/baselines/issues/656
|
|
return
|
|
|
|
if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
|
|
# TODO make acktr work with recurrent policies
|
|
# and test
|
|
# github issue: https://github.com/openai/baselines/issues/660
|
|
return
|
|
|
|
env = DummyVecEnv([lambda: gym.make('CartPole-v0')])
|
|
learn = get_learn_function(learn_fn)
|
|
|
|
kwargs = {}
|
|
kwargs.update(network_kwargs[network_fn])
|
|
kwargs.update(learn_kwargs[learn_fn])
|
|
|
|
learn = partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs)
|
|
make_session(make_default=True, graph=tf.Graph())
|
|
model1 = learn(seed=1)
|
|
make_session(make_default=True, graph=tf.Graph())
|
|
model2 = learn(seed=2)
|
|
|
|
model1.step(env.observation_space.sample())
|
|
model2.step(env.observation_space.sample())
|
|
|
|
|
|
def _serialize_variables():
|
|
sess = get_session()
|
|
variables = tf.trainable_variables()
|
|
values = sess.run(variables)
|
|
return {var.name: value for var, value in zip(variables, values)}
|
|
|
|
|
|
def _get_action_stats(model, ob):
|
|
ntrials = 1000
|
|
if model.initial_state is None or model.initial_state == []:
|
|
actions = np.array([model.step(ob)[0] for _ in range(ntrials)])
|
|
else:
|
|
actions = np.array([model.step(ob, S=model.initial_state, M=[False])[0] for _ in range(ntrials)])
|
|
|
|
mean = np.mean(actions, axis=0)
|
|
std = np.std(actions, axis=0)
|
|
|
|
return mean, std
|