Compare commits

..

2 Commits

Author SHA1 Message Date
Peter Zhokhov
2ebdc28791 lstm network builders using tf lstm 2018-08-10 14:21:30 -07:00
Peter Zhokhov
217b111c88 merged refactor 2018-08-10 14:14:46 -07:00
18 changed files with 53 additions and 18048 deletions

View File

@@ -1 +1 @@
ppo2

View File

@@ -112,6 +112,10 @@ This should get to the mean reward per episode about 5k. To load and visualize t
*NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default
## Subpackages
- [A2C](baselines/a2c)
@@ -121,19 +125,10 @@ This should get to the mean reward per episode about 5k. To load and visualize t
- [DQN](baselines/deepq)
- [GAIL](baselines/gail)
- [HER](baselines/her)
- [PPO1](baselines/ppo1) (obsolete version, left here temporarily)
- [PPO2](baselines/ppo2)
- [PPO1](baselines/ppo1) (Multi-CPU using MPI)
- [PPO2](baselines/ppo2) (Optimized for GPU)
- [TRPO](baselines/trpo_mpi)
## Benchmarks
Results of benchmarks on Mujoco (1M timesteps) and Atari (10M timesteps) are available
[here for Mujoco](https://htmlpreview.github.com/?https://github.com/openai/baselines/blob/master/benchmarks_mujoco1M.htm)
and
[here for Atari](https://htmlpreview.github.com/?https://github.com/openai/baselines/blob/master/benchmarks_atari10M.htm)
respectively. Note that these results may be not on the latest version of the code, particular commit hash with which results were obtained is specified on the benchmarks page.
To cite this repository in publications:
@misc{baselines,
@@ -144,4 +139,3 @@ To cite this repository in publications:
journal = {GitHub repository},
howpublished = {\url{https://github.com/openai/baselines}},
}

View File

@@ -2,5 +2,4 @@
- Original paper: https://arxiv.org/abs/1602.01783
- Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
- `python -m baselines.run --alg=a2c --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options
- also refer to the repo-wide [README.md](../../README.md#training-models)
- `python -m baselines.a2c.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.

View File

@@ -1,6 +1,4 @@
# ACER
- Original paper: https://arxiv.org/abs/1611.01224
- `python -m baselines.run --alg=acer --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
- also refer to the repo-wide [README.md](../../README.md#training-models)
- `python -m baselines.acer.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.

View File

@@ -2,7 +2,4 @@
- Original paper: https://arxiv.org/abs/1708.05144
- Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
- `python -m baselines.run --alg=acktr --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
- also refer to the repo-wide [README.md](../../README.md#training-models)
- `python -m baselines.acktr.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.

View File

@@ -156,7 +156,7 @@ class FrameStack(gym.Wrapper):
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=env.observation_space.dtype)
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
def reset(self):
ob = self.env.reset()
@@ -176,7 +176,6 @@ class FrameStack(gym.Wrapper):
class ScaledFloatFrame(gym.ObservationWrapper):
def __init__(self, env):
gym.ObservationWrapper.__init__(self, env)
self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
def observation(self, observation):
# careful! This undoes the memory optimization, use

View File

@@ -138,7 +138,7 @@ def conv_only(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], **conv_kwargs):
'''
def network_fn(X):
out = tf.cast(X, tf.float32) / 255.
out = X
with tf.variable_scope("convnet"):
for num_outputs, kernel_size, stride in convs:
out = layers.convolution2d(out,

View File

@@ -6,7 +6,8 @@ from baselines.run import get_learn_function
common_kwargs = dict(
seed=0,
total_timesteps=50000,
total_timesteps=20000,
nlstm=64
)
learn_kwargs = {
@@ -19,7 +20,7 @@ learn_kwargs = {
alg_list = learn_kwargs.keys()
rnn_list = ['lstm']
rnn_list = ['lstm', 'tflstm', 'tflstm_static']
@pytest.mark.slow
@pytest.mark.parametrize("alg", alg_list)
@@ -41,11 +42,11 @@ def test_fixed_sequence(alg, rnn):
**kwargs
)
simple_test(env_fn, learn, 0.7)
simple_test(env_fn, learn, 0.3)
if __name__ == '__main__':
test_fixed_sequence('ppo2', 'lstm')
test_fixed_sequence('ppo2', 'tflstm')

View File

@@ -2,6 +2,7 @@ import tensorflow as tf
import numpy as np
from gym.spaces import np_random
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.bench.monitor import Monitor
N_TRIALS = 10000
N_EPISODES = 100
@@ -10,7 +11,7 @@ def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
np.random.seed(0)
np_random.seed(0)
env = DummyVecEnv([env_fn])
env = DummyVecEnv([lambda: Monitor(env_fn(), None, allow_early_resets=True)])
with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():

View File

@@ -9,29 +9,44 @@ Here's a list of commands to run to quickly get a working example:
```bash
# Train model and save the results to cartpole_model.pkl
python -m baselines.run --alg=deepq --env=CartPole-v0 --save_path=./cartpole_model.pkl --num_timesteps=1e5
python -m baselines.deepq.experiments.train_cartpole
# Load the model saved in cartpole_model.pkl and visualize the learned policy
python -m baselines.run --alg=deepq --env=CartPole-v0 --load_apth=./cartpole_model.pkl --num_timesteps=0 --play
python -m baselines.deepq.experiments.enjoy_cartpole
```
Be sure to check out the source code of [both](experiments/train_cartpole.py) [files](experiments/enjoy_cartpole.py)!
## If you wish to apply DQN to solve a problem.
Check out our simple agent trained with one stop shop `deepq.learn` function.
- [baselines/deepq/experiments/train_cartpole.py](experiments/train_cartpole.py) - train a Cartpole agent.
- [baselines/deepq/experiments/train_pong.py](experiments/train_pong.py) - train a Pong agent using convolutional neural networks.
In particular notice that once `deepq.learn` finishes training it returns `act` function which can be used to select actions in the environment. Once trained you can easily save it and load at later time. Complimentary file `enjoy_cartpole.py` loads and visualizes the learned policy.
In particular notice that once `deepq.learn` finishes training it returns `act` function which can be used to select actions in the environment. Once trained you can easily save it and load at later time. For both of the files listed above there are complimentary files `enjoy_cartpole.py` and `enjoy_pong.py` respectively, that load and visualize the learned policy.
## If you wish to experiment with the algorithm
##### Check out the examples
- [baselines/deepq/experiments/custom_cartpole.py](experiments/custom_cartpole.py) - Cartpole training with more fine grained control over the internals of DQN algorithm.
- [baselines/deepq/defaults.py](defaults.py) - settings for training on atari. Run
- [baselines/deepq/experiments/atari/train.py](experiments/atari/train.py) - more robust setup for training at scale.
##### Download a pretrained Atari agent
For some research projects it is sometimes useful to have an already trained agent handy. There's a variety of models to choose from. You can list them all by running:
```bash
python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4
python -m baselines.deepq.experiments.atari.download_model
```
to train on Atari Pong (see more in repo-wide [README.md](../../README.md#training-models))
Once you pick a model, you can download it and visualize the learned policy. Be sure to pass `--dueling` flag to visualization script when using dueling models.
```bash
python -m baselines.deepq.experiments.atari.download_model --blob model-atari-duel-pong-1 --model-dir /tmp/models
python -m baselines.deepq.experiments.atari.enjoy --model-dir /tmp/models/model-atari-duel-pong-1 --env Pong --dueling
```

View File

@@ -309,7 +309,7 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq",
outputs=output_actions,
givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False},
updates=updates)
def act(ob, reset=False, update_param_noise_threshold=False, update_param_noise_scale=False, stochastic=True, update_eps=-1):
def act(ob, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1):
return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale)
return act

View File

@@ -27,7 +27,7 @@ class ActWrapper(object):
self.initial_state = None
@staticmethod
def load_act(path):
def load_act(self, path):
with open(path, "rb") as f:
model_data, act_params = cloudpickle.load(f)
act = deepq.build_act(**act_params)
@@ -70,7 +70,6 @@ class ActWrapper(object):
def save(self, path):
save_state(path)
self.save_act(path+".pickle")
def load_act(path):
@@ -195,9 +194,8 @@ def learn(env,
# capture the shape outside the closure so that the env object is not serialized
# by cloudpickle when serializing make_obs_ph
observation_space = env.observation_space
def make_obs_ph(name):
return ObservationInput(observation_space, name=name)
return ObservationInput(env.observation_space, name=name)
act, train, update_target, debug = deepq.build_train(
make_obs_ph=make_obs_ph,

View File

@@ -11,11 +11,12 @@ def callback(lcl, _glb):
def main():
env = gym.make("CartPole-v0")
model = deepq.models.mlp([64])
act = deepq.learn(
env,
network='mlp',
q_func=model,
lr=1e-3,
total_timesteps=100000,
max_timesteps=100000,
buffer_size=50000,
exploration_fraction=0.1,
exploration_final_eps=0.02,

View File

@@ -2,7 +2,5 @@
- Original paper: https://arxiv.org/abs/1707.06347
- Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
- `python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
- `python -m baselines.run --alg=ppo2 --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M frames on a Mujoco Ant environment.
- also refer to the repo-wide [README.md](../../README.md#training-models)
- `python -m baselines.ppo2.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
- `python -m baselines.ppo2.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.

View File

@@ -123,18 +123,14 @@ def build_env(args, render=False):
env = bench.Monitor(env, logger.get_dir())
env = retro_wrappers.wrap_deepmind_retro(env)
elif env_type == 'classic_control':
elif env_type == 'classic':
def make_env():
e = gym.make(env_id)
e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True)
e.seed(seed)
return e
env = DummyVecEnv([make_env])
else:
raise ValueError('Unknown env_type {}'.format(env_type))
return env
@@ -153,7 +149,7 @@ def get_env_type(env_id):
return env_type, env_id
def get_default_network(env_type):
if env_type == 'mujoco' or env_type == 'classic_control':
if env_type == 'mujoco' or env_type=='classic':
return 'mlp'
if env_type == 'atari':
return 'cnn'

View File

@@ -2,6 +2,5 @@
- Original paper: https://arxiv.org/abs/1502.05477
- Baselines blog post https://blog.openai.com/openai-baselines-ppo/
- `mpirun -np 16 python -m baselines.run --alg=trpo_mpi --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
- `python -m baselines.run --alg=trpo_mpi --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M timesteps on a Mujoco Ant environment.
- also refer to the repo-wide [README.md](../../README.md#training-models)
- `mpirun -np 16 python -m baselines.trpo_mpi.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
- `python -m baselines.trpo_mpi.run_mujoco` runs the algorithm for 1M timesteps on a Mujoco environment.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff