From e2da7cd42f04d8952b4faa2906a29a9e29a09a0b Mon Sep 17 00:00:00 2001
From: Pim de Haan <pimdehaan@gmail.com>
Date: Thu, 16 Aug 2018 12:08:53 -0700
Subject: [PATCH 1/3] Several bugfixes for #504, #505, #506 related to Classic
 Control and deepq (#507)

* Several bugfixes

* Fixed ActWrapper.step bug
---
 baselines/deepq/build_graph.py |  2 +-
 baselines/deepq/deepq.py       |  6 ++++--
 baselines/run.py               | 10 +++++++---
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/baselines/deepq/build_graph.py b/baselines/deepq/build_graph.py
index e9ff1a4..dd96f0e 100644
--- a/baselines/deepq/build_graph.py
+++ b/baselines/deepq/build_graph.py
@@ -309,7 +309,7 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq",
                          outputs=output_actions,
                          givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False},
                          updates=updates)
-        def act(ob, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1):
+        def act(ob, reset=False, update_param_noise_threshold=False, update_param_noise_scale=False, stochastic=True, update_eps=-1):
             return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale)
         return act
 
diff --git a/baselines/deepq/deepq.py b/baselines/deepq/deepq.py
index 7d44acf..01921bb 100644
--- a/baselines/deepq/deepq.py
+++ b/baselines/deepq/deepq.py
@@ -27,7 +27,7 @@ class ActWrapper(object):
         self.initial_state = None
 
     @staticmethod
-    def load_act(self, path):
+    def load_act(path):
         with open(path, "rb") as f:
             model_data, act_params = cloudpickle.load(f)
         act = deepq.build_act(**act_params)
@@ -70,6 +70,7 @@ class ActWrapper(object):
 
     def save(self, path):
         save_state(path)
+        self.save_act(path+".pickle")
 
 
 def load_act(path):
@@ -194,8 +195,9 @@ def learn(env,
     # capture the shape outside the closure so that the env object is not serialized
     # by cloudpickle when serializing make_obs_ph
 
+    observation_space = env.observation_space
     def make_obs_ph(name):
-        return ObservationInput(env.observation_space, name=name)
+        return ObservationInput(observation_space, name=name)
 
     act, train, update_target, debug = deepq.build_train(
         make_obs_ph=make_obs_ph,
diff --git a/baselines/run.py b/baselines/run.py
index cba8515..1491a5e 100644
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -123,14 +123,18 @@ def build_env(args, render=False):
         env = bench.Monitor(env, logger.get_dir())
         env = retro_wrappers.wrap_deepmind_retro(env)
         
-    elif env_type == 'classic':
+    elif env_type == 'classic_control':
         def make_env():
             e = gym.make(env_id)
+            e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True)
             e.seed(seed)
             return e
             
         env = DummyVecEnv([make_env])
- 
+
+    else:
+        raise ValueError('Unknown env_type {}'.format(env_type))
+
     return env
 
 
@@ -149,7 +153,7 @@ def get_env_type(env_id):
     return env_type, env_id
 
 def get_default_network(env_type):
-    if env_type == 'mujoco' or env_type=='classic':
+    if env_type == 'mujoco' or env_type == 'classic_control':
         return 'mlp'
     if env_type == 'atari':
         return 'cnn'

From cd375ab20985eddab5736a4ff871ffeca8888b88 Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Thu, 16 Aug 2018 14:53:49 -0700
Subject: [PATCH 2/3] update readmes (#514)

* update per-algorithm READMEs to reflect new way of running algorithms

* adding a link to repo-wide README

* updated README files and deepq.train_cartpole example
---
 baselines/a2c/README.md                       |  3 ++-
 baselines/acer/README.md                      |  4 ++-
 baselines/acktr/README.md                     |  5 +++-
 baselines/deepq/README.md                     | 27 +++++--------------
 baselines/deepq/experiments/train_cartpole.py |  5 ++--
 baselines/ppo2/README.md                      |  6 +++--
 baselines/trpo_mpi/README.md                  |  5 ++--
 7 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/baselines/a2c/README.md b/baselines/a2c/README.md
index 2df6eb2..915852b 100644
--- a/baselines/a2c/README.md
+++ b/baselines/a2c/README.md
@@ -2,4 +2,5 @@
 
 - Original paper: https://arxiv.org/abs/1602.01783
 - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
-- `python -m baselines.a2c.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
\ No newline at end of file
+- `python -m baselines.run --alg=a2c --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options
+- also refer to the repo-wide [README.md](../../README.md#training-models)
diff --git a/baselines/acer/README.md b/baselines/acer/README.md
index 7a53d75..d1ef98c 100644
--- a/baselines/acer/README.md
+++ b/baselines/acer/README.md
@@ -1,4 +1,6 @@
 # ACER
 
 - Original paper: https://arxiv.org/abs/1611.01224
-- `python -m baselines.acer.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
\ No newline at end of file
+- `python -m baselines.run --alg=acer --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
+- also refer to the repo-wide [README.md](../../README.md#training-models)
+
diff --git a/baselines/acktr/README.md b/baselines/acktr/README.md
index e8a806d..93692e8 100644
--- a/baselines/acktr/README.md
+++ b/baselines/acktr/README.md
@@ -2,4 +2,7 @@
 
 - Original paper: https://arxiv.org/abs/1708.05144
 - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
-- `python -m baselines.acktr.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
\ No newline at end of file
+- `python -m baselines.run --alg=acktr --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
+- also refer to the repo-wide [README.md](../../README.md#training-models)
+
+
diff --git a/baselines/deepq/README.md b/baselines/deepq/README.md
index 8fa19ab..7b18c90 100644
--- a/baselines/deepq/README.md
+++ b/baselines/deepq/README.md
@@ -9,44 +9,29 @@ Here's a list of commands to run to quickly get a working example:
 
 ```bash
 # Train model and save the results to cartpole_model.pkl
-python -m baselines.deepq.experiments.train_cartpole
+python -m baselines.run --alg=deepq --env=CartPole-v0 --save_path=./cartpole_model.pkl --num_timesteps=1e5
 # Load the model saved in cartpole_model.pkl and visualize the learned policy
-python -m baselines.deepq.experiments.enjoy_cartpole
+python -m baselines.run --alg=deepq --env=CartPole-v0 --load_apth=./cartpole_model.pkl --num_timesteps=0 --play
 ```
 
-
-Be sure to check out the source code of [both](experiments/train_cartpole.py) [files](experiments/enjoy_cartpole.py)!
-
 ## If you wish to apply DQN to solve a problem.
 
 Check out our simple agent trained with one stop shop `deepq.learn` function. 
 
 - [baselines/deepq/experiments/train_cartpole.py](experiments/train_cartpole.py) - train a Cartpole agent.
-- [baselines/deepq/experiments/train_pong.py](experiments/train_pong.py) - train a Pong agent using convolutional neural networks.
 
-In particular notice that once `deepq.learn` finishes training it returns `act` function which can be used to select actions in the environment. Once trained you can easily save it and load at later time. For both of the files listed above there are complimentary files `enjoy_cartpole.py` and `enjoy_pong.py` respectively, that load and visualize the learned policy.
+In particular notice that once `deepq.learn` finishes training it returns `act` function which can be used to select actions in the environment. Once trained you can easily save it and load at later time. Complimentary file `enjoy_cartpole.py` loads and visualizes the learned policy.
 
 ## If you wish to experiment with the algorithm
 
 ##### Check out the examples
 
-
 - [baselines/deepq/experiments/custom_cartpole.py](experiments/custom_cartpole.py) - Cartpole training with more fine grained control over the internals of DQN algorithm.
-- [baselines/deepq/experiments/run_atari.py](experiments/run_atari.py) - more robust setup for training at scale.
-
-
-##### Download a pretrained Atari agent
-
-For some research projects it is sometimes useful to have an already trained agent handy. There's a variety of models to choose from. You can list them all by running:
+- [baselines/deepq/defaults.py](defaults.py) - settings for training on atari. Run 
 
 ```bash
-python -m baselines.deepq.experiments.atari.download_model
+python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4 
 ```
+to train on Atari Pong (see more in repo-wide [README.md](../../README.md#training-models))
 
-Once you pick a model, you can download it and visualize the learned policy. Be sure to pass `--dueling` flag to visualization script when using dueling models.
 
-```bash
-python -m baselines.deepq.experiments.atari.download_model --blob model-atari-duel-pong-1 --model-dir /tmp/models
-python -m baselines.deepq.experiments.atari.enjoy --model-dir /tmp/models/model-atari-duel-pong-1 --env Pong --dueling
-
-```
diff --git a/baselines/deepq/experiments/train_cartpole.py b/baselines/deepq/experiments/train_cartpole.py
index a50c242..cfbbdc9 100644
--- a/baselines/deepq/experiments/train_cartpole.py
+++ b/baselines/deepq/experiments/train_cartpole.py
@@ -11,12 +11,11 @@ def callback(lcl, _glb):
 
 def main():
     env = gym.make("CartPole-v0")
-    model = deepq.models.mlp([64])
     act = deepq.learn(
         env,
-        q_func=model,
+        network='mlp',
         lr=1e-3,
-        max_timesteps=100000,
+        total_timesteps=100000,
         buffer_size=50000,
         exploration_fraction=0.1,
         exploration_final_eps=0.02,
diff --git a/baselines/ppo2/README.md b/baselines/ppo2/README.md
index 4c262ad..4d431bc 100644
--- a/baselines/ppo2/README.md
+++ b/baselines/ppo2/README.md
@@ -2,5 +2,7 @@
 
 - Original paper: https://arxiv.org/abs/1707.06347
 - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
-- `python -m baselines.ppo2.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
-- `python -m baselines.ppo2.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
+
+- `python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
+- `python -m baselines.run --alg=ppo2 --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M frames on a Mujoco Ant environment.
+- also refer to the repo-wide [README.md](../../README.md#training-models)
diff --git a/baselines/trpo_mpi/README.md b/baselines/trpo_mpi/README.md
index b3d9b9d..4cdbb5a 100644
--- a/baselines/trpo_mpi/README.md
+++ b/baselines/trpo_mpi/README.md
@@ -2,5 +2,6 @@
 
 - Original paper: https://arxiv.org/abs/1502.05477
 - Baselines blog post https://blog.openai.com/openai-baselines-ppo/
-- `mpirun -np 16 python -m baselines.trpo_mpi.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
-- `python -m baselines.trpo_mpi.run_mujoco` runs the algorithm for 1M timesteps on a Mujoco environment.
+- `mpirun -np 16 python -m baselines.run --alg=trpo_mpi --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
+- `python -m baselines.run --alg=trpo_mpi --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M timesteps on a Mujoco Ant environment. 
+- also refer to the repo-wide [README.md](../../README.md#training-models)

From 5edcd6886e9538401004cb2bb43cc7b876e10105 Mon Sep 17 00:00:00 2001
From: Youngjin Kim <youngjin.kim@vision.snu.ac.kr>
Date: Fri, 17 Aug 2018 06:55:57 +0900
Subject: [PATCH 3/3] Fix argument error in deepq (#508)

* Fix argment error in deepq

* Fix argment error in deepq
---
 baselines/deepq/experiments/run_atari.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/baselines/deepq/experiments/run_atari.py b/baselines/deepq/experiments/run_atari.py
index b6b427b..aa60001 100644
--- a/baselines/deepq/experiments/run_atari.py
+++ b/baselines/deepq/experiments/run_atari.py
@@ -23,17 +23,15 @@ def main():
     env = make_atari(args.env)
     env = bench.Monitor(env, logger.get_dir())
     env = deepq.wrap_atari_dqn(env)
-    model = deepq.models.cnn_to_mlp(
-        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
-        hiddens=[256],
-        dueling=bool(args.dueling),
-    )
 
     deepq.learn(
         env,
-        q_func=model,
+        "conv_only",
+        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
+        hiddens=[256],
+        dueling=bool(args.dueling),
         lr=1e-4,
-        max_timesteps=args.num_timesteps,
+        total_timesteps=args.num_timesteps,
         buffer_size=10000,
         exploration_fraction=0.1,
         exploration_final_eps=0.01,