From a07fad9066c353a98c7f177c261cf56a7d4dd9c6 Mon Sep 17 00:00:00 2001
From: pzhokhov <peterz@openai.com>
Date: Fri, 26 Apr 2019 16:14:21 -0700
Subject: [PATCH] change rms 2 tfrms switch in vec_normalize to be more
 explicit (#886)

* change rms 2 tfrms switch in vec_normalize to be more explicit

* modify the vec_normalize / use_tf logic a little bit

* typo

* use_tf = False by default
---
 README.md                                 |  2 +-
 baselines/common/vec_env/vec_normalize.py | 13 ++++++++-----
 baselines/run.py                          |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 9b6500b..dc3c8b6 100644
--- a/README.md
+++ b/README.md
@@ -109,7 +109,7 @@ This should get to the mean reward per episode about 20. To load and visualize t
 python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --load_path=~/models/pong_20M_ppo2 --play
 ```
 
-*NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default
+*NOTE:* Mujoco environments require normalization to work properly, so we wrap them with VecNormalize wrapper. Currently, to ensure the models are saved with normalization (so that trained models can be restored and run without further training) the normalization coefficients are saved as tensorflow variables. This can decrease the performance somewhat, so if you require high-throughput steps with Mujoco and do not need saving/restoring the models, it may make sense to use numpy normalization instead. To do that, set 'use_tf=False` in [baselines/run.py](baselines/run.py#L116). 
 
 ## Loading and vizualizing learning curves and other training metrics
 See [here](docs/viz/viz.ipynb) for instructions on how to load and display the training data. 
diff --git a/baselines/common/vec_env/vec_normalize.py b/baselines/common/vec_env/vec_normalize.py
index f3255e9..40bd04b 100644
--- a/baselines/common/vec_env/vec_normalize.py
+++ b/baselines/common/vec_env/vec_normalize.py
@@ -1,18 +1,21 @@
 from . import VecEnvWrapper
-from baselines.common.running_mean_std import RunningMeanStd
+from baselines.common.running_mean_std import TfRunningMeanStd, RunningMeanStd
 import numpy as np
 
-
 class VecNormalize(VecEnvWrapper):
     """
     A vectorized wrapper that normalizes the observations
     and returns from an environment.
     """
 
-    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
+    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False):
         VecEnvWrapper.__init__(self, venv)
-        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
-        self.ret_rms = RunningMeanStd(shape=()) if ret else None
+        if use_tf:
+            self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None
+            self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None
+        else:
+            self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
+            self.ret_rms = RunningMeanStd(shape=()) if ret else None
         self.clipob = clipob
         self.cliprew = cliprew
         self.ret = np.zeros(self.num_envs)
diff --git a/baselines/run.py b/baselines/run.py
index 4f9ed15..a295873 100644
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -113,7 +113,7 @@ def build_env(args):
         env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations)
 
         if env_type == 'mujoco':
-            env = VecNormalize(env)
+            env = VecNormalize(env, use_tf=True)
 
     return env