change rms 2 tfrms switch in vec_normalize to be more explicit (#886)

* change rms 2 tfrms switch in vec_normalize to be more explicit * modify the vec_normalize / use_tf logic a little bit * typo * use_tf = False by default
2019-04-26 16:14:21 -07:00
parent 5d8041d18e
commit a07fad9066
3 changed files with 10 additions and 7 deletions
--- a/README.md
+++ b/README.md
@@ -109,7 +109,7 @@ This should get to the mean reward per episode about 20. To load and visualize t
 python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=0 --load_path=~/models/pong_20M_ppo2 --play
 ```

-*NOTE:* At the moment Mujoco training uses VecNormalize wrapper for the environment which is not being saved correctly; so loading the models trained on Mujoco will not work well if the environment is recreated. If necessary, you can work around that by replacing RunningMeanStd by TfRunningMeanStd in [baselines/common/vec_env/vec_normalize.py](baselines/common/vec_env/vec_normalize.py#L12). This way, mean and std of environment normalizing wrapper will be saved in tensorflow variables and included in the model file; however, training is slower that way - hence not including it by default
+*NOTE:* Mujoco environments require normalization to work properly, so we wrap them with VecNormalize wrapper. Currently, to ensure the models are saved with normalization (so that trained models can be restored and run without further training) the normalization coefficients are saved as tensorflow variables. This can decrease the performance somewhat, so if you require high-throughput steps with Mujoco and do not need saving/restoring the models, it may make sense to use numpy normalization instead. To do that, set 'use_tf=False` in [baselines/run.py](baselines/run.py#L116). 

 ## Loading and vizualizing learning curves and other training metrics
 See [here](docs/viz/viz.ipynb) for instructions on how to load and display the training data. 
--- a/baselines/common/vec_env/vec_normalize.py
+++ b/baselines/common/vec_env/vec_normalize.py
@@ -1,18 +1,21 @@
 from . import VecEnvWrapper
-from baselines.common.running_mean_std import RunningMeanStd
+from baselines.common.running_mean_std import TfRunningMeanStd, RunningMeanStd
 import numpy as np

-
 class VecNormalize(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """

-    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
+    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False):
        VecEnvWrapper.__init__(self, venv)
-        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
-        self.ret_rms = RunningMeanStd(shape=()) if ret else None
+        if use_tf:
+            self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None
+            self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None
+        else:
+            self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
+            self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -113,7 +113,7 @@ def build_env(args):
        env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations)

        if env_type == 'mujoco':
-            env = VecNormalize(env)
+            env = VecNormalize(env, use_tf=True)

    return env