Lots of cleanups

Fixes for new gym version Add @olegklimov and @unixpickle to authors list
2018-01-25 18:33:48 -08:00
parent b5be53dc92
commit 9fa8e1baf1
62 changed files with 989 additions and 1604 deletions
--- a/baselines/ddpg/ddpg.py
+++ b/baselines/ddpg/ddpg.py
@@ -9,8 +9,7 @@ from baselines import logger
 from baselines.common.mpi_adam import MpiAdam
 import baselines.common.tf_util as U
 from baselines.common.mpi_running_mean_std import RunningMeanStd
-from baselines.ddpg.util import reduce_std, mpi_mean
-
+from mpi4py import MPI

 def normalize(x, stats):
    if stats is None:
@@ -23,6 +22,13 @@ def denormalize(x, stats):
        return x
    return x * stats.std + stats.mean

+def reduce_std(x, axis=None, keepdims=False):
+    return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
+
+def reduce_var(x, axis=None, keepdims=False):
+    m = tf.reduce_mean(x, axis=axis, keep_dims=True)
+    devs_squared = tf.square(x - m)
+    return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)

 def get_target_updates(vars, target_vars, tau):
    logger.info('setting up target updates ...')
@@ -198,7 +204,7 @@ class DDPG(object):
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean
-        
+
        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
@@ -213,15 +219,15 @@ class DDPG(object):
    def setup_stats(self):
        ops = []
        names = []
-        
+
        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']
-        
+
        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']
-        
+
        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
@@ -231,7 +237,7 @@ class DDPG(object):
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']
-        
+
        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
@@ -347,7 +353,7 @@ class DDPG(object):
    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.
-        
+
        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
@@ -358,7 +364,7 @@ class DDPG(object):
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

-        mean_distance = mpi_mean(distance)
+        mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

--- a/baselines/ddpg/main.py
+++ b/baselines/ddpg/main.py
@@ -25,7 +25,6 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
-    gym.logger.setLevel(logging.WARN)

    if evaluation and rank==0:
        eval_env = gym.make(env_id)
--- a/baselines/ddpg/training.py
+++ b/baselines/ddpg/training.py
@@ -4,7 +4,6 @@ from collections import deque
 import pickle

 from baselines.ddpg.ddpg import DDPG
-from baselines.ddpg.util import mpi_mean, mpi_std, mpi_max, mpi_sum
 import baselines.common.tf_util as U

 from baselines import logger
@@ -35,7 +34,7 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
        saver = tf.train.Saver()
    else:
        saver = None
-    
+
    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
@@ -138,42 +137,46 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

+            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
-            epoch_train_duration = time.time() - epoch_start_time
+            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
-            combined_stats = {}
-            for key in sorted(stats.keys()):
-                combined_stats[key] = mpi_mean(stats[key])
-
-            # Rollout statistics.
-            combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
-            combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history))
-            combined_stats['rollout/episode_steps'] = mpi_mean(epoch_episode_steps)
-            combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
-            combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
-            combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
-            combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)
-    
-            # Train statistics.
-            combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
-            combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
-            combined_stats['train/param_noise_distance'] = mpi_mean(epoch_adaptive_distances)
-
+            combined_stats = stats.copy()
+            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
+            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
+            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
+            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
+            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
+            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
+            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
+            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
+            combined_stats['total/duration'] = duration
+            combined_stats['total/steps_per_second'] = float(t) / float(duration)
+            combined_stats['total/episodes'] = episodes
+            combined_stats['rollout/episodes'] = epoch_episodes
+            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
-                combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
-                combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history))
-                combined_stats['eval/Q'] = mpi_mean(eval_qs)
-                combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards))
+                combined_stats['eval/return'] = eval_episode_rewards
+                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
+                combined_stats['eval/Q'] = eval_qs
+                combined_stats['eval/episodes'] = len(eval_episode_rewards)
+            def as_scalar(x):
+                if isinstance(x, np.ndarray):
+                    assert x.size == 1
+                    return x[0]
+                elif np.isscalar(x):
+                    return x
+                else:
+                    raise ValueError('expected scalar, got %s'%x)
+            combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
+            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
-            combined_stats['total/duration'] = mpi_mean(duration)
-            combined_stats['total/steps_per_second'] = mpi_mean(float(t) / float(duration))
-            combined_stats['total/episodes'] = mpi_mean(episodes)
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t
-            
+
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
--- a/baselines/ddpg/util.py
+++ b/baselines/ddpg/util.py
@@ -1,44 +0,0 @@
-import numpy as np
-import tensorflow as tf
-from mpi4py import MPI
-from baselines.common.mpi_moments import mpi_moments
-
-
-def reduce_var(x, axis=None, keepdims=False):
-    m = tf.reduce_mean(x, axis=axis, keep_dims=True)
-    devs_squared = tf.square(x - m)
-    return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)
-
-
-def reduce_std(x, axis=None, keepdims=False):
-    return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
-
-
-def mpi_mean(value):
-    if value == []:
-        value = [0.]
-    if not isinstance(value, list):
-        value = [value]
-    return mpi_moments(np.array(value))[0][0]
-
-
-def mpi_std(value):
-    if value == []:
-        value = [0.]
-    if not isinstance(value, list):
-        value = [value]
-    return mpi_moments(np.array(value))[1][0]
-
-
-def mpi_max(value):
-    global_max = np.zeros(1, dtype='float64')
-    local_max = np.max(value).astype('float64')
-    MPI.COMM_WORLD.Reduce(local_max, global_max, op=MPI.MAX)
-    return global_max[0]
-
-
-def mpi_sum(value):
-    global_sum = np.zeros(1, dtype='float64')
-    local_sum = np.sum(np.array(value)).astype('float64')
-    MPI.COMM_WORLD.Reduce(local_sum, global_sum, op=MPI.SUM)
-    return global_sum[0]