diff --git a/baselines/common/models.py b/baselines/common/models.py index 0798916..a6fe467 100644 --- a/baselines/common/models.py +++ b/baselines/common/models.py @@ -30,8 +30,17 @@ def build_impala_cnn(unscaled_images, depths=[16,32,32], **conv_kwargs): Model used in the paper "IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures" https://arxiv.org/abs/1802.01561 """ + + layer_num = 0 + + def get_layer_num_str(): + nonlocal layer_num + num_str = str(layer_num) + layer_num += 1 + return num_str + def conv_layer(out, depth): - return tf.layers.conv2d(out, depth, 3, padding='same') + return tf.layers.conv2d(out, depth, 3, padding='same', name='layer_' + get_layer_num_str()) def residual_block(inputs): depth = inputs.get_shape()[-1].value @@ -57,7 +66,7 @@ def build_impala_cnn(unscaled_images, depths=[16,32,32], **conv_kwargs): out = tf.layers.flatten(out) out = tf.nn.relu(out) - out = tf.layers.dense(out, 256, activation=tf.nn.relu) + out = tf.layers.dense(out, 256, activation=tf.nn.relu, name='layer_' + get_layer_num_str()) return out diff --git a/baselines/ppo2/microbatched_model.py b/baselines/ppo2/microbatched_model.py index 8d8b688..a35b830 100644 --- a/baselines/ppo2/microbatched_model.py +++ b/baselines/ppo2/microbatched_model.py @@ -8,7 +8,7 @@ class MicrobatchedModel(Model): on the entire minibatch causes some overflow """ def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, - nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight, microbatch_size): + nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight, comm, microbatch_size): self.nmicrobatches = nbatch_train // microbatch_size self.microbatch_size = microbatch_size @@ -24,7 +24,8 @@ class MicrobatchedModel(Model): ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, - mpi_rank_weight=mpi_rank_weight) + mpi_rank_weight=mpi_rank_weight, + comm=comm) self.grads_ph = [tf.placeholder(dtype=g.dtype, shape=g.shape) for g in self.grads] grads_ph_and_vars = list(zip(self.grads_ph, self.var)) diff --git a/baselines/ppo2/model.py b/baselines/ppo2/model.py index 9370d5c..35a883d 100644 --- a/baselines/ppo2/model.py +++ b/baselines/ppo2/model.py @@ -25,9 +25,12 @@ class Model(object): - Save load the model """ def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, - nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, microbatch_size=None): + nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None): self.sess = sess = get_session() + if MPI is not None and comm is None: + comm = MPI.COMM_WORLD + with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling @@ -91,8 +94,8 @@ class Model(object): # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer - if MPI is not None: - self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) + if comm is not None and comm.Get_size() > 1: + self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py index 09bc933..f3a69d8 100644 --- a/baselines/ppo2/ppo2.py +++ b/baselines/ppo2/ppo2.py @@ -21,7 +21,7 @@ def constfn(val): def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, - save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, **network_kwargs): + save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) @@ -105,7 +105,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2 model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, - max_grad_norm=max_grad_norm, mpi_rank_weight=mpi_rank_weight) + max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path)