Baselines for Tensorflow 2.0. (#978)

* Baselines for Tensorflow 2.0.

Please do note that:
1. ACER, ACKTR, GAIL is still under development by external
contributors.
2. HER is still under development by tanzheny@google.com.

* Some cleanup.

* Addressing some comments.
This commit is contained in:
tanzhenyu
2019-08-08 11:03:17 -07:00
committed by pzhokhov
parent c57528573e
commit d1a05a0dd2
138 changed files with 1216 additions and 8215 deletions

View File

@@ -1,4 +1,5 @@
import os
import os.path as osp
import time
from collections import deque
import pickle
@@ -8,9 +9,9 @@ from baselines.ddpg.models import Actor, Critic
from baselines.ddpg.memory import Memory
from baselines.ddpg.noise import AdaptiveParamNoiseSpec, NormalActionNoise, OrnsteinUhlenbeckActionNoise
from baselines.common import set_global_seeds
import baselines.common.tf_util as U
from baselines import logger
import tensorflow as tf
import numpy as np
try:
@@ -42,6 +43,7 @@ def learn(network, env,
tau=0.01,
eval_env=None,
param_noise_adaption_interval=50,
load_path=None,
**network_kwargs):
set_global_seeds(seed)
@@ -61,8 +63,8 @@ def learn(network, env,
assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions.
memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
critic = Critic(network=network, **network_kwargs)
actor = Actor(nb_actions, network=network, **network_kwargs)
critic = Critic(nb_actions, ob_shape=env.observation_space.shape, network=network, **network_kwargs)
actor = Actor(nb_actions, ob_shape=env.observation_space.shape, network=network, **network_kwargs)
action_noise = None
param_noise = None
@@ -94,12 +96,18 @@ def learn(network, env,
logger.info('Using agent with the following configuration:')
logger.info(str(agent.__dict__.items()))
if load_path is not None:
load_path = osp.expanduser(load_path)
ckpt = tf.train.Checkpoint(model=agent)
manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None)
ckpt.restore(manager.latest_checkpoint)
print("Restoring from {}".format(manager.latest_checkpoint))
eval_episode_rewards_history = deque(maxlen=100)
episode_rewards_history = deque(maxlen=100)
sess = U.get_session()
# Prepare everything.
agent.initialize(sess)
sess.graph.finalize()
agent.initialize()
agent.reset()
@@ -133,7 +141,8 @@ def learn(network, env,
agent.reset()
for t_rollout in range(nb_rollout_steps):
# Predict next action.
action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True)
action, q, _, _ = agent.step(tf.constant(obs), apply_noise=True, compute_Q=True)
action, q = action.numpy(), q.numpy()
# Execute next action.
if rank == 0 and render:
@@ -170,7 +179,6 @@ def learn(network, env,
agent.reset()
# Train.
epoch_actor_losses = []
epoch_critic_losses = []
@@ -178,7 +186,9 @@ def learn(network, env,
for t_train in range(nb_train_steps):
# Adapt param noise, if necessary.
if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
distance = agent.adapt_param_noise()
batch = agent.memory.sample(batch_size=batch_size)
obs0 = tf.constant(batch['obs0'])
distance = agent.adapt_param_noise(obs0)
epoch_adaptive_distances.append(distance)
cl, al = agent.train()

View File

@@ -1,16 +1,15 @@
from copy import copy
from functools import reduce
import numpy as np
import tensorflow as tf
import tensorflow.contrib as tc
from baselines import logger
from baselines.common.mpi_adam import MpiAdam
import baselines.common.tf_util as U
from baselines.ddpg.models import Actor, Critic
from baselines.common.mpi_running_mean_std import RunningMeanStd
try:
from mpi4py import MPI
from baselines.common.mpi_adam_optimizer import MpiAdamOptimizer
from baselines.common.mpi_util import sync_from_root
except ImportError:
MPI = None
@@ -25,6 +24,7 @@ def denormalize(x, stats):
return x
return x * stats.std + stats.mean
@tf.function
def reduce_std(x, axis=None, keepdims=False):
return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
@@ -33,49 +33,21 @@ def reduce_var(x, axis=None, keepdims=False):
devs_squared = tf.square(x - m)
return tf.reduce_mean(devs_squared, axis=axis, keepdims=keepdims)
def get_target_updates(vars, target_vars, tau):
logger.info('setting up target updates ...')
soft_updates = []
init_updates = []
assert len(vars) == len(target_vars)
for var, target_var in zip(vars, target_vars):
logger.info(' {} <- {}'.format(target_var.name, var.name))
init_updates.append(tf.assign(target_var, var))
soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var))
assert len(init_updates) == len(vars)
assert len(soft_updates) == len(vars)
return tf.group(*init_updates), tf.group(*soft_updates)
@tf.function
def update_perturbed_actor(actor, perturbed_actor, param_noise_stddev):
def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev):
assert len(actor.vars) == len(perturbed_actor.vars)
assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars)
updates = []
for var, perturbed_var in zip(actor.vars, perturbed_actor.vars):
for var, perturbed_var in zip(actor.variables, perturbed_actor.variables):
if var in actor.perturbable_vars:
logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name))
updates.append(tf.assign(perturbed_var, var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev)))
perturbed_var.assign(var + tf.random.normal(shape=tf.shape(var), mean=0., stddev=param_noise_stddev))
else:
logger.info(' {} <- {}'.format(perturbed_var.name, var.name))
updates.append(tf.assign(perturbed_var, var))
assert len(updates) == len(actor.vars)
return tf.group(*updates)
perturbed_var.assign(var)
class DDPG(object):
class DDPG(tf.Module):
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
# Inputs.
self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')
# Parameters.
self.gamma = gamma
@@ -88,128 +60,103 @@ class DDPG(object):
self.action_range = action_range
self.return_range = return_range
self.observation_range = observation_range
self.observation_shape = observation_shape
self.critic = critic
self.actor = actor
self.actor_lr = actor_lr
self.critic_lr = critic_lr
self.clip_norm = clip_norm
self.enable_popart = enable_popart
self.reward_scale = reward_scale
self.batch_size = batch_size
self.stats_sample = None
self.critic_l2_reg = critic_l2_reg
self.actor_lr = tf.constant(actor_lr)
self.critic_lr = tf.constant(critic_lr)
# Observation normalization.
if self.normalize_observations:
with tf.variable_scope('obs_rms'):
with tf.name_scope('obs_rms'):
self.obs_rms = RunningMeanStd(shape=observation_shape)
else:
self.obs_rms = None
normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
self.observation_range[0], self.observation_range[1])
normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
self.observation_range[0], self.observation_range[1])
# Return normalization.
if self.normalize_returns:
with tf.variable_scope('ret_rms'):
with tf.name_scope('ret_rms'):
self.ret_rms = RunningMeanStd()
else:
self.ret_rms = None
# Create target networks.
target_actor = copy(actor)
target_actor.name = 'target_actor'
self.target_actor = target_actor
target_critic = copy(critic)
target_critic.name = 'target_critic'
self.target_critic = target_critic
# Create networks and core TF parts that are shared across setup parts.
self.actor_tf = actor(normalized_obs0)
self.normalized_critic_tf = critic(normalized_obs0, self.actions)
self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1
self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs)
self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs)
# Set up parts.
if self.param_noise is not None:
self.setup_param_noise(normalized_obs0)
self.setup_actor_optimizer()
self.setup_critic_optimizer()
if self.normalize_returns and self.enable_popart:
self.setup_popart()
self.setup_stats()
self.setup_target_network_updates()
self.setup_param_noise()
self.initial_state = None # recurrent architectures not supported yet
if MPI is not None:
comm = MPI.COMM_WORLD
self.actor_optimizer = MpiAdamOptimizer(comm, self.actor.trainable_variables)
self.critic_optimizer = MpiAdamOptimizer(comm, self.critic.trainable_variables)
else:
self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)
def setup_target_network_updates(self):
actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau)
self.target_init_updates = [actor_init_updates, critic_init_updates]
self.target_soft_updates = [actor_soft_updates, critic_soft_updates]
def setup_param_noise(self, normalized_obs0):
assert self.param_noise is not None
# Configure perturbed actor.
param_noise_actor = copy(self.actor)
param_noise_actor.name = 'param_noise_actor'
self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
logger.info('setting up param noise')
self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)
# Configure separate copy for stddev adoption.
adaptive_param_noise_actor = copy(self.actor)
adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))
def setup_actor_optimizer(self):
logger.info('setting up actor optimizer')
self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_variables]
actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
logger.info(' actor shapes: {}'.format(actor_shapes))
logger.info(' actor params: {}'.format(actor_nb_params))
self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
beta1=0.9, beta2=0.999, epsilon=1e-08)
def setup_critic_optimizer(self):
logger.info('setting up critic optimizer')
normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1])
self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
if self.critic_l2_reg > 0.:
critic_reg_vars = [var for var in self.critic.trainable_vars if var.name.endswith('/w:0') and 'output' not in var.name]
for var in critic_reg_vars:
logger.info(' regularizing: {}'.format(var.name))
logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg))
critic_reg = tc.layers.apply_regularization(
tc.layers.l2_regularizer(self.critic_l2_reg),
weights_list=critic_reg_vars
)
self.critic_loss += critic_reg
critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_variables]
critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
logger.info(' critic shapes: {}'.format(critic_shapes))
logger.info(' critic params: {}'.format(critic_nb_params))
self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
beta1=0.9, beta2=0.999, epsilon=1e-08)
if self.critic_l2_reg > 0.:
critic_reg_vars = []
for layer in self.critic.network_builder.layers[1:]:
critic_reg_vars.append(layer.kernel)
for var in critic_reg_vars:
logger.info(' regularizing: {}'.format(var.name))
logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg))
logger.info('setting up critic target updates ...')
for var, target_var in zip(self.critic.variables, self.target_critic.variables):
logger.info(' {} <- {}'.format(target_var.name, var.name))
logger.info('setting up actor target updates ...')
for var, target_var in zip(self.actor.variables, self.target_actor.variables):
logger.info(' {} <- {}'.format(target_var.name, var.name))
if self.param_noise:
logger.info('setting up param noise')
for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables):
if var in actor.perturbable_vars:
logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name))
else:
logger.info(' {} <- {}'.format(perturbed_var.name, var.name))
for var, perturbed_var in zip(self.actor.variables, self.perturbed_adaptive_actor.variables):
if var in actor.perturbable_vars:
logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name))
else:
logger.info(' {} <- {}'.format(perturbed_var.name, var.name))
if self.normalize_returns and self.enable_popart:
self.setup_popart()
self.initial_state = None # recurrent architectures not supported yet
def setup_param_noise(self):
assert self.param_noise is not None
# Configure perturbed actor.
self.perturbed_actor = Actor(self.actor.nb_actions, self.observation_shape, name='param_noise_actor', network=self.actor.network, **self.actor.network_kwargs)
# Configure separate copy for stddev adoption.
self.perturbed_adaptive_actor = Actor(self.actor.nb_actions, self.observation_shape, name='adaptive_param_noise_actor', network=self.actor.network, **self.actor.network_kwargs)
def setup_popart(self):
# See https://arxiv.org/pdf/1602.07714.pdf for details.
self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
new_std = self.ret_rms.std
self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
new_mean = self.ret_rms.mean
self.renormalize_Q_outputs_op = []
for vs in [self.critic.output_vars, self.target_critic.output_vars]:
assert len(vs) == 2
M, b = vs
@@ -217,63 +164,26 @@ class DDPG(object):
assert 'bias' in b.name
assert M.get_shape()[-1] == 1
assert b.get_shape()[-1] == 1
self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]
def setup_stats(self):
ops = []
names = []
if self.normalize_returns:
ops += [self.ret_rms.mean, self.ret_rms.std]
names += ['ret_rms_mean', 'ret_rms_std']
if self.normalize_observations:
ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
names += ['obs_rms_mean', 'obs_rms_std']
ops += [tf.reduce_mean(self.critic_tf)]
names += ['reference_Q_mean']
ops += [reduce_std(self.critic_tf)]
names += ['reference_Q_std']
ops += [tf.reduce_mean(self.critic_with_actor_tf)]
names += ['reference_actor_Q_mean']
ops += [reduce_std(self.critic_with_actor_tf)]
names += ['reference_actor_Q_std']
ops += [tf.reduce_mean(self.actor_tf)]
names += ['reference_action_mean']
ops += [reduce_std(self.actor_tf)]
names += ['reference_action_std']
if self.param_noise:
ops += [tf.reduce_mean(self.perturbed_actor_tf)]
names += ['reference_perturbed_action_mean']
ops += [reduce_std(self.perturbed_actor_tf)]
names += ['reference_perturbed_action_std']
self.stats_ops = ops
self.stats_names = names
@tf.function
def step(self, obs, apply_noise=True, compute_Q=True):
normalized_obs = tf.clip_by_value(normalize(obs, self.obs_rms), self.observation_range[0], self.observation_range[1])
actor_tf = self.actor(normalized_obs)
if self.param_noise is not None and apply_noise:
actor_tf = self.perturbed_actor_tf
action = self.perturbed_actor(normalized_obs)
else:
actor_tf = self.actor_tf
feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
action = actor_tf
if compute_Q:
action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
normalized_critic_with_actor_tf = self.critic(normalized_obs, actor_tf)
q = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
else:
action = self.sess.run(actor_tf, feed_dict=feed_dict)
q = None
if self.action_noise is not None and apply_noise:
noise = self.action_noise()
assert noise.shape == action[0].shape
action += noise
action = np.clip(action, self.action_range[0], self.action_range[1])
action = tf.clip_by_value(action, self.action_range[0], self.action_range[1])
return action, q, None, None
@@ -287,79 +197,130 @@ class DDPG(object):
self.obs_rms.update(np.array([obs0[b]]))
def train(self):
# Get a batch.
batch = self.memory.sample(batch_size=self.batch_size)
obs0, obs1 = tf.constant(batch['obs0']), tf.constant(batch['obs1'])
actions, rewards, terminals1 = tf.constant(batch['actions']), tf.constant(batch['rewards']), tf.constant(batch['terminals1'], dtype=tf.float32)
normalized_obs0, target_Q = self.compute_normalized_obs0_and_target_Q(obs0, obs1, rewards, terminals1)
if self.normalize_returns and self.enable_popart:
old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
self.obs1: batch['obs1'],
self.rewards: batch['rewards'],
self.terminals1: batch['terminals1'].astype('float32'),
})
old_mean = self.ret_rms.mean
old_std = self.ret_rms.std
self.ret_rms.update(target_Q.flatten())
self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
self.old_std : np.array([old_std]),
self.old_mean : np.array([old_mean]),
})
# renormalize Q outputs
new_mean = self.ret_rms.mean
new_std = self.ret_rms.std
for vs in [self.critic.output_vars, self.target_critic.output_vars]:
kernel, bias = vs
kernel.assign(kernel * old_std / new_std)
bias.assign((bias * old_std + old_mean - new_mean) / new_std)
# Run sanity check. Disabled by default since it slows down things considerably.
# print('running sanity check')
# target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
# self.obs1: batch['obs1'],
# self.rewards: batch['rewards'],
# self.terminals1: batch['terminals1'].astype('float32'),
# })
# print(target_Q_new, target_Q, new_mean, new_std)
# assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
actor_grads, actor_loss = self.get_actor_grads(normalized_obs0)
critic_grads, critic_loss = self.get_critic_grads(normalized_obs0, actions, target_Q)
if MPI is not None:
self.actor_optimizer.apply_gradients(actor_grads, self.actor_lr)
self.critic_optimizer.apply_gradients(critic_grads, self.critic_lr)
else:
target_Q = self.sess.run(self.target_Q, feed_dict={
self.obs1: batch['obs1'],
self.rewards: batch['rewards'],
self.terminals1: batch['terminals1'].astype('float32'),
})
# Get all gradients and perform a synced update.
ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
self.obs0: batch['obs0'],
self.actions: batch['actions'],
self.critic_target: target_Q,
})
self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)
self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))
self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))
return critic_loss, actor_loss
def initialize(self, sess):
self.sess = sess
self.sess.run(tf.global_variables_initializer())
self.actor_optimizer.sync()
self.critic_optimizer.sync()
self.sess.run(self.target_init_updates)
@tf.function
def compute_normalized_obs0_and_target_Q(self, obs0, obs1, rewards, terminals1):
normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1])
normalized_obs1 = tf.clip_by_value(normalize(obs1, self.obs_rms), self.observation_range[0], self.observation_range[1])
Q_obs1 = denormalize(self.target_critic(normalized_obs1, self.target_actor(normalized_obs1)), self.ret_rms)
target_Q = rewards + (1. - terminals1) * self.gamma * Q_obs1
return normalized_obs0, target_Q
@tf.function
def get_actor_grads(self, normalized_obs0):
with tf.GradientTape() as tape:
actor_tf = self.actor(normalized_obs0)
normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf)
critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
actor_loss = -tf.reduce_mean(critic_with_actor_tf)
actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
if self.clip_norm:
actor_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in actor_grads]
if MPI is not None:
actor_grads = tf.concat([tf.reshape(g, (-1,)) for g in actor_grads], axis=0)
return actor_grads, actor_loss
@tf.function
def get_critic_grads(self, normalized_obs0, actions, target_Q):
with tf.GradientTape() as tape:
normalized_critic_tf = self.critic(normalized_obs0, actions)
normalized_critic_target_tf = tf.clip_by_value(normalize(target_Q, self.ret_rms), self.return_range[0], self.return_range[1])
critic_loss = tf.reduce_mean(tf.square(normalized_critic_tf - normalized_critic_target_tf))
# The first is input layer, which is ignored here.
if self.critic_l2_reg > 0.:
# Ignore the first input layer.
for layer in self.critic.network_builder.layers[1:]:
# The original l2_regularizer takes half of sum square.
critic_loss += (self.critic_l2_reg / 2.)* tf.reduce_sum(tf.square(layer.kernel))
critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
if self.clip_norm:
critic_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in critic_grads]
if MPI is not None:
critic_grads = tf.concat([tf.reshape(g, (-1,)) for g in critic_grads], axis=0)
return critic_grads, critic_loss
def initialize(self):
if MPI is not None:
sync_from_root(self.actor.trainable_variables + self.critic.trainable_variables)
self.target_actor.set_weights(self.actor.get_weights())
self.target_critic.set_weights(self.critic.get_weights())
@tf.function
def update_target_net(self):
self.sess.run(self.target_soft_updates)
for var, target_var in zip(self.actor.variables, self.target_actor.variables):
target_var.assign((1. - self.tau) * target_var + self.tau * var)
for var, target_var in zip(self.critic.variables, self.target_critic.variables):
target_var.assign((1. - self.tau) * target_var + self.tau * var)
def get_stats(self):
if self.stats_sample is None:
# Get a sample and keep that fixed for all further computations.
# This allows us to estimate the change in value for the same set of inputs.
self.stats_sample = self.memory.sample(batch_size=self.batch_size)
values = self.sess.run(self.stats_ops, feed_dict={
self.obs0: self.stats_sample['obs0'],
self.actions: self.stats_sample['actions'],
})
obs0 = self.stats_sample['obs0']
actions = self.stats_sample['actions']
normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1])
normalized_critic_tf = self.critic(normalized_obs0, actions)
critic_tf = denormalize(tf.clip_by_value(normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
actor_tf = self.actor(normalized_obs0)
normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf)
critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
names = self.stats_names[:]
assert len(names) == len(values)
stats = dict(zip(names, values))
if self.param_noise is not None:
stats = {**stats, **self.param_noise.get_stats()}
stats = {}
if self.normalize_returns:
stats['ret_rms_mean'] = self.ret_rms.mean
stats['ret_rms_std'] = self.ret_rms.std
if self.normalize_observations:
stats['obs_rms_mean'] = tf.reduce_mean(self.obs_rms.mean)
stats['obs_rms_std'] = tf.reduce_mean(self.obs_rms.std)
stats['reference_Q_mean'] = tf.reduce_mean(critic_tf)
stats['reference_Q_std'] = reduce_std(critic_tf)
stats['reference_actor_Q_mean'] = tf.reduce_mean(critic_with_actor_tf)
stats['reference_actor_Q_std'] = reduce_std(critic_with_actor_tf)
stats['reference_action_mean'] = tf.reduce_mean(actor_tf)
stats['reference_action_std'] = reduce_std(actor_tf)
if self.param_noise:
perturbed_actor_tf = self.perturbed_actor(normalized_obs0)
stats['reference_perturbed_action_mean'] = tf.reduce_mean(perturbed_actor_tf)
stats['reference_perturbed_action_std'] = reduce_std(perturbed_actor_tf)
stats.update(self.param_noise.get_stats())
return stats
def adapt_param_noise(self):
def adapt_param_noise(self, obs0):
try:
from mpi4py import MPI
except ImportError:
@@ -368,34 +329,28 @@ class DDPG(object):
if self.param_noise is None:
return 0.
# Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
batch = self.memory.sample(batch_size=self.batch_size)
self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
self.param_noise_stddev: self.param_noise.current_stddev,
})
distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
self.obs0: batch['obs0'],
self.param_noise_stddev: self.param_noise.current_stddev,
})
mean_distance = self.get_mean_distance(obs0).numpy()
if MPI is not None:
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
else:
mean_distance = distance
if MPI is not None:
mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
else:
mean_distance = distance
mean_distance = MPI.COMM_WORLD.allreduce(mean_distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
self.param_noise.adapt(mean_distance)
return mean_distance
@tf.function
def get_mean_distance(self, obs0):
# Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
update_perturbed_actor(self.actor, self.perturbed_adaptive_actor, self.param_noise.current_stddev)
normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1])
actor_tf = self.actor(normalized_obs0)
adaptive_actor_tf = self.perturbed_adaptive_actor(normalized_obs0)
mean_distance = tf.sqrt(tf.reduce_mean(tf.square(actor_tf - adaptive_actor_tf)))
return mean_distance
def reset(self):
# Reset internal state after an episode is complete.
if self.action_noise is not None:
self.action_noise.reset()
if self.param_noise is not None:
self.sess.run(self.perturb_policy_ops, feed_dict={
self.param_noise_stddev: self.param_noise.current_stddev,
})
update_perturbed_actor(self.actor, self.perturbed_actor, self.param_noise.current_stddev)

View File

@@ -2,50 +2,48 @@ import tensorflow as tf
from baselines.common.models import get_network_builder
class Model(object):
class Model(tf.keras.Model):
def __init__(self, name, network='mlp', **network_kwargs):
self.name = name
self.network_builder = get_network_builder(network)(**network_kwargs)
@property
def vars(self):
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
@property
def trainable_vars(self):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name)
super(Model, self).__init__(name=name)
self.network = network
self.network_kwargs = network_kwargs
@property
def perturbable_vars(self):
return [var for var in self.trainable_vars if 'LayerNorm' not in var.name]
return [var for var in self.trainable_variables if 'layer_normalization' not in var.name]
class Actor(Model):
def __init__(self, nb_actions, name='actor', network='mlp', **network_kwargs):
def __init__(self, nb_actions, ob_shape, name='actor', network='mlp', **network_kwargs):
super().__init__(name=name, network=network, **network_kwargs)
self.nb_actions = nb_actions
self.network_builder = get_network_builder(network)(**network_kwargs)(ob_shape)
self.output_layer = tf.keras.layers.Dense(units=self.nb_actions,
activation=tf.keras.activations.tanh,
kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
_ = self.output_layer(self.network_builder.outputs[0])
def __call__(self, obs, reuse=False):
with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
x = self.network_builder(obs)
x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
x = tf.nn.tanh(x)
return x
@tf.function
def call(self, obs):
return self.output_layer(self.network_builder(obs))
class Critic(Model):
def __init__(self, name='critic', network='mlp', **network_kwargs):
def __init__(self, nb_actions, ob_shape, name='critic', network='mlp', **network_kwargs):
super().__init__(name=name, network=network, **network_kwargs)
self.layer_norm = True
self.network_builder = get_network_builder(network)(**network_kwargs)((ob_shape[0] + nb_actions,))
self.output_layer = tf.keras.layers.Dense(units=1,
kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3),
name='output')
_ = self.output_layer(self.network_builder.outputs[0])
def __call__(self, obs, action, reuse=False):
with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
x = tf.concat([obs, action], axis=-1) # this assumes observation and action can be concatenated
x = self.network_builder(x)
x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3), name='output')
return x
@tf.function
def call(self, obs, actions):
x = tf.concat([obs, actions], axis=-1) # this assumes observation and action can be concatenated
x = self.network_builder(x)
return self.output_layer(x)
@property
def output_vars(self):
output_vars = [var for var in self.trainable_vars if 'output' in var.name]
return output_vars
return self.output_layer.trainable_variables

View File

@@ -1,16 +0,0 @@
from baselines.common.tests.util import smoketest
def _run(argstr):
smoketest('--alg=ddpg --env=Pendulum-v0 --num_timesteps=0 ' + argstr)
def test_popart():
_run('--normalize_returns=True --popart=True')
def test_noise_normal():
_run('--noise_type=normal_0.1')
def test_noise_ou():
_run('--noise_type=ou_0.1')
def test_noise_adaptive():
_run('--noise_type=adaptive-param_0.2,normal_0.1')