201 lines
6.2 KiB
Python
201 lines
6.2 KiB
Python
import os
|
|
import numpy as np
|
|
import tensorflow as tf
|
|
import baselines.common.tf_util as U
|
|
from collections import deque
|
|
|
|
def sample(logits):
|
|
noise = tf.random_uniform(tf.shape(logits))
|
|
return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
|
|
|
|
def std(x):
|
|
mean = tf.reduce_mean(x)
|
|
var = tf.reduce_mean(tf.square(x-mean))
|
|
return tf.sqrt(var)
|
|
|
|
def cat_entropy(logits):
|
|
a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
|
|
ea0 = tf.exp(a0)
|
|
z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
|
|
p0 = ea0 / z0
|
|
return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
|
|
|
|
def cat_entropy_softmax(p0):
|
|
return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
|
|
|
|
def mse(pred, target):
|
|
return tf.square(pred-target)/2.
|
|
|
|
def ortho_init(scale=1.0):
|
|
def _ortho_init(shape, dtype, partition_info=None):
|
|
#lasagne ortho init for tf
|
|
shape = tuple(shape)
|
|
if len(shape) == 2:
|
|
flat_shape = shape
|
|
elif len(shape) == 4: # assumes NHWC
|
|
flat_shape = (np.prod(shape[:-1]), shape[-1])
|
|
else:
|
|
raise NotImplementedError
|
|
a = np.random.normal(0.0, 1.0, flat_shape)
|
|
u, _, v = np.linalg.svd(a, full_matrices=False)
|
|
q = u if u.shape == flat_shape else v # pick the one with the correct shape
|
|
q = q.reshape(shape)
|
|
return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
|
|
return _ortho_init
|
|
|
|
def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
|
|
with tf.variable_scope(scope):
|
|
nin = x.get_shape()[3].value
|
|
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
|
|
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
|
|
z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
|
|
h = act(z)
|
|
return h
|
|
|
|
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
|
|
with tf.variable_scope(scope):
|
|
nin = x.get_shape()[1].value
|
|
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
|
|
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
|
|
z = tf.matmul(x, w)+b
|
|
h = act(z)
|
|
return h
|
|
|
|
def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
|
|
with tf.variable_scope(name, reuse=reuse):
|
|
assert (len(U.scope_name().split('/')) == 2)
|
|
|
|
w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
|
|
b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
|
|
weight_decay_fc = 3e-4
|
|
|
|
if weight_loss_dict is not None:
|
|
weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss')
|
|
if weight_loss_dict is not None:
|
|
weight_loss_dict[w] = weight_decay_fc
|
|
weight_loss_dict[b] = 0.0
|
|
|
|
tf.add_to_collection(U.scope_name().split('/')[0] + '_' + 'losses', weight_decay)
|
|
|
|
return tf.nn.bias_add(tf.matmul(x, w), b)
|
|
|
|
def conv_to_fc(x):
|
|
nh = np.prod([v.value for v in x.get_shape()[1:]])
|
|
x = tf.reshape(x, [-1, nh])
|
|
return x
|
|
|
|
def kl_div(action_dist1, action_dist2, action_size):
|
|
mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
|
|
mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
|
|
|
|
numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2)
|
|
denominator = 2 * tf.square(std2) + 1e-8
|
|
return tf.reduce_sum(
|
|
numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
|
|
|
|
def discount_with_dones(rewards, dones, gamma):
|
|
discounted = []
|
|
r = 0
|
|
for reward, done in zip(rewards[::-1], dones[::-1]):
|
|
r = reward + gamma*r*(1.-done) # fixed off by one bug
|
|
discounted.append(r)
|
|
return discounted[::-1]
|
|
|
|
def find_trainable_variables(key):
|
|
with tf.variable_scope(key):
|
|
return tf.trainable_variables()
|
|
|
|
def make_path(f):
|
|
return os.makedirs(f, exist_ok=True)
|
|
|
|
def constant(p):
|
|
return 1
|
|
|
|
def linear(p):
|
|
return 1-p
|
|
|
|
|
|
def middle_drop(p):
|
|
eps = 0.75
|
|
if 1-p<eps:
|
|
return eps*0.1
|
|
return 1-p
|
|
|
|
def double_linear_con(p):
|
|
p *= 2
|
|
eps = 0.125
|
|
if 1-p<eps:
|
|
return eps
|
|
return 1-p
|
|
|
|
|
|
def double_middle_drop(p):
|
|
eps1 = 0.75
|
|
eps2 = 0.25
|
|
if 1-p<eps1:
|
|
if 1-p<eps2:
|
|
return eps2*0.5
|
|
return eps1*0.1
|
|
return 1-p
|
|
|
|
|
|
schedules = {
|
|
'linear':linear,
|
|
'constant':constant,
|
|
'double_linear_con':double_linear_con,
|
|
'middle_drop':middle_drop,
|
|
'double_middle_drop':double_middle_drop
|
|
}
|
|
|
|
class Scheduler(object):
|
|
|
|
def __init__(self, v, nvalues, schedule):
|
|
self.n = 0.
|
|
self.v = v
|
|
self.nvalues = nvalues
|
|
self.schedule = schedules[schedule]
|
|
|
|
def value(self):
|
|
current_value = self.v*self.schedule(self.n/self.nvalues)
|
|
self.n += 1.
|
|
return current_value
|
|
|
|
def value_steps(self, steps):
|
|
return self.v*self.schedule(steps/self.nvalues)
|
|
|
|
|
|
class EpisodeStats:
|
|
def __init__(self, nsteps, nenvs):
|
|
self.episode_rewards = []
|
|
for i in range(nenvs):
|
|
self.episode_rewards.append([])
|
|
self.lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
|
|
self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
|
|
self.nsteps = nsteps
|
|
self.nenvs = nenvs
|
|
|
|
def feed(self, rewards, masks):
|
|
rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
|
|
masks = np.reshape(masks, [self.nenvs, self.nsteps])
|
|
for i in range(0, self.nenvs):
|
|
for j in range(0, self.nsteps):
|
|
self.episode_rewards[i].append(rewards[i][j])
|
|
if masks[i][j]:
|
|
l = len(self.episode_rewards[i])
|
|
s = sum(self.episode_rewards[i])
|
|
self.lenbuffer.append(l)
|
|
self.rewbuffer.append(s)
|
|
self.episode_rewards[i] = []
|
|
|
|
def mean_length(self):
|
|
if self.lenbuffer:
|
|
return np.mean(self.lenbuffer)
|
|
else:
|
|
return 0 # on the first params dump, no episodes are finished
|
|
|
|
def mean_reward(self):
|
|
if self.rewbuffer:
|
|
return np.mean(self.rewbuffer)
|
|
else:
|
|
return 0
|