Files
baselines/baselines/acktr/utils.py
John Schulman 3f676f7d1e ACKTR + A2C
2017-08-18 09:25:39 -07:00

201 lines
6.2 KiB
Python

import os
import numpy as np
import tensorflow as tf
import baselines.common.tf_util as U
from collections import deque
def sample(logits):
noise = tf.random_uniform(tf.shape(logits))
return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
def std(x):
mean = tf.reduce_mean(x)
var = tf.reduce_mean(tf.square(x-mean))
return tf.sqrt(var)
def cat_entropy(logits):
a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
ea0 = tf.exp(a0)
z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
p0 = ea0 / z0
return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
def cat_entropy_softmax(p0):
return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
def mse(pred, target):
return tf.square(pred-target)/2.
def ortho_init(scale=1.0):
def _ortho_init(shape, dtype, partition_info=None):
#lasagne ortho init for tf
shape = tuple(shape)
if len(shape) == 2:
flat_shape = shape
elif len(shape) == 4: # assumes NHWC
flat_shape = (np.prod(shape[:-1]), shape[-1])
else:
raise NotImplementedError
a = np.random.normal(0.0, 1.0, flat_shape)
u, _, v = np.linalg.svd(a, full_matrices=False)
q = u if u.shape == flat_shape else v # pick the one with the correct shape
q = q.reshape(shape)
return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
return _ortho_init
def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
with tf.variable_scope(scope):
nin = x.get_shape()[3].value
w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
h = act(z)
return h
def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
with tf.variable_scope(scope):
nin = x.get_shape()[1].value
w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
z = tf.matmul(x, w)+b
h = act(z)
return h
def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
with tf.variable_scope(name, reuse=reuse):
assert (len(U.scope_name().split('/')) == 2)
w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
weight_decay_fc = 3e-4
if weight_loss_dict is not None:
weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss')
if weight_loss_dict is not None:
weight_loss_dict[w] = weight_decay_fc
weight_loss_dict[b] = 0.0
tf.add_to_collection(U.scope_name().split('/')[0] + '_' + 'losses', weight_decay)
return tf.nn.bias_add(tf.matmul(x, w), b)
def conv_to_fc(x):
nh = np.prod([v.value for v in x.get_shape()[1:]])
x = tf.reshape(x, [-1, nh])
return x
def kl_div(action_dist1, action_dist2, action_size):
mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2)
denominator = 2 * tf.square(std2) + 1e-8
return tf.reduce_sum(
numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
def discount_with_dones(rewards, dones, gamma):
discounted = []
r = 0
for reward, done in zip(rewards[::-1], dones[::-1]):
r = reward + gamma*r*(1.-done) # fixed off by one bug
discounted.append(r)
return discounted[::-1]
def find_trainable_variables(key):
with tf.variable_scope(key):
return tf.trainable_variables()
def make_path(f):
return os.makedirs(f, exist_ok=True)
def constant(p):
return 1
def linear(p):
return 1-p
def middle_drop(p):
eps = 0.75
if 1-p<eps:
return eps*0.1
return 1-p
def double_linear_con(p):
p *= 2
eps = 0.125
if 1-p<eps:
return eps
return 1-p
def double_middle_drop(p):
eps1 = 0.75
eps2 = 0.25
if 1-p<eps1:
if 1-p<eps2:
return eps2*0.5
return eps1*0.1
return 1-p
schedules = {
'linear':linear,
'constant':constant,
'double_linear_con':double_linear_con,
'middle_drop':middle_drop,
'double_middle_drop':double_middle_drop
}
class Scheduler(object):
def __init__(self, v, nvalues, schedule):
self.n = 0.
self.v = v
self.nvalues = nvalues
self.schedule = schedules[schedule]
def value(self):
current_value = self.v*self.schedule(self.n/self.nvalues)
self.n += 1.
return current_value
def value_steps(self, steps):
return self.v*self.schedule(steps/self.nvalues)
class EpisodeStats:
def __init__(self, nsteps, nenvs):
self.episode_rewards = []
for i in range(nenvs):
self.episode_rewards.append([])
self.lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
self.rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
self.nsteps = nsteps
self.nenvs = nenvs
def feed(self, rewards, masks):
rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
masks = np.reshape(masks, [self.nenvs, self.nsteps])
for i in range(0, self.nenvs):
for j in range(0, self.nsteps):
self.episode_rewards[i].append(rewards[i][j])
if masks[i][j]:
l = len(self.episode_rewards[i])
s = sum(self.episode_rewards[i])
self.lenbuffer.append(l)
self.rewbuffer.append(s)
self.episode_rewards[i] = []
def mean_length(self):
if self.lenbuffer:
return np.mean(self.lenbuffer)
else:
return 0 # on the first params dump, no episodes are finished
def mean_reward(self):
if self.rewbuffer:
return np.mean(self.rewbuffer)
else:
return 0