refactor a2c, acer, acktr, ppo2, deepq, and trpo_mpi (#490)
* exported rl-algs * more stuff from rl-algs * run slow tests * re-exported rl_algs * re-exported rl_algs - fixed problems with serialization test and test_cartpole * replaced atari_arg_parser with common_arg_parser * run.py can run algos from both baselines and rl_algs * added approximate humanoid reward with ppo2 into the README for reference * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * very dummy commit to RUN BENCHMARKS * serialize variables as a dict, not as a list * running_mean_std uses tensorflow variables * fixed import in vec_normalize * dummy commit to RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * flake8 complaints * save all variables to make sure we save the vec_normalize normalization * benchmarks on ppo2 only RUN BENCHMARKS * make_atari_env compatible with mpi * run ppo_mpi benchmarks only RUN BENCHMARKS * hardcode names of retro environments * add defaults * changed default ppo2 lr schedule to linear RUN BENCHMARKS * non-tf normalization benchmark RUN BENCHMARKS * use ncpu=1 for mujoco sessions - gives a bit of a performance speedup * reverted running_mean_std to user property decorators for mean, var, count * reverted VecNormalize to use RunningMeanStd (no tf) * reverted VecNormalize to use RunningMeanStd (no tf) * profiling wip * use VecNormalize with regular RunningMeanStd * added acer runner (missing import) * flake8 complaints * added a note in README about TfRunningMeanStd and serialization of VecNormalize * dummy commit to RUN BENCHMARKS * merged benchmarks branch
This commit is contained in:
@@ -1,8 +1,6 @@
|
||||
import os
|
||||
import gym
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from gym import spaces
|
||||
from collections import deque
|
||||
|
||||
def sample(logits):
|
||||
@@ -10,18 +8,15 @@ def sample(logits):
|
||||
return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
|
||||
|
||||
def cat_entropy(logits):
|
||||
a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
|
||||
a0 = logits - tf.reduce_max(logits, 1, keepdims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
|
||||
z0 = tf.reduce_sum(ea0, 1, keepdims=True)
|
||||
p0 = ea0 / z0
|
||||
return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
|
||||
|
||||
def cat_entropy_softmax(p0):
|
||||
return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
|
||||
|
||||
def mse(pred, target):
|
||||
return tf.square(pred-target)/2.
|
||||
|
||||
def ortho_init(scale=1.0):
|
||||
def _ortho_init(shape, dtype, partition_info=None):
|
||||
#lasagne ortho init for tf
|
||||
@@ -58,7 +53,7 @@ def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='
|
||||
b = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0))
|
||||
if not one_dim_bias and data_format == 'NHWC':
|
||||
b = tf.reshape(b, bshape)
|
||||
return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format)
|
||||
return tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format) + b
|
||||
|
||||
def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
|
||||
with tf.variable_scope(scope):
|
||||
@@ -85,7 +80,6 @@ def seq_to_batch(h, flat = False):
|
||||
|
||||
def lstm(xs, ms, s, scope, nh, init_scale=1.0):
|
||||
nbatch, nin = [v.value for v in xs[0].get_shape()]
|
||||
nsteps = len(xs)
|
||||
with tf.variable_scope(scope):
|
||||
wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
|
||||
wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
|
||||
@@ -115,7 +109,6 @@ def _ln(x, g, b, e=1e-5, axes=[1]):
|
||||
|
||||
def lnlstm(xs, ms, s, scope, nh, init_scale=1.0):
|
||||
nbatch, nin = [v.value for v in xs[0].get_shape()]
|
||||
nsteps = len(xs)
|
||||
with tf.variable_scope(scope):
|
||||
wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
|
||||
gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0))
|
||||
@@ -160,8 +153,7 @@ def discount_with_dones(rewards, dones, gamma):
|
||||
return discounted[::-1]
|
||||
|
||||
def find_trainable_variables(key):
|
||||
with tf.variable_scope(key):
|
||||
return tf.trainable_variables()
|
||||
return tf.trainable_variables(key)
|
||||
|
||||
def make_path(f):
|
||||
return os.makedirs(f, exist_ok=True)
|
||||
|
Reference in New Issue
Block a user