continuous action spaces for codegen + some benchmarking (#82)
* add some docstrings * start making big changes * state machine redesign * sampling seems to work * some reorg * fixed sampling of real vals * json conversion * made it possible to register new commands got nontrivial version of Pred working * consolidate command definitions * add more macro blocks * revived visualization * rename Userdata -> CmdInterpreter make AlgoSmInstance subclass of SmInstance that uses appropriate userdata argument * replace userdata by ci when appropriate * minor test fixes * revamped handmade dir, can run ppo_metal * seed to avoid random test failure * implement AlgoAgent * Autogenerated object that performs all ops and macros * more CmdRecorder changes * move files around * move MatchProb and JtftProb * remove obsolete * fix tests involving AlgoAgent (pending the next commit on ppo_metal code) * ppo_metal: reduce duplication in policy_gen, make sess an attribute of PpoAgent and StochasticPolicy instead of using get_default_session everywhere. * maze_env reformatting, move algo_search script (but stil broken) * move agent.py * fix test on handcrafted agents * tuning/fixing ppo_metal baseline * minor * Fix ppo_metal baseline * Don’t set epcount, tcount unless they’re being used * get rid of old ppo_metal baseline * fixes for handmade/run.py tuning * fix codegen ppo * fix handmade ppo hps * fix test, go back to safe_div * switch to more complex filtering * make sure all handcrafted algos have finite probability * train to maximize logprob of provided samples Trex changes to avoid segfault * AlgoSm also includes global hyperparams * don’t duplicate global hyperparam defaults * create generic_ob_ac_space function * use sorted list of outkeys * revive tsne * todo changes * determinism test * todo + test fix * remove a few deprecated files, rename other tests so they don’t run automatically, fix real test failure * continuous control with codegen * continuous control with codegen * implement continuous action space algodistr * ppo with trex RUN BENCHMARKS * wrap trex in a monitor * dummy commit to RUN BENCHMARKS * adding monitor to trex env RUN BENCHMARKS * adding monitor to trex RUN BENCHMARKS * include monitor into trex env RUN BENCHMARKS * generate nll and predmean using Distribution node * dummy commit to RUN BENCHMARKS * include pybullet into baselines optional dependencies * dummy commit to RUN BENCHMARKS * install games for cron rcall user RUN BENCHMARKS * add --yes flag to install.py in rcall config for cron user RUN BENCHMARKS * both continuous and discrete versions seem to run * fixes to monitor to work with vecenv-like info and rewards RUN BENCHMARKS * dummy commit to RUN BENCHMARKS * removed shape check from one-hot encoding logic in distributions.CategoricalPd * reset logger configuration in codegen/handmade/run.py to be in-line with baselines RUN BENCHMARKS * merged peterz_codegen_benchmarks RUN BENCHMARKS * skip tests RUN BENCHMARKS * working on test failures * save benchmark dicts RUN BENCHMARK * merged peterz_codegen_benchmark RUN BENCHMARKS * add get_git_commit_message to the baselines.common.console_util * dummy commit to RUN BENCHMARKS * merged fixes from peterz_codegen_benchmark RUN BENCHMARKS * fixing failure in test_algo_nll WIP * test_algo_nll passes with both ppo and softq * re-enabled tests * run trex on gpus for 100k total (horizon=100k / 16) RUN BENCHMARKS * merged latest peterz_codegen_benchmarks RUN BENCHMARKS * fixing codegen test failures (logging-related) * fixed name collision in run-benchmarks-new.py RUN BENCHMARKS * fixed name collision in run-benchmarks-new.py RUN BENCHMARKS * fixed import in node_filters.py * test_algo_search passes * some cleanup * dummy commit to RUN BENCHMARKS * merge fast fail for subprocvecenv RUN BENCHMARKS * use SubprocVecEnv in sonic_prob * added deprecation note to shmem_vec_env * allow indexing of distributions * add timeout to pipeline.yaml * typo in pipeline.yml * run tests with --forked option * resolved merge conflict in rl_algs.bench.benchmarks * re-enable parallel tests * fix remaining merge conflicts and syntax * Update trex_prob.py * fixes to ResultsWriter * take baselines/run.py from peterz_codegen branch * actually save stuff to file in VecMonitor RUN BENCHMARKS * enable parallel tests * merge stricter flake8 * merge peterz_codegen_benchmark, resolve conflicts * autopep8 * remove traces of Monitor from trex env, check shapes before encoding in CategoricalPd * asserts and warnings to make q -> distribution change more explicit * fixed assert in CategoricalPd * add header to vec_monitor output file RUN BENCHMARKS * make VecMonitor write header to the output file * remove deprecation message from shmem_vec_env RUN BENCHMARKS * autopep8 * proper shape test in distributions.py * ResultsWriter can take dict headers * dummy commit to RUN BENCHMARKS * replace assert len(qs)==1 with warning RUN BENCHMARKS * removed pdb from ppo2 RUN BENCHMARKS
This commit is contained in:
@@ -97,6 +97,19 @@ register_benchmark({
|
||||
]
|
||||
})
|
||||
|
||||
# Bullet
|
||||
_bulletsmall = [
|
||||
'InvertedDoublePendulum', 'InvertedPendulum', 'HalfCheetah', 'Reacher', 'Walker2D', 'Hopper', 'Ant'
|
||||
]
|
||||
_bulletsmall = [e + 'BulletEnv-v0' for e in _bulletsmall]
|
||||
|
||||
register_benchmark({
|
||||
'name': 'Bullet1M',
|
||||
'description': '6 mujoco-like tasks from bullet, 1M steps',
|
||||
'tasks': [{'env_id': e, 'trials': 6, 'num_timesteps': int(1e6)} for e in _bulletsmall]
|
||||
})
|
||||
|
||||
|
||||
# Roboschool
|
||||
|
||||
register_benchmark({
|
||||
|
@@ -16,21 +16,11 @@ class Monitor(Wrapper):
|
||||
def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()):
|
||||
Wrapper.__init__(self, env=env)
|
||||
self.tstart = time.time()
|
||||
if filename is None:
|
||||
self.f = None
|
||||
self.logger = None
|
||||
else:
|
||||
if not filename.endswith(Monitor.EXT):
|
||||
if osp.isdir(filename):
|
||||
filename = osp.join(filename, Monitor.EXT)
|
||||
else:
|
||||
filename = filename + "." + Monitor.EXT
|
||||
self.f = open(filename, "wt")
|
||||
self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, 'env_id' : env.spec and env.spec.id}))
|
||||
self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords+info_keywords)
|
||||
self.logger.writeheader()
|
||||
self.f.flush()
|
||||
|
||||
self.results_writer = ResultsWriter(
|
||||
filename,
|
||||
header={"t_start": time.time(), 'env_id' : env.spec and env.spec.id},
|
||||
extra_keys=reset_keywords + info_keywords
|
||||
)
|
||||
self.reset_keywords = reset_keywords
|
||||
self.info_keywords = info_keywords
|
||||
self.allow_early_resets = allow_early_resets
|
||||
@@ -43,10 +33,7 @@ class Monitor(Wrapper):
|
||||
self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()
|
||||
|
||||
def reset(self, **kwargs):
|
||||
if not self.allow_early_resets and not self.needs_reset:
|
||||
raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
|
||||
self.rewards = []
|
||||
self.needs_reset = False
|
||||
self.reset_state()
|
||||
for k in self.reset_keywords:
|
||||
v = kwargs.get(k)
|
||||
if v is None:
|
||||
@@ -54,10 +41,21 @@ class Monitor(Wrapper):
|
||||
self.current_reset_info[k] = v
|
||||
return self.env.reset(**kwargs)
|
||||
|
||||
def reset_state(self):
|
||||
if not self.allow_early_resets and not self.needs_reset:
|
||||
raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
|
||||
self.rewards = []
|
||||
self.needs_reset = False
|
||||
|
||||
|
||||
def step(self, action):
|
||||
if self.needs_reset:
|
||||
raise RuntimeError("Tried to step environment that needs reset")
|
||||
ob, rew, done, info = self.env.step(action)
|
||||
self.update(ob, rew, done, info)
|
||||
return (ob, rew, done, info)
|
||||
|
||||
def update(self, ob, rew, done, info):
|
||||
self.rewards.append(rew)
|
||||
if done:
|
||||
self.needs_reset = True
|
||||
@@ -70,12 +68,12 @@ class Monitor(Wrapper):
|
||||
self.episode_lengths.append(eplen)
|
||||
self.episode_times.append(time.time() - self.tstart)
|
||||
epinfo.update(self.current_reset_info)
|
||||
if self.logger:
|
||||
self.logger.writerow(epinfo)
|
||||
self.f.flush()
|
||||
info['episode'] = epinfo
|
||||
self.results_writer.write_row(epinfo)
|
||||
|
||||
if isinstance(info, dict):
|
||||
info['episode'] = epinfo
|
||||
|
||||
self.total_steps += 1
|
||||
return (ob, rew, done, info)
|
||||
|
||||
def close(self):
|
||||
if self.f is not None:
|
||||
@@ -96,6 +94,34 @@ class Monitor(Wrapper):
|
||||
class LoadMonitorResultsError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ResultsWriter(object):
|
||||
def __init__(self, filename=None, header='', extra_keys=()):
|
||||
self.extra_keys = extra_keys
|
||||
if filename is None:
|
||||
self.f = None
|
||||
self.logger = None
|
||||
else:
|
||||
if not filename.endswith(Monitor.EXT):
|
||||
if osp.isdir(filename):
|
||||
filename = osp.join(filename, Monitor.EXT)
|
||||
else:
|
||||
filename = filename + "." + Monitor.EXT
|
||||
self.f = open(filename, "wt")
|
||||
if isinstance(header, dict):
|
||||
header = '# {} \n'.format(json.dumps(header))
|
||||
self.f.write(header)
|
||||
self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+tuple(extra_keys))
|
||||
self.logger.writeheader()
|
||||
self.f.flush()
|
||||
|
||||
def write_row(self, epinfo):
|
||||
if self.logger:
|
||||
self.logger.writerow(epinfo)
|
||||
self.f.flush()
|
||||
|
||||
|
||||
|
||||
def get_monitor_files(dir):
|
||||
return glob(osp.join(dir, "*" + Monitor.EXT))
|
||||
|
||||
|
@@ -121,11 +121,18 @@ def parse_unknown_args(args):
|
||||
Parse arguments not consumed by arg parser into a dicitonary
|
||||
"""
|
||||
retval = {}
|
||||
preceded_by_key = False
|
||||
for arg in args:
|
||||
assert arg.startswith('--')
|
||||
assert '=' in arg, 'cannot parse arg {}'.format(arg)
|
||||
key = arg.split('=')[0][2:]
|
||||
value = arg.split('=')[1]
|
||||
retval[key] = value
|
||||
if arg.startswith('--'):
|
||||
if '=' in arg:
|
||||
key = arg.split('=')[0][2:]
|
||||
value = arg.split('=')[1]
|
||||
retval[key] = value
|
||||
else:
|
||||
key = arg[2:]
|
||||
preceded_by_key = True
|
||||
elif preceded_by_key:
|
||||
retval[key] = arg
|
||||
preceded_by_key = False
|
||||
|
||||
return retval
|
||||
|
@@ -58,6 +58,9 @@ def print_cmd(cmd, dry=False):
|
||||
def get_git_commit(cwd=None):
|
||||
return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=cwd).decode('utf8')
|
||||
|
||||
def get_git_commit_message(cwd=None):
|
||||
return subprocess.check_output(['git', 'show', '-s', '--format=%B', 'HEAD'], cwd=cwd).decode('utf8')
|
||||
|
||||
def ccap(cmd, dry=False, env=None, **kwargs):
|
||||
print_cmd(cmd, dry)
|
||||
if not dry:
|
||||
|
@@ -23,6 +23,11 @@ class Pd(object):
|
||||
raise NotImplementedError
|
||||
def logp(self, x):
|
||||
return - self.neglogp(x)
|
||||
def get_shape(self):
|
||||
return self.flatparam().shape
|
||||
@property
|
||||
def shape(self):
|
||||
return self.get_shape()
|
||||
|
||||
class PdType(object):
|
||||
"""
|
||||
@@ -145,10 +150,22 @@ class CategoricalPd(Pd):
|
||||
# return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
|
||||
# Note: we can't use sparse_softmax_cross_entropy_with_logits because
|
||||
# the implementation does not allow second-order derivatives...
|
||||
one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
|
||||
if x.dtype in {tf.uint8, tf.int32, tf.int64}:
|
||||
# one-hot encoding
|
||||
x_shape_list = x.shape.as_list()
|
||||
logits_shape_list = self.logits.get_shape().as_list()[:-1]
|
||||
for xs, ls in zip(x_shape_list, logits_shape_list):
|
||||
if xs is not None and ls is not None:
|
||||
assert xs == ls, 'shape mismatch: {} in x vs {} in logits'.format(xs, ls)
|
||||
|
||||
x = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
|
||||
else:
|
||||
# already encoded
|
||||
assert x.shape.as_list() == self.logits.shape.as_list()
|
||||
|
||||
return tf.nn.softmax_cross_entropy_with_logits_v2(
|
||||
logits=self.logits,
|
||||
labels=one_hot_actions)
|
||||
labels=x)
|
||||
def kl(self, other):
|
||||
a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keepdims=True)
|
||||
a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keepdims=True)
|
||||
@@ -216,13 +233,19 @@ class DiagGaussianPd(Pd):
|
||||
@classmethod
|
||||
def fromflat(cls, flat):
|
||||
return cls(flat)
|
||||
def __getitem__(self, idx):
|
||||
return DiagGaussianPd(self.flat[idx])
|
||||
|
||||
|
||||
class BernoulliPd(Pd):
|
||||
def __init__(self, logits):
|
||||
self.logits = logits
|
||||
self.ps = tf.sigmoid(logits)
|
||||
def flatparam(self):
|
||||
return self.logits
|
||||
return self.logit
|
||||
@property
|
||||
def mean(self):
|
||||
return self.ps
|
||||
def mode(self):
|
||||
return tf.round(self.ps)
|
||||
def neglogp(self, x):
|
||||
|
@@ -1,12 +1,16 @@
|
||||
from . import VecEnvWrapper
|
||||
from baselines.bench.monitor import ResultsWriter
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
|
||||
class VecMonitor(VecEnvWrapper):
|
||||
def __init__(self, venv):
|
||||
def __init__(self, venv, filename=None):
|
||||
VecEnvWrapper.__init__(self, venv)
|
||||
self.eprets = None
|
||||
self.eplens = None
|
||||
self.tstart = time.time()
|
||||
self.results_writer = ResultsWriter(filename, header={'t_start': self.tstart})
|
||||
|
||||
def reset(self):
|
||||
obs = self.venv.reset()
|
||||
@@ -22,8 +26,12 @@ class VecMonitor(VecEnvWrapper):
|
||||
for (i, (done, ret, eplen, info)) in enumerate(zip(dones, self.eprets, self.eplens, infos)):
|
||||
info = info.copy()
|
||||
if done:
|
||||
info['episode'] = {'r': ret, 'l': eplen}
|
||||
epinfo = {'r': ret, 'l': eplen, 't': round(time.time() - self.tstart, 6)}
|
||||
info['episode'] = epinfo
|
||||
self.eprets[i] = 0
|
||||
self.eplens[i] = 0
|
||||
self.results_writer.write_row(epinfo)
|
||||
|
||||
newinfos.append(info)
|
||||
|
||||
return obs, rews, dones, newinfos
|
||||
|
@@ -30,6 +30,5 @@ def main():
|
||||
model.save('pong_model.pkl')
|
||||
env.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@@ -154,9 +154,6 @@ def get_default_network(env_type):
|
||||
else:
|
||||
return 'mlp'
|
||||
|
||||
raise ValueError('Unknown env_type {}'.format(env_type))
|
||||
|
||||
|
||||
def get_alg_module(alg, submodule=None):
|
||||
submodule = submodule or alg
|
||||
try:
|
||||
@@ -182,16 +179,21 @@ def get_learn_function_defaults(alg, env_type):
|
||||
return kwargs
|
||||
|
||||
|
||||
def parse(v):
|
||||
'''
|
||||
convert value of a command-line arg to a python object if possible, othewise, keep as string
|
||||
'''
|
||||
|
||||
assert isinstance(v, str)
|
||||
try:
|
||||
return eval(v)
|
||||
except (NameError, SyntaxError):
|
||||
return v
|
||||
def parse_cmdline_kwargs(args):
|
||||
'''
|
||||
convert a list of '='-spaced command-line arguments to a dictionary, evaluating python objects when possible
|
||||
'''
|
||||
def parse(v):
|
||||
|
||||
assert isinstance(v, str)
|
||||
try:
|
||||
return eval(v)
|
||||
except (NameError, SyntaxError):
|
||||
return v
|
||||
|
||||
return {k: parse(v) for k,v in parse_unknown_args(args).items()}
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
@@ -199,7 +201,7 @@ def main():
|
||||
|
||||
arg_parser = common_arg_parser()
|
||||
args, unknown_args = arg_parser.parse_known_args()
|
||||
extra_args = {k: parse(v) for k, v in parse_unknown_args(unknown_args).items()}
|
||||
extra_args = parse_cmdline_kwargs(unknown_args)
|
||||
|
||||
if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
|
||||
rank = 0
|
||||
|
Reference in New Issue
Block a user