dummy commit to RUN BENCHMARKS

GAIL: bugfix in dataset loading (#447 )
2018-07-25 18:09:30 -07:00 · 2018-07-25 18:07:56 -07:00 · 2018-07-06 16:12:14 -07:00 · 2018-06-08 09:41:45 -07:00 · 2018-06-06 11:39:13 -07:00 · 2018-05-21 15:24:00 -07:00
31 changed files with 500 additions and 147 deletions
--- a/README.md
+++ b/README.md
@@ -52,6 +52,8 @@ Install baselines package
 ```bash
 pip install -e .
 ```
+### MuJoCo
+Some of the baselines examples use [MuJoCo](http://www.mujoco.org) (multi-joint dynamics in contact) physics simulator, which is proprietary and requires binaries and a license (temporary 30-day license can be obtained from [www.mujoco.org](http://www.mujoco.org)). Instructions on setting up MuJoCo can be found [here](https://github.com/openai/mujoco-py)

 ## Testing the installation
 All unit tests in baselines can be run using pytest runner:
--- a/baselines/a2c/a2c.py
+++ b/baselines/a2c/a2c.py
@@ -131,7 +131,6 @@ class Runner(AbstractEnvRunner):
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values

 def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
-    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs
@@ -158,3 +157,4 @@ def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, e
            logger.record_tabular("explained_variance", float(ev))
            logger.dump_tabular()
    env.close()
+    return model
--- a/baselines/a2c/policies.py
+++ b/baselines/a2c/policies.py
@@ -2,6 +2,7 @@ import numpy as np
 import tensorflow as tf
 from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
 from baselines.common.distributions import make_pdtype
+from baselines.common.input import observation_input

 def nature_cnn(unscaled_images, **conv_kwargs):
    """
@@ -19,14 +20,12 @@ def nature_cnn(unscaled_images, **conv_kwargs):
 class LnLstmPolicy(object):
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
-        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        X, processed_x = observation_input(ob_space, nbatch)
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
+            h = nature_cnn(processed_x)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
@@ -56,11 +55,9 @@ class LstmPolicy(object):

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
-
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
-        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        X, processed_x = observation_input(ob_space, nbatch)
+
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
@@ -93,12 +90,10 @@ class LstmPolicy(object):
 class CnnPolicy(object):

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
-        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X, **conv_kwargs)
+            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

@@ -120,15 +115,14 @@ class CnnPolicy(object):

 class MlpPolicy(object):
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
-        ob_shape = (nbatch,) + ob_space.shape
        self.pdtype = make_pdtype(ac_space)
-        X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
        with tf.variable_scope("model", reuse=reuse):
+            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
-            flatten = tf.layers.flatten
-            pi_h1 = activ(fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
+            processed_x = tf.layers.flatten(processed_x)
+            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf_h1 = activ(fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
+            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:,0]

--- a/baselines/bench/benchmarks.py
+++ b/baselines/bench/benchmarks.py
@@ -9,6 +9,8 @@ _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye
 _BENCHMARKS = []

 remove_version_re = re.compile(r'-v\d+$')
+
+
 def register_benchmark(benchmark):
    for b in _BENCHMARKS:
        if b['name'] == benchmark['name']:
@@ -138,3 +140,11 @@ register_benchmark({
    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
 })

+# HER DDPG
+
+register_benchmark({
+    'name': 'HerDdpg',
+    'description': 'Smoke-test only benchmark of HER',
+    'tasks': [{'trials': 1, 'env_id': 'FetchReach-v1'}]
+})
+
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -3,6 +3,7 @@ Helpers for scripts like run_atari.py.
 """

 import os
+from mpi4py import MPI
 import gym
 from gym.wrappers import FlattenDictWrapper
 from baselines import logger
@@ -30,9 +31,10 @@ def make_mujoco_env(env_id, seed):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
-    set_global_seeds(seed)
+    rank = MPI.COMM_WORLD.Get_rank()
+    set_global_seeds(seed + 10000 * rank)
    env = gym.make(env_id)
-    env = Monitor(env, logger.get_dir())
+    env = Monitor(env, os.path.join(logger.get_dir(), str(rank)))
    env.seed(seed)
    return env

@@ -74,6 +76,7 @@ def mujoco_arg_parser():
    parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
+    parser.add_argument('--play', default=False, action='store_true')
    return parser

 def robotics_arg_parser():
--- a/baselines/common/identity_env.py
+++ b/baselines/common/identity_env.py
@@ -0,0 +1,30 @@
+from gym import Env
+from gym.spaces import Discrete
+
+
+class IdentityEnv(Env):
+    def __init__(
+            self,
+            dim,
+            ep_length=100,
+    ):
+
+        self.action_space = Discrete(dim)
+        self.reset()
+
+    def reset(self):
+        self._choose_next_state()
+        self.observation_space = self.action_space
+
+        return self.state
+
+    def step(self, actions):
+        rew = self._get_reward(actions)
+        self._choose_next_state()
+        return self.state, rew, False, {}
+
+    def _choose_next_state(self):
+        self.state = self.action_space.sample()
+
+    def _get_reward(self, actions):
+        return 1 if self.state == actions else 0
--- a/baselines/common/input.py
+++ b/baselines/common/input.py
@@ -0,0 +1,30 @@
+import tensorflow as tf
+from gym.spaces import Discrete, Box
+
+def observation_input(ob_space, batch_size=None, name='Ob'):
+    '''
+    Build observation input with encoding depending on the 
+    observation space type
+    Params:
+    
+    ob_space: observation space (should be one of gym.spaces)
+    batch_size: batch size for input (default is None, so that resulting input placeholder can take tensors with any batch size)
+    name: tensorflow variable name for input placeholder
+
+    returns: tuple (input_placeholder, processed_input_tensor)
+    '''
+    if isinstance(ob_space, Discrete):
+        input_x  = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name)
+        processed_x = tf.to_float(tf.one_hot(input_x, ob_space.n))
+        return input_x, processed_x
+
+    elif isinstance(ob_space, Box):
+        input_shape = (batch_size,) + ob_space.shape
+        input_x = tf.placeholder(shape=input_shape, dtype=ob_space.dtype, name=name)
+        processed_x = tf.to_float(input_x)
+        return input_x, processed_x
+
+    else:
+        raise NotImplementedError
+
+ 
--- a/baselines/common/runners.py
+++ b/baselines/common/runners.py
@@ -7,7 +7,7 @@ class AbstractEnvRunner(ABC):
        self.model = model
        nenv = env.num_envs
        self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
-        self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=model.train_model.X.dtype.name)
+        self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
        self.obs[:] = env.reset()
        self.nsteps = nsteps
        self.states = model.initial_state
--- a/baselines/common/test_identity.py
+++ b/baselines/common/test_identity.py
@@ -0,0 +1,44 @@
+import pytest
+import tensorflow as tf
+import random
+import numpy as np
+from gym.spaces import np_random
+
+from baselines.a2c import a2c
+from baselines.ppo2 import ppo2
+from baselines.common.identity_env import IdentityEnv
+from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
+from baselines.ppo2.policies import MlpPolicy
+
+
+learn_func_list = [
+    lambda e: a2c.learn(policy=MlpPolicy, env=e, seed=0, total_timesteps=50000),
+    lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.01)
+]
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("learn_func", learn_func_list)
+def test_identity(learn_func):
+    '''
+    Test if the algorithm (with a given policy) 
+    can learn an identity transformation (i.e. return observation as an action)
+    '''
+    np.random.seed(0)
+    np_random.seed(0)
+    random.seed(0)
+
+    env = DummyVecEnv([lambda: IdentityEnv(10)])
+
+    with tf.Graph().as_default(), tf.Session().as_default():
+        tf.set_random_seed(0)
+        model = learn_func(env)
+
+        N_TRIALS = 1000
+        sum_rew = 0
+        obs = env.reset()
+        for i in range(N_TRIALS):
+            obs, rew, done, _ = env.step(model.step(obs)[0])
+            sum_rew += rew
+
+        assert sum_rew > 0.9 * N_TRIALS
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -55,7 +55,6 @@ def make_session(num_cpu=None, make_default=False, graph=None):
    tf_config = tf.ConfigProto(
        inter_op_parallelism_threads=num_cpu,
        intra_op_parallelism_threads=num_cpu)
-    tf_config.gpu_options.allocator_type = 'BFC'
    if make_default:
        return tf.InteractiveSession(config=tf_config, graph=graph)
    else:
@@ -279,3 +278,27 @@ def display_var_info(vars):
        logger.info("   %s%s %i params %s" % (name, " "*(55-len(name)), v_params, str(v.shape)))

    logger.info("Total model parameters: %0.2f million" % (count_params*1e-6))
+
+
+def get_available_gpus():
+    # recipe from here:
+    # https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
+ 
+    from tensorflow.python.client import device_lib
+    local_device_protos = device_lib.list_local_devices()
+    return [x.name for x in local_device_protos if x.device_type == 'GPU']
+
+# ================================================================
+# Saving variables
+# ================================================================
+
+def load_state(fname):
+    saver = tf.train.Saver()
+    saver.restore(tf.get_default_session(), fname)
+
+def save_state(fname):
+    os.makedirs(os.path.dirname(fname), exist_ok=True)
+    saver = tf.train.Saver()
+    saver.save(tf.get_default_session(), fname)
+
+
--- a/baselines/common/tile_images.py
+++ b/baselines/common/tile_images.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+def tile_images(img_nhwc):
+    """
+    Tile N images into one big PxQ image
+    (P,Q) are chosen to be as close as possible, and if N
+    is square, then P=Q.
+
+    input: img_nhwc, list or array of images, ndim=4 once turned into array
+        n = batch index, h = height, w = width, c = channel
+    returns:
+        bigim_HWc, ndarray with ndim=3
+    """
+    img_nhwc = np.asarray(img_nhwc)
+    N, h, w, c = img_nhwc.shape
+    H = int(np.ceil(np.sqrt(N)))
+    W = int(np.ceil(float(N)/H))
+    img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
+    img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
+    img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
+    img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
+    return img_Hh_Ww_c
+
--- a/baselines/common/vec_env/init.py
+++ b/baselines/common/vec_env/init.py
@@ -77,7 +77,7 @@ class VecEnv(ABC):
        self.step_async(actions)
        return self.step_wait()

-    def render(self):
+    def render(self, mode='human'):
        logger.warn('Render not defined for %s'%self)

    @property
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -11,18 +11,18 @@ class DummyVecEnv(VecEnv):
        shapes, dtypes = {}, {}
        self.keys = []
        obs_space = env.observation_space
+
        if isinstance(obs_space, spaces.Dict):
            assert isinstance(obs_space.spaces, OrderedDict)
-            for key, box in obs_space.spaces.items():
-                assert isinstance(box, spaces.Box)
-                shapes[key] = box.shape
-                dtypes[key] = box.dtype
-                self.keys.append(key)
+            subspaces = obs_space.spaces
        else:
-            box = obs_space
-            assert isinstance(box, spaces.Box)
-            self.keys = [None]
-            shapes, dtypes = { None: box.shape }, { None: box.dtype }
+            subspaces = {None: obs_space}
+
+        for key, box in subspaces.items():
+            shapes[key] = box.shape
+            dtypes[key] = box.dtype
+            self.keys.append(key)
+        
        self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
        self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
        self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
@@ -50,6 +50,9 @@ class DummyVecEnv(VecEnv):
    def close(self):
        return

+    def render(self, mode='human'):
+        return [e.render(mode=mode) for e in self.envs]
+
    def _save_obs(self, e, obs):
        for k in self.keys:
            if k is None:
--- a/baselines/common/vec_env/subproc_vec_env.py
+++ b/baselines/common/vec_env/subproc_vec_env.py
@@ -1,6 +1,7 @@
 import numpy as np
 from multiprocessing import Process, Pipe
 from baselines.common.vec_env import VecEnv, CloudpickleWrapper
+from baselines.common.tile_images import tile_images


 def worker(remote, parent_remote, env_fn_wrapper):
@@ -16,9 +17,8 @@ def worker(remote, parent_remote, env_fn_wrapper):
        elif cmd == 'reset':
            ob = env.reset()
            remote.send(ob)
-        elif cmd == 'reset_task':
-            ob = env.reset_task()
-            remote.send(ob)
+        elif cmd == 'render':
+            remote.send(env.render(mode='rgb_array'))
        elif cmd == 'close':
            remote.close()
            break
@@ -81,3 +81,17 @@ class SubprocVecEnv(VecEnv):
        for p in self.ps:
            p.join()
        self.closed = True
+
+    def render(self, mode='human'):
+        for pipe in self.remotes:
+            pipe.send(('render', None))
+        imgs = [pipe.recv() for pipe in self.remotes]
+        bigimg = tile_images(imgs)
+        if mode == 'human':
+            import cv2
+            cv2.imshow('vecenv', bigimg[:,:,::-1])
+            cv2.waitKey(1)
+        elif mode == 'rgb_array':
+            return bigimg
+        else:
+            raise NotImplementedError
--- a/baselines/deepq/experiments/custom_cartpole.py
+++ b/baselines/deepq/experiments/custom_cartpole.py
@@ -9,7 +9,7 @@ import baselines.common.tf_util as U
 from baselines import logger
 from baselines import deepq
 from baselines.deepq.replay_buffer import ReplayBuffer
-from baselines.deepq.utils import BatchInput
+from baselines.deepq.utils import ObservationInput
 from baselines.common.schedules import LinearSchedule


@@ -28,7 +28,7 @@ if __name__ == '__main__':
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
-            make_obs_ph=lambda name: BatchInput(env.observation_space.shape, name=name),
+            make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name),
            q_func=model,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
--- a/baselines/deepq/experiments/run_atari.py
+++ b/baselines/deepq/experiments/run_atari.py
@@ -14,6 +14,9 @@ def main():
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
+    parser.add_argument('--checkpoint-freq', type=int, default=10000)
+    parser.add_argument('--checkpoint-path', type=str, default=None)
+
    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
@@ -39,7 +42,9 @@ def main():
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=bool(args.prioritized),
-        prioritized_replay_alpha=args.prioritized_replay_alpha
+        prioritized_replay_alpha=args.prioritized_replay_alpha,
+        checkpoint_freq=args.checkpoint_freq,
+        checkpoint_path=args.checkpoint_path,
    )

    env.close()
--- a/baselines/deepq/simple.py
+++ b/baselines/deepq/simple.py
@@ -6,13 +6,15 @@ import zipfile
 import cloudpickle
 import numpy as np

-import gym
 import baselines.common.tf_util as U
+from baselines.common.tf_util import load_state, save_state
 from baselines import logger
 from baselines.common.schedules import LinearSchedule
+from baselines.common.input import observation_input
+
 from baselines import deepq
 from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
-from baselines.deepq.utils import BatchInput, load_state, save_state
+from baselines.deepq.utils import ObservationInput


 class ActWrapper(object):
@@ -88,6 +90,7 @@ def learn(env,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
+          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
@@ -170,9 +173,9 @@ def learn(env,

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
-    observation_space_shape = env.observation_space.shape
+
    def make_obs_ph(name):
-        return BatchInput(observation_space_shape, name=name)
+        return ObservationInput(env.observation_space, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
@@ -216,9 +219,17 @@ def learn(env,
    saved_mean_reward = None
    obs = env.reset()
    reset = True
+
    with tempfile.TemporaryDirectory() as td:
-        model_saved = False
+        td = checkpoint_path or td
+
        model_file = os.path.join(td, "model")
+        model_saved = False
+        if tf.train.latest_checkpoint(td) is not None:
+            load_state(model_file)
+            logger.log('Loaded model from {}'.format(model_file))
+            model_saved = True
+
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
--- a/baselines/deepq/test_identity.py
+++ b/baselines/deepq/test_identity.py
@@ -0,0 +1,43 @@
+import tensorflow as tf
+import random
+
+from baselines import deepq
+from baselines.common.identity_env import IdentityEnv
+
+
+def test_identity():
+
+    with tf.Graph().as_default():
+        env = IdentityEnv(10)
+        random.seed(0)
+
+        tf.set_random_seed(0)
+
+        param_noise = False
+        model = deepq.models.mlp([32])
+        act = deepq.learn(
+            env,
+            q_func=model,
+            lr=1e-3,
+            max_timesteps=10000,
+            buffer_size=50000,
+            exploration_fraction=0.1,
+            exploration_final_eps=0.02,
+            print_freq=10,
+            param_noise=param_noise,
+        )
+
+        tf.set_random_seed(0)
+
+        N_TRIALS = 1000
+        sum_rew = 0
+        obs = env.reset()
+        for i in range(N_TRIALS):
+            obs, rew, done, _ = env.step(act([obs]))
+            sum_rew += rew
+
+        assert sum_rew > 0.9 * N_TRIALS
+
+
+if __name__ == '__main__':
+    test_identity()
--- a/baselines/deepq/utils.py
+++ b/baselines/deepq/utils.py
@@ -1,24 +1,12 @@
-import os
+from baselines.common.input import observation_input

 import tensorflow as tf

-# ================================================================
-# Saving variables
-# ================================================================
-
-def load_state(fname):
-    saver = tf.train.Saver()
-    saver.restore(tf.get_default_session(), fname)
-
-def save_state(fname):
-    os.makedirs(os.path.dirname(fname), exist_ok=True)
-    saver = tf.train.Saver()
-    saver.save(tf.get_default_session(), fname)
-
 # ================================================================
 # Placeholders
 # ================================================================

+
 class TfInput(object):
    def __init__(self, name="(unnamed)"):
        """Generalized Tensorflow placeholder. The main differences are:
@@ -50,20 +38,6 @@ class PlaceholderTfInput(TfInput):
    def make_feed_dict(self, data):
        return {self._placeholder: data}

-class BatchInput(PlaceholderTfInput):
-    def __init__(self, shape, dtype=tf.float32, name=None):
-        """Creates a placeholder for a batch of tensors of a given shape and dtype
-
-        Parameters
-        ----------
-        shape: [int]
-            shape of a single elemenet of the batch
-        dtype: tf.dtype
-            number representation used for tensor contents
-        name: str
-            name of the underlying placeholder
-        """
-        super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))

 class Uint8Input(PlaceholderTfInput):
    def __init__(self, shape, name=None):
@@ -85,4 +59,25 @@ class Uint8Input(PlaceholderTfInput):
        self._output = tf.cast(super().get(), tf.float32) / 255.0

    def get(self):
-        return self._output
+        return self._output
+
+
+class ObservationInput(PlaceholderTfInput):
+    def __init__(self, observation_space, name=None):
+        """Creates an input placeholder tailored to a specific observation space
+        
+        Parameters
+        ----------
+
+        observation_space: 
+                observation space of the environment. Should be one of the gym.spaces types
+        name: str 
+                tensorflow name of the underlying placeholder
+        """
+        inpt, self.processed_inpt = observation_input(observation_space, name=name)
+        super().__init__(inpt)
+
+    def get(self):
+        return self.processed_inpt
+    
+    
--- a/baselines/gail/dataset/mujoco_dset.py
+++ b/baselines/gail/dataset/mujoco_dset.py
@@ -47,18 +47,12 @@ class Mujoco_Dset(object):
        obs = traj_data['obs'][:traj_limitation]
        acs = traj_data['acs'][:traj_limitation]

-        def flatten(x):
-            # x.shape = (E,), or (E, L, D)
-            _, size = x[0].shape
-            episode_length = [len(i) for i in x]
-            y = np.zeros((sum(episode_length), size))
-            start_idx = 0
-            for l, x_i in zip(episode_length, x):
-                y[start_idx:(start_idx+l)] = x_i
-                start_idx += l
-                return y
-        self.obs = np.array(flatten(obs))
-        self.acs = np.array(flatten(acs))
+        # obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length
+        # and S is the environment observation/action space.
+        # Flatten to (N * L, prod(S))
+        self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
+        self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
+
        self.rets = traj_data['ep_rets'][:traj_limitation]
        self.avg_ret = sum(self.rets)/len(self.rets)
        self.std_ret = np.std(np.array(self.rets))
--- a/baselines/her/experiment/config.py
+++ b/baselines/her/experiment/config.py
@@ -1,7 +1,4 @@
-from copy import deepcopy
 import numpy as np
-import json
-import os
 import gym

 from baselines import logger
@@ -10,7 +7,7 @@ from baselines.her.her import make_sample_her_transitions


 DEFAULT_ENV_PARAMS = {
-    'FetchReach-v0': {
+    'FetchReach-v1': {
        'n_cycles': 10,
    },
 }
@@ -51,6 +48,8 @@ DEFAULT_PARAMS = {


 CACHED_ENVS = {}
+
+
 def cached_make_env(make_env):
    """
    Only creates a new environment from the provided function if one has not yet already been
@@ -68,6 +67,7 @@ def prepare_params(kwargs):
    ddpg_params = dict()

    env_name = kwargs['env_name']
+
    def make_env():
        return gym.make(env_name)
    kwargs['make_env'] = make_env
@@ -75,7 +75,7 @@ def prepare_params(kwargs):
    assert hasattr(tmp_env, '_max_episode_steps')
    kwargs['T'] = tmp_env._max_episode_steps
    tmp_env.reset()
-    kwargs['max_u'] = np.array(kwargs['max_u']) if type(kwargs['max_u']) == list else kwargs['max_u']
+    kwargs['max_u'] = np.array(kwargs['max_u']) if isinstance(kwargs['max_u'], list) else kwargs['max_u']
    kwargs['gamma'] = 1. - 1. / kwargs['T']
    if 'lr' in kwargs:
        kwargs['pi_lr'] = kwargs['lr']
@@ -83,7 +83,7 @@ def prepare_params(kwargs):
        del kwargs['lr']
    for name in ['buffer_size', 'hidden', 'layers',
                 'network_class',
-                 'polyak', 
+                 'polyak',
                 'batch_size', 'Q_lr', 'pi_lr',
                 'norm_eps', 'norm_clip', 'max_u',
                 'action_l2', 'clip_obs', 'scope', 'relative_goals']:
@@ -103,6 +103,7 @@ def log_params(params, logger=logger):
 def configure_her(params):
    env = cached_make_env(params['make_env'])
    env.reset()
+
    def reward_fun(ag_2, g, info):  # vectorized
        return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info)

--- a/baselines/her/experiment/train.py
+++ b/baselines/her/experiment/train.py
@@ -13,6 +13,8 @@ import baselines.her.experiment.config as config
 from baselines.her.rollout import RolloutWorker
 from baselines.her.util import mpi_fork

+from subprocess import CalledProcessError
+

 def mpi_average(value):
    if value == []:
@@ -81,12 +83,17 @@ def train(policy, rollout_worker, evaluator,


 def launch(
-    env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
+    env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
    override_params={}, save_policies=True
 ):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
-        whoami = mpi_fork(num_cpu)
+        try:
+            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
+        except CalledProcessError:
+            # fancy version of mpi call failed, try simple version
+            whoami = mpi_fork(num_cpu)
+
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
@@ -109,10 +116,10 @@ def launch(

    # Prepare params.
    params = config.DEFAULT_PARAMS
-    params['env_name'] = env_name
+    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
-    if env_name in config.DEFAULT_ENV_PARAMS:
-        params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
+    if env in config.DEFAULT_ENV_PARAMS:
+        params.update(config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
@@ -126,7 +133,7 @@ def launch(
            'You are running HER with just a single MPI worker. This will work, but the ' +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' +
            'were obtained with --num_cpu 19. This makes a significant difference and if you ' +
-            'are looking to reproduce those results, be aware of this. Please also refer to ' + 
+            'are looking to reproduce those results, be aware of this. Please also refer to ' +
            'https://github.com/openai/baselines/issues/314 for further details.')
        logger.warn('****************')
        logger.warn()
@@ -168,7 +175,7 @@ def launch(


@click.command()
-@click.option('--env_name', type=str, default='FetchReach-v0', help='the name of the OpenAI Gym environment that you want to train on')
+@click.option('--env', type=str, default='FetchReach-v1', help='the name of the OpenAI Gym environment that you want to train on')
@click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/')
@click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run')
@click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)')
--- a/baselines/her/util.py
+++ b/baselines/her/util.py
@@ -58,12 +58,12 @@ def nn(input, layers_sizes, reuse=None, flatten=False, name=""):
    """Creates a simple neural network
    """
    for i, size in enumerate(layers_sizes):
-        activation = tf.nn.relu if i < len(layers_sizes)-1 else None
+        activation = tf.nn.relu if i < len(layers_sizes) - 1 else None
        input = tf.layers.dense(inputs=input,
                                units=size,
                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                reuse=reuse,
-                                name=name+'_'+str(i))
+                                name=name + '_' + str(i))
        if activation:
            input = activation(input)
    if flatten:
@@ -85,7 +85,7 @@ def install_mpi_excepthook():
    sys.excepthook = new_hook


-def mpi_fork(n):
+def mpi_fork(n, extra_mpi_args=[]):
    """Re-launches the current script with workers
    Returns "parent" for original parent, "child" for MPI children
    """
@@ -99,14 +99,10 @@ def mpi_fork(n):
            IN_MPI="1"
        )
        # "-bind-to core" is crucial for good performance
-        args = [
-            "mpirun",
-            "-np",
-            str(n),
-            "-bind-to",
-            "core",
-            sys.executable
-        ]
+        args = ["mpirun", "-np", str(n)] + \
+            extra_mpi_args + \
+            [sys.executable]
+
        args += sys.argv
        subprocess.check_call(args, env=env)
        return "parent"
@@ -140,5 +136,5 @@ def reshape_for_broadcasting(source, target):
    before broadcasting it with MPI.
    """
    dim = len(target.get_shape())
-    shape = ([1] * (dim-1)) + [-1]
+    shape = ([1] * (dim - 1)) + [-1]
    return tf.reshape(tf.cast(source, target.dtype), shape)
--- a/baselines/logger.py
+++ b/baselines/logger.py
@@ -8,10 +8,6 @@ import datetime
 import tempfile
 from collections import defaultdict

-LOG_OUTPUT_FORMATS     = ['stdout', 'log', 'csv']
-LOG_OUTPUT_FORMATS_MPI = ['log']
-# Also valid: json, tensorboard
-
 DEBUG = 10
 INFO = 20
 WARN = 30
@@ -75,8 +71,11 @@ class HumanOutputFormat(KVWriter, SeqWriter):
        return s[:20] + '...' if len(s) > 23 else s

    def writeseq(self, seq):
-        for arg in seq:
-            self.file.write(arg)
+        seq = list(seq)
+        for (i, elem) in enumerate(seq):
+            self.file.write(elem)
+            if i < len(seq) - 1: # add space unless this is the last one
+                self.file.write(' ')
        self.file.write('\n')
        self.file.flush()

@@ -363,13 +362,11 @@ def configure(dir=None, format_strs=None):
        log_suffix = "-rank%03i" % rank

    if format_strs is None:
-        strs, strs_mpi = os.getenv('OPENAI_LOG_FORMAT'), os.getenv('OPENAI_LOG_FORMAT_MPI')
-        format_strs = strs_mpi if rank>0 else strs
-        if format_strs is not None:
-            format_strs = format_strs.split(',')
+        if rank == 0:
+            format_strs = os.getenv('OPENAI_LOG_FORMAT', 'stdout,log,csv').split(',')
        else:
-            format_strs = LOG_OUTPUT_FORMATS_MPI if rank>0 else LOG_OUTPUT_FORMATS
-
+            format_strs = os.getenv('OPENAI_LOG_FORMAT_MPI', 'log').split(',')
+    format_strs = filter(None, format_strs)
    output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]

    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
--- a/baselines/ppo1/README.md
+++ b/baselines/ppo1/README.md
@@ -5,3 +5,5 @@
 - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
 - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.

+- Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model`
+- Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model`
--- a/baselines/ppo1/pposgd_simple.py
+++ b/baselines/ppo1/pposgd_simple.py
@@ -212,5 +212,7 @@ def learn(env, policy_fn, *,
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()

+    return pi
+
 def flatten_lists(listoflists):
    return [el for list_ in listoflists for el in list_]
--- a/baselines/ppo1/run_humanoid.py
+++ b/baselines/ppo1/run_humanoid.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+import os
+from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
+from baselines.common import tf_util as U
+from baselines import logger
+
+import gym
+
+def train(num_timesteps, seed, model_path=None):
+    env_id = 'Humanoid-v2'
+    from baselines.ppo1 import mlp_policy, pposgd_simple
+    U.make_session(num_cpu=1).__enter__()
+    def policy_fn(name, ob_space, ac_space):
+        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
+            hid_size=64, num_hid_layers=2)
+    env = make_mujoco_env(env_id, seed)
+
+    # parameters below were the best found in a simple random search
+    # these are good enough to make humanoid walk, but whether those are
+    # an absolute best or not is not certain
+    env = RewScale(env, 0.1)
+    pi = pposgd_simple.learn(env, policy_fn,
+            max_timesteps=num_timesteps,
+            timesteps_per_actorbatch=2048,
+            clip_param=0.2, entcoeff=0.0,
+            optim_epochs=10, 
+            optim_stepsize=3e-4, 
+            optim_batchsize=64, 
+            gamma=0.99, 
+            lam=0.95,
+            schedule='linear',
+        )
+    env.close()
+    if model_path:
+        U.save_state(model_path)
+        
+    return pi
+
+class RewScale(gym.RewardWrapper):
+    def __init__(self, env, scale):
+        gym.RewardWrapper.__init__(self, env)
+        self.scale = scale
+    def reward(self, r):
+        return r * self.scale
+
+def main():
+    logger.configure()
+    parser = mujoco_arg_parser()
+    parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
+    parser.set_defaults(num_timesteps=int(2e7))
+   
+    args = parser.parse_args()
+    
+    if not args.play:
+        # train the model
+        train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
+    else:       
+        # construct the model object, load pre-trained model and render
+        pi = train(num_timesteps=1, seed=args.seed)
+        U.load_state(args.model_path)
+        env = make_mujoco_env('Humanoid-v2', seed=0)
+
+        ob = env.reset()        
+        while True:
+            action = pi.act(stochastic=False, ob=ob)[0]
+            ob, _, done, _ =  env.step(action)
+            env.render()
+            if done:
+                ob = env.reset()
+        
+        
+    
+
+if __name__ == '__main__':
+    main()
--- a/baselines/ppo1/run_robotics.py
+++ b/baselines/ppo1/run_robotics.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+from mpi4py import MPI
+from baselines.common import set_global_seeds
+from baselines import logger
+from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser
+import mujoco_py
+
+
+def train(env_id, num_timesteps, seed):
+    from baselines.ppo1 import mlp_policy, pposgd_simple
+    import baselines.common.tf_util as U
+    rank = MPI.COMM_WORLD.Get_rank()
+    sess = U.single_threaded_session()
+    sess.__enter__()
+    mujoco_py.ignore_mujoco_warnings().__enter__()
+    workerseed = seed + 10000 * rank
+    set_global_seeds(workerseed)
+    env = make_robotics_env(env_id, workerseed, rank=rank)
+    def policy_fn(name, ob_space, ac_space):
+        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
+            hid_size=256, num_hid_layers=3)
+
+    pposgd_simple.learn(env, policy_fn,
+            max_timesteps=num_timesteps,
+            timesteps_per_actorbatch=2048,
+            clip_param=0.2, entcoeff=0.0,
+            optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256,
+            gamma=0.99, lam=0.95, schedule='linear',
+        )
+    env.close()
+
+
+def main():
+    args = robotics_arg_parser().parse_args()
+    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
+
+
+if __name__ == '__main__':
+    main()
--- a/baselines/ppo2/policies.py
+++ b/baselines/ppo2/policies.py
@@ -2,6 +2,7 @@ import numpy as np
 import tensorflow as tf
 from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
 from baselines.common.distributions import make_pdtype
+from baselines.common.input import observation_input

 def nature_cnn(unscaled_images, **conv_kwargs):
    """
@@ -19,14 +20,12 @@ def nature_cnn(unscaled_images, **conv_kwargs):
 class LnLstmPolicy(object):
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
-        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        X, processed_x = observation_input(ob_space, nbatch)
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
+            h = nature_cnn(processed_x)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
@@ -56,11 +55,9 @@ class LstmPolicy(object):

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
-
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
-        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        X, processed_x = observation_input(ob_space, nbatch)
+
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
@@ -93,12 +90,10 @@ class LstmPolicy(object):
 class CnnPolicy(object):

    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
-        X = tf.placeholder(tf.uint8, ob_shape) #obs
+        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X, **conv_kwargs)
+            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

@@ -120,15 +115,14 @@ class CnnPolicy(object):

 class MlpPolicy(object):
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
-        ob_shape = (nbatch,) + ob_space.shape
        self.pdtype = make_pdtype(ac_space)
-        X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
        with tf.variable_scope("model", reuse=reuse):
+            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
-            flatten = tf.layers.flatten
-            pi_h1 = activ(fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
+            processed_x = tf.layers.flatten(processed_x)
+            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf_h1 = activ(fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
+            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:,0]

--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -236,6 +236,7 @@ def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
            print('Saving to', savepath)
            model.save(savepath)
    env.close()
+    return model

 def safemean(xs):
    return np.nan if len(xs) == 0 else np.mean(xs)
--- a/baselines/ppo2/run_mujoco.py
+++ b/baselines/ppo2/run_mujoco.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python3
-import argparse
+import numpy as np
 from baselines.common.cmd_util import mujoco_arg_parser
 from baselines import bench, logger

+
 def train(env_id, num_timesteps, seed):
    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
@@ -16,27 +17,40 @@ def train(env_id, num_timesteps, seed):
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()
+
    def make_env():
        env = gym.make(env_id)
-        env = bench.Monitor(env, logger.get_dir())
+        env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
        return env
+
    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
-    ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
-        lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
-        ent_coef=0.0,
-        lr=3e-4,
-        cliprange=0.2,
-        total_timesteps=num_timesteps)
+    model = ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
+                       lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
+                       ent_coef=0.0,
+                       lr=3e-4,
+                       cliprange=0.2,
+                       total_timesteps=num_timesteps)
+
+    return model, env


 def main():
    args = mujoco_arg_parser().parse_args()
    logger.configure()
-    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
+    model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
+
+    if args.play:
+        logger.log("Running trained model")
+        obs = np.zeros((env.num_envs,) + env.observation_space.shape)
+        obs[:] = env.reset()
+        while True:
+            actions = model.step(obs)[0]
+            obs[:]  = env.step(actions)[0]
+            env.render()


 if __name__ == '__main__':
Author	SHA1	Message	Date
Peter Zhokhov	2c818245d6	dummy commit to RUN BENCHMARKS	2018-07-25 18:09:30 -07:00
Peter Zhokhov	ae8e7fd16b	dummy commit to RUN BENCHMARKS	2018-07-25 18:07:56 -07:00
Adam Gleave	f272969325	GAIL: bugfix in dataset loading (#447 ) * Fix silly typo * Replace ad-hoc function with NumPy code	2018-07-06 16:12:14 -07:00
pzhokhov	a6b1bc70f1	re-import internal; fix missing tile_images.py (#427 ) * import rl-algs from 2e3a166 commit * extra import of the baselines badge * exported commit with identity test * proper rng seeding in the test_identity * import internal * adding missing tile_images.py	2018-06-08 09:41:45 -07:00
pzhokhov	36ee5d1707	Import internal changes (#422 ) * import rl-algs from 2e3a166 commit * extra import of the baselines badge * exported commit with identity test * proper rng seeding in the test_identity * import internal	2018-06-06 11:39:13 -07:00
pzhokhov	24fe3d6576	Import internal repo (#409 ) * import rl-algs from 2e3a166 commit * extra import of the baselines badge * exported commit with identity test * proper rng seeding in the test_identity	2018-05-21 15:24:00 -07:00