dummy commit to RUN BENCHMARKS

GAIL: bugfix in dataset loading (#447 )
2018-07-25 18:09:30 -07:00 · 2018-07-25 18:07:56 -07:00 · 2018-07-06 16:12:14 -07:00 · 2018-06-08 09:41:45 -07:00 · 2018-06-06 11:39:13 -07:00
12 changed files with 181 additions and 33 deletions
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -3,6 +3,7 @@ Helpers for scripts like run_atari.py.
 """

 import os
+from mpi4py import MPI
 import gym
 from gym.wrappers import FlattenDictWrapper
 from baselines import logger
@@ -30,9 +31,10 @@ def make_mujoco_env(env_id, seed):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
-    set_global_seeds(seed)
+    rank = MPI.COMM_WORLD.Get_rank()
+    set_global_seeds(seed + 10000 * rank)
    env = gym.make(env_id)
-    env = Monitor(env, logger.get_dir())
+    env = Monitor(env, os.path.join(logger.get_dir(), str(rank)))
    env.seed(seed)
    return env

--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -55,7 +55,6 @@ def make_session(num_cpu=None, make_default=False, graph=None):
    tf_config = tf.ConfigProto(
        inter_op_parallelism_threads=num_cpu,
        intra_op_parallelism_threads=num_cpu)
-    tf_config.gpu_options.allocator_type = 'BFC'
    if make_default:
        return tf.InteractiveSession(config=tf_config, graph=graph)
    else:
--- a/baselines/common/tile_images.py
+++ b/baselines/common/tile_images.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+def tile_images(img_nhwc):
+    """
+    Tile N images into one big PxQ image
+    (P,Q) are chosen to be as close as possible, and if N
+    is square, then P=Q.
+
+    input: img_nhwc, list or array of images, ndim=4 once turned into array
+        n = batch index, h = height, w = width, c = channel
+    returns:
+        bigim_HWc, ndarray with ndim=3
+    """
+    img_nhwc = np.asarray(img_nhwc)
+    N, h, w, c = img_nhwc.shape
+    H = int(np.ceil(np.sqrt(N)))
+    W = int(np.ceil(float(N)/H))
+    img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
+    img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
+    img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
+    img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
+    return img_Hh_Ww_c
+
--- a/baselines/common/vec_env/init.py
+++ b/baselines/common/vec_env/init.py
@@ -77,7 +77,7 @@ class VecEnv(ABC):
        self.step_async(actions)
        return self.step_wait()

-    def render(self):
+    def render(self, mode='human'):
        logger.warn('Render not defined for %s'%self)

    @property
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -50,8 +50,8 @@ class DummyVecEnv(VecEnv):
    def close(self):
        return

-    def render(self):
-        return [e.render() for e in self.envs]
+    def render(self, mode='human'):
+        return [e.render(mode=mode) for e in self.envs]

    def _save_obs(self, e, obs):
        for k in self.keys:
--- a/baselines/common/vec_env/subproc_vec_env.py
+++ b/baselines/common/vec_env/subproc_vec_env.py
@@ -1,6 +1,7 @@
 import numpy as np
 from multiprocessing import Process, Pipe
 from baselines.common.vec_env import VecEnv, CloudpickleWrapper
+from baselines.common.tile_images import tile_images


 def worker(remote, parent_remote, env_fn_wrapper):
@@ -16,9 +17,8 @@ def worker(remote, parent_remote, env_fn_wrapper):
        elif cmd == 'reset':
            ob = env.reset()
            remote.send(ob)
-        elif cmd == 'reset_task':
-            ob = env.reset_task()
-            remote.send(ob)
+        elif cmd == 'render':
+            remote.send(env.render(mode='rgb_array'))
        elif cmd == 'close':
            remote.close()
            break
@@ -81,3 +81,17 @@ class SubprocVecEnv(VecEnv):
        for p in self.ps:
            p.join()
        self.closed = True
+
+    def render(self, mode='human'):
+        for pipe in self.remotes:
+            pipe.send(('render', None))
+        imgs = [pipe.recv() for pipe in self.remotes]
+        bigimg = tile_images(imgs)
+        if mode == 'human':
+            import cv2
+            cv2.imshow('vecenv', bigimg[:,:,::-1])
+            cv2.waitKey(1)
+        elif mode == 'rgb_array':
+            return bigimg
+        else:
+            raise NotImplementedError
--- a/baselines/gail/dataset/mujoco_dset.py
+++ b/baselines/gail/dataset/mujoco_dset.py
@@ -47,18 +47,12 @@ class Mujoco_Dset(object):
        obs = traj_data['obs'][:traj_limitation]
        acs = traj_data['acs'][:traj_limitation]

-        def flatten(x):
-            # x.shape = (E,), or (E, L, D)
-            _, size = x[0].shape
-            episode_length = [len(i) for i in x]
-            y = np.zeros((sum(episode_length), size))
-            start_idx = 0
-            for l, x_i in zip(episode_length, x):
-                y[start_idx:(start_idx+l)] = x_i
-                start_idx += l
-                return y
-        self.obs = np.array(flatten(obs))
-        self.acs = np.array(flatten(acs))
+        # obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length
+        # and S is the environment observation/action space.
+        # Flatten to (N * L, prod(S))
+        self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
+        self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
+
        self.rets = traj_data['ep_rets'][:traj_limitation]
        self.avg_ret = sum(self.rets)/len(self.rets)
        self.std_ret = np.std(np.array(self.rets))
--- a/baselines/logger.py
+++ b/baselines/logger.py
@@ -8,10 +8,6 @@ import datetime
 import tempfile
 from collections import defaultdict

-LOG_OUTPUT_FORMATS     = ['stdout', 'log', 'csv']
-LOG_OUTPUT_FORMATS_MPI = ['log']
-# Also valid: json, tensorboard
-
 DEBUG = 10
 INFO = 20
 WARN = 30
@@ -75,8 +71,11 @@ class HumanOutputFormat(KVWriter, SeqWriter):
        return s[:20] + '...' if len(s) > 23 else s

    def writeseq(self, seq):
-        for arg in seq:
-            self.file.write(arg)
+        seq = list(seq)
+        for (i, elem) in enumerate(seq):
+            self.file.write(elem)
+            if i < len(seq) - 1: # add space unless this is the last one
+                self.file.write(' ')
        self.file.write('\n')
        self.file.flush()

@@ -363,13 +362,11 @@ def configure(dir=None, format_strs=None):
        log_suffix = "-rank%03i" % rank

    if format_strs is None:
-        strs, strs_mpi = os.getenv('OPENAI_LOG_FORMAT'), os.getenv('OPENAI_LOG_FORMAT_MPI')
-        format_strs = strs_mpi if rank>0 else strs
-        if format_strs is not None:
-            format_strs = format_strs.split(',')
+        if rank == 0:
+            format_strs = os.getenv('OPENAI_LOG_FORMAT', 'stdout,log,csv').split(',')
        else:
-            format_strs = LOG_OUTPUT_FORMATS_MPI if rank>0 else LOG_OUTPUT_FORMATS
-
+            format_strs = os.getenv('OPENAI_LOG_FORMAT_MPI', 'log').split(',')
+    format_strs = filter(None, format_strs)
    output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]

    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
--- a/baselines/ppo1/README.md
+++ b/baselines/ppo1/README.md
@@ -5,3 +5,5 @@
 - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
 - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.

+- Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model`
+- Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model`
--- a/baselines/ppo1/pposgd_simple.py
+++ b/baselines/ppo1/pposgd_simple.py
@@ -212,5 +212,7 @@ def learn(env, policy_fn, *,
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()

+    return pi
+
 def flatten_lists(listoflists):
    return [el for list_ in listoflists for el in list_]
--- a/baselines/ppo1/run_humanoid.py
+++ b/baselines/ppo1/run_humanoid.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+import os
+from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
+from baselines.common import tf_util as U
+from baselines import logger
+
+import gym
+
+def train(num_timesteps, seed, model_path=None):
+    env_id = 'Humanoid-v2'
+    from baselines.ppo1 import mlp_policy, pposgd_simple
+    U.make_session(num_cpu=1).__enter__()
+    def policy_fn(name, ob_space, ac_space):
+        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
+            hid_size=64, num_hid_layers=2)
+    env = make_mujoco_env(env_id, seed)
+
+    # parameters below were the best found in a simple random search
+    # these are good enough to make humanoid walk, but whether those are
+    # an absolute best or not is not certain
+    env = RewScale(env, 0.1)
+    pi = pposgd_simple.learn(env, policy_fn,
+            max_timesteps=num_timesteps,
+            timesteps_per_actorbatch=2048,
+            clip_param=0.2, entcoeff=0.0,
+            optim_epochs=10, 
+            optim_stepsize=3e-4, 
+            optim_batchsize=64, 
+            gamma=0.99, 
+            lam=0.95,
+            schedule='linear',
+        )
+    env.close()
+    if model_path:
+        U.save_state(model_path)
+        
+    return pi
+
+class RewScale(gym.RewardWrapper):
+    def __init__(self, env, scale):
+        gym.RewardWrapper.__init__(self, env)
+        self.scale = scale
+    def reward(self, r):
+        return r * self.scale
+
+def main():
+    logger.configure()
+    parser = mujoco_arg_parser()
+    parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
+    parser.set_defaults(num_timesteps=int(2e7))
+   
+    args = parser.parse_args()
+    
+    if not args.play:
+        # train the model
+        train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
+    else:       
+        # construct the model object, load pre-trained model and render
+        pi = train(num_timesteps=1, seed=args.seed)
+        U.load_state(args.model_path)
+        env = make_mujoco_env('Humanoid-v2', seed=0)
+
+        ob = env.reset()        
+        while True:
+            action = pi.act(stochastic=False, ob=ob)[0]
+            ob, _, done, _ =  env.step(action)
+            env.render()
+            if done:
+                ob = env.reset()
+        
+        
+    
+
+if __name__ == '__main__':
+    main()
--- a/baselines/ppo1/run_robotics.py
+++ b/baselines/ppo1/run_robotics.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+from mpi4py import MPI
+from baselines.common import set_global_seeds
+from baselines import logger
+from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser
+import mujoco_py
+
+
+def train(env_id, num_timesteps, seed):
+    from baselines.ppo1 import mlp_policy, pposgd_simple
+    import baselines.common.tf_util as U
+    rank = MPI.COMM_WORLD.Get_rank()
+    sess = U.single_threaded_session()
+    sess.__enter__()
+    mujoco_py.ignore_mujoco_warnings().__enter__()
+    workerseed = seed + 10000 * rank
+    set_global_seeds(workerseed)
+    env = make_robotics_env(env_id, workerseed, rank=rank)
+    def policy_fn(name, ob_space, ac_space):
+        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
+            hid_size=256, num_hid_layers=3)
+
+    pposgd_simple.learn(env, policy_fn,
+            max_timesteps=num_timesteps,
+            timesteps_per_actorbatch=2048,
+            clip_param=0.2, entcoeff=0.0,
+            optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256,
+            gamma=0.99, lam=0.95, schedule='linear',
+        )
+    env.close()
+
+
+def main():
+    args = robotics_arg_parser().parse_args()
+    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
+
+
+if __name__ == '__main__':
+    main()
Author	SHA1	Message	Date
Peter Zhokhov	2c818245d6	dummy commit to RUN BENCHMARKS	2018-07-25 18:09:30 -07:00
Peter Zhokhov	ae8e7fd16b	dummy commit to RUN BENCHMARKS	2018-07-25 18:07:56 -07:00
Adam Gleave	f272969325	GAIL: bugfix in dataset loading (#447 ) * Fix silly typo * Replace ad-hoc function with NumPy code	2018-07-06 16:12:14 -07:00
pzhokhov	a6b1bc70f1	re-import internal; fix missing tile_images.py (#427 ) * import rl-algs from 2e3a166 commit * extra import of the baselines badge * exported commit with identity test * proper rng seeding in the test_identity * import internal * adding missing tile_images.py	2018-06-08 09:41:45 -07:00
pzhokhov	36ee5d1707	Import internal changes (#422 ) * import rl-algs from 2e3a166 commit * extra import of the baselines badge * exported commit with identity test * proper rng seeding in the test_identity * import internal	2018-06-06 11:39:13 -07:00