Import internal changes (#422)
* import rl-algs from 2e3a166 commit * extra import of the baselines badge * exported commit with identity test * proper rng seeding in the test_identity * import internal
This commit is contained in:
@@ -3,6 +3,7 @@ Helpers for scripts like run_atari.py.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
from mpi4py import MPI
|
||||||
import gym
|
import gym
|
||||||
from gym.wrappers import FlattenDictWrapper
|
from gym.wrappers import FlattenDictWrapper
|
||||||
from baselines import logger
|
from baselines import logger
|
||||||
@@ -30,9 +31,10 @@ def make_mujoco_env(env_id, seed):
|
|||||||
"""
|
"""
|
||||||
Create a wrapped, monitored gym.Env for MuJoCo.
|
Create a wrapped, monitored gym.Env for MuJoCo.
|
||||||
"""
|
"""
|
||||||
set_global_seeds(seed)
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
|
set_global_seeds(seed + 10000 * rank)
|
||||||
env = gym.make(env_id)
|
env = gym.make(env_id)
|
||||||
env = Monitor(env, logger.get_dir())
|
env = Monitor(env, os.path.join(logger.get_dir(), str(rank)))
|
||||||
env.seed(seed)
|
env.seed(seed)
|
||||||
return env
|
return env
|
||||||
|
|
||||||
|
@@ -55,7 +55,6 @@ def make_session(num_cpu=None, make_default=False, graph=None):
|
|||||||
tf_config = tf.ConfigProto(
|
tf_config = tf.ConfigProto(
|
||||||
inter_op_parallelism_threads=num_cpu,
|
inter_op_parallelism_threads=num_cpu,
|
||||||
intra_op_parallelism_threads=num_cpu)
|
intra_op_parallelism_threads=num_cpu)
|
||||||
tf_config.gpu_options.allocator_type = 'BFC'
|
|
||||||
if make_default:
|
if make_default:
|
||||||
return tf.InteractiveSession(config=tf_config, graph=graph)
|
return tf.InteractiveSession(config=tf_config, graph=graph)
|
||||||
else:
|
else:
|
||||||
|
@@ -77,7 +77,7 @@ class VecEnv(ABC):
|
|||||||
self.step_async(actions)
|
self.step_async(actions)
|
||||||
return self.step_wait()
|
return self.step_wait()
|
||||||
|
|
||||||
def render(self):
|
def render(self, mode='human'):
|
||||||
logger.warn('Render not defined for %s'%self)
|
logger.warn('Render not defined for %s'%self)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@@ -50,8 +50,8 @@ class DummyVecEnv(VecEnv):
|
|||||||
def close(self):
|
def close(self):
|
||||||
return
|
return
|
||||||
|
|
||||||
def render(self):
|
def render(self, mode='human'):
|
||||||
return [e.render() for e in self.envs]
|
return [e.render(mode=mode) for e in self.envs]
|
||||||
|
|
||||||
def _save_obs(self, e, obs):
|
def _save_obs(self, e, obs):
|
||||||
for k in self.keys:
|
for k in self.keys:
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from multiprocessing import Process, Pipe
|
from multiprocessing import Process, Pipe
|
||||||
from baselines.common.vec_env import VecEnv, CloudpickleWrapper
|
from baselines.common.vec_env import VecEnv, CloudpickleWrapper
|
||||||
|
from baselines.common.tile_images import tile_images
|
||||||
|
|
||||||
|
|
||||||
def worker(remote, parent_remote, env_fn_wrapper):
|
def worker(remote, parent_remote, env_fn_wrapper):
|
||||||
@@ -16,9 +17,8 @@ def worker(remote, parent_remote, env_fn_wrapper):
|
|||||||
elif cmd == 'reset':
|
elif cmd == 'reset':
|
||||||
ob = env.reset()
|
ob = env.reset()
|
||||||
remote.send(ob)
|
remote.send(ob)
|
||||||
elif cmd == 'reset_task':
|
elif cmd == 'render':
|
||||||
ob = env.reset_task()
|
remote.send(env.render(mode='rgb_array'))
|
||||||
remote.send(ob)
|
|
||||||
elif cmd == 'close':
|
elif cmd == 'close':
|
||||||
remote.close()
|
remote.close()
|
||||||
break
|
break
|
||||||
@@ -81,3 +81,17 @@ class SubprocVecEnv(VecEnv):
|
|||||||
for p in self.ps:
|
for p in self.ps:
|
||||||
p.join()
|
p.join()
|
||||||
self.closed = True
|
self.closed = True
|
||||||
|
|
||||||
|
def render(self, mode='human'):
|
||||||
|
for pipe in self.remotes:
|
||||||
|
pipe.send(('render', None))
|
||||||
|
imgs = [pipe.recv() for pipe in self.remotes]
|
||||||
|
bigimg = tile_images(imgs)
|
||||||
|
if mode == 'human':
|
||||||
|
import cv2
|
||||||
|
cv2.imshow('vecenv', bigimg[:,:,::-1])
|
||||||
|
cv2.waitKey(1)
|
||||||
|
elif mode == 'rgb_array':
|
||||||
|
return bigimg
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
@@ -8,10 +8,6 @@ import datetime
|
|||||||
import tempfile
|
import tempfile
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'csv']
|
|
||||||
LOG_OUTPUT_FORMATS_MPI = ['log']
|
|
||||||
# Also valid: json, tensorboard
|
|
||||||
|
|
||||||
DEBUG = 10
|
DEBUG = 10
|
||||||
INFO = 20
|
INFO = 20
|
||||||
WARN = 30
|
WARN = 30
|
||||||
@@ -75,8 +71,11 @@ class HumanOutputFormat(KVWriter, SeqWriter):
|
|||||||
return s[:20] + '...' if len(s) > 23 else s
|
return s[:20] + '...' if len(s) > 23 else s
|
||||||
|
|
||||||
def writeseq(self, seq):
|
def writeseq(self, seq):
|
||||||
for arg in seq:
|
seq = list(seq)
|
||||||
self.file.write(arg)
|
for (i, elem) in enumerate(seq):
|
||||||
|
self.file.write(elem)
|
||||||
|
if i < len(seq) - 1: # add space unless this is the last one
|
||||||
|
self.file.write(' ')
|
||||||
self.file.write('\n')
|
self.file.write('\n')
|
||||||
self.file.flush()
|
self.file.flush()
|
||||||
|
|
||||||
@@ -363,13 +362,11 @@ def configure(dir=None, format_strs=None):
|
|||||||
log_suffix = "-rank%03i" % rank
|
log_suffix = "-rank%03i" % rank
|
||||||
|
|
||||||
if format_strs is None:
|
if format_strs is None:
|
||||||
strs, strs_mpi = os.getenv('OPENAI_LOG_FORMAT'), os.getenv('OPENAI_LOG_FORMAT_MPI')
|
if rank == 0:
|
||||||
format_strs = strs_mpi if rank>0 else strs
|
format_strs = os.getenv('OPENAI_LOG_FORMAT', 'stdout,log,csv').split(',')
|
||||||
if format_strs is not None:
|
|
||||||
format_strs = format_strs.split(',')
|
|
||||||
else:
|
else:
|
||||||
format_strs = LOG_OUTPUT_FORMATS_MPI if rank>0 else LOG_OUTPUT_FORMATS
|
format_strs = os.getenv('OPENAI_LOG_FORMAT_MPI', 'log').split(',')
|
||||||
|
format_strs = filter(None, format_strs)
|
||||||
output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
|
output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
|
||||||
|
|
||||||
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
|
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
|
||||||
|
@@ -5,3 +5,5 @@
|
|||||||
- `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
|
- `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
|
||||||
- `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
|
- `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
|
||||||
|
|
||||||
|
- Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model`
|
||||||
|
- Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model`
|
||||||
|
@@ -212,5 +212,7 @@ def learn(env, policy_fn, *,
|
|||||||
if MPI.COMM_WORLD.Get_rank()==0:
|
if MPI.COMM_WORLD.Get_rank()==0:
|
||||||
logger.dump_tabular()
|
logger.dump_tabular()
|
||||||
|
|
||||||
|
return pi
|
||||||
|
|
||||||
def flatten_lists(listoflists):
|
def flatten_lists(listoflists):
|
||||||
return [el for list_ in listoflists for el in list_]
|
return [el for list_ in listoflists for el in list_]
|
||||||
|
75
baselines/ppo1/run_humanoid.py
Normal file
75
baselines/ppo1/run_humanoid.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import os
|
||||||
|
from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
|
||||||
|
from baselines.common import tf_util as U
|
||||||
|
from baselines import logger
|
||||||
|
|
||||||
|
import gym
|
||||||
|
|
||||||
|
def train(num_timesteps, seed, model_path=None):
|
||||||
|
env_id = 'Humanoid-v2'
|
||||||
|
from baselines.ppo1 import mlp_policy, pposgd_simple
|
||||||
|
U.make_session(num_cpu=1).__enter__()
|
||||||
|
def policy_fn(name, ob_space, ac_space):
|
||||||
|
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
|
||||||
|
hid_size=64, num_hid_layers=2)
|
||||||
|
env = make_mujoco_env(env_id, seed)
|
||||||
|
|
||||||
|
# parameters below were the best found in a simple random search
|
||||||
|
# these are good enough to make humanoid walk, but whether those are
|
||||||
|
# an absolute best or not is not certain
|
||||||
|
env = RewScale(env, 0.1)
|
||||||
|
pi = pposgd_simple.learn(env, policy_fn,
|
||||||
|
max_timesteps=num_timesteps,
|
||||||
|
timesteps_per_actorbatch=2048,
|
||||||
|
clip_param=0.2, entcoeff=0.0,
|
||||||
|
optim_epochs=10,
|
||||||
|
optim_stepsize=3e-4,
|
||||||
|
optim_batchsize=64,
|
||||||
|
gamma=0.99,
|
||||||
|
lam=0.95,
|
||||||
|
schedule='linear',
|
||||||
|
)
|
||||||
|
env.close()
|
||||||
|
if model_path:
|
||||||
|
U.save_state(model_path)
|
||||||
|
|
||||||
|
return pi
|
||||||
|
|
||||||
|
class RewScale(gym.RewardWrapper):
|
||||||
|
def __init__(self, env, scale):
|
||||||
|
gym.RewardWrapper.__init__(self, env)
|
||||||
|
self.scale = scale
|
||||||
|
def reward(self, r):
|
||||||
|
return r * self.scale
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logger.configure()
|
||||||
|
parser = mujoco_arg_parser()
|
||||||
|
parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
|
||||||
|
parser.set_defaults(num_timesteps=int(2e7))
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.play:
|
||||||
|
# train the model
|
||||||
|
train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
|
||||||
|
else:
|
||||||
|
# construct the model object, load pre-trained model and render
|
||||||
|
pi = train(num_timesteps=1, seed=args.seed)
|
||||||
|
U.load_state(args.model_path)
|
||||||
|
env = make_mujoco_env('Humanoid-v2', seed=0)
|
||||||
|
|
||||||
|
ob = env.reset()
|
||||||
|
while True:
|
||||||
|
action = pi.act(stochastic=False, ob=ob)[0]
|
||||||
|
ob, _, done, _ = env.step(action)
|
||||||
|
env.render()
|
||||||
|
if done:
|
||||||
|
ob = env.reset()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
40
baselines/ppo1/run_robotics.py
Normal file
40
baselines/ppo1/run_robotics.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from mpi4py import MPI
|
||||||
|
from baselines.common import set_global_seeds
|
||||||
|
from baselines import logger
|
||||||
|
from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser
|
||||||
|
import mujoco_py
|
||||||
|
|
||||||
|
|
||||||
|
def train(env_id, num_timesteps, seed):
|
||||||
|
from baselines.ppo1 import mlp_policy, pposgd_simple
|
||||||
|
import baselines.common.tf_util as U
|
||||||
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
|
sess = U.single_threaded_session()
|
||||||
|
sess.__enter__()
|
||||||
|
mujoco_py.ignore_mujoco_warnings().__enter__()
|
||||||
|
workerseed = seed + 10000 * rank
|
||||||
|
set_global_seeds(workerseed)
|
||||||
|
env = make_robotics_env(env_id, workerseed, rank=rank)
|
||||||
|
def policy_fn(name, ob_space, ac_space):
|
||||||
|
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
|
||||||
|
hid_size=256, num_hid_layers=3)
|
||||||
|
|
||||||
|
pposgd_simple.learn(env, policy_fn,
|
||||||
|
max_timesteps=num_timesteps,
|
||||||
|
timesteps_per_actorbatch=2048,
|
||||||
|
clip_param=0.2, entcoeff=0.0,
|
||||||
|
optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256,
|
||||||
|
gamma=0.99, lam=0.95, schedule='linear',
|
||||||
|
)
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = robotics_arg_parser().parse_args()
|
||||||
|
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Reference in New Issue
Block a user