Compare commits

..

6 Commits

Author SHA1 Message Date
Peter Zhokhov
2c818245d6 dummy commit to RUN BENCHMARKS 2018-07-25 18:09:30 -07:00
Peter Zhokhov
ae8e7fd16b dummy commit to RUN BENCHMARKS 2018-07-25 18:07:56 -07:00
Adam Gleave
f272969325 GAIL: bugfix in dataset loading (#447)
* Fix silly typo

* Replace ad-hoc function with NumPy code
2018-07-06 16:12:14 -07:00
pzhokhov
a6b1bc70f1 re-import internal; fix missing tile_images.py (#427)
* import rl-algs from 2e3a166 commit

* extra import of the baselines badge

* exported commit with identity test

* proper rng seeding in the test_identity

* import internal

* adding missing tile_images.py
2018-06-08 09:41:45 -07:00
pzhokhov
36ee5d1707 Import internal changes (#422)
* import rl-algs from 2e3a166 commit

* extra import of the baselines badge

* exported commit with identity test

* proper rng seeding in the test_identity

* import internal
2018-06-06 11:39:13 -07:00
pzhokhov
24fe3d6576 Import internal repo (#409)
* import rl-algs from 2e3a166 commit

* extra import of the baselines badge

* exported commit with identity test

* proper rng seeding in the test_identity
2018-05-21 15:24:00 -07:00
24 changed files with 408 additions and 99 deletions

View File

@@ -1,4 +1,4 @@
<img src="data/logo.jpg" width=25% align="right" />
<img src="data/logo.jpg" width=25% align="right" /> [![Build status](https://travis-ci.org/openai/baselines.svg?branch=master)](https://travis-ci.org/openai/baselines)
# Baselines

View File

@@ -131,7 +131,6 @@ class Runner(AbstractEnvRunner):
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
tf.reset_default_graph()
set_global_seeds(seed)
nenvs = env.num_envs
@@ -158,3 +157,4 @@ def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, e
logger.record_tabular("explained_variance", float(ev))
logger.dump_tabular()
env.close()
return model

View File

@@ -2,6 +2,7 @@ import numpy as np
import tensorflow as tf
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
from baselines.common.distributions import make_pdtype
from baselines.common.input import observation_input
def nature_cnn(unscaled_images, **conv_kwargs):
"""
@@ -19,14 +20,12 @@ def nature_cnn(unscaled_images, **conv_kwargs):
class LnLstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
X = tf.placeholder(tf.uint8, ob_shape) #obs
X, processed_x = observation_input(ob_space, nbatch)
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
self.pdtype = make_pdtype(ac_space)
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
h = nature_cnn(processed_x)
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
@@ -56,11 +55,9 @@ class LstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
self.pdtype = make_pdtype(ac_space)
X = tf.placeholder(tf.uint8, ob_shape) #obs
X, processed_x = observation_input(ob_space, nbatch)
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
@@ -93,12 +90,10 @@ class LstmPolicy(object):
class CnnPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
self.pdtype = make_pdtype(ac_space)
X = tf.placeholder(tf.uint8, ob_shape) #obs
X, processed_x = observation_input(ob_space, nbatch)
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X, **conv_kwargs)
h = nature_cnn(processed_x, **conv_kwargs)
vf = fc(h, 'v', 1)[:,0]
self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)
@@ -120,15 +115,14 @@ class CnnPolicy(object):
class MlpPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
ob_shape = (nbatch,) + ob_space.shape
self.pdtype = make_pdtype(ac_space)
X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
with tf.variable_scope("model", reuse=reuse):
X, processed_x = observation_input(ob_space, nbatch)
activ = tf.tanh
flatten = tf.layers.flatten
pi_h1 = activ(fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
processed_x = tf.layers.flatten(processed_x)
pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
vf_h1 = activ(fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
vf = fc(vf_h2, 'vf', 1)[:,0]

View File

@@ -3,6 +3,7 @@ Helpers for scripts like run_atari.py.
"""
import os
from mpi4py import MPI
import gym
from gym.wrappers import FlattenDictWrapper
from baselines import logger
@@ -30,9 +31,10 @@ def make_mujoco_env(env_id, seed):
"""
Create a wrapped, monitored gym.Env for MuJoCo.
"""
set_global_seeds(seed)
rank = MPI.COMM_WORLD.Get_rank()
set_global_seeds(seed + 10000 * rank)
env = gym.make(env_id)
env = Monitor(env, logger.get_dir())
env = Monitor(env, os.path.join(logger.get_dir(), str(rank)))
env.seed(seed)
return env

View File

@@ -0,0 +1,30 @@
from gym import Env
from gym.spaces import Discrete
class IdentityEnv(Env):
def __init__(
self,
dim,
ep_length=100,
):
self.action_space = Discrete(dim)
self.reset()
def reset(self):
self._choose_next_state()
self.observation_space = self.action_space
return self.state
def step(self, actions):
rew = self._get_reward(actions)
self._choose_next_state()
return self.state, rew, False, {}
def _choose_next_state(self):
self.state = self.action_space.sample()
def _get_reward(self, actions):
return 1 if self.state == actions else 0

30
baselines/common/input.py Normal file
View File

@@ -0,0 +1,30 @@
import tensorflow as tf
from gym.spaces import Discrete, Box
def observation_input(ob_space, batch_size=None, name='Ob'):
'''
Build observation input with encoding depending on the
observation space type
Params:
ob_space: observation space (should be one of gym.spaces)
batch_size: batch size for input (default is None, so that resulting input placeholder can take tensors with any batch size)
name: tensorflow variable name for input placeholder
returns: tuple (input_placeholder, processed_input_tensor)
'''
if isinstance(ob_space, Discrete):
input_x = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name)
processed_x = tf.to_float(tf.one_hot(input_x, ob_space.n))
return input_x, processed_x
elif isinstance(ob_space, Box):
input_shape = (batch_size,) + ob_space.shape
input_x = tf.placeholder(shape=input_shape, dtype=ob_space.dtype, name=name)
processed_x = tf.to_float(input_x)
return input_x, processed_x
else:
raise NotImplementedError

View File

@@ -0,0 +1,18 @@
import numpy as np
from abc import ABC, abstractmethod
class AbstractEnvRunner(ABC):
def __init__(self, *, env, model, nsteps):
self.env = env
self.model = model
nenv = env.num_envs
self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
self.obs[:] = env.reset()
self.nsteps = nsteps
self.states = model.initial_state
self.dones = [False for _ in range(nenv)]
@abstractmethod
def run(self):
raise NotImplementedError

View File

@@ -0,0 +1,44 @@
import pytest
import tensorflow as tf
import random
import numpy as np
from gym.spaces import np_random
from baselines.a2c import a2c
from baselines.ppo2 import ppo2
from baselines.common.identity_env import IdentityEnv
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from baselines.ppo2.policies import MlpPolicy
learn_func_list = [
lambda e: a2c.learn(policy=MlpPolicy, env=e, seed=0, total_timesteps=50000),
lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.01)
]
@pytest.mark.slow
@pytest.mark.parametrize("learn_func", learn_func_list)
def test_identity(learn_func):
'''
Test if the algorithm (with a given policy)
can learn an identity transformation (i.e. return observation as an action)
'''
np.random.seed(0)
np_random.seed(0)
random.seed(0)
env = DummyVecEnv([lambda: IdentityEnv(10)])
with tf.Graph().as_default(), tf.Session().as_default():
tf.set_random_seed(0)
model = learn_func(env)
N_TRIALS = 1000
sum_rew = 0
obs = env.reset()
for i in range(N_TRIALS):
obs, rew, done, _ = env.step(model.step(obs)[0])
sum_rew += rew
assert sum_rew > 0.9 * N_TRIALS

View File

@@ -55,7 +55,6 @@ def make_session(num_cpu=None, make_default=False, graph=None):
tf_config = tf.ConfigProto(
inter_op_parallelism_threads=num_cpu,
intra_op_parallelism_threads=num_cpu)
tf_config.gpu_options.allocator_type = 'BFC'
if make_default:
return tf.InteractiveSession(config=tf_config, graph=graph)
else:

View File

@@ -0,0 +1,23 @@
import numpy as np
def tile_images(img_nhwc):
"""
Tile N images into one big PxQ image
(P,Q) are chosen to be as close as possible, and if N
is square, then P=Q.
input: img_nhwc, list or array of images, ndim=4 once turned into array
n = batch index, h = height, w = width, c = channel
returns:
bigim_HWc, ndarray with ndim=3
"""
img_nhwc = np.asarray(img_nhwc)
N, h, w, c = img_nhwc.shape
H = int(np.ceil(np.sqrt(N)))
W = int(np.ceil(float(N)/H))
img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
return img_Hh_Ww_c

View File

@@ -77,7 +77,7 @@ class VecEnv(ABC):
self.step_async(actions)
return self.step_wait()
def render(self):
def render(self, mode='human'):
logger.warn('Render not defined for %s'%self)
@property

View File

@@ -11,18 +11,18 @@ class DummyVecEnv(VecEnv):
shapes, dtypes = {}, {}
self.keys = []
obs_space = env.observation_space
if isinstance(obs_space, spaces.Dict):
assert isinstance(obs_space.spaces, OrderedDict)
for key, box in obs_space.spaces.items():
assert isinstance(box, spaces.Box)
shapes[key] = box.shape
dtypes[key] = box.dtype
self.keys.append(key)
subspaces = obs_space.spaces
else:
box = obs_space
assert isinstance(box, spaces.Box)
self.keys = [None]
shapes, dtypes = { None: box.shape }, { None: box.dtype }
subspaces = {None: obs_space}
for key, box in subspaces.items():
shapes[key] = box.shape
dtypes[key] = box.dtype
self.keys.append(key)
self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
@@ -50,8 +50,8 @@ class DummyVecEnv(VecEnv):
def close(self):
return
def render(self):
return [e.render() for e in self.envs]
def render(self, mode='human'):
return [e.render(mode=mode) for e in self.envs]
def _save_obs(self, e, obs):
for k in self.keys:

View File

@@ -1,6 +1,7 @@
import numpy as np
from multiprocessing import Process, Pipe
from baselines.common.vec_env import VecEnv, CloudpickleWrapper
from baselines.common.tile_images import tile_images
def worker(remote, parent_remote, env_fn_wrapper):
@@ -16,9 +17,8 @@ def worker(remote, parent_remote, env_fn_wrapper):
elif cmd == 'reset':
ob = env.reset()
remote.send(ob)
elif cmd == 'reset_task':
ob = env.reset_task()
remote.send(ob)
elif cmd == 'render':
remote.send(env.render(mode='rgb_array'))
elif cmd == 'close':
remote.close()
break
@@ -81,3 +81,17 @@ class SubprocVecEnv(VecEnv):
for p in self.ps:
p.join()
self.closed = True
def render(self, mode='human'):
for pipe in self.remotes:
pipe.send(('render', None))
imgs = [pipe.recv() for pipe in self.remotes]
bigimg = tile_images(imgs)
if mode == 'human':
import cv2
cv2.imshow('vecenv', bigimg[:,:,::-1])
cv2.waitKey(1)
elif mode == 'rgb_array':
return bigimg
else:
raise NotImplementedError

View File

@@ -9,7 +9,7 @@ import baselines.common.tf_util as U
from baselines import logger
from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.deepq.utils import BatchInput
from baselines.deepq.utils import ObservationInput
from baselines.common.schedules import LinearSchedule
@@ -28,7 +28,7 @@ if __name__ == '__main__':
env = gym.make("CartPole-v0")
# Create all the functions necessary to train the model
act, train, update_target, debug = deepq.build_train(
make_obs_ph=lambda name: BatchInput(env.observation_space.shape, name=name),
make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name),
q_func=model,
num_actions=env.action_space.n,
optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),

View File

@@ -10,9 +10,11 @@ import baselines.common.tf_util as U
from baselines.common.tf_util import load_state, save_state
from baselines import logger
from baselines.common.schedules import LinearSchedule
from baselines.common.input import observation_input
from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
from baselines.deepq.utils import BatchInput
from baselines.deepq.utils import ObservationInput
class ActWrapper(object):
@@ -171,10 +173,9 @@ def learn(env,
# capture the shape outside the closure so that the env object is not serialized
# by cloudpickle when serializing make_obs_ph
observation_space_shape = env.observation_space.shape
def make_obs_ph(name):
return BatchInput(observation_space_shape, name=name)
return ObservationInput(env.observation_space, name=name)
act, train, update_target, debug = deepq.build_train(
make_obs_ph=make_obs_ph,

View File

@@ -0,0 +1,43 @@
import tensorflow as tf
import random
from baselines import deepq
from baselines.common.identity_env import IdentityEnv
def test_identity():
with tf.Graph().as_default():
env = IdentityEnv(10)
random.seed(0)
tf.set_random_seed(0)
param_noise = False
model = deepq.models.mlp([32])
act = deepq.learn(
env,
q_func=model,
lr=1e-3,
max_timesteps=10000,
buffer_size=50000,
exploration_fraction=0.1,
exploration_final_eps=0.02,
print_freq=10,
param_noise=param_noise,
)
tf.set_random_seed(0)
N_TRIALS = 1000
sum_rew = 0
obs = env.reset()
for i in range(N_TRIALS):
obs, rew, done, _ = env.step(act([obs]))
sum_rew += rew
assert sum_rew > 0.9 * N_TRIALS
if __name__ == '__main__':
test_identity()

View File

@@ -1,3 +1,5 @@
from baselines.common.input import observation_input
import tensorflow as tf
# ================================================================
@@ -37,22 +39,6 @@ class PlaceholderTfInput(TfInput):
return {self._placeholder: data}
class BatchInput(PlaceholderTfInput):
def __init__(self, shape, dtype=tf.float32, name=None):
"""Creates a placeholder for a batch of tensors of a given shape and dtype
Parameters
----------
shape: [int]
shape of a single elemenet of the batch
dtype: tf.dtype
number representation used for tensor contents
name: str
name of the underlying placeholder
"""
super().__init__(tf.placeholder(dtype, [None] + list(shape), name=name))
class Uint8Input(PlaceholderTfInput):
def __init__(self, shape, name=None):
"""Takes input in uint8 format which is cast to float32 and divided by 255
@@ -74,3 +60,24 @@ class Uint8Input(PlaceholderTfInput):
def get(self):
return self._output
class ObservationInput(PlaceholderTfInput):
def __init__(self, observation_space, name=None):
"""Creates an input placeholder tailored to a specific observation space
Parameters
----------
observation_space:
observation space of the environment. Should be one of the gym.spaces types
name: str
tensorflow name of the underlying placeholder
"""
inpt, self.processed_inpt = observation_input(observation_space, name=name)
super().__init__(inpt)
def get(self):
return self.processed_inpt

View File

@@ -47,18 +47,12 @@ class Mujoco_Dset(object):
obs = traj_data['obs'][:traj_limitation]
acs = traj_data['acs'][:traj_limitation]
def flatten(x):
# x.shape = (E,), or (E, L, D)
_, size = x[0].shape
episode_length = [len(i) for i in x]
y = np.zeros((sum(episode_length), size))
start_idx = 0
for l, x_i in zip(episode_length, x):
y[start_idx:(start_idx+l)] = x_i
start_idx += l
return y
self.obs = np.array(flatten(obs))
self.acs = np.array(flatten(acs))
# obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length
# and S is the environment observation/action space.
# Flatten to (N * L, prod(S))
self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
self.rets = traj_data['ep_rets'][:traj_limitation]
self.avg_ret = sum(self.rets)/len(self.rets)
self.std_ret = np.std(np.array(self.rets))

View File

@@ -8,10 +8,6 @@ import datetime
import tempfile
from collections import defaultdict
LOG_OUTPUT_FORMATS = ['stdout', 'log', 'csv']
LOG_OUTPUT_FORMATS_MPI = ['log']
# Also valid: json, tensorboard
DEBUG = 10
INFO = 20
WARN = 30
@@ -75,8 +71,11 @@ class HumanOutputFormat(KVWriter, SeqWriter):
return s[:20] + '...' if len(s) > 23 else s
def writeseq(self, seq):
for arg in seq:
self.file.write(arg)
seq = list(seq)
for (i, elem) in enumerate(seq):
self.file.write(elem)
if i < len(seq) - 1: # add space unless this is the last one
self.file.write(' ')
self.file.write('\n')
self.file.flush()
@@ -363,13 +362,11 @@ def configure(dir=None, format_strs=None):
log_suffix = "-rank%03i" % rank
if format_strs is None:
strs, strs_mpi = os.getenv('OPENAI_LOG_FORMAT'), os.getenv('OPENAI_LOG_FORMAT_MPI')
format_strs = strs_mpi if rank>0 else strs
if format_strs is not None:
format_strs = format_strs.split(',')
if rank == 0:
format_strs = os.getenv('OPENAI_LOG_FORMAT', 'stdout,log,csv').split(',')
else:
format_strs = LOG_OUTPUT_FORMATS_MPI if rank>0 else LOG_OUTPUT_FORMATS
format_strs = os.getenv('OPENAI_LOG_FORMAT_MPI', 'log').split(',')
format_strs = filter(None, format_strs)
output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)

View File

@@ -5,3 +5,5 @@
- `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
- `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
- Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model`
- Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model`

View File

@@ -212,5 +212,7 @@ def learn(env, policy_fn, *,
if MPI.COMM_WORLD.Get_rank()==0:
logger.dump_tabular()
return pi
def flatten_lists(listoflists):
return [el for list_ in listoflists for el in list_]

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env python3
import os
from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
from baselines.common import tf_util as U
from baselines import logger
import gym
def train(num_timesteps, seed, model_path=None):
env_id = 'Humanoid-v2'
from baselines.ppo1 import mlp_policy, pposgd_simple
U.make_session(num_cpu=1).__enter__()
def policy_fn(name, ob_space, ac_space):
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=64, num_hid_layers=2)
env = make_mujoco_env(env_id, seed)
# parameters below were the best found in a simple random search
# these are good enough to make humanoid walk, but whether those are
# an absolute best or not is not certain
env = RewScale(env, 0.1)
pi = pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps,
timesteps_per_actorbatch=2048,
clip_param=0.2, entcoeff=0.0,
optim_epochs=10,
optim_stepsize=3e-4,
optim_batchsize=64,
gamma=0.99,
lam=0.95,
schedule='linear',
)
env.close()
if model_path:
U.save_state(model_path)
return pi
class RewScale(gym.RewardWrapper):
def __init__(self, env, scale):
gym.RewardWrapper.__init__(self, env)
self.scale = scale
def reward(self, r):
return r * self.scale
def main():
logger.configure()
parser = mujoco_arg_parser()
parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
parser.set_defaults(num_timesteps=int(2e7))
args = parser.parse_args()
if not args.play:
# train the model
train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
else:
# construct the model object, load pre-trained model and render
pi = train(num_timesteps=1, seed=args.seed)
U.load_state(args.model_path)
env = make_mujoco_env('Humanoid-v2', seed=0)
ob = env.reset()
while True:
action = pi.act(stochastic=False, ob=ob)[0]
ob, _, done, _ = env.step(action)
env.render()
if done:
ob = env.reset()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,40 @@
#!/usr/bin/env python3
from mpi4py import MPI
from baselines.common import set_global_seeds
from baselines import logger
from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser
import mujoco_py
def train(env_id, num_timesteps, seed):
from baselines.ppo1 import mlp_policy, pposgd_simple
import baselines.common.tf_util as U
rank = MPI.COMM_WORLD.Get_rank()
sess = U.single_threaded_session()
sess.__enter__()
mujoco_py.ignore_mujoco_warnings().__enter__()
workerseed = seed + 10000 * rank
set_global_seeds(workerseed)
env = make_robotics_env(env_id, workerseed, rank=rank)
def policy_fn(name, ob_space, ac_space):
return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
hid_size=256, num_hid_layers=3)
pposgd_simple.learn(env, policy_fn,
max_timesteps=num_timesteps,
timesteps_per_actorbatch=2048,
clip_param=0.2, entcoeff=0.0,
optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256,
gamma=0.99, lam=0.95, schedule='linear',
)
env.close()
def main():
args = robotics_arg_parser().parse_args()
train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
if __name__ == '__main__':
main()

View File

@@ -2,6 +2,7 @@ import numpy as np
import tensorflow as tf
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
from baselines.common.distributions import make_pdtype
from baselines.common.input import observation_input
def nature_cnn(unscaled_images, **conv_kwargs):
"""
@@ -19,14 +20,12 @@ def nature_cnn(unscaled_images, **conv_kwargs):
class LnLstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
X = tf.placeholder(tf.uint8, ob_shape) #obs
X, processed_x = observation_input(ob_space, nbatch)
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
self.pdtype = make_pdtype(ac_space)
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
h = nature_cnn(processed_x)
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
@@ -56,11 +55,9 @@ class LstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
self.pdtype = make_pdtype(ac_space)
X = tf.placeholder(tf.uint8, ob_shape) #obs
X, processed_x = observation_input(ob_space, nbatch)
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
@@ -93,12 +90,10 @@ class LstmPolicy(object):
class CnnPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
self.pdtype = make_pdtype(ac_space)
X = tf.placeholder(tf.uint8, ob_shape) #obs
X, processed_x = observation_input(ob_space, nbatch)
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X, **conv_kwargs)
h = nature_cnn(processed_x, **conv_kwargs)
vf = fc(h, 'v', 1)[:,0]
self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)
@@ -120,15 +115,14 @@ class CnnPolicy(object):
class MlpPolicy(object):
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
ob_shape = (nbatch,) + ob_space.shape
self.pdtype = make_pdtype(ac_space)
X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
with tf.variable_scope("model", reuse=reuse):
X, processed_x = observation_input(ob_space, nbatch)
activ = tf.tanh
flatten = tf.layers.flatten
pi_h1 = activ(fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
processed_x = tf.layers.flatten(processed_x)
pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
vf_h1 = activ(fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
vf = fc(vf_h2, 'vf', 1)[:,0]