Files
baselines/baselines/her/experiment/config.py
Rishabh Jangir 8513d73355 HER : new functionality, enables demo based training (#474)
* Add, initialize, normalize and sample from a demo buffer

* Modify losses and add cloning loss

* Add demo file parameter to train.py

* Introduce new params in config.py for demo based training

* Change logger.warning to logger.warn in rollout.py;bug

* Add data generation file for Fetch environments

* Update README file
2018-10-22 19:04:40 -07:00

185 lines
6.6 KiB
Python

import numpy as np
import gym
from baselines import logger
from baselines.her.ddpg import DDPG
from baselines.her.her import make_sample_her_transitions
DEFAULT_ENV_PARAMS = {
'FetchReach-v1': {
'n_cycles': 10,
},
}
DEFAULT_PARAMS = {
# env
'max_u': 1., # max absolute value of actions on different coordinates
# ddpg
'layers': 3, # number of layers in the critic/actor networks
'hidden': 256, # number of neurons in each hidden layers
'network_class': 'baselines.her.actor_critic:ActorCritic',
'Q_lr': 0.001, # critic learning rate
'pi_lr': 0.001, # actor learning rate
'buffer_size': int(1E6), # for experience replay
'polyak': 0.95, # polyak averaging coefficient
'action_l2': 1.0, # quadratic penalty on actions (before rescaling by max_u)
'clip_obs': 200.,
'scope': 'ddpg', # can be tweaked for testing
'relative_goals': False,
# training
'n_cycles': 50, # per epoch
'rollout_batch_size': 2, # per mpi thread
'n_batches': 40, # training batches per cycle
'batch_size': 256, # per mpi thread, measured in transitions and reduced to even multiple of chunk_length.
'n_test_rollouts': 10, # number of test rollouts per epoch, each consists of rollout_batch_size rollouts
'test_with_polyak': False, # run test episodes with the target network
# exploration
'random_eps': 0.3, # percentage of time a random action is taken
'noise_eps': 0.2, # std of gaussian noise added to not-completely-random actions as a percentage of max_u
# HER
'replay_strategy': 'future', # supported modes: future, none
'replay_k': 4, # number of additional goals used for replay, only used if off_policy_data=future
# normalization
'norm_eps': 0.01, # epsilon used for observation normalization
'norm_clip': 5, # normalized observations are cropped to this values
'bc_loss': 0, # whether or not to use the behavior cloning loss as an auxilliary loss
'q_filter': 0, # whether or not a Q value filter should be used on the Actor outputs
'num_demo': 100, # number of expert demo episodes
'demo_batch_size': 128, #number of samples to be used from the demonstrations buffer, per mpi thread 128/1024 or 32/256
'prm_loss_weight': 0.001, #Weight corresponding to the primary loss
'aux_loss_weight': 0.0078, #Weight corresponding to the auxilliary loss also called the cloning loss
}
CACHED_ENVS = {}
def cached_make_env(make_env):
"""
Only creates a new environment from the provided function if one has not yet already been
created. This is useful here because we need to infer certain properties of the env, e.g.
its observation and action spaces, without any intend of actually using it.
"""
if make_env not in CACHED_ENVS:
env = make_env()
CACHED_ENVS[make_env] = env
return CACHED_ENVS[make_env]
def prepare_params(kwargs):
# DDPG params
ddpg_params = dict()
env_name = kwargs['env_name']
def make_env():
return gym.make(env_name)
kwargs['make_env'] = make_env
tmp_env = cached_make_env(kwargs['make_env'])
assert hasattr(tmp_env, '_max_episode_steps')
kwargs['T'] = tmp_env._max_episode_steps
tmp_env.reset()
kwargs['max_u'] = np.array(kwargs['max_u']) if isinstance(kwargs['max_u'], list) else kwargs['max_u']
kwargs['gamma'] = 1. - 1. / kwargs['T']
if 'lr' in kwargs:
kwargs['pi_lr'] = kwargs['lr']
kwargs['Q_lr'] = kwargs['lr']
del kwargs['lr']
for name in ['buffer_size', 'hidden', 'layers',
'network_class',
'polyak',
'batch_size', 'Q_lr', 'pi_lr',
'norm_eps', 'norm_clip', 'max_u',
'action_l2', 'clip_obs', 'scope', 'relative_goals']:
ddpg_params[name] = kwargs[name]
kwargs['_' + name] = kwargs[name]
del kwargs[name]
kwargs['ddpg_params'] = ddpg_params
return kwargs
def log_params(params, logger=logger):
for key in sorted(params.keys()):
logger.info('{}: {}'.format(key, params[key]))
def configure_her(params):
env = cached_make_env(params['make_env'])
env.reset()
def reward_fun(ag_2, g, info): # vectorized
return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info)
# Prepare configuration for HER.
her_params = {
'reward_fun': reward_fun,
}
for name in ['replay_strategy', 'replay_k']:
her_params[name] = params[name]
params['_' + name] = her_params[name]
del params[name]
sample_her_transitions = make_sample_her_transitions(**her_params)
return sample_her_transitions
def simple_goal_subtract(a, b):
assert a.shape == b.shape
return a - b
def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True):
sample_her_transitions = configure_her(params)
# Extract relevant parameters.
gamma = params['gamma']
rollout_batch_size = params['rollout_batch_size']
ddpg_params = params['ddpg_params']
input_dims = dims.copy()
# DDPG agent
env = cached_make_env(params['make_env'])
env.reset()
ddpg_params.update({'input_dims': input_dims, # agent takes an input observations
'T': params['T'],
'clip_pos_returns': True, # clip positive returns
'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # max abs of return
'rollout_batch_size': rollout_batch_size,
'subtract_goals': simple_goal_subtract,
'sample_transitions': sample_her_transitions,
'gamma': gamma,
'bc_loss': params['bc_loss'],
'q_filter': params['q_filter'],
'num_demo': params['num_demo'],
'demo_batch_size': params['demo_batch_size'],
'prm_loss_weight': params['prm_loss_weight'],
'aux_loss_weight': params['aux_loss_weight'],
})
ddpg_params['info'] = {
'env_name': params['env_name'],
}
policy = DDPG(reuse=reuse, **ddpg_params, use_mpi=use_mpi)
return policy
def configure_dims(params):
env = cached_make_env(params['make_env'])
env.reset()
obs, _, _, info = env.step(env.action_space.sample())
dims = {
'o': obs['observation'].shape[0],
'u': env.action_space.shape[0],
'g': obs['desired_goal'].shape[0],
}
for key, value in info.items():
value = np.array(value)
if value.ndim == 0:
value = value.reshape(1)
dims['info_{}'.format(key)] = value.shape[0]
return dims