Runtime: More progress towards cuBLAS integration
This commit is contained in:
@@ -84,6 +84,7 @@ public:
|
||||
virtual std::vector<int_t> input_sizes(expression_tree const & expressions) const = 0;
|
||||
virtual int is_invalid(expression_tree const & expressions, driver::Device const & device) const = 0;
|
||||
virtual void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & expressions) = 0;
|
||||
virtual expression_type type() const = 0;
|
||||
std::string generate(std::string const & suffix, expression_tree const & expressions, driver::Device const & device);
|
||||
std::shared_ptr<base> getptr();
|
||||
};
|
||||
|
@@ -38,6 +38,7 @@ public:
|
||||
elementwise_1d(unsigned int vwidth, unsigned int ls, unsigned int ng, fetch_type fetch);
|
||||
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
|
||||
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
|
||||
expression_type type() const;
|
||||
private:
|
||||
unsigned int ng_;
|
||||
fetch_type fetch_;
|
||||
|
@@ -39,6 +39,7 @@ public:
|
||||
elementwise_2d(unsigned int vwidth, unsigned int ls0, unsigned int ls1, unsigned int ng0, unsigned int ng1, fetch_type fetch);
|
||||
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
|
||||
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
|
||||
expression_type type() const;
|
||||
private:
|
||||
unsigned int ng0_;
|
||||
unsigned int ng1_;
|
||||
|
@@ -39,6 +39,7 @@ public:
|
||||
int is_invalid(expression_tree const &, driver::Device const &) const;
|
||||
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
|
||||
void enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & h);
|
||||
expression_type type() const;
|
||||
private:
|
||||
const char A_trans_;
|
||||
const char B_trans_;
|
||||
@@ -62,6 +63,7 @@ public:
|
||||
, int_t lf0, int_t lf1, char A_trans, char B_trans);
|
||||
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
|
||||
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & h);
|
||||
expression_type type() const;
|
||||
|
||||
private:
|
||||
//Parameters
|
||||
|
@@ -43,6 +43,8 @@ public:
|
||||
reduce_1d(unsigned int vwidth, unsigned int ls, unsigned int ng, fetch_type fetch);
|
||||
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
|
||||
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
|
||||
expression_type type() const;
|
||||
|
||||
private:
|
||||
unsigned int ng_;
|
||||
fetch_type fetch_;
|
||||
|
@@ -44,6 +44,7 @@ private:
|
||||
public:
|
||||
virtual std::vector<int_t> input_sizes(expression_tree const & expressions) const;
|
||||
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
|
||||
expression_type type() const;
|
||||
private:
|
||||
unsigned int ng0_;
|
||||
unsigned int ng1_;
|
||||
|
@@ -53,7 +53,7 @@ public:
|
||||
|
||||
public:
|
||||
value_type(expression_type, numeric_type, predictors::random_forest const &, std::vector< std::shared_ptr<templates::base> > const &, driver::CommandQueue const &);
|
||||
value_type(expression_type, numeric_type, std::shared_ptr<templates::base> const &, driver::CommandQueue const &);
|
||||
value_type(numeric_type, std::shared_ptr<templates::base> const &, driver::CommandQueue const &);
|
||||
void execute(runtime::execution_handler const &);
|
||||
templates_container const & templates() const;
|
||||
|
||||
|
@@ -43,6 +43,9 @@ int elementwise_1d::is_invalid_impl(driver::Device const &, expression_tree cons
|
||||
return TEMPLATE_VALID;
|
||||
}
|
||||
|
||||
expression_type elementwise_1d::type() const
|
||||
{ return ELEMENTWISE_1D; }
|
||||
|
||||
std::string elementwise_1d::generate_impl(std::string const & suffix, expression_tree const & tree, driver::Device const & device, symbolic::symbols_table const & symbols) const
|
||||
{
|
||||
driver::backend_type backend = device.backend();
|
||||
|
@@ -42,6 +42,9 @@ int elementwise_2d::is_invalid_impl(driver::Device const &, expression_tree cons
|
||||
return TEMPLATE_VALID;
|
||||
}
|
||||
|
||||
expression_type elementwise_2d::type() const
|
||||
{ return ELEMENTWISE_2D; }
|
||||
|
||||
std::string elementwise_2d::generate_impl(std::string const & suffix, expression_tree const & tree, driver::Device const & device, symbolic::symbols_table const & symbols) const
|
||||
{
|
||||
std::string init0, upper_bound0, inc0, init1, upper_bound1, inc1;
|
||||
|
@@ -50,16 +50,11 @@ std::vector<int_t> infos(expression_tree const & tree, symbolic::preset::gemm::a
|
||||
}
|
||||
|
||||
/* ------------------ CUBLAS ------------------ */
|
||||
bool cublas_gemm::init()
|
||||
{
|
||||
return driver::dispatch::cublasinit();
|
||||
}
|
||||
|
||||
cublas_gemm::cublas_gemm(char A_trans, char B_trans): A_trans_(A_trans), B_trans_(B_trans), init_(driver::dispatch::cublasinit())
|
||||
{ }
|
||||
|
||||
int cublas_gemm::is_invalid(expression_tree const &, driver::Device const & device) const
|
||||
{ return init_ && device.backend()==driver::CUDA; }
|
||||
{ return (init_ && device.backend()==driver::CUDA)?0:-1; }
|
||||
|
||||
std::vector<int_t> cublas_gemm::input_sizes(expression_tree const & expressions) const
|
||||
{
|
||||
@@ -67,9 +62,21 @@ std::vector<int_t> cublas_gemm::input_sizes(expression_tree const & expressions)
|
||||
return infos((expression_tree&)expressions, dummy, A_trans_);
|
||||
}
|
||||
|
||||
expression_type cublas_gemm::type() const
|
||||
{
|
||||
if(A_trans_=='N' && B_trans_=='N')
|
||||
return GEMM_NN;
|
||||
else if(A_trans_=='T' && B_trans_=='N')
|
||||
return GEMM_TN;
|
||||
else if(A_trans_=='N' && B_trans_=='T')
|
||||
return GEMM_NT;
|
||||
else
|
||||
return GEMM_TT;
|
||||
}
|
||||
|
||||
void cublas_gemm::enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & control)
|
||||
{
|
||||
namespace drv = driver;;
|
||||
namespace drv = driver;
|
||||
//Get GEMM info
|
||||
symbolic::preset::gemm::args args;
|
||||
std::vector<int_t> MNK = infos(control.x(), args, A_trans_);
|
||||
@@ -115,6 +122,19 @@ unsigned int gemm::lmem_usage(expression_tree const & expression) const
|
||||
return N*size_of(expression.dtype());
|
||||
}
|
||||
|
||||
expression_type gemm::type() const
|
||||
{
|
||||
if(A_trans_=='N' && B_trans_=='N')
|
||||
return GEMM_NN;
|
||||
else if(A_trans_=='T' && B_trans_=='N')
|
||||
return GEMM_TN;
|
||||
else if(A_trans_=='N' && B_trans_=='T')
|
||||
return GEMM_NT;
|
||||
else
|
||||
return GEMM_TT;
|
||||
}
|
||||
|
||||
|
||||
unsigned int gemm::registers_usage(expression_tree const & expression) const
|
||||
{
|
||||
unsigned int N = mS_ * nS_ + mS_ * kS_ + kS_ * nS_;
|
||||
|
@@ -55,6 +55,9 @@ unsigned int reduce_1d::temporary_workspace(expression_tree const &) const
|
||||
return 0;
|
||||
}
|
||||
|
||||
expression_type reduce_1d::type() const
|
||||
{ return REDUCE_1D; }
|
||||
|
||||
inline void reduce_1d::reduce_1d_local_memory(kernel_generation_stream & stream, unsigned int size, std::vector<symbolic::reduce_1d*> exprs,
|
||||
std::string const & buf_str, std::string const & buf_value_str, driver::backend_type) const
|
||||
{
|
||||
|
@@ -290,6 +290,14 @@ std::vector<int_t> reduce_2d::input_sizes(expression_tree const & tree) const
|
||||
return {shape[0], shape[1]};
|
||||
}
|
||||
|
||||
expression_type reduce_2d::type() const
|
||||
{
|
||||
if(reduction_type_==REDUCE_ROWS)
|
||||
return REDUCE_2D_ROWS;
|
||||
else
|
||||
return REDUCE_2D_COLS;
|
||||
}
|
||||
|
||||
void reduce_2d::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & control)
|
||||
{
|
||||
expression_tree const & tree = control.x();
|
||||
|
@@ -77,7 +77,7 @@ profiles::value_type::value_type(expression_type etype, numeric_type dtype, pred
|
||||
}
|
||||
|
||||
|
||||
profiles::value_type::value_type(expression_type etype, numeric_type dtype, std::shared_ptr<templates::base> const & tp, driver::CommandQueue const & queue) : templates_(1,tp), queue_(queue), cache_(driver::backend::programs::get(queue,etype,dtype))
|
||||
profiles::value_type::value_type(numeric_type dtype, std::shared_ptr<templates::base> const & tp, driver::CommandQueue const & queue) : templates_(1,tp), queue_(queue), cache_(driver::backend::programs::get(queue,tp->type(),dtype))
|
||||
{
|
||||
cache_.clear();
|
||||
}
|
||||
@@ -197,7 +197,7 @@ void profiles::import(std::string const & str, driver::CommandQueue const & queu
|
||||
result[{etype, dtype}] = std::make_shared<value_type>(etype, dtype, predictor, templates, queue);
|
||||
}
|
||||
else
|
||||
result[{etype, dtype}] = std::make_shared<value_type>(etype, dtype, templates[0], queue);
|
||||
result[{etype, dtype}] = std::make_shared<value_type>(dtype, templates[0], queue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -86,31 +86,5 @@ namespace tools
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
inline sc::expression_type extract_template_type(bp::object const & odtype)
|
||||
{
|
||||
std::string name = bp::extract<std::string>(odtype.attr("__class__").attr("__name__"))();
|
||||
if(name=="class")
|
||||
name = bp::extract<std::string>(odtype.attr("__name__"))();
|
||||
else
|
||||
name = bp::extract<std::string>(odtype.attr("__class__").attr("__name__"))();
|
||||
|
||||
if(name=="elementwise_1d") return sc::ELEMENTWISE_1D;
|
||||
else if(name=="elementwise_2d") return sc::ELEMENTWISE_2D;
|
||||
else if(name=="reduce_1d") return sc::REDUCE_1D;
|
||||
else if(name=="reduce_2d_rows") return sc::REDUCE_2D_ROWS;
|
||||
else if(name=="reduce_2d_cols") return sc::REDUCE_2D_COLS;
|
||||
else if(name=="gemm_nn") return sc::GEMM_NN;
|
||||
else if(name=="gemm_tn") return sc::GEMM_TN;
|
||||
else if(name=="gemm_nt") return sc::GEMM_NT;
|
||||
else if(name=="gemm_tt") return sc::GEMM_TT;
|
||||
else
|
||||
{
|
||||
PyErr_SetString(PyExc_TypeError, "Template type not understood");
|
||||
bp::throw_error_already_set();
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
|
@@ -106,7 +106,7 @@ namespace detail
|
||||
std::shared_ptr<rt::profiles::value_type> construct_model(bp::object const & tp, bp::object dtype, sc::driver::CommandQueue & queue)
|
||||
{
|
||||
tpt::base* raw = bp::extract<tpt::base*>(tp);
|
||||
return std::make_shared<rt::profiles::value_type>(tools::extract_template_type(tp), tools::extract_dtype(dtype), raw->getptr(), queue);
|
||||
return std::make_shared<rt::profiles::value_type>(tools::extract_dtype(dtype), raw->getptr(), queue);
|
||||
}
|
||||
|
||||
std::shared_ptr<sc::array>
|
||||
@@ -219,9 +219,9 @@ namespace detail
|
||||
{
|
||||
static rt::profiles::value_type& get_item(rt::profiles::map_type& container, bp::tuple i_)
|
||||
{
|
||||
sc::expression_type expression = tools::extract_template_type(i_[0]);
|
||||
tpt::base* tpt = bp::extract<tpt::base*>(i_[0]);
|
||||
sc::numeric_type dtype = tools::extract_dtype(i_[1]);
|
||||
rt::profiles::map_type::iterator i = container.find(std::make_pair(expression, dtype));
|
||||
rt::profiles::map_type::iterator i = container.find(std::make_pair(tpt->type(), dtype));
|
||||
if (i == container.end())
|
||||
{
|
||||
PyErr_SetString(PyExc_KeyError, "Invalid key");
|
||||
@@ -232,9 +232,9 @@ namespace detail
|
||||
|
||||
static void set_item(rt::profiles::map_type& container, bp::tuple i_, rt::profiles::value_type const & v)
|
||||
{
|
||||
sc::expression_type expression = tools::extract_template_type(i_[0]);
|
||||
tpt::base* tpt = bp::extract<tpt::base*>(i_[0]);
|
||||
sc::numeric_type dtype = tools::extract_dtype(i_[1]);
|
||||
container[std::make_pair(expression, dtype)].reset(new rt::profiles::value_type(v));
|
||||
container[std::make_pair(tpt->type(), dtype)].reset(new rt::profiles::value_type(v));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@@ -39,28 +39,6 @@ fetch_types = [sc.templates.fetch_type.FETCH_FROM_GLOBAL_CONTIGUOUS,
|
||||
sc.templates.fetch_type.FETCH_FROM_LOCAL,
|
||||
sc.templates.fetch_type.FETCH_FROM_LOCAL]
|
||||
|
||||
def exhaustive(template, sizes, context):
|
||||
tree, _ = tools.tree_of(template, sizes, context)
|
||||
metric = tools.metric_of(template)
|
||||
nbits = tools.genetic_infos_of(template)['nbits']
|
||||
categorical = tools.genetic_infos_of(template)['categorical']
|
||||
ranges = [range(2**x) for x in nbits]
|
||||
ranges = list(product(*ranges))
|
||||
timings = {}
|
||||
best = None
|
||||
for idx, r in enumerate(ranges):
|
||||
parameters = tuple([fetch_types[x] if i in categorical else 2**x for i,x in enumerate(r)])
|
||||
try:
|
||||
time = tools.benchmark(template, parameters, tree)
|
||||
if not best or time < best[1]:
|
||||
best = parameters, time
|
||||
except profile_execution_failure:
|
||||
pass
|
||||
if best:
|
||||
stdout.write('%.2f %% | Best %.2f [ for %s ]\r'%(float(idx*100)/len(ranges),metric(sizes, best[1]), best[0]))
|
||||
return best[0]
|
||||
|
||||
|
||||
class GeneticOptimizer:
|
||||
|
||||
def __init__(self, logger, naccept=500, niter=1000, cxpb=.4, mutpb=.4, popsize=10, progress_bar = None):
|
||||
@@ -105,7 +83,10 @@ class GeneticOptimizer:
|
||||
def evaluate(genome):
|
||||
idx = tuple(genome)
|
||||
if idx not in cache:
|
||||
cache[idx] = tools.benchmark(template, decode(genome), tree)
|
||||
time = tools.benchmark(template, template(*decode(genome)), tree)
|
||||
if time == float('inf'):
|
||||
return time,
|
||||
cache[idx] = time
|
||||
self.progress_bar.update(max(len(cache), it), self.niter, decode(min(cache, key=cache.get)), metric(sizes, min(cache.values())))
|
||||
return cache[idx],
|
||||
|
||||
@@ -132,11 +113,9 @@ class GeneticOptimizer:
|
||||
genome = encode(prior if prior else list(initializer.next()))
|
||||
while len(population) < self.popsize:
|
||||
individual = creator.Individual(genome)
|
||||
try:
|
||||
individual.fitness.values = toolbox.evaluate(genome)
|
||||
individual.fitness.values = toolbox.evaluate(genome)
|
||||
if max(individual.fitness.values) != float('inf'):
|
||||
population += [individual]
|
||||
except profile_execution_failure:
|
||||
pass
|
||||
genome = encode(list(initializer.next()))
|
||||
hof.update(population)
|
||||
|
||||
@@ -146,26 +125,25 @@ class GeneticOptimizer:
|
||||
#Generate offspring
|
||||
offspring = []
|
||||
while len(offspring) < self.popsize:
|
||||
try:
|
||||
op_choice = random.random()
|
||||
#Cross-over
|
||||
if op_choice < self.cxpb:
|
||||
ind1, ind2 = map(toolbox.clone, random.sample(population, 2))
|
||||
ind1, ind2 = toolbox.mate(ind1, ind2)
|
||||
ind = ind1
|
||||
toolbox.evaluate(ind)
|
||||
op_choice = random.random()
|
||||
#Cross-over
|
||||
if op_choice < self.cxpb:
|
||||
ind1, ind2 = map(toolbox.clone, random.sample(population, 2))
|
||||
ind1, ind2 = toolbox.mate(ind1, ind2)
|
||||
ind = ind1
|
||||
toolbox.evaluate(ind)
|
||||
if max(ind.fitness.values) != float('inf'):
|
||||
offspring += [ind]
|
||||
#Mutation
|
||||
elif op_choice < self.cxpb + self.mutpb:
|
||||
ind = toolbox.clone(random.choice(population))
|
||||
ind, = toolbox.mutate(ind, 1.0/offsets[-1])
|
||||
toolbox.evaluate(ind)
|
||||
#Mutation
|
||||
elif op_choice < self.cxpb + self.mutpb:
|
||||
ind = toolbox.clone(random.choice(population))
|
||||
ind, = toolbox.mutate(ind, 1.0/offsets[-1])
|
||||
toolbox.evaluate(ind)
|
||||
if max(ind.fitness.values) != float('inf'):
|
||||
offspring += [ind]
|
||||
#Reproduction
|
||||
else:
|
||||
offspring += [random.choice(population)]
|
||||
except profile_execution_failure:
|
||||
pass
|
||||
#Reproduction
|
||||
else:
|
||||
offspring += [random.choice(population)]
|
||||
|
||||
#Update fitnesses
|
||||
fitnesses = toolbox.map(toolbox.evaluate, offspring)
|
||||
@@ -195,9 +173,8 @@ def is_local_optimum(parameters, template, sizes, context):
|
||||
sweep_over = [0,1,2,3,4]
|
||||
|
||||
#Evaluate the provided parameters guess
|
||||
try:
|
||||
reference = tools.benchmark(template, parameters, tree)
|
||||
except profile_execution_failure:
|
||||
reference = tools.benchmark(template, template(*parameters), tree)
|
||||
if isinf(reference):
|
||||
return False
|
||||
|
||||
#Latency bound -- ignore
|
||||
@@ -210,12 +187,9 @@ def is_local_optimum(parameters, template, sizes, context):
|
||||
for x in product(*domain):
|
||||
if x==parameters:
|
||||
pass
|
||||
try:
|
||||
time = tools.benchmark(template, x, tree)
|
||||
if time/reference < .98:
|
||||
return False
|
||||
except profile_execution_failure:
|
||||
pass
|
||||
time = tools.benchmark(template, template(*x), tree)
|
||||
if time/reference < .98:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
|
@@ -40,15 +40,18 @@ def linspace(a, b, n=100):
|
||||
def expspace(a,b,N,r=128):
|
||||
return [int(ceil(exp(x)/r)*r) for x in linspace(log(a), log(b), N)]
|
||||
|
||||
def benchmark(template, setting, tree):
|
||||
def benchmark(operation, template, tree):
|
||||
queue = tree.context.queues[0]
|
||||
queue.profiles[template, sc.float32] = sc.profile(template(*setting), sc.float32, queue)
|
||||
queue.profiles[template, sc.float32] = sc.profile(template, sc.float32, queue)
|
||||
times = []
|
||||
total = 0
|
||||
i = 0
|
||||
#Warm-up
|
||||
z, events = sc.driver.enqueue(tree)
|
||||
tree.context.queues[0].synchronize()
|
||||
try:
|
||||
z, events = sc.driver.enqueue(tree)
|
||||
tree.context.queues[0].synchronize()
|
||||
except profile_execution_failure:
|
||||
return float("inf")
|
||||
#Time
|
||||
while total < 1e-1:
|
||||
start = time()
|
||||
@@ -119,6 +122,16 @@ def metric_name_of(template):
|
||||
return 'GFLOPS'
|
||||
return 'GB/S'
|
||||
|
||||
def external_profiles(template):
|
||||
if template is sc.templates.gemm_nn:
|
||||
return [sc.templates.cublas_gemm('N', 'N')]
|
||||
elif template is sc.templates.gemm_tn:
|
||||
return [sc.templates.cublas_gemm('T', 'N')]
|
||||
elif template is sc.templates.gemm_nt:
|
||||
return [sc.templates.cublas_gemm('N', 'T')]
|
||||
elif template is sc.templates.gemm_tt:
|
||||
return [sc.templates.cublas_gemm('T', 'T')]
|
||||
|
||||
def genetic_infos_of(template):
|
||||
if issubclass(template, sc.templates.elementwise_1d):
|
||||
return {'categorical': [3], 'nbits': [3,4,4,2] }
|
||||
|
@@ -69,73 +69,52 @@ class Tuner:
|
||||
|
||||
#BLAS1 training sizes
|
||||
if operation in [sc.templates.elementwise_1d, sc.templates.reduce_1d]:
|
||||
if level=='simple':
|
||||
sizes = [(10000000,)]
|
||||
elif level=='intermediate':
|
||||
sizes = [(x,) for x in tools.expspace(1e3, 1e8, 10)]
|
||||
else:
|
||||
sizes = [(x,) for x in tools.expspace(1e3, 1e8, 100)]
|
||||
sizes = [(x,) for x in tools.expspace(1e3, 1e8, 20)]
|
||||
|
||||
#BLAS2 training sizes
|
||||
if operation in [sc.templates.elementwise_2d, sc.templates.reduce_2d_rows, sc.templates.reduce_2d_cols]:
|
||||
if level=='simple':
|
||||
sizes = [(1536, 1536)]
|
||||
elif level=='intermediate':
|
||||
sizes = []
|
||||
#Square
|
||||
for N in [896, 1760, 2048, 2560]:
|
||||
sizes += [(N, N)]
|
||||
#Tall and Skinny
|
||||
for M in [16, 32, 64, 128]:
|
||||
for N in [1024, 4096, 16384, 65536, 262144]:
|
||||
sizes += [(M, N)]
|
||||
sizes += [(N, M)]
|
||||
else:
|
||||
sizes = product(pow2range(4,17), pow2range(4,17))
|
||||
sizes = []
|
||||
#Square
|
||||
for N in [896, 1760, 2048, 2560]:
|
||||
sizes += [(N, N)]
|
||||
#Tall and Skinny
|
||||
for M in [16, 32, 64, 128]:
|
||||
for N in [1024, 4096, 16384, 65536, 262144]:
|
||||
sizes += [(M, N)]
|
||||
sizes += [(N, M)]
|
||||
|
||||
#BLAS3 training sizes
|
||||
if operation in [sc.templates.gemm_nn, sc.templates.gemm_nt, sc.templates.gemm_tn, sc.templates.gemm_tt]:
|
||||
if level=='simple':
|
||||
sizes = [(2560,2560,2560)]
|
||||
elif level=='intermediate':
|
||||
sizes = []
|
||||
#Square
|
||||
for N in [896, 1760, 2048, 2560]:
|
||||
sizes += [(N, N, N)]
|
||||
#LaPack
|
||||
for N in [896, 1760, 2048, 2560]:
|
||||
for K in [16, 32, 64, 128]:
|
||||
sizes += [(N, N, K)]
|
||||
#Covariance
|
||||
for N in [16, 32, 64, 128]:
|
||||
for K in [16000,32000,64000,128000]:
|
||||
sizes += [(N, N, K)]
|
||||
#DeepSpeech
|
||||
for M in [1760, 2048, 2560]:
|
||||
for N in [16, 32, 64, 128, M]:
|
||||
sizes += [(M, N, M)]
|
||||
elif level=='full':
|
||||
sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17))
|
||||
|
||||
#Remove duplicates and or too small/big tuples
|
||||
sizes = [x for x in sizes if 1e-4 <= tools.memory_footprint(operation, x) <= 2e-1]
|
||||
sizes = []
|
||||
#Square
|
||||
for N in [896, 1760, 2048, 2560]:
|
||||
sizes += [(N, N, N)]
|
||||
#LaPack
|
||||
for N in [896, 1760, 2048, 2560]:
|
||||
for K in [16, 32, 64, 128]:
|
||||
sizes += [(N, N, K)]
|
||||
#Covariance
|
||||
for N in [16, 32, 64, 128]:
|
||||
for K in [16000,32000,64000,128000]:
|
||||
sizes += [(N, N, K)]
|
||||
#DeepSpeech
|
||||
for M in [1760, 2048, 2560]:
|
||||
for N in [16, 32, 64, 128, M]:
|
||||
sizes += [(M, N, M)]
|
||||
|
||||
#Training data
|
||||
performance = tools.metric_of(operation)
|
||||
profiles, X, Y = [], [], []
|
||||
|
||||
#Restore previous run
|
||||
#Restore progress
|
||||
savepath = os.path.join('save', operation.__name__)
|
||||
if not os.path.exists(savepath):
|
||||
os.makedirs(savepath)
|
||||
|
||||
try:
|
||||
with open(os.path.join(savepath, 'X.csv')) as f:
|
||||
X = [tuple(map(int, row)) for row in csv.reader(f, delimiter=',')]
|
||||
|
||||
with open(os.path.join(savepath, 'Y.csv')) as f:
|
||||
Y = [map(float, row) for row in csv.reader(f, delimiter=',')]
|
||||
|
||||
with open(os.path.join(savepath, 'profiles.csv')) as f:
|
||||
def mmap(x):
|
||||
if x=='FETCH_FROM_LOCAL':
|
||||
@@ -149,94 +128,69 @@ class Tuner:
|
||||
except:
|
||||
pass
|
||||
|
||||
##### Exploration #####
|
||||
#Tuning
|
||||
for idx, x in enumerate(sizes):
|
||||
#Create new line on log
|
||||
if idx>0:
|
||||
self.progress_bar.set_finished()
|
||||
|
||||
self.progress_bar.set_finished()
|
||||
self.progress_bar.set_prefix(', '.join(map(str, x)))
|
||||
#Skip if saved
|
||||
#Skip if already saved
|
||||
if x in X:
|
||||
row = Y[X.index(x)]
|
||||
self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
|
||||
continue
|
||||
|
||||
#Check if the current best prediction is not a local optimum
|
||||
idx = len(X)
|
||||
nparams = len(profiles)
|
||||
tree, operands = tools.tree_of(operation, x, context)
|
||||
if idx==0:
|
||||
retune = True
|
||||
predicted = None
|
||||
else:
|
||||
if nparams==1:
|
||||
predicted = profiles[0]
|
||||
else:
|
||||
clf = RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
|
||||
#clf, nrmse = model.train(X, Y, profiles)
|
||||
predperf = clf.predict(x)[0]
|
||||
best = (-predperf).argsort()
|
||||
perf = []
|
||||
for b in best:
|
||||
try:
|
||||
perf += [performance(x, tools.benchmark(operation, profiles[b], tree))]
|
||||
break
|
||||
except profile_execution_failure:
|
||||
pass
|
||||
if perf:
|
||||
predicted = profiles[best[argmax(perf)]]
|
||||
retune = not optimize.is_local_optimum(predicted, operation, x, context)
|
||||
else:
|
||||
retune = True
|
||||
predicted = None
|
||||
|
||||
#Check if GA needs to run (i.e., current best prediction is not a local optimum)
|
||||
tune = True
|
||||
best = None
|
||||
if idx > 0:
|
||||
dim = min(10, idx+1)
|
||||
model = RandomForestRegressor(dim, dim).fit(X, Y)
|
||||
predictions = model.predict(x)[0]
|
||||
for idx in (-predictions).argsort():
|
||||
ts = tools.benchmark(operation, operation(*profiles[idx]), tree)
|
||||
if np.isfinite(ts):
|
||||
break
|
||||
if np.isfinite(ts):
|
||||
best = profiles[idx]
|
||||
tune = not optimize.is_local_optimum(predicted, operation, x, context)
|
||||
#Retune if necessary
|
||||
if retune:
|
||||
if tune:
|
||||
optimizer = optimize.GeneticOptimizer(self.logger, naccept=1000, niter=1000, cxpb=.4, mutpb=.4, popsize=20, progress_bar = self.progress_bar)
|
||||
new = optimizer.run(operation, x, context, prior=predicted)[0]
|
||||
if new not in profiles:
|
||||
profiles.append(new)
|
||||
if idx > 0:
|
||||
for xx,yy in zip(X, Y):
|
||||
_tree, _operands = tools.tree_of(operation, xx, context)
|
||||
try:
|
||||
time = tools.benchmark(operation, new, _tree)
|
||||
perf = performance(xx, time)
|
||||
except profile_execution_failure:
|
||||
perf = 0
|
||||
yy.append(0 if isinf(perf) else perf)
|
||||
|
||||
|
||||
##### Training #####
|
||||
y = []
|
||||
fastest = max(predperf) if nparams > 1 else None
|
||||
for ip, p in enumerate(profiles):
|
||||
try:
|
||||
perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree))
|
||||
except profile_execution_failure:
|
||||
perf = 0
|
||||
y.append(0 if isinf(perf) else perf)
|
||||
best = optimizer.run(operation, x, context, prior=best)[0]
|
||||
if best not in profiles:
|
||||
profiles.append(best)
|
||||
for xx,yy in zip(X, Y):
|
||||
tree, _operands = tools.tree_of(operation, xx, context)
|
||||
time = tools.benchmark(operation, best, _tree)
|
||||
yy.append(performance(xx, time))
|
||||
#Update dataset
|
||||
X.append(x)
|
||||
y = [performance(x,tools.benchmark(operation, prf, tree)) for prf in profiles]
|
||||
Y.append(y)
|
||||
|
||||
#Save data
|
||||
for (fname, data) in zip(['X.csv', 'Y.csv', 'profiles.csv'], [X, Y, profiles]):
|
||||
with open(os.path.join(savepath, fname), 'wb') as f:
|
||||
csv.writer(f).writerows(data)
|
||||
|
||||
#print performance info in case no tuning was done
|
||||
if not retune:
|
||||
if not tune:
|
||||
row = Y[X.index(x)]
|
||||
self.progress_bar.update(1, 1, profiles[argmax(row)], max(row))
|
||||
self.progress_bar.set_finished()
|
||||
|
||||
#Remove unused profiles
|
||||
#Adding external profiles
|
||||
#~ for prf in tools.external_profiles(operation):
|
||||
#~ x = [1024, 1024, 1024]
|
||||
#~ tree, operands = tools.tree_of(operation, x, context)
|
||||
#~ print performance(x,tools.benchmark(operation, prf, tree))
|
||||
|
||||
#Pruning of useless profiles
|
||||
if len(Y[0]) > 1:
|
||||
unused = np.where(np.bincount(np.argmax(Y, 1))==0)[0]
|
||||
profiles = [p for ip,p in enumerate(profiles) if ip not in unused]
|
||||
Y = np.delete(Y, np.where(np.bincount(np.argmax(Y, 1))==0), axis=1).tolist()
|
||||
|
||||
##### Exportation #####
|
||||
#Exporting to JSON
|
||||
json_path = tools.sanitize(device.name) + '.json' if not self.json_path else self.json_path
|
||||
if os.path.isfile(json_path):
|
||||
json_data = json.load(open(json_path, 'r'))
|
||||
|
Reference in New Issue
Block a user