Inference: now explicit tuning among top-5 kernel predictions
This commit is contained in:
@@ -1,16 +0,0 @@
|
|||||||
|
|
||||||
set terminal pdf
|
|
||||||
set output 'bench.pdf'
|
|
||||||
|
|
||||||
set xlabel 'N'
|
|
||||||
set ylabel 'Bandwidth (GB/s)'
|
|
||||||
set key top left
|
|
||||||
stats "out.dat" nooutput
|
|
||||||
|
|
||||||
set logscale x
|
|
||||||
do for [i=1:STATS_blocks]{
|
|
||||||
plot "out.dat" index (i-1) using 1:2 with lines title 'ViennaCL', \
|
|
||||||
"out.dat" index (i-1) using 1:3 with lines title 'Model', \
|
|
||||||
"out.dat" index (i-1) using 1:4 with lines title 'Optimal', \
|
|
||||||
"out.dat" index (i-1) using 1:5 with lines title 'CuBLAS'
|
|
||||||
}
|
|
@@ -60,7 +60,7 @@ public:
|
|||||||
private:
|
private:
|
||||||
templates_container templates_;
|
templates_container templates_;
|
||||||
std::shared_ptr<predictors::random_forest> predictor_;
|
std::shared_ptr<predictors::random_forest> predictor_;
|
||||||
std::map<std::vector<int_t>, int> hardcoded_;
|
std::map<std::vector<int_t>, int> labels_;
|
||||||
driver::CommandQueue queue_;
|
driver::CommandQueue queue_;
|
||||||
driver::ProgramCache & cache_;
|
driver::ProgramCache & cache_;
|
||||||
};
|
};
|
||||||
|
@@ -84,54 +84,44 @@ profiles::value_type::value_type(numeric_type dtype, std::shared_ptr<templates::
|
|||||||
|
|
||||||
void profiles::value_type::execute(runtime::execution_handler const & expr)
|
void profiles::value_type::execute(runtime::execution_handler const & expr)
|
||||||
{
|
{
|
||||||
|
static const int MAX_TEMPORARY_WORKSPACE = 1e6;
|
||||||
driver::Program const & program = init(expr);
|
driver::Program const & program = init(expr);
|
||||||
std::vector<int_t> x = templates_[0]->input_sizes(expr.x());
|
std::vector<int_t> x = templates_[0]->input_sizes(expr.x());
|
||||||
static const int MAX_TEMPORARY_WORKSPACE = 1e6;
|
|
||||||
|
|
||||||
//Specific tuning if requested
|
//Cached
|
||||||
if(expr.dispatcher_options().tune && hardcoded_.find(x)==hardcoded_.end())
|
auto it = labels_.find(x);
|
||||||
{
|
if(it!=labels_.end()){
|
||||||
std::vector<double> timings(templates_.size());
|
templates_[it->second]->enqueue(queue_, program, tools::to_string(it->second), expr);
|
||||||
for(unsigned int i = 0 ; i < templates_.size() ; ++i)
|
return;
|
||||||
{
|
}
|
||||||
if(templates_[i]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE){
|
|
||||||
timings[i] = INFINITY;
|
//Not cached
|
||||||
continue;
|
std::vector<double> times;
|
||||||
}
|
std::vector<float> perf = predictor_->predict(x);
|
||||||
std::list<driver::Event> events;
|
std::vector<size_t> idx(perf.size());
|
||||||
try{
|
std::iota(idx.begin(), idx.end(), 0);
|
||||||
templates_[i]->enqueue(queue_, program, tools::to_string(i), runtime::execution_handler(expr.x(), runtime::execution_options_type(0, &events)));
|
std::sort(idx.begin(), idx.end(), [&perf](size_t i1, size_t i2) {return perf[i1] > perf[i2];});
|
||||||
queue_.synchronize();
|
bool valid_found = false;
|
||||||
timings[i] = 1e-9*std::accumulate(events.begin(), events.end(), 0, &time_event);
|
for(size_t k = 0 ; k < std::min<size_t>(idx.size(), idx.size()) || !valid_found ; k++){
|
||||||
}catch(...){
|
size_t i = idx[k];
|
||||||
timings[i] = INFINITY;
|
if(templates_[i]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE){
|
||||||
}
|
times.push_back(INFINITY);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
std::list<driver::Event> events;
|
||||||
|
try{
|
||||||
|
templates_[i]->enqueue(queue_, program, tools::to_string(i), runtime::execution_handler(expr.x(), runtime::execution_options_type(0, &events)));
|
||||||
|
queue_.synchronize();
|
||||||
|
times.push_back(1e-9*std::accumulate(events.begin(), events.end(), 0, &time_event));
|
||||||
|
valid_found = true;
|
||||||
|
}catch(...){
|
||||||
|
times.push_back(INFINITY);
|
||||||
}
|
}
|
||||||
//Fill the override
|
|
||||||
std::vector<int_t> x = templates_[0]->input_sizes(expr.x());
|
|
||||||
hardcoded_[x] = std::distance(timings.begin(),std::min_element(timings.begin(), timings.end()));
|
|
||||||
}
|
}
|
||||||
|
//Fill the override
|
||||||
//Prediction
|
size_t label = idx[std::distance(times.begin(),std::min_element(times.begin(), times.end()))];
|
||||||
int label = 0;
|
labels_.insert({x, label});
|
||||||
if(expr.dispatcher_options().label>=0)
|
templates_[label]->enqueue(queue_, program, tools::to_string(label), expr);
|
||||||
label = expr.dispatcher_options().label;
|
|
||||||
else if(hardcoded_.find(x)!=hardcoded_.end())
|
|
||||||
label = hardcoded_.at(x);
|
|
||||||
else if(predictor_.get())
|
|
||||||
{
|
|
||||||
std::vector<float> predictions = predictor_->predict(x);
|
|
||||||
do{
|
|
||||||
label = std::distance(predictions.begin(),std::max_element(predictions.begin(), predictions.end()));
|
|
||||||
predictions[label] = 0;
|
|
||||||
}while(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE);
|
|
||||||
}
|
|
||||||
|
|
||||||
//Execution
|
|
||||||
if(templates_[label]->temporary_workspace(expr.x()) > MAX_TEMPORARY_WORKSPACE)
|
|
||||||
throw operation_not_supported_exception("Running this operation would require an overly large temporary.");
|
|
||||||
|
|
||||||
return templates_[label]->enqueue(queue_, program, tools::to_string(label), expr);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
profiles::value_type::templates_container const & profiles::value_type::templates() const
|
profiles::value_type::templates_container const & profiles::value_type::templates() const
|
||||||
|
Reference in New Issue
Block a user