Fix bug in operation-specific tuning
This commit is contained in:
@@ -92,7 +92,8 @@ void bench(ad::numeric_type dtype){
|
||||
{\
|
||||
std::vector<long> times;\
|
||||
double total_time = 0;\
|
||||
while(total_time*1e-9 < 1e-2){\
|
||||
queue.finish();\
|
||||
while(total_time*1e-9 < 1e-3){\
|
||||
std::list<cl::Event> events;\
|
||||
OP;\
|
||||
queue.finish();\
|
||||
@@ -107,11 +108,11 @@ void bench(ad::numeric_type dtype){
|
||||
{\
|
||||
std::vector<long> times;\
|
||||
double total_time = 0;\
|
||||
while(total_time*1e-9 < 1e-2){\
|
||||
while(total_time*1e-9 < 1e-3){\
|
||||
cl::Event event;\
|
||||
OP;\
|
||||
queue.finish();\
|
||||
times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>());\
|
||||
times.push_back(time_event(0, event));\
|
||||
total_time+=times.back();\
|
||||
}\
|
||||
double t = median(times);\
|
||||
@@ -167,66 +168,34 @@ cl::CommandQueue & queue = ad::cl_ext::queues[ad::cl_ext::default_context()][0];
|
||||
// /*---------*/
|
||||
// /*--BLAS1--*/
|
||||
// /*---------*/
|
||||
// std::cout << "#AXPY" << std::endl;
|
||||
// for(int_t N : create_log_range(1e3, 2e7, 50, 64))
|
||||
// {
|
||||
// std::cout << N;
|
||||
// ad::array x(N, dtype), y(N, dtype);
|
||||
// /* ATIDLAS */
|
||||
// y = x + y; queue.finish();
|
||||
// BENCHMARK_ATIDLAS(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t)
|
||||
// /* clAmdBlas */
|
||||
//#ifdef BENCH_CLAMDBLAS
|
||||
// BENCHMARK_CLAMDBLAS(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
|
||||
//#endif
|
||||
// /* BLAS */
|
||||
//#ifdef BENCH_CBLAS
|
||||
// std::vector<float> cx(N), cy(N);
|
||||
// ad::copy(x, cx);
|
||||
// ad::copy(y, cy);
|
||||
// BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
|
||||
//#endif
|
||||
// /* CuBLAS */
|
||||
//#ifdef BENCH_CUBLAS
|
||||
// T *cux, *cuy;
|
||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
// cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
// BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
|
||||
// cudaFree(cux);
|
||||
// cudaFree(cuy);
|
||||
//#endif
|
||||
// std::cout << std::endl;
|
||||
// }
|
||||
// std::cout << "\n\n" << std::flush;
|
||||
|
||||
std::cout << "#DOT" << std::endl;
|
||||
std::cout << "#AXPY" << std::endl;
|
||||
for(int_t i = 0 ; i < BLAS1_N.size() ; ++i)
|
||||
{
|
||||
int_t N = BLAS1_N[i];
|
||||
std::cout << N;
|
||||
/* ATIDLAS */
|
||||
ad::array x(N, dtype), y(N, dtype);
|
||||
ad::array scratch(N, dtype);
|
||||
ad::scalar s(dtype);
|
||||
s = dot(x,y); queue.finish();
|
||||
BENCHMARK_ATIDLAS(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t)
|
||||
/* ATIDLAS */
|
||||
std::list<cl::Event> events;\
|
||||
y = x + y;
|
||||
queue.finish();
|
||||
BENCHMARK_ATIDLAS(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), 3*N*dtsize/t)
|
||||
/* clAmdBlas */
|
||||
#ifdef BENCH_CLAMDBLAS
|
||||
BENCHMARK_CLAMDBLAS(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
|
||||
BENCHMARK_CLAMDBLAS(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
|
||||
#endif
|
||||
/* BLAS */
|
||||
#ifdef BENCH_CBLAS
|
||||
std::vector<float> cx(N), cy(N);
|
||||
ad::copy(x, cx);
|
||||
ad::copy(y, cy);
|
||||
BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
|
||||
BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
|
||||
#endif
|
||||
/* CuBLAS */
|
||||
#ifdef BENCH_CUBLAS
|
||||
T *cux, *cuy;
|
||||
T result;
|
||||
cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
|
||||
BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
|
||||
cudaFree(cux);
|
||||
cudaFree(cuy);
|
||||
#endif
|
||||
@@ -234,6 +203,41 @@ cl::CommandQueue & queue = ad::cl_ext::queues[ad::cl_ext::default_context()][0];
|
||||
}
|
||||
std::cout << "\n\n" << std::flush;
|
||||
|
||||
// std::cout << "#DOT" << std::endl;
|
||||
// for(int_t i = 0 ; i < BLAS1_N.size() ; ++i)
|
||||
// {
|
||||
// int_t N = BLAS1_N[i];
|
||||
// std::cout << N;
|
||||
// /* ATIDLAS */
|
||||
// ad::array x(N, dtype), y(N, dtype);
|
||||
// ad::array scratch(N, dtype);
|
||||
// ad::scalar s(dtype);
|
||||
// s = dot(x,y); queue.finish();
|
||||
// BENCHMARK_ATIDLAS(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t)
|
||||
// /* clAmdBlas */
|
||||
//#ifdef BENCH_CLAMDBLAS
|
||||
// BENCHMARK_CLAMDBLAS(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
|
||||
//#endif
|
||||
// /* BLAS */
|
||||
//#ifdef BENCH_CBLAS
|
||||
// std::vector<float> cx(N), cy(N);
|
||||
// ad::copy(x, cx);
|
||||
// ad::copy(y, cy);
|
||||
// BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
|
||||
//#endif
|
||||
//#ifdef BENCH_CUBLAS
|
||||
// T *cux, *cuy;
|
||||
// T result;
|
||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
// cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
// BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
|
||||
// cudaFree(cux);
|
||||
// cudaFree(cuy);
|
||||
//#endif
|
||||
// std::cout << std::endl;
|
||||
// }
|
||||
// std::cout << "\n\n" << std::flush;
|
||||
|
||||
// /*---------*/
|
||||
// /*--BLAS2--*/
|
||||
// /*---------*/
|
||||
|
@@ -2,6 +2,7 @@
|
||||
#include <fstream>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
|
||||
#include "rapidjson/document.h"
|
||||
#include "atidlas/backend/parse.h"
|
||||
@@ -21,6 +22,9 @@
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
static double time_event(unsigned long sum, cl::Event const & e)
|
||||
{ return sum + e.getProfilingInfo<CL_PROFILING_COMMAND_END>() - e.getProfilingInfo<CL_PROFILING_COMMAND_START>();}
|
||||
|
||||
|
||||
std::string model::define_extension(std::string const & extensions, std::string const & ext)
|
||||
{
|
||||
@@ -92,32 +96,31 @@ model::model(std::vector< tools::shared_ptr<base> > const & templates, cl::Comma
|
||||
model::model(base const & tp, cl::CommandQueue & queue) : templates_(1,tp.clone()), queue_(queue)
|
||||
{}
|
||||
|
||||
void model::execute(controller<expressions_tuple> const & expressions)
|
||||
void model::execute(controller<expressions_tuple> const & expr)
|
||||
{
|
||||
std::vector<cl_ext::lazy_compiler> & compilers = init(expressions);
|
||||
std::vector<int_t> x = templates_[0]->input_sizes(expressions.x());
|
||||
std::vector<cl_ext::lazy_compiler> & compilers = init(expr);
|
||||
std::vector<int_t> x = templates_[0]->input_sizes(expr.x());
|
||||
|
||||
//Specific tuning if requested
|
||||
if(expressions.dispatcher_options().tune && hardcoded_.find(x)==hardcoded_.end())
|
||||
if(expr.dispatcher_options().tune && hardcoded_.find(x)==hardcoded_.end())
|
||||
{
|
||||
std::vector<float> timings(templates_.size());
|
||||
tools::timer timer;
|
||||
std::vector<double> timings(templates_.size());
|
||||
for(size_t i = 0 ; i < templates_.size() ; ++i)
|
||||
{
|
||||
timer.start();
|
||||
templates_[i]->enqueue(queue_, compilers, i, expressions);
|
||||
std::list<cl::Event> events;
|
||||
templates_[i]->enqueue(queue_, compilers, i, control(expr.x(), execution_options_type(0, &events)));
|
||||
queue_.finish();
|
||||
timings[i] = timer.get();
|
||||
timings[i] = 1e-9*std::accumulate(events.begin(), events.end(), 0, &time_event);
|
||||
}
|
||||
//Fill the override
|
||||
std::vector<int_t> x = templates_[0]->input_sizes(expressions.x());
|
||||
std::vector<int_t> x = templates_[0]->input_sizes(expr.x());
|
||||
hardcoded_[x] = std::distance(timings.begin(),std::min_element(timings.begin(), timings.end()));
|
||||
}
|
||||
|
||||
//Prediction
|
||||
int label = 0;
|
||||
if(expressions.dispatcher_options().label>=0)
|
||||
label = expressions.dispatcher_options().label;
|
||||
if(expr.dispatcher_options().label>=0)
|
||||
label = expr.dispatcher_options().label;
|
||||
else if(hardcoded_.find(x)!=hardcoded_.end())
|
||||
label = hardcoded_.at(x);
|
||||
else if(predictor_.get())
|
||||
@@ -127,7 +130,7 @@ void model::execute(controller<expressions_tuple> const & expressions)
|
||||
}
|
||||
|
||||
//Execution
|
||||
return templates_[label]->enqueue(queue_, compilers, label, expressions);
|
||||
return templates_[label]->enqueue(queue_, compilers, label, expr);
|
||||
}
|
||||
|
||||
model::templates_container const & model::templates() const
|
||||
|
Reference in New Issue
Block a user