diff --git a/bench/blas.cpp b/bench/blas.cpp index 63ff8d498..312ca0e7b 100644 --- a/bench/blas.cpp +++ b/bench/blas.cpp @@ -92,7 +92,8 @@ void bench(ad::numeric_type dtype){ {\ std::vector times;\ double total_time = 0;\ - while(total_time*1e-9 < 1e-2){\ + queue.finish();\ + while(total_time*1e-9 < 1e-3){\ std::list events;\ OP;\ queue.finish();\ @@ -107,11 +108,11 @@ void bench(ad::numeric_type dtype){ {\ std::vector times;\ double total_time = 0;\ - while(total_time*1e-9 < 1e-2){\ + while(total_time*1e-9 < 1e-3){\ cl::Event event;\ OP;\ queue.finish();\ - times.push_back(event.getProfilingInfo() - event.getProfilingInfo());\ + times.push_back(time_event(0, event));\ total_time+=times.back();\ }\ double t = median(times);\ @@ -167,66 +168,34 @@ cl::CommandQueue & queue = ad::cl_ext::queues[ad::cl_ext::default_context()][0]; // /*---------*/ // /*--BLAS1--*/ // /*---------*/ -// std::cout << "#AXPY" << std::endl; -// for(int_t N : create_log_range(1e3, 2e7, 50, 64)) -// { -// std::cout << N; -// ad::array x(N, dtype), y(N, dtype); -// /* ATIDLAS */ -// y = x + y; queue.finish(); -// BENCHMARK_ATIDLAS(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t) -// /* clAmdBlas */ -//#ifdef BENCH_CLAMDBLAS -// BENCHMARK_CLAMDBLAS(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t) -//#endif -// /* BLAS */ -//#ifdef BENCH_CBLAS -// std::vector cx(N), cy(N); -// ad::copy(x, cx); -// ad::copy(y, cy); -// BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t); -//#endif -// /* CuBLAS */ -//#ifdef BENCH_CUBLAS -// T *cux, *cuy; -// cudaMalloc((void**) &cux, N * sizeof(T)); -// cudaMalloc((void**) &cuy, N * sizeof(T)); -// BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t) -// cudaFree(cux); -// cudaFree(cuy); -//#endif -// std::cout << std::endl; -// } -// std::cout << "\n\n" << std::flush; - - std::cout << "#DOT" << std::endl; + std::cout << "#AXPY" << std::endl; for(int_t i = 0 ; i < BLAS1_N.size() ; ++i) { int_t N = BLAS1_N[i]; std::cout << N; - /* ATIDLAS */ ad::array x(N, dtype), y(N, dtype); - ad::array scratch(N, dtype); - ad::scalar s(dtype); - s = dot(x,y); queue.finish(); - BENCHMARK_ATIDLAS(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t) + /* ATIDLAS */ + std::list events;\ + y = x + y; + queue.finish(); + BENCHMARK_ATIDLAS(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), 3*N*dtsize/t) /* clAmdBlas */ #ifdef BENCH_CLAMDBLAS - BENCHMARK_CLAMDBLAS(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t) + BENCHMARK_CLAMDBLAS(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t) #endif /* BLAS */ #ifdef BENCH_CBLAS std::vector cx(N), cy(N); ad::copy(x, cx); ad::copy(y, cy); - BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t); + BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t); #endif + /* CuBLAS */ #ifdef BENCH_CUBLAS T *cux, *cuy; - T result; cudaMalloc((void**) &cux, N * sizeof(T)); cudaMalloc((void**) &cuy, N * sizeof(T)); - BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t) + BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t) cudaFree(cux); cudaFree(cuy); #endif @@ -234,6 +203,41 @@ cl::CommandQueue & queue = ad::cl_ext::queues[ad::cl_ext::default_context()][0]; } std::cout << "\n\n" << std::flush; +// std::cout << "#DOT" << std::endl; +// for(int_t i = 0 ; i < BLAS1_N.size() ; ++i) +// { +// int_t N = BLAS1_N[i]; +// std::cout << N; +// /* ATIDLAS */ +// ad::array x(N, dtype), y(N, dtype); +// ad::array scratch(N, dtype); +// ad::scalar s(dtype); +// s = dot(x,y); queue.finish(); +// BENCHMARK_ATIDLAS(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t) +// /* clAmdBlas */ +//#ifdef BENCH_CLAMDBLAS +// BENCHMARK_CLAMDBLAS(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t) +//#endif +// /* BLAS */ +//#ifdef BENCH_CBLAS +// std::vector cx(N), cy(N); +// ad::copy(x, cx); +// ad::copy(y, cy); +// BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t); +//#endif +//#ifdef BENCH_CUBLAS +// T *cux, *cuy; +// T result; +// cudaMalloc((void**) &cux, N * sizeof(T)); +// cudaMalloc((void**) &cuy, N * sizeof(T)); +// BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t) +// cudaFree(cux); +// cudaFree(cuy); +//#endif +// std::cout << std::endl; +// } +// std::cout << "\n\n" << std::flush; + // /*---------*/ // /*--BLAS2--*/ // /*---------*/ diff --git a/lib/model/model.cpp b/lib/model/model.cpp index d42f4a967..2b167aea9 100644 --- a/lib/model/model.cpp +++ b/lib/model/model.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include "rapidjson/document.h" #include "atidlas/backend/parse.h" @@ -21,6 +22,9 @@ namespace atidlas { +static double time_event(unsigned long sum, cl::Event const & e) +{ return sum + e.getProfilingInfo() - e.getProfilingInfo();} + std::string model::define_extension(std::string const & extensions, std::string const & ext) { @@ -92,32 +96,31 @@ model::model(std::vector< tools::shared_ptr > const & templates, cl::Comma model::model(base const & tp, cl::CommandQueue & queue) : templates_(1,tp.clone()), queue_(queue) {} -void model::execute(controller const & expressions) +void model::execute(controller const & expr) { - std::vector & compilers = init(expressions); - std::vector x = templates_[0]->input_sizes(expressions.x()); + std::vector & compilers = init(expr); + std::vector x = templates_[0]->input_sizes(expr.x()); //Specific tuning if requested - if(expressions.dispatcher_options().tune && hardcoded_.find(x)==hardcoded_.end()) + if(expr.dispatcher_options().tune && hardcoded_.find(x)==hardcoded_.end()) { - std::vector timings(templates_.size()); - tools::timer timer; + std::vector timings(templates_.size()); for(size_t i = 0 ; i < templates_.size() ; ++i) { - timer.start(); - templates_[i]->enqueue(queue_, compilers, i, expressions); + std::list events; + templates_[i]->enqueue(queue_, compilers, i, control(expr.x(), execution_options_type(0, &events))); queue_.finish(); - timings[i] = timer.get(); + timings[i] = 1e-9*std::accumulate(events.begin(), events.end(), 0, &time_event); } //Fill the override - std::vector x = templates_[0]->input_sizes(expressions.x()); + std::vector x = templates_[0]->input_sizes(expr.x()); hardcoded_[x] = std::distance(timings.begin(),std::min_element(timings.begin(), timings.end())); } //Prediction int label = 0; - if(expressions.dispatcher_options().label>=0) - label = expressions.dispatcher_options().label; + if(expr.dispatcher_options().label>=0) + label = expr.dispatcher_options().label; else if(hardcoded_.find(x)!=hardcoded_.end()) label = hardcoded_.at(x); else if(predictor_.get()) @@ -127,7 +130,7 @@ void model::execute(controller const & expressions) } //Execution - return templates_[label]->enqueue(queue_, compilers, label, expressions); + return templates_[label]->enqueue(queue_, compilers, label, expr); } model::templates_container const & model::templates() const