From b32de3ac76d8ae0c8efe50cf402ae933e63f0b0e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 25 Jun 2015 08:12:16 -0700 Subject: [PATCH] C++: More clBLAS routines --- bench/blas.cpp | 262 +++++++++++++++++++++--------------------- include/isaac/array.h | 8 +- lib/array.cpp | 35 ++++-- lib/wrap/clBLAS.cpp | 183 +++++++++++++++++++++++------ 4 files changed, 310 insertions(+), 178 deletions(-) diff --git a/bench/blas.cpp b/bench/blas.cpp index 4959697e2..c263d3077 100644 --- a/bench/blas.cpp +++ b/bench/blas.cpp @@ -99,6 +99,7 @@ void bench(ad::numeric_type dtype, std::string operation) double total_time = 0;\ while(total_time*1e-9 < 1e-3){\ cl::Event event;\ + flush = ad::zeros(1e6, 1, dtype);\ OP;\ queue.synchronize();\ times.push_back(event.getProfilingInfo() - event.getProfilingInfo());\ @@ -146,8 +147,7 @@ void bench(ad::numeric_type dtype, std::string operation) ad::array flush(1e6, dtype); std::cout << "#" << operation << " (" << metric[operation] << ")" << std::endl; std::cout << "N"; - std::cout << "\tISAAC (predictive)"; - std::cout << "\tISAAC (optimal)"; + std::cout << "\tISAAC"; #ifdef BENCH_CLBLAS std::cout << "\tclBLAS"; #endif @@ -162,142 +162,143 @@ void bench(ad::numeric_type dtype, std::string operation) // RUN BENCHMARKS // -// /*---------*/ -// /*--BLAS1--*/ -// /*---------*/ + /*---------*/ + /*--BLAS1--*/ + /*---------*/ -// if(operation=="axpy") -// { -// for(int_t N: create_log_range(1e3, 2e7, 50, 64)) -// { -// std::cout << N; -// ad::array x(N, dtype), y(N, dtype); -// /* ISAAC */ -// std::list events;\ -// BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), 3*N*dtsize/t) -// BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t) -// /* clblas */ -// #ifdef BENCH_CLBLAS -// BENCHMARK_CLBLAS(clblasSaxpy(N, 1, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event()), 3*N*dtsize/t) -// #endif -// /* BLAS */ -// #ifdef BENCH_CBLAS -// std::vector cx(N), cy(N); -// ad::copy(x, cx); -// ad::copy(y, cy); -// BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t); -// #endif -// /* CuBLAS */ -// #ifdef BENCH_CUBLAS -// T *cux, *cuy; -// cudaMalloc((void**) &cux, N * sizeof(T)); -// cudaMalloc((void**) &cuy, N * sizeof(T)); -// BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t) -// cudaFree(cux); -// cudaFree(cuy); -// #endif -// std::cout << std::endl; -// } -// } + if(operation=="axpy") + { + float alpha = 1; + for(int_t N: create_log_range(1e3, 2e7, 50, 64)) + { + std::cout << N; + ad::array x(N, dtype), y(N, dtype); + /* ISAAC */ + std::list events;\ + BENCHMARK_ISAAC(y = ad::control(x + alpha*y, ad::execution_options_type(0, &events)), 3*N*dtsize/t) + /* clblas */ + #ifdef BENCH_CLBLAS + BENCHMARK_CLBLAS(clblasSaxpy(N, alpha, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event()), 3*N*dtsize/t) + #endif + /* BLAS */ + #ifdef BENCH_CBLAS + std::vector cx(N), cy(N); + ad::copy(x, cx); + ad::copy(y, cy); + BENCHMARK_HOST(cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t); + #endif + /* CuBLAS */ + #ifdef BENCH_CUBLAS + T *cux, *cuy; + cudaMalloc((void**) &cux, N * sizeof(T)); + cudaMalloc((void**) &cuy, N * sizeof(T)); + BENCHMARK_CUDA(cublasSaxpy(N, alpha, cux, 1, cuy, 1), 3*N*dtsize/t) + cudaFree(cux); + cudaFree(cuy); + #endif + std::cout << std::endl; + } + } -// if(operation=="dot") -// { -// for(int_t N: create_log_range(1e3, 2e7, 50, 64)) -// { -// std::cout << N; -// /* ISAAC */ -// ad::array x(N, dtype), y(N, dtype); -// ad::array scratch(N, dtype); -// ad::scalar s(dtype); -// s = dot(x,y); queue.synchronize(); -// BENCHMARK_ISAAC(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t) -// /* clblas */ -// #ifdef BENCH_CLBLAS -// BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event()), 2*N*dtsize/t) -// #endif -// /* BLAS */ -// #ifdef BENCH_CBLAS -// std::vector cx(N), cy(N); -// ad::copy(x, cx); -// ad::copy(y, cy); -// BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t); -// #endif -// #ifdef BENCH_CUBLAS -// T *cux, *cuy; -// T result; -// cudaMalloc((void**) &cux, N * sizeof(T)); -// cudaMalloc((void**) &cuy, N * sizeof(T)); -// BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t) -// cudaFree(cux); -// cudaFree(cuy); -// #endif -// std::cout << std::endl; -// } -// std::cout << "\n\n" << std::flush; -// } + if(operation=="dot") + { + for(int_t N: create_log_range(1e3, 2e7, 50, 64)) + { + std::cout << N; + /* ISAAC */ + ad::array x(N, dtype), y(N, dtype); + ad::array scratch(N, dtype); + ad::scalar s(dtype); + s = dot(x,y); queue.synchronize(); + BENCHMARK_ISAAC(s = ad::control(dot(x,y), ad::execution_options_type(0, &events)), 2*N*dtsize/t) + /* clblas */ + #ifdef BENCH_CLBLAS + BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event()), 2*N*dtsize/t) + #endif + /* BLAS */ + #ifdef BENCH_CBLAS + std::vector cx(N), cy(N); + ad::copy(x, cx); + ad::copy(y, cy); + BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t); + #endif + #ifdef BENCH_CUBLAS + T *cux, *cuy; + T result; + cudaMalloc((void**) &cux, N * sizeof(T)); + cudaMalloc((void**) &cuy, N * sizeof(T)); + BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t) + cudaFree(cux); + cudaFree(cuy); + #endif + std::cout << std::endl; + } + std::cout << "\n\n" << std::flush; + } -// if(operation.substr(0, 4)=="gemv") -// { -// std::vector > MNs; -// MNs.push_back(std::make_tuple(896,896)); -// MNs.push_back(std::make_tuple(3072,3072)); -// MNs.push_back(std::make_tuple(64,32000)); -// MNs.push_back(std::make_tuple(896,32000)); -// MNs.push_back(std::make_tuple(32000, 64)); -// MNs.push_back(std::make_tuple(32000, 896)); + if(operation.substr(0, 4)=="gemv") + { + std::vector > MNs; + MNs.push_back(std::make_tuple(896,896)); + MNs.push_back(std::make_tuple(3072,3072)); + MNs.push_back(std::make_tuple(64,32000)); + MNs.push_back(std::make_tuple(896,32000)); + MNs.push_back(std::make_tuple(32000, 64)); + MNs.push_back(std::make_tuple(32000, 896)); -// /*---------*/ -// /*--BLAS2--*/ -// /*---------*/ -// //T-layout -// for(std::tuple MN: MNs) -// { -// int_t M = std::get<0>(MN); -// int_t N = std::get<1>(MN); -// std::cout << M << "," << N; -// /* ISAAC */ -// ad::array A(N, M, dtype), y(M, dtype), x(N, dtype); -// #if HAS_A_BLAS -// int_t lda = A.ld(); -// #endif -// y = dot(trans(A),x); queue.synchronize(); -// BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)),(M*N + M + N)*dtsize/t); -// BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t); -// #ifdef BENCH_CLBLAS -// BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, clblasTrans, N, M, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event()), (M*N + M + N)*dtsize/t) -// #endif -// #ifdef BENCH_CBLAS -// std::vector cA(N*M), cx(N), cy(M); -// ad::copy(x, cx); -// ad::copy(y, cy); -// ad::copy(A, cA); -// BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t); -// #endif -// #ifdef BENCH_CUBLAS -// T *cuA, *cux, *cuy; -// cudaMalloc((void**) &cuA, N * M * sizeof(T)); -// cudaMalloc((void**) &cux, N * sizeof(T)); -// cudaMalloc((void**) &cuy, M * sizeof(T)); -// BENCHMARK_CUDA(cublasSgemv('t', N, M, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t) -// cudaFree(cuA); -// cudaFree(cux); -// cudaFree(cuy); -// #endif -// std::cout << std::endl; -// } -// std::cout << "\n\n" << std::flush; -// } + /*---------*/ + /*--BLAS2--*/ + /*---------*/ + //T-layout + for(std::tuple MN: MNs) + { + int_t M = std::get<0>(MN); + int_t N = std::get<1>(MN); + std::cout << M << "," << N; + /* ISAAC */ + ad::array A(N, M, dtype), y(M, dtype), x(N, dtype); + #if HAS_A_BLAS + int_t lda = A.ld(); + #endif + y = dot(trans(A),x); queue.synchronize(); + BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events)),(M*N + M + N)*dtsize/t); + #ifdef BENCH_CLBLAS + BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, clblasTrans, N, M, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event()), (M*N + M + N)*dtsize/t) + #endif + #ifdef BENCH_CBLAS + std::vector cA(N*M), cx(N), cy(M); + ad::copy(x, cx); + ad::copy(y, cy); + ad::copy(A, cA); + BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t); + #endif + #ifdef BENCH_CUBLAS + T *cuA, *cux, *cuy; + cudaMalloc((void**) &cuA, N * M * sizeof(T)); + cudaMalloc((void**) &cux, N * sizeof(T)); + cudaMalloc((void**) &cuy, M * sizeof(T)); + BENCHMARK_CUDA(cublasSgemv('t', N, M, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t) + cudaFree(cuA); + cudaFree(cux); + cudaFree(cuy); + #endif + std::cout << std::endl; + } + std::cout << "\n\n" << std::flush; + } if(operation.substr(0,4)=="gemm") { std::vector > MNKs; - MNKs.push_back(std::make_tuple(896,896,896)); - MNKs.push_back(std::make_tuple(3072,3072,3072)); - MNKs.push_back(std::make_tuple(1024,64,768)); - MNKs.push_back(std::make_tuple(768,64,128)); - MNKs.push_back(std::make_tuple(64,64,32000)); - MNKs.push_back(std::make_tuple(1024,1024,32000)); +// MNKs.push_back(std::make_tuple(896,896,896)); +// MNKs.push_back(std::make_tuple(3072,3072,3072)); +// MNKs.push_back(std::make_tuple(1024,64,768)); +// MNKs.push_back(std::make_tuple(768,64,128)); +// MNKs.push_back(std::make_tuple(64,64,32000)); +// MNKs.push_back(std::make_tuple(1024,1024,32000)); + for(unsigned int N = 1 ; N <10 ; ++N) + MNKs.push_back(std::make_tuple(128*N, 128*N, 128*N)); /*---------*/ /*--BLAS3--*/ /*---------*/ @@ -312,8 +313,7 @@ void bench(ad::numeric_type dtype, std::string operation) #if HAS_A_BLAS int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld(); #endif -// BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), (double)2*M*N*K/t); - //BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), (double)2*M*N*K/t); + BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events)), (double)2*M*N*K/t); /* clblas */ #ifdef BENCH_CLBLAS BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, clblasNoTrans, clblasTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb, diff --git a/include/isaac/array.h b/include/isaac/array.h index 5ef23b16f..ca3955085 100644 --- a/include/isaac/array.h +++ b/include/isaac/array.h @@ -14,15 +14,22 @@ class scalar; class array: public array_base { +protected: + //Slices + array(numeric_type dtype, driver::Buffer data, slice const & s1, slice const & s2, int_t ld); + public: //1D Constructors array(int_t size1, numeric_type dtype, driver::Context context = driver::queues.default_context()); + array(int_t size1, numeric_type dtype, driver::Buffer data, int_t start, int_t inc); + template array(std::vector
const & data, driver::Context context = driver::queues.default_context()); array(array & v, slice const & s1); //2D Constructors array(int_t size1, int_t size2, numeric_type dtype, driver::Context context = driver::queues.default_context()); + array(int_t size1, int_t size2, numeric_type dtype, driver::Buffer data, int_t start, int_t ld); template array(int_t size1, int_t size2, std::vector
const & data, driver::Context context = driver::queues.default_context()); array(array & M, slice const & s1, slice const & s2); @@ -31,7 +38,6 @@ public: array(int_t size1, int_t size2, int_t size3, numeric_type dtype, driver::Context context = driver::queues.default_context()); //General constructor - array(numeric_type dtype, driver::Buffer data, slice const & s1, slice const & s2, int_t ld); array(array_expression const & proxy); array(array const &); diff --git a/lib/array.cpp b/lib/array.cpp index 7c1777f4b..6756a46d0 100644 --- a/lib/array.cpp +++ b/lib/array.cpp @@ -19,6 +19,11 @@ array::array(int_t shape0, numeric_type dtype, driver::Context context) : context_(context), data_(context_, size_of(dtype)*dsize()) { } +array::array(int_t shape0, numeric_type dtype, driver::Buffer data, int_t start, int_t inc): + dtype_(dtype), shape_(shape0), start_(start, 0, 0, 0), stride_(inc), ld_(shape_[0]), context_(data.context()), data_(data) +{ } + + template array::array(std::vector
const & x, driver::Context context): dtype_(to_numeric_type
::value), shape_(x.size(), 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape_[0]), @@ -45,21 +50,20 @@ INSTANTIATE(double); #undef INSTANTIATE // 2D -array::array(int_t shape0, int_t shape1, numeric_type dtype, driver::Context context) : dtype_(dtype), shape_(shape0, shape1, 1, 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape0), +array::array(int_t shape0, int_t shape1, numeric_type dtype, driver::Context context) : dtype_(dtype), shape_(shape0, shape1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape0), context_(context), data_(context_, size_of(dtype_)*dsize()) {} +array::array(int_t shape0, int_t shape1, numeric_type dtype, driver::Buffer data, int_t start, int_t ld) : + dtype_(dtype), shape_(shape0, shape1), start_(start%ld, start/ld, 0, 0), stride_(1, 1, 1, 1), ld_(ld), context_(data.context()), data_(data) +{ } + array::array(array & M, slice const & s0, slice const & s1) : dtype_(M.dtype_), shape_(s0.size, s1.size, 1, 1), start_(M.start_[0] + M.stride_[0]*s0.start, M.start_[1] + M.stride_[1]*s1.start, 0, 0), stride_(M.stride_[0]*s0.stride, M.stride_[1]*s1.stride, 1, 1), ld_(M.ld_), context_(M.data_.context()), data_(M.data_) { } -// 3D -array::array(int_t shape0, int_t shape1, int_t shape2, numeric_type dtype, driver::Context context) : dtype_(dtype), shape_(shape0, shape1, shape2, 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape0), - context_(context), data_(context_, size_of(dtype_)*dsize()) -{} - template array::array(int_t shape0, int_t shape1, std::vector
const & data, driver::Context context) : dtype_(to_numeric_type
::value), @@ -69,6 +73,19 @@ array::array(int_t shape0, int_t shape1, std::vector
const & data, driver::C isaac::copy(data, *this); } +// 3D +array::array(int_t shape0, int_t shape1, int_t shape2, numeric_type dtype, driver::Context context) : dtype_(dtype), shape_(shape0, shape1, shape2, 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape0), + context_(context), data_(context_, size_of(dtype_)*dsize()) +{} + +//Slices +array::array(numeric_type dtype, driver::Buffer data, slice const & s0, slice const & s1, int_t ld): + dtype_(dtype), shape_(s0.size, s1.size), start_(s0.start, s1.start), stride_(s0.stride, s1.stride), + ld_(ld), context_(data.context()), data_(data) +{ } + + + #define INSTANTIATE(T) template array::array(int_t, int_t, std::vector const &, driver::Context) INSTANTIATE(char); INSTANTIATE(unsigned char); @@ -84,12 +101,6 @@ INSTANTIATE(float); INSTANTIATE(double); #undef INSTANTIATE -// General -array::array(numeric_type dtype, driver::Buffer data, slice const & s0, slice const & s1, int_t ld): - dtype_(dtype), shape_(s0.size, s1.size), start_(s0.start, s1.start), stride_(s0.stride, s1.stride), - ld_(ld), context_(data.context()), data_(data) -{ } - array::array(array_expression const & proxy) : array(control(proxy)){} array::array(array const & other) : array(control(other)){} diff --git a/lib/wrap/clBLAS.cpp b/lib/wrap/clBLAS.cpp index 9c61b214d..d719843bd 100644 --- a/lib/wrap/clBLAS.cpp +++ b/lib/wrap/clBLAS.cpp @@ -2,6 +2,7 @@ #include "isaac/wrap/clBLAS.h" #include "isaac/array.h" +#include "isaac/symbolic/execute.h" namespace is = isaac; @@ -18,6 +19,136 @@ extern "C" } + void execute(is::array_expression const & operation, is::driver::Context const & context, + cl_uint numCommandQueues, cl_command_queue *commandQueues, + cl_uint numEventsInWaitList, const cl_event *eventWaitList, + cl_event *events) + { + std::vector waitlist; + for(cl_uint i = 0 ; i < numEventsInWaitList ; ++i) + waitlist.push_back(cl::Event(eventWaitList[i])); + for(cl_uint i = 0 ; i < numCommandQueues ; ++i) + { + std::list levents; + is::driver::CommandQueue queue(cl::CommandQueue(commandQueues[i])); + clRetainCommandQueue(commandQueues[i]); + is::execution_options_type options(queue, &levents, &waitlist); + is::execute(is::control(operation, options), is::models(options.queue(context))); + if(events) + { + events[i] = static_cast(levents.front())(); + clRetainEvent(events[i]); + } + } + } + + //***************** + //BLAS1 + //***************** + clblasStatus clblasSaxpy(size_t N, cl_float alpha, + const cl_mem mx, size_t offx, int incx, + cl_mem my, size_t offy, int incy, + cl_uint numCommandQueues, cl_command_queue *commandQueues, + cl_uint numEventsInWaitList, const cl_event *eventWaitList, + cl_event *events) + { + is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx); + clRetainMemObject(mx); + is::array y(N, is::FLOAT_TYPE, cl::Buffer(my), offy, incy); + clRetainMemObject(my); + execute(is::detail::assign(y, x + alpha*y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); + return clblasSuccess; + } + + clblasStatus clblasSscal(size_t N, cl_float alpha, + cl_mem mx, size_t offx, int incx, + cl_uint numCommandQueues, cl_command_queue *commandQueues, + cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) + { + is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx); + clRetainMemObject(mx); + execute(is::detail::assign(x, alpha*x), x.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); + return clblasSuccess; + } + + clblasStatus clblasScopy(size_t N, + const cl_mem mx, size_t offx, int incx, + cl_mem my, size_t offy, int incy, + cl_uint numCommandQueues, cl_command_queue *commandQueues, + cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) + { + const is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx); + clRetainMemObject(mx); + is::array y(N, is::FLOAT_TYPE, cl::Buffer(my), offy, incy); + clRetainMemObject(my); + execute(is::detail::assign(y, x), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); + return clblasSuccess; + } + + clblasStatus clblasSdot(size_t N, cl_mem dotProduct, size_t offDP, + const cl_mem mx, size_t offx, int incx, + const cl_mem my, size_t offy, int incy, + cl_mem /*scratchBuff*/, cl_uint numCommandQueues, + cl_command_queue *commandQueues, cl_uint numEventsInWaitList, + const cl_event *eventWaitList, cl_event *events) + { + is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx); + clRetainMemObject(mx); + is::array y(N, is::FLOAT_TYPE, cl::Buffer(my), offy, incy); + clRetainMemObject(my); + is::scalar s(is::FLOAT_TYPE, cl::Buffer(dotProduct), offDP); + clRetainMemObject(dotProduct); + execute(is::detail::assign(s, dot(x,y)), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); + return clblasSuccess; + } + + clblasStatus clblasSasum(size_t N, cl_mem asum, size_t offAsum, + const cl_mem mx, size_t offx, int incx, + cl_mem /*scratchBuff*/, cl_uint numCommandQueues, cl_command_queue *commandQueues, + cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) + { + is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx); + clRetainMemObject(mx); + is::scalar s(is::FLOAT_TYPE, cl::Buffer(asum), offAsum); + clRetainMemObject(asum); + execute(is::detail::assign(s, sum(abs(x))), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); + return clblasSuccess; + } + + //***************** + //BLAS2 + //***************** + clblasStatus clblasSgemv(clblasOrder order, clblasTranspose transA, + size_t M, size_t N, + cl_float alpha, const cl_mem mA, size_t offA, size_t lda, + const cl_mem mx, size_t offx, int incx, + cl_float beta, cl_mem my, size_t offy, int incy, + cl_uint numCommandQueues, cl_command_queue *commandQueues, + cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) + { + //A + is::int_t As1 = M, As2 = N; + if(transA==clblasTrans) std::swap(As1, As2); + is::array A(As1, As2, is::FLOAT_TYPE, cl::Buffer(mA), offA, lda); + clRetainMemObject(mA); + //x + is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx); + clRetainMemObject(mx); + //y + is::array y(N, is::FLOAT_TYPE, cl::Buffer(my), offy, incy); + clRetainMemObject(my); + //Operation + is::driver::Context const & context = A.context(); + if(transA==clblasTrans) + execute(is::detail::assign(y, alpha*dot(A.T(), x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); + else + execute(is::detail::assign(y, alpha*dot(A, x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); + return clblasSuccess; + } + + //***************** + //BLAS3 + //***************** clblasStatus clblasSgemm(clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem mA, size_t offA, size_t lda, @@ -28,42 +159,26 @@ extern "C" { is::int_t As1 = M, As2 = K; is::int_t Bs1 = K, Bs2 = N; - + //Trans if(transA==clblasTrans) std::swap(As1, As2); if(transB==clblasTrans) std::swap(Bs1, Bs2); - - is::array A(is::FLOAT_TYPE, cl::Buffer(mA), is::_(offA%lda, As1, 1), is::_(offA/lda, As2, 1), lda); - is::array B(is::FLOAT_TYPE, cl::Buffer(mB), is::_(offB%ldb, Bs1, 1), is::_(offB/ldb, Bs2, 1), ldb); - is::array C(is::FLOAT_TYPE, cl::Buffer(mC), is::_(offC%ldc, M, 1), is::_(offC/ldc, N, 1), ldc); - - std::vector waitlist; - for(cl_uint i = 0 ; i < numEventsInWaitList ; ++i) - waitlist.push_back(cl::Event(eventWaitList[i])); - - std::list levents; - - for(cl_uint i = 0 ; i < numCommandQueues ; ++i) - { - clRetainCommandQueue(commandQueues[i]); - cl::CommandQueue clqueue(commandQueues[i]); - is::driver::CommandQueue queue(clqueue); - - - is::execution_options_type opt(queue, &levents, &waitlist); - - if(transA==clblasTrans && transB==clblasTrans) - C = is::control(alpha*dot(A.T(), B.T()) + beta*C, opt); - else if(transA==clblasTrans && transB==clblasNoTrans) - C = is::control(alpha*dot(A.T(), B) + beta*C, opt); - else if(transA==clblasNoTrans && transB==clblasTrans) - C = is::control(alpha*dot(A, B.T()) + beta*C, opt); - else - C = is::control(alpha*dot(A, B) + beta*C, opt); - } - - if(events) - *events = static_cast(levents.front())(); - std::cout << events << std::endl; + //Struct + is::array A(As1, As2, is::FLOAT_TYPE, cl::Buffer(mA), offA, lda); + clRetainMemObject(mA); + is::array B(Bs1, Bs2, is::FLOAT_TYPE, cl::Buffer(mB), offB, ldb); + clRetainMemObject(mB); + is::array C(M, N, is::FLOAT_TYPE, cl::Buffer(mC), offC, ldc); + clRetainMemObject(mC); + is::driver::Context const & context = C.context(); + //Operation + if(transA==clblasTrans && transB==clblasTrans) + execute(is::detail::assign(C, alpha*dot(A.T(), B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); + else if(transA==clblasTrans && transB==clblasNoTrans) + execute(is::detail::assign(C, alpha*dot(A.T(), B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); + else if(transA==clblasNoTrans && transB==clblasTrans) + execute(is::detail::assign(C, alpha*dot(A, B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); + else + execute(is::detail::assign(C, alpha*dot(A, B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return clblasSuccess; }