C++: Added clBLAS sGEMM ABI (still buggy)

This commit is contained in:
Philippe Tillet
2015-06-24 07:51:27 -07:00
parent c61eaceb21
commit 9f7e34ba5d
10 changed files with 9928 additions and 140 deletions

View File

@@ -11,12 +11,12 @@ if(CUDA_FOUND)
endif() endif()
#CLAMDBLAS #CLAMDBLAS
find_package(CLAMDBLAS) #find_package(CLAMDBLAS)
if(CLAMDBLAS_FOUND) #if(CLAMDBLAS_FOUND)
set(BLAS_DEF ${BLAS_DEF} "-DBENCH_CLBLAS") set(BLAS_DEF ${BLAS_DEF} "-DBENCH_CLBLAS")
include_directories(${CLAMDBLAS_INCLUDE_DIR}) #include_directories(${CLAMDBLAS_INCLUDE_DIR})
set(BLAS_LIBS ${BLAS_LIBS} ${CLAMDBLAS_LIBRARIES} ) #set(BLAS_LIBS ${BLAS_LIBS} ${CLAMDBLAS_LIBRARIES} )
endif() #endif()
#CBLAS #CBLAS
find_package(MKL) find_package(MKL)

View File

@@ -2,7 +2,7 @@
#include "isaac/symbolic/execute.h" #include "isaac/symbolic/execute.h"
#include "isaac/tools/timer.hpp" #include "isaac/tools/timer.hpp"
#ifdef BENCH_CLBLAS #ifdef BENCH_CLBLAS
#include "clBLAS.h" #include "isaac/wrap/clBLAS.h"
#endif #endif
#ifdef BENCH_CBLAS #ifdef BENCH_CBLAS
#include "cblas.h" #include "cblas.h"
@@ -166,127 +166,127 @@ void bench(ad::numeric_type dtype, std::string operation)
// /*--BLAS1--*/ // /*--BLAS1--*/
// /*---------*/ // /*---------*/
if(operation=="axpy") // if(operation=="axpy")
{ // {
for(int_t N: create_log_range(1e3, 2e7, 50, 64)) // for(int_t N: create_log_range(1e3, 2e7, 50, 64))
{ // {
std::cout << N; // std::cout << N;
ad::array x(N, dtype), y(N, dtype); // ad::array x(N, dtype), y(N, dtype);
/* ISAAC */ // /* ISAAC */
std::list<ad::driver::Event> events;\ // std::list<ad::driver::Event> events;\
BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), 3*N*dtsize/t) // BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), 3*N*dtsize/t)
BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t) // BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t)
/* clblas */ // /* clblas */
#ifdef BENCH_CLBLAS // #ifdef BENCH_CLBLAS
BENCHMARK_CLBLAS(clblasSaxpy(N, 1, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event()), 3*N*dtsize/t) // BENCHMARK_CLBLAS(clblasSaxpy(N, 1, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event()), 3*N*dtsize/t)
#endif // #endif
/* BLAS */ // /* BLAS */
#ifdef BENCH_CBLAS // #ifdef BENCH_CBLAS
std::vector<float> cx(N), cy(N); // std::vector<float> cx(N), cy(N);
ad::copy(x, cx); // ad::copy(x, cx);
ad::copy(y, cy); // ad::copy(y, cy);
BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t); // BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
#endif // #endif
/* CuBLAS */ // /* CuBLAS */
#ifdef BENCH_CUBLAS // #ifdef BENCH_CUBLAS
T *cux, *cuy; // T *cux, *cuy;
cudaMalloc((void**) &cux, N * sizeof(T)); // cudaMalloc((void**) &cux, N * sizeof(T));
cudaMalloc((void**) &cuy, N * sizeof(T)); // cudaMalloc((void**) &cuy, N * sizeof(T));
BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t) // BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
cudaFree(cux); // cudaFree(cux);
cudaFree(cuy); // cudaFree(cuy);
#endif // #endif
std::cout << std::endl; // std::cout << std::endl;
} // }
} // }
if(operation=="dot") // if(operation=="dot")
{ // {
for(int_t N: create_log_range(1e3, 2e7, 50, 64)) // for(int_t N: create_log_range(1e3, 2e7, 50, 64))
{ // {
std::cout << N; // std::cout << N;
/* ISAAC */ // /* ISAAC */
ad::array x(N, dtype), y(N, dtype); // ad::array x(N, dtype), y(N, dtype);
ad::array scratch(N, dtype); // ad::array scratch(N, dtype);
ad::scalar s(dtype); // ad::scalar s(dtype);
s = dot(x,y); queue.synchronize(); // s = dot(x,y); queue.synchronize();
BENCHMARK_ISAAC(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t) // BENCHMARK_ISAAC(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t)
/* clblas */ // /* clblas */
#ifdef BENCH_CLBLAS // #ifdef BENCH_CLBLAS
BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event()), 2*N*dtsize/t) // BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event()), 2*N*dtsize/t)
#endif // #endif
/* BLAS */ // /* BLAS */
#ifdef BENCH_CBLAS // #ifdef BENCH_CBLAS
std::vector<float> cx(N), cy(N); // std::vector<float> cx(N), cy(N);
ad::copy(x, cx); // ad::copy(x, cx);
ad::copy(y, cy); // ad::copy(y, cy);
BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t); // BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
#endif // #endif
#ifdef BENCH_CUBLAS // #ifdef BENCH_CUBLAS
T *cux, *cuy; // T *cux, *cuy;
T result; // T result;
cudaMalloc((void**) &cux, N * sizeof(T)); // cudaMalloc((void**) &cux, N * sizeof(T));
cudaMalloc((void**) &cuy, N * sizeof(T)); // cudaMalloc((void**) &cuy, N * sizeof(T));
BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t) // BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t)
cudaFree(cux); // cudaFree(cux);
cudaFree(cuy); // cudaFree(cuy);
#endif // #endif
std::cout << std::endl; // std::cout << std::endl;
} // }
std::cout << "\n\n" << std::flush; // std::cout << "\n\n" << std::flush;
} // }
if(operation.substr(0, 4)=="gemv") // if(operation.substr(0, 4)=="gemv")
{ // {
std::vector<std::tuple<int_t, int_t> > MNs; // std::vector<std::tuple<int_t, int_t> > MNs;
MNs.push_back(std::make_tuple(896,896)); // MNs.push_back(std::make_tuple(896,896));
MNs.push_back(std::make_tuple(3072,3072)); // MNs.push_back(std::make_tuple(3072,3072));
MNs.push_back(std::make_tuple(64,32000)); // MNs.push_back(std::make_tuple(64,32000));
MNs.push_back(std::make_tuple(896,32000)); // MNs.push_back(std::make_tuple(896,32000));
MNs.push_back(std::make_tuple(32000, 64)); // MNs.push_back(std::make_tuple(32000, 64));
MNs.push_back(std::make_tuple(32000, 896)); // MNs.push_back(std::make_tuple(32000, 896));
/*---------*/ // /*---------*/
/*--BLAS2--*/ // /*--BLAS2--*/
/*---------*/ // /*---------*/
//T-layout // //T-layout
for(std::tuple<int_t, int_t> MN: MNs) // for(std::tuple<int_t, int_t> MN: MNs)
{ // {
int_t M = std::get<0>(MN); // int_t M = std::get<0>(MN);
int_t N = std::get<1>(MN); // int_t N = std::get<1>(MN);
std::cout << M << "," << N; // std::cout << M << "," << N;
/* ISAAC */ // /* ISAAC */
ad::array A(N, M, dtype), y(M, dtype), x(N, dtype); // ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
#if HAS_A_BLAS // #if HAS_A_BLAS
int_t lda = A.ld(); // int_t lda = A.ld();
#endif // #endif
y = dot(trans(A),x); queue.synchronize(); // y = dot(trans(A),x); queue.synchronize();
BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)),(M*N + M + N)*dtsize/t); // BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)),(M*N + M + N)*dtsize/t);
BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t); // BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
#ifdef BENCH_CLBLAS // #ifdef BENCH_CLBLAS
BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, clblasTrans, N, M, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event()), (M*N + M + N)*dtsize/t) // BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, clblasTrans, N, M, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event()), (M*N + M + N)*dtsize/t)
#endif // #endif
#ifdef BENCH_CBLAS // #ifdef BENCH_CBLAS
std::vector<float> cA(N*M), cx(N), cy(M); // std::vector<float> cA(N*M), cx(N), cy(M);
ad::copy(x, cx); // ad::copy(x, cx);
ad::copy(y, cy); // ad::copy(y, cy);
ad::copy(A, cA); // ad::copy(A, cA);
BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t); // BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
#endif // #endif
#ifdef BENCH_CUBLAS // #ifdef BENCH_CUBLAS
T *cuA, *cux, *cuy; // T *cuA, *cux, *cuy;
cudaMalloc((void**) &cuA, N * M * sizeof(T)); // cudaMalloc((void**) &cuA, N * M * sizeof(T));
cudaMalloc((void**) &cux, N * sizeof(T)); // cudaMalloc((void**) &cux, N * sizeof(T));
cudaMalloc((void**) &cuy, M * sizeof(T)); // cudaMalloc((void**) &cuy, M * sizeof(T));
BENCHMARK_CUDA(cublasSgemv('t', N, M, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t) // BENCHMARK_CUDA(cublasSgemv('t', N, M, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
cudaFree(cuA); // cudaFree(cuA);
cudaFree(cux); // cudaFree(cux);
cudaFree(cuy); // cudaFree(cuy);
#endif // #endif
std::cout << std::endl; // std::cout << std::endl;
} // }
std::cout << "\n\n" << std::flush; // std::cout << "\n\n" << std::flush;
} // }
if(operation.substr(0,4)=="gemm") if(operation.substr(0,4)=="gemm")
{ {
@@ -312,7 +312,7 @@ void bench(ad::numeric_type dtype, std::string operation)
#if HAS_A_BLAS #if HAS_A_BLAS
int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld(); int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld();
#endif #endif
BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), (double)2*M*N*K/t); // BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), (double)2*M*N*K/t);
//BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), (double)2*M*N*K/t); //BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), (double)2*M*N*K/t);
/* clblas */ /* clblas */
#ifdef BENCH_CLBLAS #ifdef BENCH_CLBLAS

View File

@@ -17,14 +17,12 @@ class array: public array_base
public: public:
//1D Constructors //1D Constructors
array(int_t size1, numeric_type dtype, driver::Context context = driver::queues.default_context()); array(int_t size1, numeric_type dtype, driver::Context context = driver::queues.default_context());
array(int_t size1, numeric_type dtype, driver::Buffer data);
template<typename DT> template<typename DT>
array(std::vector<DT> const & data, driver::Context context = driver::queues.default_context()); array(std::vector<DT> const & data, driver::Context context = driver::queues.default_context());
array(array & v, slice const & s1); array(array & v, slice const & s1);
//2D Constructors //2D Constructors
array(int_t size1, int_t size2, numeric_type dtype, driver::Context context = driver::queues.default_context()); array(int_t size1, int_t size2, numeric_type dtype, driver::Context context = driver::queues.default_context());
// array(int_t size1, int_t size2, numeric_type dtype, cl_mem data);
template<typename DT> template<typename DT>
array(int_t size1, int_t size2, std::vector<DT> const & data, driver::Context context = driver::queues.default_context()); array(int_t size1, int_t size2, std::vector<DT> const & data, driver::Context context = driver::queues.default_context());
array(array & M, slice const & s1, slice const & s2); array(array & M, slice const & s1, slice const & s2);
@@ -33,7 +31,7 @@ public:
array(int_t size1, int_t size2, int_t size3, numeric_type dtype, driver::Context context = driver::queues.default_context()); array(int_t size1, int_t size2, int_t size3, numeric_type dtype, driver::Context context = driver::queues.default_context());
//General constructor //General constructor
array(numeric_type dtype, driver::Buffer data, slice const & s1, slice const & s2, int_t ld, driver::Context context = driver::queues.default_context()); array(numeric_type dtype, driver::Buffer data, slice const & s1, slice const & s2, int_t ld);
array(array_expression const & proxy); array(array_expression const & proxy);
array(array const &); array(array const &);
@@ -105,7 +103,7 @@ private:
void inject(values_holder&) const; void inject(values_holder&) const;
template<class T> T cast() const; template<class T> T cast() const;
public: public:
explicit scalar(numeric_type dtype, driver::Buffer const & data, int_t offset, driver::Context context = driver::queues.default_context()); explicit scalar(numeric_type dtype, driver::Buffer const & data, int_t offset);
explicit scalar(value_scalar value, driver::Context context = driver::queues.default_context()); explicit scalar(value_scalar value, driver::Context context = driver::queues.default_context());
explicit scalar(numeric_type dtype, driver::Context context = driver::queues.default_context()); explicit scalar(numeric_type dtype, driver::Context context = driver::queues.default_context());
scalar(array_expression const & proxy); scalar(array_expression const & proxy);

View File

@@ -14,8 +14,11 @@ class Event
{ {
friend class CommandQueue; friend class CommandQueue;
public: public:
Event(cl::Event const & event);
Event(backend_type backend); Event(backend_type backend);
long elapsed_time() const; long elapsed_time() const;
operator cl::Event();
private: private:
backend_type backend_; backend_type backend_;
#ifdef ISAAC_WITH_CUDA #ifdef ISAAC_WITH_CUDA

View File

@@ -220,7 +220,12 @@ private:
struct execution_options_type struct execution_options_type
{ {
execution_options_type(unsigned int _queue_id = 0, std::list<driver::Event>* _events = NULL, std::vector<driver::Event>* _dependencies = NULL) : execution_options_type(unsigned int _queue_id = 0, std::list<driver::Event>* _events = NULL, std::vector<driver::Event>* _dependencies = NULL) :
events(_events), dependencies(_dependencies), queue_id_(_queue_id){} events(_events), dependencies(_dependencies), queue_id_(_queue_id)
{}
execution_options_type(driver::CommandQueue const & queue, std::list<driver::Event> *_events = NULL, std::vector<driver::Event> *_dependencies = NULL) :
events(_events), dependencies(_dependencies), queue_(new driver::CommandQueue(queue))
{}
void enqueue(driver::Context const & context, driver::Kernel const & kernel, driver::NDRange global, driver::NDRange local) const void enqueue(driver::Context const & context, driver::Kernel const & kernel, driver::NDRange global, driver::NDRange local) const
{ {

View File

@@ -0,0 +1,53 @@
/* ************************************************************************
* Copyright 2013 Advanced Micro Devices, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ************************************************************************/
#ifndef CLBLAS_COMPLEX_H_
#define CLBLAS_COMPLEX H_
#ifdef __cplusplus
extern "C" {
#endif
typedef cl_float2 FloatComplex;
typedef cl_double2 DoubleComplex;
static __inline FloatComplex
floatComplex(float real, float imag)
{
FloatComplex z;
z.s[0] = real;
z.s[1] = imag;
return z;
}
static __inline DoubleComplex
doubleComplex(double real, double imag)
{
DoubleComplex z;
z.s[0] = real;
z.s[1] = imag;
return z;
}
#define CREAL(v) ((v).s[0])
#define CIMAG(v) ((v).s[1])
#ifdef __cplusplus
} /* extern "C" { */
#endif
#endif /* CLBLAS_COMPLEX_H_ */

9654
include/isaac/wrap/clBLAS.h Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -19,11 +19,6 @@ array::array(int_t shape0, numeric_type dtype, driver::Context context) :
context_(context), data_(context_, size_of(dtype)*dsize()) context_(context), data_(context_, size_of(dtype)*dsize())
{ } { }
array::array(int_t shape0, numeric_type dtype, driver::Buffer data) :
dtype_(dtype), shape_(shape0, 1, 1, 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape_[0]),
context_(data.context()), data_(data)
{ }
template<class DT> template<class DT>
array::array(std::vector<DT> const & x, driver::Context context): array::array(std::vector<DT> const & x, driver::Context context):
dtype_(to_numeric_type<DT>::value), shape_(x.size(), 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape_[0]), dtype_(to_numeric_type<DT>::value), shape_(x.size(), 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape_[0]),
@@ -90,9 +85,9 @@ INSTANTIATE(double);
#undef INSTANTIATE #undef INSTANTIATE
// General // General
array::array(numeric_type dtype, driver::Buffer data, slice const & s0, slice const & s1, int_t ld, driver::Context context): array::array(numeric_type dtype, driver::Buffer data, slice const & s0, slice const & s1, int_t ld):
dtype_(dtype), shape_(s0.size, s1.size), start_(s0.start, s1.start), stride_(s0.stride, s1.stride), dtype_(dtype), shape_(s0.size, s1.size), start_(s0.start, s1.start), stride_(s0.stride, s1.stride),
ld_(ld), context_(context), data_(data) ld_(ld), context_(data.context()), data_(data)
{ } { }
array::array(array_expression const & proxy) : array(control(proxy)){} array::array(array_expression const & proxy) : array(control(proxy)){}
@@ -236,13 +231,13 @@ array_expression array::T() const
scalar array::operator [](int_t idx) scalar array::operator [](int_t idx)
{ {
assert(nshape()==1); assert(nshape()==1);
return scalar(dtype_, data_, idx, context_); return scalar(dtype_, data_, idx);
} }
const scalar array::operator [](int_t idx) const const scalar array::operator [](int_t idx) const
{ {
assert(nshape()==1); assert(nshape()==1);
return scalar(dtype_, data_, idx, context_); return scalar(dtype_, data_, idx);
} }
@@ -268,7 +263,7 @@ void copy(driver::Context & ctx, driver::Buffer const & data, T value)
} }
scalar::scalar(numeric_type dtype, const driver::Buffer &data, int_t offset, driver::Context context): array(dtype, data, _(offset, offset+1), _(1,2), 1, context) scalar::scalar(numeric_type dtype, const driver::Buffer &data, int_t offset): array(dtype, data, _(offset, offset+1), _(1,2), 1)
{ } { }
scalar::scalar(value_scalar value, driver::Context context) : array(1, value.dtype(), context) scalar::scalar(value_scalar value, driver::Context context) : array(1, value.dtype(), context)

View File

@@ -21,6 +21,11 @@ Event::Event(backend_type backend) : backend_(backend), h_(backend_)
} }
} }
Event::Event(cl::Event const & event) : backend_(OPENCL), h_(backend_)
{
*h_.cl = event;
}
long Event::elapsed_time() const long Event::elapsed_time() const
{ {
switch(backend_) switch(backend_)
@@ -38,6 +43,10 @@ long Event::elapsed_time() const
} }
} }
Event::operator cl::Event()
{
return *h_.cl;
}
} }
} }

71
lib/wrap/clBLAS.cpp Normal file
View File

@@ -0,0 +1,71 @@
#include "CL/cl.hpp"
#include "isaac/wrap/clBLAS.h"
#include "isaac/array.h"
namespace is = isaac;
extern "C"
{
clblasStatus clblasSetup()
{
return clblasSuccess;
}
void clblasTeardown()
{
}
clblasStatus clblasSgemm(clblasOrder order, clblasTranspose transA, clblasTranspose transB,
size_t M, size_t N, size_t K,
cl_float alpha, const cl_mem mA, size_t offA, size_t lda,
const cl_mem mB, size_t offB, size_t ldb, cl_float beta,
cl_mem mC, size_t offC, size_t ldc,
cl_uint numCommandQueues, cl_command_queue *commandQueues,
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)
{
is::int_t As1 = M, As2 = K;
is::int_t Bs1 = K, Bs2 = N;
if(transA==clblasTrans) std::swap(As1, As2);
if(transB==clblasTrans) std::swap(Bs1, Bs2);
is::array A(is::FLOAT_TYPE, cl::Buffer(mA), is::_(offA%lda, As1, 1), is::_(offA/lda, As2, 1), lda);
is::array B(is::FLOAT_TYPE, cl::Buffer(mB), is::_(offB%ldb, Bs1, 1), is::_(offB/ldb, Bs2, 1), ldb);
is::array C(is::FLOAT_TYPE, cl::Buffer(mC), is::_(offC%ldc, M, 1), is::_(offC/ldc, N, 1), ldc);
std::vector<is::driver::Event> waitlist;
for(cl_uint i = 0 ; i < numEventsInWaitList ; ++i)
waitlist.push_back(cl::Event(eventWaitList[i]));
std::list<is::driver::Event> levents;
for(cl_uint i = 0 ; i < numCommandQueues ; ++i)
{
clRetainCommandQueue(commandQueues[i]);
cl::CommandQueue clqueue(commandQueues[i]);
is::driver::CommandQueue queue(clqueue);
is::execution_options_type opt(queue, &levents, &waitlist);
if(transA==clblasTrans && transB==clblasTrans)
C = is::control(alpha*dot(A.T(), B.T()) + beta*C, opt);
else if(transA==clblasTrans && transB==clblasNoTrans)
C = is::control(alpha*dot(A.T(), B) + beta*C, opt);
else if(transA==clblasNoTrans && transB==clblasTrans)
C = is::control(alpha*dot(A, B.T()) + beta*C, opt);
else
C = is::control(alpha*dot(A, B) + beta*C, opt);
}
if(events)
*events = static_cast<cl::Event>(levents.front())();
std::cout << events << std::endl;
return clblasSuccess;
}
}