C++: More clBLAS routines
This commit is contained in:
262
bench/blas.cpp
262
bench/blas.cpp
@@ -99,6 +99,7 @@ void bench(ad::numeric_type dtype, std::string operation)
|
|||||||
double total_time = 0;\
|
double total_time = 0;\
|
||||||
while(total_time*1e-9 < 1e-3){\
|
while(total_time*1e-9 < 1e-3){\
|
||||||
cl::Event event;\
|
cl::Event event;\
|
||||||
|
flush = ad::zeros(1e6, 1, dtype);\
|
||||||
OP;\
|
OP;\
|
||||||
queue.synchronize();\
|
queue.synchronize();\
|
||||||
times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>());\
|
times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>());\
|
||||||
@@ -146,8 +147,7 @@ void bench(ad::numeric_type dtype, std::string operation)
|
|||||||
ad::array flush(1e6, dtype);
|
ad::array flush(1e6, dtype);
|
||||||
std::cout << "#" << operation << " (" << metric[operation] << ")" << std::endl;
|
std::cout << "#" << operation << " (" << metric[operation] << ")" << std::endl;
|
||||||
std::cout << "N";
|
std::cout << "N";
|
||||||
std::cout << "\tISAAC (predictive)";
|
std::cout << "\tISAAC";
|
||||||
std::cout << "\tISAAC (optimal)";
|
|
||||||
#ifdef BENCH_CLBLAS
|
#ifdef BENCH_CLBLAS
|
||||||
std::cout << "\tclBLAS";
|
std::cout << "\tclBLAS";
|
||||||
#endif
|
#endif
|
||||||
@@ -162,142 +162,143 @@ void bench(ad::numeric_type dtype, std::string operation)
|
|||||||
// RUN BENCHMARKS
|
// RUN BENCHMARKS
|
||||||
//
|
//
|
||||||
|
|
||||||
// /*---------*/
|
/*---------*/
|
||||||
// /*--BLAS1--*/
|
/*--BLAS1--*/
|
||||||
// /*---------*/
|
/*---------*/
|
||||||
|
|
||||||
// if(operation=="axpy")
|
if(operation=="axpy")
|
||||||
// {
|
{
|
||||||
// for(int_t N: create_log_range(1e3, 2e7, 50, 64))
|
float alpha = 1;
|
||||||
// {
|
for(int_t N: create_log_range(1e3, 2e7, 50, 64))
|
||||||
// std::cout << N;
|
{
|
||||||
// ad::array x(N, dtype), y(N, dtype);
|
std::cout << N;
|
||||||
// /* ISAAC */
|
ad::array x(N, dtype), y(N, dtype);
|
||||||
// std::list<ad::driver::Event> events;\
|
/* ISAAC */
|
||||||
// BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), 3*N*dtsize/t)
|
std::list<ad::driver::Event> events;\
|
||||||
// BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t)
|
BENCHMARK_ISAAC(y = ad::control(x + alpha*y, ad::execution_options_type(0, &events)), 3*N*dtsize/t)
|
||||||
// /* clblas */
|
/* clblas */
|
||||||
// #ifdef BENCH_CLBLAS
|
#ifdef BENCH_CLBLAS
|
||||||
// BENCHMARK_CLBLAS(clblasSaxpy(N, 1, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event()), 3*N*dtsize/t)
|
BENCHMARK_CLBLAS(clblasSaxpy(N, alpha, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event()), 3*N*dtsize/t)
|
||||||
// #endif
|
#endif
|
||||||
// /* BLAS */
|
/* BLAS */
|
||||||
// #ifdef BENCH_CBLAS
|
#ifdef BENCH_CBLAS
|
||||||
// std::vector<float> cx(N), cy(N);
|
std::vector<float> cx(N), cy(N);
|
||||||
// ad::copy(x, cx);
|
ad::copy(x, cx);
|
||||||
// ad::copy(y, cy);
|
ad::copy(y, cy);
|
||||||
// BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
|
BENCHMARK_HOST(cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
|
||||||
// #endif
|
#endif
|
||||||
// /* CuBLAS */
|
/* CuBLAS */
|
||||||
// #ifdef BENCH_CUBLAS
|
#ifdef BENCH_CUBLAS
|
||||||
// T *cux, *cuy;
|
T *cux, *cuy;
|
||||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
cudaMalloc((void**) &cux, N * sizeof(T));
|
||||||
// cudaMalloc((void**) &cuy, N * sizeof(T));
|
cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||||
// BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
|
BENCHMARK_CUDA(cublasSaxpy(N, alpha, cux, 1, cuy, 1), 3*N*dtsize/t)
|
||||||
// cudaFree(cux);
|
cudaFree(cux);
|
||||||
// cudaFree(cuy);
|
cudaFree(cuy);
|
||||||
// #endif
|
#endif
|
||||||
// std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
// }
|
}
|
||||||
// }
|
}
|
||||||
|
|
||||||
// if(operation=="dot")
|
if(operation=="dot")
|
||||||
// {
|
{
|
||||||
// for(int_t N: create_log_range(1e3, 2e7, 50, 64))
|
for(int_t N: create_log_range(1e3, 2e7, 50, 64))
|
||||||
// {
|
{
|
||||||
// std::cout << N;
|
std::cout << N;
|
||||||
// /* ISAAC */
|
/* ISAAC */
|
||||||
// ad::array x(N, dtype), y(N, dtype);
|
ad::array x(N, dtype), y(N, dtype);
|
||||||
// ad::array scratch(N, dtype);
|
ad::array scratch(N, dtype);
|
||||||
// ad::scalar s(dtype);
|
ad::scalar s(dtype);
|
||||||
// s = dot(x,y); queue.synchronize();
|
s = dot(x,y); queue.synchronize();
|
||||||
// BENCHMARK_ISAAC(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t)
|
BENCHMARK_ISAAC(s = ad::control(dot(x,y), ad::execution_options_type(0, &events)), 2*N*dtsize/t)
|
||||||
// /* clblas */
|
/* clblas */
|
||||||
// #ifdef BENCH_CLBLAS
|
#ifdef BENCH_CLBLAS
|
||||||
// BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event()), 2*N*dtsize/t)
|
BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event()), 2*N*dtsize/t)
|
||||||
// #endif
|
#endif
|
||||||
// /* BLAS */
|
/* BLAS */
|
||||||
// #ifdef BENCH_CBLAS
|
#ifdef BENCH_CBLAS
|
||||||
// std::vector<float> cx(N), cy(N);
|
std::vector<float> cx(N), cy(N);
|
||||||
// ad::copy(x, cx);
|
ad::copy(x, cx);
|
||||||
// ad::copy(y, cy);
|
ad::copy(y, cy);
|
||||||
// BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
|
BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
|
||||||
// #endif
|
#endif
|
||||||
// #ifdef BENCH_CUBLAS
|
#ifdef BENCH_CUBLAS
|
||||||
// T *cux, *cuy;
|
T *cux, *cuy;
|
||||||
// T result;
|
T result;
|
||||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
cudaMalloc((void**) &cux, N * sizeof(T));
|
||||||
// cudaMalloc((void**) &cuy, N * sizeof(T));
|
cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||||
// BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t)
|
BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t)
|
||||||
// cudaFree(cux);
|
cudaFree(cux);
|
||||||
// cudaFree(cuy);
|
cudaFree(cuy);
|
||||||
// #endif
|
#endif
|
||||||
// std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
// }
|
}
|
||||||
// std::cout << "\n\n" << std::flush;
|
std::cout << "\n\n" << std::flush;
|
||||||
// }
|
}
|
||||||
|
|
||||||
// if(operation.substr(0, 4)=="gemv")
|
if(operation.substr(0, 4)=="gemv")
|
||||||
// {
|
{
|
||||||
// std::vector<std::tuple<int_t, int_t> > MNs;
|
std::vector<std::tuple<int_t, int_t> > MNs;
|
||||||
// MNs.push_back(std::make_tuple(896,896));
|
MNs.push_back(std::make_tuple(896,896));
|
||||||
// MNs.push_back(std::make_tuple(3072,3072));
|
MNs.push_back(std::make_tuple(3072,3072));
|
||||||
// MNs.push_back(std::make_tuple(64,32000));
|
MNs.push_back(std::make_tuple(64,32000));
|
||||||
// MNs.push_back(std::make_tuple(896,32000));
|
MNs.push_back(std::make_tuple(896,32000));
|
||||||
// MNs.push_back(std::make_tuple(32000, 64));
|
MNs.push_back(std::make_tuple(32000, 64));
|
||||||
// MNs.push_back(std::make_tuple(32000, 896));
|
MNs.push_back(std::make_tuple(32000, 896));
|
||||||
|
|
||||||
// /*---------*/
|
/*---------*/
|
||||||
// /*--BLAS2--*/
|
/*--BLAS2--*/
|
||||||
// /*---------*/
|
/*---------*/
|
||||||
// //T-layout
|
//T-layout
|
||||||
// for(std::tuple<int_t, int_t> MN: MNs)
|
for(std::tuple<int_t, int_t> MN: MNs)
|
||||||
// {
|
{
|
||||||
// int_t M = std::get<0>(MN);
|
int_t M = std::get<0>(MN);
|
||||||
// int_t N = std::get<1>(MN);
|
int_t N = std::get<1>(MN);
|
||||||
// std::cout << M << "," << N;
|
std::cout << M << "," << N;
|
||||||
// /* ISAAC */
|
/* ISAAC */
|
||||||
// ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
|
ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
|
||||||
// #if HAS_A_BLAS
|
#if HAS_A_BLAS
|
||||||
// int_t lda = A.ld();
|
int_t lda = A.ld();
|
||||||
// #endif
|
#endif
|
||||||
// y = dot(trans(A),x); queue.synchronize();
|
y = dot(trans(A),x); queue.synchronize();
|
||||||
// BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)),(M*N + M + N)*dtsize/t);
|
BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events)),(M*N + M + N)*dtsize/t);
|
||||||
// BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
|
#ifdef BENCH_CLBLAS
|
||||||
// #ifdef BENCH_CLBLAS
|
BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, clblasTrans, N, M, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event()), (M*N + M + N)*dtsize/t)
|
||||||
// BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, clblasTrans, N, M, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event()), (M*N + M + N)*dtsize/t)
|
#endif
|
||||||
// #endif
|
#ifdef BENCH_CBLAS
|
||||||
// #ifdef BENCH_CBLAS
|
std::vector<float> cA(N*M), cx(N), cy(M);
|
||||||
// std::vector<float> cA(N*M), cx(N), cy(M);
|
ad::copy(x, cx);
|
||||||
// ad::copy(x, cx);
|
ad::copy(y, cy);
|
||||||
// ad::copy(y, cy);
|
ad::copy(A, cA);
|
||||||
// ad::copy(A, cA);
|
BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
|
||||||
// BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
|
#endif
|
||||||
// #endif
|
#ifdef BENCH_CUBLAS
|
||||||
// #ifdef BENCH_CUBLAS
|
T *cuA, *cux, *cuy;
|
||||||
// T *cuA, *cux, *cuy;
|
cudaMalloc((void**) &cuA, N * M * sizeof(T));
|
||||||
// cudaMalloc((void**) &cuA, N * M * sizeof(T));
|
cudaMalloc((void**) &cux, N * sizeof(T));
|
||||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
cudaMalloc((void**) &cuy, M * sizeof(T));
|
||||||
// cudaMalloc((void**) &cuy, M * sizeof(T));
|
BENCHMARK_CUDA(cublasSgemv('t', N, M, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
|
||||||
// BENCHMARK_CUDA(cublasSgemv('t', N, M, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
|
cudaFree(cuA);
|
||||||
// cudaFree(cuA);
|
cudaFree(cux);
|
||||||
// cudaFree(cux);
|
cudaFree(cuy);
|
||||||
// cudaFree(cuy);
|
#endif
|
||||||
// #endif
|
std::cout << std::endl;
|
||||||
// std::cout << std::endl;
|
}
|
||||||
// }
|
std::cout << "\n\n" << std::flush;
|
||||||
// std::cout << "\n\n" << std::flush;
|
}
|
||||||
// }
|
|
||||||
|
|
||||||
if(operation.substr(0,4)=="gemm")
|
if(operation.substr(0,4)=="gemm")
|
||||||
{
|
{
|
||||||
std::vector<std::tuple<int_t, int_t, int_t> > MNKs;
|
std::vector<std::tuple<int_t, int_t, int_t> > MNKs;
|
||||||
MNKs.push_back(std::make_tuple(896,896,896));
|
// MNKs.push_back(std::make_tuple(896,896,896));
|
||||||
MNKs.push_back(std::make_tuple(3072,3072,3072));
|
// MNKs.push_back(std::make_tuple(3072,3072,3072));
|
||||||
MNKs.push_back(std::make_tuple(1024,64,768));
|
// MNKs.push_back(std::make_tuple(1024,64,768));
|
||||||
MNKs.push_back(std::make_tuple(768,64,128));
|
// MNKs.push_back(std::make_tuple(768,64,128));
|
||||||
MNKs.push_back(std::make_tuple(64,64,32000));
|
// MNKs.push_back(std::make_tuple(64,64,32000));
|
||||||
MNKs.push_back(std::make_tuple(1024,1024,32000));
|
// MNKs.push_back(std::make_tuple(1024,1024,32000));
|
||||||
|
|
||||||
|
for(unsigned int N = 1 ; N <10 ; ++N)
|
||||||
|
MNKs.push_back(std::make_tuple(128*N, 128*N, 128*N));
|
||||||
/*---------*/
|
/*---------*/
|
||||||
/*--BLAS3--*/
|
/*--BLAS3--*/
|
||||||
/*---------*/
|
/*---------*/
|
||||||
@@ -312,8 +313,7 @@ void bench(ad::numeric_type dtype, std::string operation)
|
|||||||
#if HAS_A_BLAS
|
#if HAS_A_BLAS
|
||||||
int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld();
|
int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld();
|
||||||
#endif
|
#endif
|
||||||
// BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), (double)2*M*N*K/t);
|
BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events)), (double)2*M*N*K/t);
|
||||||
//BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), (double)2*M*N*K/t);
|
|
||||||
/* clblas */
|
/* clblas */
|
||||||
#ifdef BENCH_CLBLAS
|
#ifdef BENCH_CLBLAS
|
||||||
BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, clblasNoTrans, clblasTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
|
BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, clblasNoTrans, clblasTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
|
||||||
|
@@ -14,15 +14,22 @@ class scalar;
|
|||||||
|
|
||||||
class array: public array_base
|
class array: public array_base
|
||||||
{
|
{
|
||||||
|
protected:
|
||||||
|
//Slices
|
||||||
|
array(numeric_type dtype, driver::Buffer data, slice const & s1, slice const & s2, int_t ld);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
//1D Constructors
|
//1D Constructors
|
||||||
array(int_t size1, numeric_type dtype, driver::Context context = driver::queues.default_context());
|
array(int_t size1, numeric_type dtype, driver::Context context = driver::queues.default_context());
|
||||||
|
array(int_t size1, numeric_type dtype, driver::Buffer data, int_t start, int_t inc);
|
||||||
|
|
||||||
template<typename DT>
|
template<typename DT>
|
||||||
array(std::vector<DT> const & data, driver::Context context = driver::queues.default_context());
|
array(std::vector<DT> const & data, driver::Context context = driver::queues.default_context());
|
||||||
array(array & v, slice const & s1);
|
array(array & v, slice const & s1);
|
||||||
|
|
||||||
//2D Constructors
|
//2D Constructors
|
||||||
array(int_t size1, int_t size2, numeric_type dtype, driver::Context context = driver::queues.default_context());
|
array(int_t size1, int_t size2, numeric_type dtype, driver::Context context = driver::queues.default_context());
|
||||||
|
array(int_t size1, int_t size2, numeric_type dtype, driver::Buffer data, int_t start, int_t ld);
|
||||||
template<typename DT>
|
template<typename DT>
|
||||||
array(int_t size1, int_t size2, std::vector<DT> const & data, driver::Context context = driver::queues.default_context());
|
array(int_t size1, int_t size2, std::vector<DT> const & data, driver::Context context = driver::queues.default_context());
|
||||||
array(array & M, slice const & s1, slice const & s2);
|
array(array & M, slice const & s1, slice const & s2);
|
||||||
@@ -31,7 +38,6 @@ public:
|
|||||||
array(int_t size1, int_t size2, int_t size3, numeric_type dtype, driver::Context context = driver::queues.default_context());
|
array(int_t size1, int_t size2, int_t size3, numeric_type dtype, driver::Context context = driver::queues.default_context());
|
||||||
|
|
||||||
//General constructor
|
//General constructor
|
||||||
array(numeric_type dtype, driver::Buffer data, slice const & s1, slice const & s2, int_t ld);
|
|
||||||
|
|
||||||
array(array_expression const & proxy);
|
array(array_expression const & proxy);
|
||||||
array(array const &);
|
array(array const &);
|
||||||
|
@@ -19,6 +19,11 @@ array::array(int_t shape0, numeric_type dtype, driver::Context context) :
|
|||||||
context_(context), data_(context_, size_of(dtype)*dsize())
|
context_(context), data_(context_, size_of(dtype)*dsize())
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
|
array::array(int_t shape0, numeric_type dtype, driver::Buffer data, int_t start, int_t inc):
|
||||||
|
dtype_(dtype), shape_(shape0), start_(start, 0, 0, 0), stride_(inc), ld_(shape_[0]), context_(data.context()), data_(data)
|
||||||
|
{ }
|
||||||
|
|
||||||
|
|
||||||
template<class DT>
|
template<class DT>
|
||||||
array::array(std::vector<DT> const & x, driver::Context context):
|
array::array(std::vector<DT> const & x, driver::Context context):
|
||||||
dtype_(to_numeric_type<DT>::value), shape_(x.size(), 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape_[0]),
|
dtype_(to_numeric_type<DT>::value), shape_(x.size(), 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape_[0]),
|
||||||
@@ -45,21 +50,20 @@ INSTANTIATE(double);
|
|||||||
#undef INSTANTIATE
|
#undef INSTANTIATE
|
||||||
|
|
||||||
// 2D
|
// 2D
|
||||||
array::array(int_t shape0, int_t shape1, numeric_type dtype, driver::Context context) : dtype_(dtype), shape_(shape0, shape1, 1, 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape0),
|
array::array(int_t shape0, int_t shape1, numeric_type dtype, driver::Context context) : dtype_(dtype), shape_(shape0, shape1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape0),
|
||||||
context_(context), data_(context_, size_of(dtype_)*dsize())
|
context_(context), data_(context_, size_of(dtype_)*dsize())
|
||||||
{}
|
{}
|
||||||
|
|
||||||
|
array::array(int_t shape0, int_t shape1, numeric_type dtype, driver::Buffer data, int_t start, int_t ld) :
|
||||||
|
dtype_(dtype), shape_(shape0, shape1), start_(start%ld, start/ld, 0, 0), stride_(1, 1, 1, 1), ld_(ld), context_(data.context()), data_(data)
|
||||||
|
{ }
|
||||||
|
|
||||||
array::array(array & M, slice const & s0, slice const & s1) : dtype_(M.dtype_), shape_(s0.size, s1.size, 1, 1),
|
array::array(array & M, slice const & s0, slice const & s1) : dtype_(M.dtype_), shape_(s0.size, s1.size, 1, 1),
|
||||||
start_(M.start_[0] + M.stride_[0]*s0.start, M.start_[1] + M.stride_[1]*s1.start, 0, 0),
|
start_(M.start_[0] + M.stride_[0]*s0.start, M.start_[1] + M.stride_[1]*s1.start, 0, 0),
|
||||||
stride_(M.stride_[0]*s0.stride, M.stride_[1]*s1.stride, 1, 1), ld_(M.ld_),
|
stride_(M.stride_[0]*s0.stride, M.stride_[1]*s1.stride, 1, 1), ld_(M.ld_),
|
||||||
context_(M.data_.context()), data_(M.data_)
|
context_(M.data_.context()), data_(M.data_)
|
||||||
{ }
|
{ }
|
||||||
|
|
||||||
// 3D
|
|
||||||
array::array(int_t shape0, int_t shape1, int_t shape2, numeric_type dtype, driver::Context context) : dtype_(dtype), shape_(shape0, shape1, shape2, 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape0),
|
|
||||||
context_(context), data_(context_, size_of(dtype_)*dsize())
|
|
||||||
{}
|
|
||||||
|
|
||||||
template<typename DT>
|
template<typename DT>
|
||||||
array::array(int_t shape0, int_t shape1, std::vector<DT> const & data, driver::Context context)
|
array::array(int_t shape0, int_t shape1, std::vector<DT> const & data, driver::Context context)
|
||||||
: dtype_(to_numeric_type<DT>::value),
|
: dtype_(to_numeric_type<DT>::value),
|
||||||
@@ -69,6 +73,19 @@ array::array(int_t shape0, int_t shape1, std::vector<DT> const & data, driver::C
|
|||||||
isaac::copy(data, *this);
|
isaac::copy(data, *this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 3D
|
||||||
|
array::array(int_t shape0, int_t shape1, int_t shape2, numeric_type dtype, driver::Context context) : dtype_(dtype), shape_(shape0, shape1, shape2, 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape0),
|
||||||
|
context_(context), data_(context_, size_of(dtype_)*dsize())
|
||||||
|
{}
|
||||||
|
|
||||||
|
//Slices
|
||||||
|
array::array(numeric_type dtype, driver::Buffer data, slice const & s0, slice const & s1, int_t ld):
|
||||||
|
dtype_(dtype), shape_(s0.size, s1.size), start_(s0.start, s1.start), stride_(s0.stride, s1.stride),
|
||||||
|
ld_(ld), context_(data.context()), data_(data)
|
||||||
|
{ }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define INSTANTIATE(T) template array::array(int_t, int_t, std::vector<T> const &, driver::Context)
|
#define INSTANTIATE(T) template array::array(int_t, int_t, std::vector<T> const &, driver::Context)
|
||||||
INSTANTIATE(char);
|
INSTANTIATE(char);
|
||||||
INSTANTIATE(unsigned char);
|
INSTANTIATE(unsigned char);
|
||||||
@@ -84,12 +101,6 @@ INSTANTIATE(float);
|
|||||||
INSTANTIATE(double);
|
INSTANTIATE(double);
|
||||||
#undef INSTANTIATE
|
#undef INSTANTIATE
|
||||||
|
|
||||||
// General
|
|
||||||
array::array(numeric_type dtype, driver::Buffer data, slice const & s0, slice const & s1, int_t ld):
|
|
||||||
dtype_(dtype), shape_(s0.size, s1.size), start_(s0.start, s1.start), stride_(s0.stride, s1.stride),
|
|
||||||
ld_(ld), context_(data.context()), data_(data)
|
|
||||||
{ }
|
|
||||||
|
|
||||||
array::array(array_expression const & proxy) : array(control(proxy)){}
|
array::array(array_expression const & proxy) : array(control(proxy)){}
|
||||||
array::array(array const & other) : array(control(other)){}
|
array::array(array const & other) : array(control(other)){}
|
||||||
|
|
||||||
|
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include "isaac/wrap/clBLAS.h"
|
#include "isaac/wrap/clBLAS.h"
|
||||||
#include "isaac/array.h"
|
#include "isaac/array.h"
|
||||||
|
#include "isaac/symbolic/execute.h"
|
||||||
|
|
||||||
namespace is = isaac;
|
namespace is = isaac;
|
||||||
|
|
||||||
@@ -18,6 +19,136 @@ extern "C"
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void execute(is::array_expression const & operation, is::driver::Context const & context,
|
||||||
|
cl_uint numCommandQueues, cl_command_queue *commandQueues,
|
||||||
|
cl_uint numEventsInWaitList, const cl_event *eventWaitList,
|
||||||
|
cl_event *events)
|
||||||
|
{
|
||||||
|
std::vector<is::driver::Event> waitlist;
|
||||||
|
for(cl_uint i = 0 ; i < numEventsInWaitList ; ++i)
|
||||||
|
waitlist.push_back(cl::Event(eventWaitList[i]));
|
||||||
|
for(cl_uint i = 0 ; i < numCommandQueues ; ++i)
|
||||||
|
{
|
||||||
|
std::list<is::driver::Event> levents;
|
||||||
|
is::driver::CommandQueue queue(cl::CommandQueue(commandQueues[i]));
|
||||||
|
clRetainCommandQueue(commandQueues[i]);
|
||||||
|
is::execution_options_type options(queue, &levents, &waitlist);
|
||||||
|
is::execute(is::control(operation, options), is::models(options.queue(context)));
|
||||||
|
if(events)
|
||||||
|
{
|
||||||
|
events[i] = static_cast<cl::Event>(levents.front())();
|
||||||
|
clRetainEvent(events[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//*****************
|
||||||
|
//BLAS1
|
||||||
|
//*****************
|
||||||
|
clblasStatus clblasSaxpy(size_t N, cl_float alpha,
|
||||||
|
const cl_mem mx, size_t offx, int incx,
|
||||||
|
cl_mem my, size_t offy, int incy,
|
||||||
|
cl_uint numCommandQueues, cl_command_queue *commandQueues,
|
||||||
|
cl_uint numEventsInWaitList, const cl_event *eventWaitList,
|
||||||
|
cl_event *events)
|
||||||
|
{
|
||||||
|
is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx);
|
||||||
|
clRetainMemObject(mx);
|
||||||
|
is::array y(N, is::FLOAT_TYPE, cl::Buffer(my), offy, incy);
|
||||||
|
clRetainMemObject(my);
|
||||||
|
execute(is::detail::assign(y, x + alpha*y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
|
||||||
|
return clblasSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
clblasStatus clblasSscal(size_t N, cl_float alpha,
|
||||||
|
cl_mem mx, size_t offx, int incx,
|
||||||
|
cl_uint numCommandQueues, cl_command_queue *commandQueues,
|
||||||
|
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)
|
||||||
|
{
|
||||||
|
is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx);
|
||||||
|
clRetainMemObject(mx);
|
||||||
|
execute(is::detail::assign(x, alpha*x), x.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
|
||||||
|
return clblasSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
clblasStatus clblasScopy(size_t N,
|
||||||
|
const cl_mem mx, size_t offx, int incx,
|
||||||
|
cl_mem my, size_t offy, int incy,
|
||||||
|
cl_uint numCommandQueues, cl_command_queue *commandQueues,
|
||||||
|
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)
|
||||||
|
{
|
||||||
|
const is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx);
|
||||||
|
clRetainMemObject(mx);
|
||||||
|
is::array y(N, is::FLOAT_TYPE, cl::Buffer(my), offy, incy);
|
||||||
|
clRetainMemObject(my);
|
||||||
|
execute(is::detail::assign(y, x), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
|
||||||
|
return clblasSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
clblasStatus clblasSdot(size_t N, cl_mem dotProduct, size_t offDP,
|
||||||
|
const cl_mem mx, size_t offx, int incx,
|
||||||
|
const cl_mem my, size_t offy, int incy,
|
||||||
|
cl_mem /*scratchBuff*/, cl_uint numCommandQueues,
|
||||||
|
cl_command_queue *commandQueues, cl_uint numEventsInWaitList,
|
||||||
|
const cl_event *eventWaitList, cl_event *events)
|
||||||
|
{
|
||||||
|
is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx);
|
||||||
|
clRetainMemObject(mx);
|
||||||
|
is::array y(N, is::FLOAT_TYPE, cl::Buffer(my), offy, incy);
|
||||||
|
clRetainMemObject(my);
|
||||||
|
is::scalar s(is::FLOAT_TYPE, cl::Buffer(dotProduct), offDP);
|
||||||
|
clRetainMemObject(dotProduct);
|
||||||
|
execute(is::detail::assign(s, dot(x,y)), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
|
||||||
|
return clblasSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
clblasStatus clblasSasum(size_t N, cl_mem asum, size_t offAsum,
|
||||||
|
const cl_mem mx, size_t offx, int incx,
|
||||||
|
cl_mem /*scratchBuff*/, cl_uint numCommandQueues, cl_command_queue *commandQueues,
|
||||||
|
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)
|
||||||
|
{
|
||||||
|
is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx);
|
||||||
|
clRetainMemObject(mx);
|
||||||
|
is::scalar s(is::FLOAT_TYPE, cl::Buffer(asum), offAsum);
|
||||||
|
clRetainMemObject(asum);
|
||||||
|
execute(is::detail::assign(s, sum(abs(x))), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
|
||||||
|
return clblasSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
//*****************
|
||||||
|
//BLAS2
|
||||||
|
//*****************
|
||||||
|
clblasStatus clblasSgemv(clblasOrder order, clblasTranspose transA,
|
||||||
|
size_t M, size_t N,
|
||||||
|
cl_float alpha, const cl_mem mA, size_t offA, size_t lda,
|
||||||
|
const cl_mem mx, size_t offx, int incx,
|
||||||
|
cl_float beta, cl_mem my, size_t offy, int incy,
|
||||||
|
cl_uint numCommandQueues, cl_command_queue *commandQueues,
|
||||||
|
cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)
|
||||||
|
{
|
||||||
|
//A
|
||||||
|
is::int_t As1 = M, As2 = N;
|
||||||
|
if(transA==clblasTrans) std::swap(As1, As2);
|
||||||
|
is::array A(As1, As2, is::FLOAT_TYPE, cl::Buffer(mA), offA, lda);
|
||||||
|
clRetainMemObject(mA);
|
||||||
|
//x
|
||||||
|
is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx);
|
||||||
|
clRetainMemObject(mx);
|
||||||
|
//y
|
||||||
|
is::array y(N, is::FLOAT_TYPE, cl::Buffer(my), offy, incy);
|
||||||
|
clRetainMemObject(my);
|
||||||
|
//Operation
|
||||||
|
is::driver::Context const & context = A.context();
|
||||||
|
if(transA==clblasTrans)
|
||||||
|
execute(is::detail::assign(y, alpha*dot(A.T(), x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
|
||||||
|
else
|
||||||
|
execute(is::detail::assign(y, alpha*dot(A, x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
|
||||||
|
return clblasSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
//*****************
|
||||||
|
//BLAS3
|
||||||
|
//*****************
|
||||||
clblasStatus clblasSgemm(clblasOrder order, clblasTranspose transA, clblasTranspose transB,
|
clblasStatus clblasSgemm(clblasOrder order, clblasTranspose transA, clblasTranspose transB,
|
||||||
size_t M, size_t N, size_t K,
|
size_t M, size_t N, size_t K,
|
||||||
cl_float alpha, const cl_mem mA, size_t offA, size_t lda,
|
cl_float alpha, const cl_mem mA, size_t offA, size_t lda,
|
||||||
@@ -28,42 +159,26 @@ extern "C"
|
|||||||
{
|
{
|
||||||
is::int_t As1 = M, As2 = K;
|
is::int_t As1 = M, As2 = K;
|
||||||
is::int_t Bs1 = K, Bs2 = N;
|
is::int_t Bs1 = K, Bs2 = N;
|
||||||
|
//Trans
|
||||||
if(transA==clblasTrans) std::swap(As1, As2);
|
if(transA==clblasTrans) std::swap(As1, As2);
|
||||||
if(transB==clblasTrans) std::swap(Bs1, Bs2);
|
if(transB==clblasTrans) std::swap(Bs1, Bs2);
|
||||||
|
//Struct
|
||||||
is::array A(is::FLOAT_TYPE, cl::Buffer(mA), is::_(offA%lda, As1, 1), is::_(offA/lda, As2, 1), lda);
|
is::array A(As1, As2, is::FLOAT_TYPE, cl::Buffer(mA), offA, lda);
|
||||||
is::array B(is::FLOAT_TYPE, cl::Buffer(mB), is::_(offB%ldb, Bs1, 1), is::_(offB/ldb, Bs2, 1), ldb);
|
clRetainMemObject(mA);
|
||||||
is::array C(is::FLOAT_TYPE, cl::Buffer(mC), is::_(offC%ldc, M, 1), is::_(offC/ldc, N, 1), ldc);
|
is::array B(Bs1, Bs2, is::FLOAT_TYPE, cl::Buffer(mB), offB, ldb);
|
||||||
|
clRetainMemObject(mB);
|
||||||
std::vector<is::driver::Event> waitlist;
|
is::array C(M, N, is::FLOAT_TYPE, cl::Buffer(mC), offC, ldc);
|
||||||
for(cl_uint i = 0 ; i < numEventsInWaitList ; ++i)
|
clRetainMemObject(mC);
|
||||||
waitlist.push_back(cl::Event(eventWaitList[i]));
|
is::driver::Context const & context = C.context();
|
||||||
|
//Operation
|
||||||
std::list<is::driver::Event> levents;
|
if(transA==clblasTrans && transB==clblasTrans)
|
||||||
|
execute(is::detail::assign(C, alpha*dot(A.T(), B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
|
||||||
for(cl_uint i = 0 ; i < numCommandQueues ; ++i)
|
else if(transA==clblasTrans && transB==clblasNoTrans)
|
||||||
{
|
execute(is::detail::assign(C, alpha*dot(A.T(), B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
|
||||||
clRetainCommandQueue(commandQueues[i]);
|
else if(transA==clblasNoTrans && transB==clblasTrans)
|
||||||
cl::CommandQueue clqueue(commandQueues[i]);
|
execute(is::detail::assign(C, alpha*dot(A, B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
|
||||||
is::driver::CommandQueue queue(clqueue);
|
else
|
||||||
|
execute(is::detail::assign(C, alpha*dot(A, B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
|
||||||
|
|
||||||
is::execution_options_type opt(queue, &levents, &waitlist);
|
|
||||||
|
|
||||||
if(transA==clblasTrans && transB==clblasTrans)
|
|
||||||
C = is::control(alpha*dot(A.T(), B.T()) + beta*C, opt);
|
|
||||||
else if(transA==clblasTrans && transB==clblasNoTrans)
|
|
||||||
C = is::control(alpha*dot(A.T(), B) + beta*C, opt);
|
|
||||||
else if(transA==clblasNoTrans && transB==clblasTrans)
|
|
||||||
C = is::control(alpha*dot(A, B.T()) + beta*C, opt);
|
|
||||||
else
|
|
||||||
C = is::control(alpha*dot(A, B) + beta*C, opt);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(events)
|
|
||||||
*events = static_cast<cl::Event>(levents.front())();
|
|
||||||
std::cout << events << std::endl;
|
|
||||||
|
|
||||||
return clblasSuccess;
|
return clblasSuccess;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user