C++: More clBLAS routines
This commit is contained in:
262
bench/blas.cpp
262
bench/blas.cpp
@@ -99,6 +99,7 @@ void bench(ad::numeric_type dtype, std::string operation)
|
||||
double total_time = 0;\
|
||||
while(total_time*1e-9 < 1e-3){\
|
||||
cl::Event event;\
|
||||
flush = ad::zeros(1e6, 1, dtype);\
|
||||
OP;\
|
||||
queue.synchronize();\
|
||||
times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>());\
|
||||
@@ -146,8 +147,7 @@ void bench(ad::numeric_type dtype, std::string operation)
|
||||
ad::array flush(1e6, dtype);
|
||||
std::cout << "#" << operation << " (" << metric[operation] << ")" << std::endl;
|
||||
std::cout << "N";
|
||||
std::cout << "\tISAAC (predictive)";
|
||||
std::cout << "\tISAAC (optimal)";
|
||||
std::cout << "\tISAAC";
|
||||
#ifdef BENCH_CLBLAS
|
||||
std::cout << "\tclBLAS";
|
||||
#endif
|
||||
@@ -162,142 +162,143 @@ void bench(ad::numeric_type dtype, std::string operation)
|
||||
// RUN BENCHMARKS
|
||||
//
|
||||
|
||||
// /*---------*/
|
||||
// /*--BLAS1--*/
|
||||
// /*---------*/
|
||||
/*---------*/
|
||||
/*--BLAS1--*/
|
||||
/*---------*/
|
||||
|
||||
// if(operation=="axpy")
|
||||
// {
|
||||
// for(int_t N: create_log_range(1e3, 2e7, 50, 64))
|
||||
// {
|
||||
// std::cout << N;
|
||||
// ad::array x(N, dtype), y(N, dtype);
|
||||
// /* ISAAC */
|
||||
// std::list<ad::driver::Event> events;\
|
||||
// BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), 3*N*dtsize/t)
|
||||
// BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t)
|
||||
// /* clblas */
|
||||
// #ifdef BENCH_CLBLAS
|
||||
// BENCHMARK_CLBLAS(clblasSaxpy(N, 1, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event()), 3*N*dtsize/t)
|
||||
// #endif
|
||||
// /* BLAS */
|
||||
// #ifdef BENCH_CBLAS
|
||||
// std::vector<float> cx(N), cy(N);
|
||||
// ad::copy(x, cx);
|
||||
// ad::copy(y, cy);
|
||||
// BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
|
||||
// #endif
|
||||
// /* CuBLAS */
|
||||
// #ifdef BENCH_CUBLAS
|
||||
// T *cux, *cuy;
|
||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
// cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
// BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
|
||||
// cudaFree(cux);
|
||||
// cudaFree(cuy);
|
||||
// #endif
|
||||
// std::cout << std::endl;
|
||||
// }
|
||||
// }
|
||||
if(operation=="axpy")
|
||||
{
|
||||
float alpha = 1;
|
||||
for(int_t N: create_log_range(1e3, 2e7, 50, 64))
|
||||
{
|
||||
std::cout << N;
|
||||
ad::array x(N, dtype), y(N, dtype);
|
||||
/* ISAAC */
|
||||
std::list<ad::driver::Event> events;\
|
||||
BENCHMARK_ISAAC(y = ad::control(x + alpha*y, ad::execution_options_type(0, &events)), 3*N*dtsize/t)
|
||||
/* clblas */
|
||||
#ifdef BENCH_CLBLAS
|
||||
BENCHMARK_CLBLAS(clblasSaxpy(N, alpha, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event()), 3*N*dtsize/t)
|
||||
#endif
|
||||
/* BLAS */
|
||||
#ifdef BENCH_CBLAS
|
||||
std::vector<float> cx(N), cy(N);
|
||||
ad::copy(x, cx);
|
||||
ad::copy(y, cy);
|
||||
BENCHMARK_HOST(cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
|
||||
#endif
|
||||
/* CuBLAS */
|
||||
#ifdef BENCH_CUBLAS
|
||||
T *cux, *cuy;
|
||||
cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
BENCHMARK_CUDA(cublasSaxpy(N, alpha, cux, 1, cuy, 1), 3*N*dtsize/t)
|
||||
cudaFree(cux);
|
||||
cudaFree(cuy);
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// if(operation=="dot")
|
||||
// {
|
||||
// for(int_t N: create_log_range(1e3, 2e7, 50, 64))
|
||||
// {
|
||||
// std::cout << N;
|
||||
// /* ISAAC */
|
||||
// ad::array x(N, dtype), y(N, dtype);
|
||||
// ad::array scratch(N, dtype);
|
||||
// ad::scalar s(dtype);
|
||||
// s = dot(x,y); queue.synchronize();
|
||||
// BENCHMARK_ISAAC(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t)
|
||||
// /* clblas */
|
||||
// #ifdef BENCH_CLBLAS
|
||||
// BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event()), 2*N*dtsize/t)
|
||||
// #endif
|
||||
// /* BLAS */
|
||||
// #ifdef BENCH_CBLAS
|
||||
// std::vector<float> cx(N), cy(N);
|
||||
// ad::copy(x, cx);
|
||||
// ad::copy(y, cy);
|
||||
// BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
|
||||
// #endif
|
||||
// #ifdef BENCH_CUBLAS
|
||||
// T *cux, *cuy;
|
||||
// T result;
|
||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
// cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
// BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t)
|
||||
// cudaFree(cux);
|
||||
// cudaFree(cuy);
|
||||
// #endif
|
||||
// std::cout << std::endl;
|
||||
// }
|
||||
// std::cout << "\n\n" << std::flush;
|
||||
// }
|
||||
if(operation=="dot")
|
||||
{
|
||||
for(int_t N: create_log_range(1e3, 2e7, 50, 64))
|
||||
{
|
||||
std::cout << N;
|
||||
/* ISAAC */
|
||||
ad::array x(N, dtype), y(N, dtype);
|
||||
ad::array scratch(N, dtype);
|
||||
ad::scalar s(dtype);
|
||||
s = dot(x,y); queue.synchronize();
|
||||
BENCHMARK_ISAAC(s = ad::control(dot(x,y), ad::execution_options_type(0, &events)), 2*N*dtsize/t)
|
||||
/* clblas */
|
||||
#ifdef BENCH_CLBLAS
|
||||
BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event()), 2*N*dtsize/t)
|
||||
#endif
|
||||
/* BLAS */
|
||||
#ifdef BENCH_CBLAS
|
||||
std::vector<float> cx(N), cy(N);
|
||||
ad::copy(x, cx);
|
||||
ad::copy(y, cy);
|
||||
BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
|
||||
#endif
|
||||
#ifdef BENCH_CUBLAS
|
||||
T *cux, *cuy;
|
||||
T result;
|
||||
cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t)
|
||||
cudaFree(cux);
|
||||
cudaFree(cuy);
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << "\n\n" << std::flush;
|
||||
}
|
||||
|
||||
// if(operation.substr(0, 4)=="gemv")
|
||||
// {
|
||||
// std::vector<std::tuple<int_t, int_t> > MNs;
|
||||
// MNs.push_back(std::make_tuple(896,896));
|
||||
// MNs.push_back(std::make_tuple(3072,3072));
|
||||
// MNs.push_back(std::make_tuple(64,32000));
|
||||
// MNs.push_back(std::make_tuple(896,32000));
|
||||
// MNs.push_back(std::make_tuple(32000, 64));
|
||||
// MNs.push_back(std::make_tuple(32000, 896));
|
||||
if(operation.substr(0, 4)=="gemv")
|
||||
{
|
||||
std::vector<std::tuple<int_t, int_t> > MNs;
|
||||
MNs.push_back(std::make_tuple(896,896));
|
||||
MNs.push_back(std::make_tuple(3072,3072));
|
||||
MNs.push_back(std::make_tuple(64,32000));
|
||||
MNs.push_back(std::make_tuple(896,32000));
|
||||
MNs.push_back(std::make_tuple(32000, 64));
|
||||
MNs.push_back(std::make_tuple(32000, 896));
|
||||
|
||||
// /*---------*/
|
||||
// /*--BLAS2--*/
|
||||
// /*---------*/
|
||||
// //T-layout
|
||||
// for(std::tuple<int_t, int_t> MN: MNs)
|
||||
// {
|
||||
// int_t M = std::get<0>(MN);
|
||||
// int_t N = std::get<1>(MN);
|
||||
// std::cout << M << "," << N;
|
||||
// /* ISAAC */
|
||||
// ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
|
||||
// #if HAS_A_BLAS
|
||||
// int_t lda = A.ld();
|
||||
// #endif
|
||||
// y = dot(trans(A),x); queue.synchronize();
|
||||
// BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)),(M*N + M + N)*dtsize/t);
|
||||
// BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
|
||||
// #ifdef BENCH_CLBLAS
|
||||
// BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, clblasTrans, N, M, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event()), (M*N + M + N)*dtsize/t)
|
||||
// #endif
|
||||
// #ifdef BENCH_CBLAS
|
||||
// std::vector<float> cA(N*M), cx(N), cy(M);
|
||||
// ad::copy(x, cx);
|
||||
// ad::copy(y, cy);
|
||||
// ad::copy(A, cA);
|
||||
// BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
|
||||
// #endif
|
||||
// #ifdef BENCH_CUBLAS
|
||||
// T *cuA, *cux, *cuy;
|
||||
// cudaMalloc((void**) &cuA, N * M * sizeof(T));
|
||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
// cudaMalloc((void**) &cuy, M * sizeof(T));
|
||||
// BENCHMARK_CUDA(cublasSgemv('t', N, M, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
|
||||
// cudaFree(cuA);
|
||||
// cudaFree(cux);
|
||||
// cudaFree(cuy);
|
||||
// #endif
|
||||
// std::cout << std::endl;
|
||||
// }
|
||||
// std::cout << "\n\n" << std::flush;
|
||||
// }
|
||||
/*---------*/
|
||||
/*--BLAS2--*/
|
||||
/*---------*/
|
||||
//T-layout
|
||||
for(std::tuple<int_t, int_t> MN: MNs)
|
||||
{
|
||||
int_t M = std::get<0>(MN);
|
||||
int_t N = std::get<1>(MN);
|
||||
std::cout << M << "," << N;
|
||||
/* ISAAC */
|
||||
ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
|
||||
#if HAS_A_BLAS
|
||||
int_t lda = A.ld();
|
||||
#endif
|
||||
y = dot(trans(A),x); queue.synchronize();
|
||||
BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events)),(M*N + M + N)*dtsize/t);
|
||||
#ifdef BENCH_CLBLAS
|
||||
BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, clblasTrans, N, M, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event()), (M*N + M + N)*dtsize/t)
|
||||
#endif
|
||||
#ifdef BENCH_CBLAS
|
||||
std::vector<float> cA(N*M), cx(N), cy(M);
|
||||
ad::copy(x, cx);
|
||||
ad::copy(y, cy);
|
||||
ad::copy(A, cA);
|
||||
BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
|
||||
#endif
|
||||
#ifdef BENCH_CUBLAS
|
||||
T *cuA, *cux, *cuy;
|
||||
cudaMalloc((void**) &cuA, N * M * sizeof(T));
|
||||
cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
cudaMalloc((void**) &cuy, M * sizeof(T));
|
||||
BENCHMARK_CUDA(cublasSgemv('t', N, M, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
|
||||
cudaFree(cuA);
|
||||
cudaFree(cux);
|
||||
cudaFree(cuy);
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << "\n\n" << std::flush;
|
||||
}
|
||||
|
||||
if(operation.substr(0,4)=="gemm")
|
||||
{
|
||||
std::vector<std::tuple<int_t, int_t, int_t> > MNKs;
|
||||
MNKs.push_back(std::make_tuple(896,896,896));
|
||||
MNKs.push_back(std::make_tuple(3072,3072,3072));
|
||||
MNKs.push_back(std::make_tuple(1024,64,768));
|
||||
MNKs.push_back(std::make_tuple(768,64,128));
|
||||
MNKs.push_back(std::make_tuple(64,64,32000));
|
||||
MNKs.push_back(std::make_tuple(1024,1024,32000));
|
||||
// MNKs.push_back(std::make_tuple(896,896,896));
|
||||
// MNKs.push_back(std::make_tuple(3072,3072,3072));
|
||||
// MNKs.push_back(std::make_tuple(1024,64,768));
|
||||
// MNKs.push_back(std::make_tuple(768,64,128));
|
||||
// MNKs.push_back(std::make_tuple(64,64,32000));
|
||||
// MNKs.push_back(std::make_tuple(1024,1024,32000));
|
||||
|
||||
for(unsigned int N = 1 ; N <10 ; ++N)
|
||||
MNKs.push_back(std::make_tuple(128*N, 128*N, 128*N));
|
||||
/*---------*/
|
||||
/*--BLAS3--*/
|
||||
/*---------*/
|
||||
@@ -312,8 +313,7 @@ void bench(ad::numeric_type dtype, std::string operation)
|
||||
#if HAS_A_BLAS
|
||||
int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld();
|
||||
#endif
|
||||
// BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), (double)2*M*N*K/t);
|
||||
//BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), (double)2*M*N*K/t);
|
||||
BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events)), (double)2*M*N*K/t);
|
||||
/* clblas */
|
||||
#ifdef BENCH_CLBLAS
|
||||
BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, clblasNoTrans, clblasTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
|
||||
|
Reference in New Issue
Block a user