2015-04-29 15:50:57 -04:00
|
|
|
#include "isaac/array.h"
|
|
|
|
#include "isaac/symbolic/execute.h"
|
|
|
|
#ifdef BENCH_CLBLAS
|
2015-06-24 07:51:27 -07:00
|
|
|
#include "isaac/wrap/clBLAS.h"
|
2015-01-27 15:32:59 -05:00
|
|
|
#endif
|
|
|
|
#ifdef BENCH_CBLAS
|
|
|
|
#include "cblas.h"
|
|
|
|
#endif
|
|
|
|
#ifdef BENCH_CUBLAS
|
|
|
|
#include <cublas.h>
|
|
|
|
#endif
|
2014-10-27 05:35:04 -04:00
|
|
|
#include <iomanip>
|
|
|
|
#include <stdlib.h>
|
2015-01-12 13:20:53 -05:00
|
|
|
#include <cmath>
|
2015-02-08 23:19:38 -05:00
|
|
|
#include <numeric>
|
2015-04-29 15:50:57 -04:00
|
|
|
#include <regex>
|
2014-11-06 07:07:27 -05:00
|
|
|
|
2015-08-13 17:19:07 -07:00
|
|
|
#include "common.hpp"
|
2015-08-06 12:05:12 -07:00
|
|
|
|
|
|
|
|
2015-08-12 19:38:53 -07:00
|
|
|
namespace sc = isaac;
|
|
|
|
typedef sc::int_t int_t;
|
2014-10-27 05:35:04 -04:00
|
|
|
|
2015-08-20 21:24:41 -04:00
|
|
|
static long time_event(long sum, sc::driver::Event const & e)
|
2015-02-08 23:19:38 -05:00
|
|
|
{
|
2015-08-20 21:24:41 -04:00
|
|
|
return sum + e.elapsed_time();
|
2015-02-08 23:19:38 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template<class T>
|
2015-08-12 19:38:53 -07:00
|
|
|
void bench(sc::numeric_type dtype, std::string operation)
|
2015-04-29 15:50:57 -04:00
|
|
|
{
|
2014-10-29 17:03:24 +01:00
|
|
|
|
2015-04-29 15:50:57 -04:00
|
|
|
//
|
|
|
|
// MACROS FOR BENCHMARKING
|
|
|
|
//
|
2015-07-27 11:37:19 -07:00
|
|
|
#define CL_HANDLE(X) X.handle().cl()
|
2015-07-23 09:39:13 -07:00
|
|
|
|
2015-04-29 15:50:57 -04:00
|
|
|
#define BENCHMARK_ISAAC(OP, PERF) \
|
2015-02-08 00:56:24 -05:00
|
|
|
{\
|
2015-08-05 12:47:20 -07:00
|
|
|
std::vector<double> times;\
|
2015-02-08 00:56:24 -05:00
|
|
|
double total_time = 0;\
|
2015-08-17 10:31:58 -07:00
|
|
|
while(total_time*1e-9 < 1e-2){\
|
2015-08-12 19:38:53 -07:00
|
|
|
std::list<sc::driver::Event> events;\
|
2015-08-05 09:24:10 -07:00
|
|
|
queue.synchronize();\
|
2015-02-08 00:56:24 -05:00
|
|
|
OP;\
|
2015-04-29 15:50:57 -04:00
|
|
|
queue.synchronize();\
|
2015-08-05 12:47:20 -07:00
|
|
|
times.push_back((double)std::accumulate(events.begin(), events.end(), 0, &time_event));\
|
2015-02-08 00:56:24 -05:00
|
|
|
total_time+=times.back();\
|
|
|
|
}\
|
2015-08-17 10:31:58 -07:00
|
|
|
double t = mean(times);\
|
2015-08-13 17:19:07 -07:00
|
|
|
std::cout << " " << (int)(PERF) << std::flush;\
|
2015-02-08 00:56:24 -05:00
|
|
|
}
|
|
|
|
|
2015-04-29 15:50:57 -04:00
|
|
|
#define BENCHMARK_CLBLAS(OP, PERF) \
|
2015-01-24 14:51:48 -05:00
|
|
|
{\
|
2015-02-05 23:11:16 -05:00
|
|
|
std::vector<long> times;\
|
|
|
|
double total_time = 0;\
|
2015-08-17 10:31:58 -07:00
|
|
|
while(total_time*1e-9 < 1e-2){\
|
2015-07-27 11:37:19 -07:00
|
|
|
cl_event event;\
|
2015-08-03 17:37:19 -07:00
|
|
|
queue.synchronize();\
|
2014-10-29 17:03:24 +01:00
|
|
|
OP;\
|
2015-04-29 15:50:57 -04:00
|
|
|
queue.synchronize();\
|
2015-08-12 19:38:53 -07:00
|
|
|
times.push_back(sc::driver::Event(event).elapsed_time());\
|
2015-02-05 23:11:16 -05:00
|
|
|
total_time+=times.back();\
|
2014-10-29 17:03:24 +01:00
|
|
|
}\
|
2015-08-17 10:31:58 -07:00
|
|
|
double t = mean(times);\
|
2015-08-13 17:19:07 -07:00
|
|
|
std::cout << " " << (int)(PERF) << std::flush;\
|
2015-02-05 23:11:16 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
#define BENCHMARK_HOST(OP, PERF) \
|
|
|
|
{\
|
2015-08-13 17:19:07 -07:00
|
|
|
Timer tmr;\
|
|
|
|
long total_time = 0;\
|
|
|
|
std::vector<long> times;\
|
2015-08-17 10:31:58 -07:00
|
|
|
while(total_time*1e-9 < 1e-2){\
|
2015-06-28 17:53:16 -07:00
|
|
|
tmr.start();\
|
|
|
|
OP;\
|
2015-08-13 17:19:07 -07:00
|
|
|
long time = tmr.get().count();\
|
2015-06-28 17:53:16 -07:00
|
|
|
times.push_back(time);\
|
|
|
|
total_time += time;\
|
|
|
|
}\
|
2015-08-17 10:31:58 -07:00
|
|
|
double t = mean(times);\
|
2015-08-13 17:19:07 -07:00
|
|
|
std::cout << " " << (int)(PERF) << std::flush;\
|
2015-02-05 23:11:16 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
#define BENCHMARK_CUDA(OP, PERF) \
|
|
|
|
{\
|
|
|
|
std::vector<long> times;\
|
|
|
|
double total_time = 0;\
|
2015-02-05 23:42:31 -05:00
|
|
|
float time;\
|
|
|
|
cudaEvent_t start, stop;\
|
|
|
|
cudaEventCreate(&start);\
|
|
|
|
cudaEventCreate(&stop);\
|
2015-06-28 17:53:16 -07:00
|
|
|
OP;\
|
|
|
|
cudaThreadSynchronize();\
|
2015-08-21 13:06:20 -04:00
|
|
|
while(total_time*1e-3 < 1e-2){\
|
2015-08-12 19:38:53 -07:00
|
|
|
flush = sc::zeros(1e6, 1, dtype);\
|
2015-02-05 23:42:31 -05:00
|
|
|
cudaEventRecord(start,0);\
|
2015-02-05 23:11:16 -05:00
|
|
|
OP;\
|
2015-02-05 23:42:31 -05:00
|
|
|
cudaEventRecord(stop,0);\
|
|
|
|
cudaEventSynchronize(stop);\
|
2015-02-05 23:11:16 -05:00
|
|
|
cudaEventElapsedTime(&time, start, stop);\
|
2015-02-05 23:42:31 -05:00
|
|
|
times.push_back(time*1e6);\
|
2015-02-05 23:11:16 -05:00
|
|
|
total_time+=time;\
|
|
|
|
}\
|
2015-02-05 23:42:31 -05:00
|
|
|
double t = median(times);\
|
2015-08-13 17:19:07 -07:00
|
|
|
std::cout << "\t" << (int)(PERF) << std::flush;\
|
2015-01-24 14:51:48 -05:00
|
|
|
}
|
2014-10-27 05:35:04 -04:00
|
|
|
|
2015-08-12 19:38:53 -07:00
|
|
|
unsigned int dtsize = sc::size_of(dtype);
|
|
|
|
sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0);
|
2015-04-29 15:50:57 -04:00
|
|
|
std::map<std::string, std::string> metric{ {"axpy", "GB/s"}, {"dot", "GB/s"}, {"gemv", "GB/s"}, {"gemm", "GFLOPS"}};
|
2015-08-12 19:38:53 -07:00
|
|
|
sc::array flush((int)1e6, sc::FLOAT_TYPE);
|
2015-04-29 15:50:57 -04:00
|
|
|
std::cout << "#" << operation << " (" << metric[operation] << ")" << std::endl;
|
|
|
|
std::cout << "N";
|
2015-06-25 08:12:16 -07:00
|
|
|
std::cout << "\tISAAC";
|
2015-04-29 15:50:57 -04:00
|
|
|
#ifdef BENCH_CLBLAS
|
|
|
|
std::cout << "\tclBLAS";
|
2015-02-08 23:19:38 -05:00
|
|
|
#endif
|
|
|
|
#ifdef BENCH_CBLAS
|
2015-04-29 15:50:57 -04:00
|
|
|
std::cout << "\tBLAS";
|
2015-02-08 23:19:38 -05:00
|
|
|
#endif
|
|
|
|
#ifdef BENCH_CUBLAS
|
2015-04-29 15:50:57 -04:00
|
|
|
std::cout << "\tcuBLAS";
|
2015-02-08 23:19:38 -05:00
|
|
|
#endif
|
2015-04-29 15:50:57 -04:00
|
|
|
std::cout << std::endl;
|
|
|
|
//
|
|
|
|
// RUN BENCHMARKS
|
|
|
|
//
|
2015-02-08 23:19:38 -05:00
|
|
|
|
2015-06-25 08:12:16 -07:00
|
|
|
/*---------*/
|
|
|
|
/*--BLAS1--*/
|
|
|
|
/*---------*/
|
2015-02-09 01:58:32 -05:00
|
|
|
|
2015-06-25 08:12:16 -07:00
|
|
|
if(operation=="axpy")
|
|
|
|
{
|
|
|
|
float alpha = 1;
|
2015-08-17 10:31:58 -07:00
|
|
|
for(int_t N: create_log_range((int)1e3, (int)1e8, 50, 64))
|
2015-06-25 08:12:16 -07:00
|
|
|
{
|
|
|
|
std::cout << N;
|
2015-08-12 19:38:53 -07:00
|
|
|
sc::array x(N, dtype), y(N, dtype);
|
2015-06-25 08:12:16 -07:00
|
|
|
/* ISAAC */
|
2015-08-12 19:38:53 -07:00
|
|
|
std::list<sc::driver::Event> events;
|
|
|
|
BENCHMARK_ISAAC(y = sc::control(x + alpha*y, sc::execution_options_type(0, &events)), 3*N*dtsize/t)
|
2015-06-25 08:12:16 -07:00
|
|
|
/* clblas */
|
|
|
|
#ifdef BENCH_CLBLAS
|
2015-08-26 14:12:50 -04:00
|
|
|
if(A.context().backend()==sc::driver::OPENCL)
|
|
|
|
BENCHMARK_CLBLAS(clblasSaxpy(N, alpha, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event), 3*N*dtsize/t);
|
2015-06-25 08:12:16 -07:00
|
|
|
#endif
|
|
|
|
/* BLAS */
|
|
|
|
#ifdef BENCH_CBLAS
|
|
|
|
std::vector<float> cx(N), cy(N);
|
2015-08-12 19:38:53 -07:00
|
|
|
sc::copy(x, cx);
|
|
|
|
sc::copy(y, cy);
|
2015-06-25 08:12:16 -07:00
|
|
|
BENCHMARK_HOST(cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
|
|
|
|
#endif
|
|
|
|
/* CuBLAS */
|
|
|
|
#ifdef BENCH_CUBLAS
|
|
|
|
T *cux, *cuy;
|
|
|
|
cudaMalloc((void**) &cux, N * sizeof(T));
|
|
|
|
cudaMalloc((void**) &cuy, N * sizeof(T));
|
|
|
|
BENCHMARK_CUDA(cublasSaxpy(N, alpha, cux, 1, cuy, 1), 3*N*dtsize/t)
|
|
|
|
cudaFree(cux);
|
|
|
|
cudaFree(cuy);
|
|
|
|
#endif
|
|
|
|
std::cout << std::endl;
|
|
|
|
}
|
|
|
|
}
|
2015-01-28 22:07:09 -05:00
|
|
|
|
2015-06-25 08:12:16 -07:00
|
|
|
if(operation=="dot")
|
|
|
|
{
|
2015-08-17 10:31:58 -07:00
|
|
|
for(int_t N: create_log_range((int)1e3, (int)1e8, 50, 64))
|
2015-06-25 08:12:16 -07:00
|
|
|
{
|
|
|
|
std::cout << N;
|
|
|
|
/* ISAAC */
|
2015-08-12 19:38:53 -07:00
|
|
|
sc::array x(N, dtype), y(N, dtype);
|
|
|
|
sc::array scratch(N, dtype);
|
|
|
|
sc::scalar s(dtype);
|
2015-06-25 08:12:16 -07:00
|
|
|
s = dot(x,y); queue.synchronize();
|
2015-08-12 19:38:53 -07:00
|
|
|
BENCHMARK_ISAAC(s = sc::control(dot(x,y), sc::execution_options_type(0, &events)), 2*N*dtsize/t)
|
2015-06-25 08:12:16 -07:00
|
|
|
/* clblas */
|
|
|
|
#ifdef BENCH_CLBLAS
|
2015-07-27 11:37:19 -07:00
|
|
|
BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event), 2*N*dtsize/t)
|
2015-06-25 08:12:16 -07:00
|
|
|
#endif
|
|
|
|
/* BLAS */
|
|
|
|
#ifdef BENCH_CBLAS
|
|
|
|
std::vector<float> cx(N), cy(N);
|
2015-08-12 19:38:53 -07:00
|
|
|
sc::copy(x, cx);
|
|
|
|
sc::copy(y, cy);
|
2015-06-25 08:12:16 -07:00
|
|
|
BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
|
|
|
|
#endif
|
|
|
|
#ifdef BENCH_CUBLAS
|
|
|
|
T *cux, *cuy;
|
|
|
|
cudaMalloc((void**) &cux, N * sizeof(T));
|
|
|
|
cudaMalloc((void**) &cuy, N * sizeof(T));
|
|
|
|
BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t)
|
|
|
|
cudaFree(cux);
|
|
|
|
cudaFree(cuy);
|
|
|
|
#endif
|
|
|
|
std::cout << std::endl;
|
|
|
|
}
|
|
|
|
}
|
2015-04-29 15:50:57 -04:00
|
|
|
|
2015-06-25 08:12:16 -07:00
|
|
|
if(operation.substr(0, 4)=="gemv")
|
|
|
|
{
|
2015-08-10 09:35:38 -07:00
|
|
|
std::vector<std::tuple<char,int_t, int_t> > MNs;
|
|
|
|
MNs.push_back(std::make_tuple('N',896,896));
|
|
|
|
MNs.push_back(std::make_tuple('N',3072,3072));
|
|
|
|
//AlexNet
|
|
|
|
MNs.push_back(std::make_tuple('N',1000,256));
|
|
|
|
MNs.push_back(std::make_tuple('N',4096,256));
|
2015-08-21 13:06:20 -04:00
|
|
|
|
2015-08-10 09:35:38 -07:00
|
|
|
MNs.push_back(std::make_tuple('T',169,256));
|
|
|
|
MNs.push_back(std::make_tuple('T',169,384));
|
|
|
|
MNs.push_back(std::make_tuple('T',729,256));
|
|
|
|
MNs.push_back(std::make_tuple('T',3025,96));
|
2015-04-29 15:50:57 -04:00
|
|
|
|
2015-06-25 08:12:16 -07:00
|
|
|
/*---------*/
|
|
|
|
/*--BLAS2--*/
|
|
|
|
/*---------*/
|
|
|
|
//T-layout
|
2015-08-10 09:35:38 -07:00
|
|
|
for(std::tuple<char, int_t, int_t> MN: MNs)
|
2015-06-25 08:12:16 -07:00
|
|
|
{
|
2015-08-10 09:35:38 -07:00
|
|
|
bool AT = std::get<0>(MN) == 'T';
|
|
|
|
int_t M = std::get<1>(MN);
|
|
|
|
int_t N = std::get<2>(MN);
|
|
|
|
std::cout << MN << ",";
|
|
|
|
int_t As1 = M, As2 = N;
|
|
|
|
if(AT) std::swap(As1, As2);
|
|
|
|
|
2015-06-25 08:12:16 -07:00
|
|
|
/* ISAAC */
|
2015-08-12 19:38:53 -07:00
|
|
|
sc::array A(As1, As2, dtype), y(M, dtype), x(N, dtype);
|
2015-08-05 12:07:51 -07:00
|
|
|
#ifdef HAS_A_BLAS
|
2015-06-25 08:12:16 -07:00
|
|
|
int_t lda = A.ld();
|
|
|
|
#endif
|
2015-08-12 19:38:53 -07:00
|
|
|
BENCHMARK_ISAAC(y = sc::control(AT?dot(A.T(),x):dot(A,x), sc::execution_options_type(0, &events)),(M*N + M + N)*dtsize/t);
|
2015-06-25 08:12:16 -07:00
|
|
|
#ifdef BENCH_CLBLAS
|
2015-08-26 14:12:50 -04:00
|
|
|
if(A.context().backend()==sc::driver::OPENCL)
|
|
|
|
BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, As1, As2, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event), (M*N + M + N)*dtsize/t)
|
2015-06-25 08:12:16 -07:00
|
|
|
#endif
|
|
|
|
#ifdef BENCH_CBLAS
|
2015-08-10 09:35:38 -07:00
|
|
|
std::vector<float> cA(M*N), cx(N), cy(M);
|
2015-08-12 19:38:53 -07:00
|
|
|
sc::copy(x, cx);
|
|
|
|
sc::copy(y, cy);
|
|
|
|
sc::copy(A, cA);
|
2015-08-10 09:35:38 -07:00
|
|
|
BENCHMARK_HOST(cblas_sgemv(CblasColMajor, AT?CblasTrans:CblasNoTrans, As1, As2, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
|
2015-06-25 08:12:16 -07:00
|
|
|
#endif
|
|
|
|
#ifdef BENCH_CUBLAS
|
|
|
|
T *cuA, *cux, *cuy;
|
|
|
|
cudaMalloc((void**) &cuA, N * M * sizeof(T));
|
|
|
|
cudaMalloc((void**) &cux, N * sizeof(T));
|
|
|
|
cudaMalloc((void**) &cuy, M * sizeof(T));
|
2015-08-10 09:35:38 -07:00
|
|
|
BENCHMARK_CUDA(cublasSgemv(AT?'t':'n', As1, As2, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
|
2015-06-25 08:12:16 -07:00
|
|
|
cudaFree(cuA);
|
|
|
|
cudaFree(cux);
|
|
|
|
cudaFree(cuy);
|
|
|
|
#endif
|
|
|
|
std::cout << std::endl;
|
|
|
|
}
|
|
|
|
}
|
2015-04-29 15:50:57 -04:00
|
|
|
|
|
|
|
if(operation.substr(0,4)=="gemm")
|
|
|
|
{
|
2015-08-17 10:31:58 -07:00
|
|
|
std::vector<std::tuple<std::string, char, char, int_t, int_t, int_t> > MNKs;
|
2015-08-21 13:06:20 -04:00
|
|
|
//Square
|
|
|
|
MNKs.push_back(std::make_tuple("Square [N=896]",'N','T',896,896,896));
|
|
|
|
MNKs.push_back(std::make_tuple("Square [N=2560]",'N','T',2560,2560,2560));
|
|
|
|
|
|
|
|
//Convolution
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution [AlexNet-1]",'N','N',3025,96,363));
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution [AlexNet-2]",'N','N',729,128,1200));
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution [AlexNet-3]",'N','N',169,384,2304));
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution [AlexNet-4]",'N','N',169,192,1728));
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution [AlexNet-5]",'N','N',169,128,1728));
|
|
|
|
// MNKs.push_back(std::make_tuple("Convolution [LeNet-1],'N','N',576,20,25));
|
|
|
|
// MNKs.push_back(std::make_tuple("Convolution [LeNet-2]",'N','N',64,50,500));
|
2015-08-10 10:19:50 -07:00
|
|
|
|
2015-08-21 13:06:20 -04:00
|
|
|
//Convolution Gradient-1
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-5]",'T','N',1728,128,169));
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-4]",'T','N',1728,192,169));
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-3]",'T','N',2304,384,169));
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-2]",'T','N',1200,128,729));
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution Gradient-1 [AlexNet-1]",'T','N',363,96,3025));
|
|
|
|
// MNKs.push_back(std::make_tuple("Conv. Gradient-1 [LeNet-2]",'T','N',500,50,64));
|
|
|
|
// MNKs.push_back(std::make_tuple("Conv. Gradient-1 [LeNet-1]",'T','N',25,20,576));
|
2015-08-18 16:36:41 -07:00
|
|
|
|
2015-08-21 13:06:20 -04:00
|
|
|
//Convolution Gradient-2
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-5]",'N','T',169,1728,128));
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-4]",'N','T',169,1728,192));
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-3]",'N','T',169,2304,384));
|
|
|
|
MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-2]",'N','T',729,1200,128));
|
|
|
|
// MNKs.push_back(std::make_tuple("Conv. Gradient-2 [LeNet-2]",'N','T',64,500,50));
|
2015-08-17 10:31:58 -07:00
|
|
|
|
2015-08-21 13:06:20 -04:00
|
|
|
//Covariance (e.g., ICA, 10minutes/1khz)
|
|
|
|
MNKs.push_back(std::make_tuple("ICA [32 channels]",'N','T',32,32,600000));
|
|
|
|
MNKs.push_back(std::make_tuple("ICA [256 channels]",'N','T',256,256,600000));
|
2015-07-14 13:33:23 -04:00
|
|
|
|
2015-08-21 13:06:20 -04:00
|
|
|
//Bi-diagonalization
|
|
|
|
MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 1]",'N','T',4096,4096,32));
|
|
|
|
MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 10]",'N','T',3456,3456,32));
|
|
|
|
MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 50]",'N','T',896,896,32));
|
2015-04-29 15:50:57 -04:00
|
|
|
|
|
|
|
/*---------*/
|
|
|
|
/*--BLAS3--*/
|
|
|
|
/*---------*/
|
2015-08-17 10:31:58 -07:00
|
|
|
for(std::tuple<std::string, char, char, int_t, int_t, int_t> MNK: MNKs)
|
2015-04-29 15:50:57 -04:00
|
|
|
{
|
2015-08-17 10:31:58 -07:00
|
|
|
bool AT = std::get<1>(MNK)=='T';
|
|
|
|
bool BT = std::get<2>(MNK)=='T';
|
|
|
|
int_t M = std::get<3>(MNK);
|
|
|
|
int_t N = std::get<4>(MNK);
|
|
|
|
int_t K = std::get<5>(MNK);
|
|
|
|
std::cout << "\"" << std::get<0>(MNK) << "\"";
|
2015-06-28 17:53:16 -07:00
|
|
|
std::cout << std::flush;
|
2015-04-29 15:50:57 -04:00
|
|
|
/* ISAAC */
|
2015-07-14 13:33:23 -04:00
|
|
|
int_t As1 = M, As2 = K;
|
|
|
|
if(AT) std::swap(As1, As2);
|
|
|
|
int_t Bs1 = K, Bs2 = N;
|
|
|
|
if(BT) std::swap(Bs1, Bs2);
|
|
|
|
|
2015-08-12 19:38:53 -07:00
|
|
|
sc::array C(M, N, dtype), A(As1, As2, dtype), B(Bs1, Bs2, dtype);
|
2015-08-05 12:07:51 -07:00
|
|
|
#ifdef HAS_A_BLAS
|
2015-04-29 15:50:57 -04:00
|
|
|
int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld();
|
2015-04-29 16:11:32 -04:00
|
|
|
#endif
|
2015-08-21 13:06:20 -04:00
|
|
|
BENCHMARK_ISAAC(C = sc::control(AT?(BT?dot(A.T(),B.T()):dot(A.T(),B)):(BT?dot(A,B.T()):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(false)), (double)2*M*N*K/t);
|
2015-04-29 15:50:57 -04:00
|
|
|
/* clblas */
|
|
|
|
#ifdef BENCH_CLBLAS
|
2015-08-26 14:12:50 -04:00
|
|
|
if(A.context().backend()==sc::driver::OPENCL)
|
|
|
|
BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
|
|
|
|
0, CL_HANDLE(C.data()), 0, ldc, 1, &CL_HANDLE(queue),0, NULL, &event), (double)2*M*N*K/t)
|
2015-04-29 15:50:57 -04:00
|
|
|
#endif
|
|
|
|
/* BLAS */
|
|
|
|
#ifdef BENCH_CBLAS
|
|
|
|
std::vector<float> cC(M*N), cA(M*K), cB(N*K);
|
2015-08-12 19:38:53 -07:00
|
|
|
sc::copy(C, cC);
|
|
|
|
sc::copy(A, cA);
|
|
|
|
sc::copy(B, cB);
|
2015-08-13 17:19:07 -07:00
|
|
|
BENCHMARK_HOST(cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc), (double)2*M*N*K/t);
|
2015-04-29 15:50:57 -04:00
|
|
|
#endif
|
|
|
|
#ifdef BENCH_CUBLAS
|
|
|
|
T *cuA, *cuB, *cuC;
|
|
|
|
cudaMalloc((void**) &cuA, M * K * sizeof(T));
|
|
|
|
cudaMalloc((void**) &cuB, K * N * sizeof(T));
|
|
|
|
cudaMalloc((void**) &cuC, M * N * sizeof(T));
|
2015-08-21 13:06:20 -04:00
|
|
|
BENCHMARK_CUDA(cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, cuA, lda, cuB, ldb, 1, cuC, ldc), (double)2*M*N*K/t)
|
2015-04-29 15:50:57 -04:00
|
|
|
cudaFree(cuA);
|
|
|
|
cudaFree(cuB);
|
|
|
|
cudaFree(cuC);
|
|
|
|
#endif
|
|
|
|
std::cout << std::endl;
|
|
|
|
}
|
|
|
|
}
|
2015-01-25 18:19:19 -05:00
|
|
|
|
2014-10-27 05:35:04 -04:00
|
|
|
}
|
|
|
|
|
2014-10-29 17:03:24 +01:00
|
|
|
int main(int argc, char* argv[])
|
2014-10-27 05:35:04 -04:00
|
|
|
{
|
2015-04-29 15:50:57 -04:00
|
|
|
std::vector<std::string> args(argv, argv + argc);
|
|
|
|
#ifdef BENCH_CLBLAS
|
|
|
|
clblasSetup();
|
2015-01-24 14:51:48 -05:00
|
|
|
#endif
|
2015-08-12 19:38:53 -07:00
|
|
|
sc::driver::backend::default_queue_properties = CL_QUEUE_PROFILING_ENABLE;
|
2015-02-05 04:42:57 -05:00
|
|
|
|
2015-01-24 14:51:48 -05:00
|
|
|
int device_idx = 0;
|
2015-08-12 19:38:53 -07:00
|
|
|
std::list<sc::driver::Context const *> contexts;
|
|
|
|
sc::driver::backend::contexts::get(contexts);
|
2015-02-04 22:06:15 -05:00
|
|
|
|
2015-04-29 15:50:57 -04:00
|
|
|
std::string operation;
|
2015-07-31 01:02:59 -07:00
|
|
|
if(contexts.size() > 1)
|
2015-04-29 15:50:57 -04:00
|
|
|
{
|
|
|
|
if(args.size() != 3)
|
2015-01-24 14:51:48 -05:00
|
|
|
{
|
2015-04-29 15:50:57 -04:00
|
|
|
std::cerr << "usage : blas-bench DEVICE_IDX OPERATION" << std::endl;
|
2015-01-24 14:51:48 -05:00
|
|
|
std::cout << "Devices available: " << std::endl;
|
|
|
|
unsigned int current=0;
|
2015-08-12 19:38:53 -07:00
|
|
|
for(sc::driver::Context const * context: contexts)
|
2015-07-31 15:47:41 -07:00
|
|
|
{
|
2015-08-12 19:38:53 -07:00
|
|
|
sc::driver::Device device = sc::driver::backend::queues::get(*context,0).device();
|
2015-07-31 15:47:41 -07:00
|
|
|
std::cout << current++ << ": " << device.name() << " on " << device.platform().name() << " " << device.platform().version() << std::endl;
|
|
|
|
}
|
2015-01-24 14:51:48 -05:00
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
}
|
2015-04-29 15:50:57 -04:00
|
|
|
device_idx = atoi(argv[1]);
|
|
|
|
operation = args[2];
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if(args.size() != 2)
|
|
|
|
{
|
|
|
|
std::cerr << "usage : blas-bench OPERATION" << std::endl;
|
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
}
|
|
|
|
operation = args[1];
|
2015-01-24 14:51:48 -05:00
|
|
|
}
|
|
|
|
|
2015-08-12 19:38:53 -07:00
|
|
|
sc::driver::backend::default_device = device_idx;
|
2014-10-30 13:04:33 -04:00
|
|
|
std::cout << "#Benchmark : BLAS" << std::endl;
|
|
|
|
std::cout << "#----------------" << std::endl;
|
2015-08-12 19:38:53 -07:00
|
|
|
bench<float>(sc::FLOAT_TYPE, operation);
|
2015-01-24 14:51:48 -05:00
|
|
|
|
2015-04-29 15:50:57 -04:00
|
|
|
#ifdef BENCH_CLBLAS
|
|
|
|
clblasTeardown();
|
2015-01-24 14:51:48 -05:00
|
|
|
#endif
|
2014-10-27 05:35:04 -04:00
|
|
|
}
|