Files
triton/bench/blas.cpp

368 lines
12 KiB
C++
Raw Normal View History

#include "isaac/array.h"
#include "isaac/runtime/execute.h"
#ifdef BENCH_CLBLAS
2015-11-29 15:30:11 -05:00
#include "clBLAS.h"
2015-01-27 15:32:59 -05:00
#endif
#ifdef BENCH_MKL
#include "mkl_cblas.h"
#elif defined(BENCH_CBLAS)
2015-01-27 15:32:59 -05:00
#include "cblas.h"
#endif
#ifdef BENCH_CUBLAS
#include <cublas.h>
#endif
2014-10-27 05:35:04 -04:00
#include <iomanip>
#include <stdlib.h>
#include <cmath>
2015-02-08 23:19:38 -05:00
#include <numeric>
#include <regex>
2014-11-06 07:07:27 -05:00
2015-08-13 17:19:07 -07:00
#include "common.hpp"
namespace sc = isaac;
typedef sc::int_t int_t;
2014-10-27 05:35:04 -04:00
2015-02-08 23:19:38 -05:00
template<class T>
void bench(sc::numeric_type dtype, std::string operation)
{
2014-10-29 17:03:24 +01:00
Timer tmr;
//
// MACROS FOR BENCHMARKING
//
2015-07-27 11:37:19 -07:00
#define CL_HANDLE(X) X.handle().cl()
#define CU_HANDLE(X) X.handle().cu()
#define BENCHMARK_ISAAC(OP, PERF) \
{\
std::vector<long> times;\
double total_time = 0;\
OP;\
queue.synchronize();\
while(total_time*1e-9 < 1e-1){\
tmr.start();\
OP;\
queue.synchronize();\
times.push_back(tmr.get().count());\
total_time+=times.back();\
}\
double t = min(times);\
std::cout << "\t" << (int)PERF << std::flush;\
}
#define BENCHMARK_CLBLAS(OP, PERF) \
{\
2015-02-05 23:11:16 -05:00
std::vector<long> times;\
double total_time = 0;\
OP;\
queue.synchronize();\
while(total_time*1e-9 < 1e-1){\
tmr.start();\
2014-10-29 17:03:24 +01:00
OP;\
queue.synchronize();\
times.push_back(tmr.get().count());\
2015-02-05 23:11:16 -05:00
total_time+=times.back();\
2014-10-29 17:03:24 +01:00
}\
double t = min(times);\
std::cout << "\t" << PERF << std::flush;\
2015-02-05 23:11:16 -05:00
}
#define BENCHMARK_HOST(OP, PERF) \
{\
2015-08-13 17:19:07 -07:00
long total_time = 0;\
std::vector<long> times;\
OP;\
while(total_time*1e-9 < 1e-1){\
tmr.start();\
OP;\
2015-08-13 17:19:07 -07:00
long time = tmr.get().count();\
times.push_back(time);\
total_time += time;\
}\
double t = min(times);\
std::cout << "\t" << PERF << std::flush;\
2015-02-05 23:11:16 -05:00
}
#define BENCHMARK_CUDA(OP, PERF) \
{\
std::vector<long> times;\
double total_time = 0;\
2015-02-05 23:11:16 -05:00
OP;\
cudaDeviceSynchronize();\
while(total_time*1e-9 < 1e-1){\
tmr.start();\
OP;\
cudaDeviceSynchronize();\
times.push_back(tmr.get().count());\
total_time+=times.back();\
}\
double t = min(times);\
std::cout << "\t" << PERF << std::flush;\
}
2014-10-27 05:35:04 -04:00
unsigned int dtsize = sc::size_of(dtype);
sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0);
std::map<std::string, std::string> metric{ {"axpy", "GB/s"}, {"dot", "GB/s"}, {"gemv", "GB/s"}, {"gemm", "GFLOPS"}};
sc::array flush((int)1e6, sc::FLOAT_TYPE);
//
// RUN BENCHMARKS
//
2015-02-08 23:19:38 -05:00
2015-06-25 08:12:16 -07:00
/*---------*/
/*--BLAS1--*/
/*---------*/
2015-02-09 01:58:32 -05:00
2015-06-25 08:12:16 -07:00
if(operation=="axpy")
{
float alpha = 1;
2015-08-17 10:31:58 -07:00
for(int_t N: create_log_range((int)1e3, (int)1e8, 50, 64))
2015-06-25 08:12:16 -07:00
{
std::cout << N;
sc::array x(N, dtype), y(N, dtype);
2015-06-25 08:12:16 -07:00
/* ISAAC */
BENCHMARK_ISAAC(y = x + alpha*y, 3*N*dtsize/t)
2015-12-21 18:42:56 -05:00
// BENCHMARK_ISAAC(y = sc::execution_handler(x + alpha*y, sc::execution_options_type(), sc::dispatcher_options_type(true)), 3*N*dtsize/t)
2015-06-25 08:12:16 -07:00
/* clblas */
#ifdef BENCH_CLBLAS
2015-08-26 14:24:12 -04:00
if(x.context().backend()==sc::driver::OPENCL)
BENCHMARK_CLBLAS(clblasSaxpy(N, alpha, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, NULL), 3*N*dtsize/t);
2015-06-25 08:12:16 -07:00
#endif
/* BLAS */
#ifdef BENCH_CBLAS
std::vector<float> cx(N), cy(N);
sc::copy(x, cx);
sc::copy(y, cy);
2015-06-25 08:12:16 -07:00
BENCHMARK_HOST(cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
#endif
/* CuBLAS */
#ifdef BENCH_CUBLAS
BENCHMARK_CUDA(cublasSaxpy(N, alpha, (T*)CU_HANDLE(x.data()), 1, (T*)CU_HANDLE(y.data()), 1), 3*N*dtsize/t)
2015-06-25 08:12:16 -07:00
#endif
std::cout << std::endl;
}
}
2015-06-25 08:12:16 -07:00
if(operation=="dot")
{
2015-08-17 10:31:58 -07:00
for(int_t N: create_log_range((int)1e3, (int)1e8, 50, 64))
2015-06-25 08:12:16 -07:00
{
std::cout << N;
/* ISAAC */
sc::array x(N, dtype), y(N, dtype);
sc::array scratch(N, dtype);
sc::scalar s(dtype);
2015-06-25 08:12:16 -07:00
s = dot(x,y); queue.synchronize();
BENCHMARK_ISAAC(s = dot(x,y), 2*N*dtsize/t)
2015-06-25 08:12:16 -07:00
/* clblas */
#ifdef BENCH_CLBLAS
2015-08-26 14:24:12 -04:00
if(x.context().backend()==sc::driver::OPENCL)
BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, NULL), 2*N*dtsize/t)
2015-06-25 08:12:16 -07:00
#endif
/* BLAS */
#ifdef BENCH_CBLAS
std::vector<float> cx(N), cy(N);
sc::copy(x, cx);
sc::copy(y, cy);
2015-06-25 08:12:16 -07:00
BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
#endif
#ifdef BENCH_CUBLAS
BENCHMARK_CUDA(cublasSdot(N, (T*)CU_HANDLE(x.data()), 1, (T*)CU_HANDLE(y.data()), 1), 2*N*dtsize/t)
2015-06-25 08:12:16 -07:00
#endif
std::cout << std::endl;
}
}
2015-06-25 08:12:16 -07:00
if(operation.substr(0, 4)=="gemv")
{
2015-08-31 13:35:29 -04:00
std::vector<std::tuple<std::string, char,int_t, int_t> > MNs;
2015-08-30 02:23:40 -04:00
//Linear System
2015-12-21 18:42:56 -05:00
MNs.push_back(std::make_tuple("square153[N]", 'N',153,153));
MNs.push_back(std::make_tuple("square153[T]", 'T',153,153));
MNs.push_back(std::make_tuple("square1024[T]", 'T',1024,1024));
MNs.push_back(std::make_tuple("square2867[N]", 'N',2867,2867));
MNs.push_back(std::make_tuple("square2867[T]", 'T',2867,2867));
2015-08-30 02:23:40 -04:00
//Normalization
2015-12-21 18:42:56 -05:00
MNs.push_back(std::make_tuple("norm64[N]", 'N', 64, 60000));
MNs.push_back(std::make_tuple("norm64[T]", 'T', 64, 60000));
MNs.push_back(std::make_tuple("norm256[N]", 'N', 256, 60000));
MNs.push_back(std::make_tuple("norm256[T]", 'T', 256, 60000));
MNs.push_back(std::make_tuple("norm1024[N]", 'N', 1024, 60000));
MNs.push_back(std::make_tuple("norm1024[T]", 'T', 1024, 60000));
2015-08-21 13:06:20 -04:00
2015-08-30 02:23:40 -04:00
//Householder
2015-12-21 18:42:56 -05:00
MNs.push_back(std::make_tuple("tallskinny-1[N]", 'N', 10, 60000));
MNs.push_back(std::make_tuple("tallskinny-1[T]", 'T', 10, 60000));
MNs.push_back(std::make_tuple("tallskinny-2[N]", 'N', 30, 60000));
MNs.push_back(std::make_tuple("tallskinny-2[T]", 'T', 30, 60000));
2015-06-25 08:12:16 -07:00
/*---------*/
/*--BLAS2--*/
/*---------*/
2015-08-31 13:35:29 -04:00
for(std::tuple<std::string, char, int_t, int_t> MN: MNs)
2015-06-25 08:12:16 -07:00
{
2015-08-31 13:35:29 -04:00
bool AT = std::get<1>(MN) == 'T';
int_t M = std::get<2>(MN);
int_t N = std::get<3>(MN);
std::cout << '"' << std::get<0>(MN) << '"';
int_t As1 = M, As2 = N;
if(AT) std::swap(As1, As2);
2015-06-25 08:12:16 -07:00
/* ISAAC */
sc::array A(As1, As2, dtype), y(M, dtype), x(N, dtype);
2015-08-05 12:07:51 -07:00
#ifdef HAS_A_BLAS
2015-11-19 20:49:38 -05:00
int_t lda = A.stride()[1];
2015-06-25 08:12:16 -07:00
#endif
BENCHMARK_ISAAC(y = AT?dot(A.T,x):dot(A,x),(M*N + M + N)*dtsize/t);
2015-12-21 18:42:56 -05:00
// BENCHMARK_ISAAC(y = sc::execution_handler(AT?dot(A.T,x):dot(A,x), sc::execution_options_type(), sc::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
2015-06-25 08:12:16 -07:00
#ifdef BENCH_CLBLAS
2015-08-26 14:24:12 -04:00
if(y.context().backend()==sc::driver::OPENCL)
BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, As1, As2, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, NULL), (M*N + M + N)*dtsize/t)
2015-06-25 08:12:16 -07:00
#endif
#ifdef BENCH_CBLAS
std::vector<float> cA(M*N), cx(N), cy(M);
sc::copy(x, cx);
sc::copy(y, cy);
sc::copy(A, cA);
BENCHMARK_HOST(cblas_sgemv(CblasColMajor, AT?CblasTrans:CblasNoTrans, As1, As2, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
2015-06-25 08:12:16 -07:00
#endif
#ifdef BENCH_CUBLAS
BENCHMARK_CUDA(cublasSgemv(AT?'t':'n', As1, As2, 1, (T*)CU_HANDLE(A.data()), lda, (T*)CU_HANDLE(x.data()), 1, 0, (T*)CU_HANDLE(y.data()), 1), (M*N + M + N)*dtsize/t)
2015-06-25 08:12:16 -07:00
#endif
std::cout << std::endl;
}
}
if(operation.substr(0,4)=="gemm")
{
std::vector<std::tuple<std::string, int_t, int_t, int_t, char, char> > MNKs;
2015-08-21 13:06:20 -04:00
//Square
MNKs.push_back(std::make_tuple("Square",896,896,896,'N','T'));
MNKs.push_back(std::make_tuple("Square",2560,2560,2560,'N','T'));
//DeepBench
for(size_t MK: std::vector<size_t>{1760, 2048, 2560})
for(size_t N: std::vector<size_t>{16, 32, 64, 128, MK})
MNKs.push_back(std::make_tuple("Deep", MK, N, MK, 'N', 'N'));
for(size_t MK: std::vector<size_t>{1760, 2048, 2560})
for(size_t N: std::vector<size_t>{16, 32, 64, 128, MK})
MNKs.push_back(std::make_tuple("Deep", MK, N, MK, 'T', 'N'));
for(size_t MK: std::vector<size_t>{1760})
MNKs.push_back(std::make_tuple("Deep", MK, 1733, MK, 'N', 'T'));
2015-08-30 02:23:40 -04:00
//Covariance (e.g., ICA, 10minutes/100Hz)
MNKs.push_back(std::make_tuple("Cov",32,32,60000,'N','T'));
MNKs.push_back(std::make_tuple("Cov",256,256,60000,'N','T'));
//Bi-diagonalization
MNKs.push_back(std::make_tuple("Lapack",4096,4096,32,'N','T'));
MNKs.push_back(std::make_tuple("Lapack",3456,3456,32,'N','T'));
MNKs.push_back(std::make_tuple("Lapack",896,896,32,'N','T'));
std::cout << "BENCH\tM\tN\tK\ta_t\tb_t\tISAAC";;
#ifdef BENCH_CLBLAS
std::cout << "\tclBLAS";
#endif
#ifdef BENCH_CBLAS
std::cout << "\tBLAS";
#endif
#ifdef BENCH_CUBLAS
std::cout << "\tcuBLAS";
#endif
std::cout << std::endl;
/*---------*/
/*--BLAS3--*/
/*---------*/
for(auto MNK: MNKs)
{
bool AT = std::get<4>(MNK)=='T';
bool BT = std::get<5>(MNK)=='T';
int_t M = std::get<1>(MNK);
int_t N = std::get<2>(MNK);
int_t K = std::get<3>(MNK);
std::cout << std::get<0>(MNK) << "\t" << M << "\t" << N
<< "\t" << K << "\t" << std::get<4>(MNK) << "\t" << std::get<5>(MNK);
/* ISAAC */
int_t As1 = M, As2 = K;
if(AT) std::swap(As1, As2);
int_t Bs1 = K, Bs2 = N;
if(BT) std::swap(Bs1, Bs2);
sc::array C(M, N, dtype), A(As1, As2, dtype), B(Bs1, Bs2, dtype);
2015-08-05 12:07:51 -07:00
#ifdef HAS_A_BLAS
2015-11-19 20:49:38 -05:00
int_t lda = A.stride()[1], ldb = B.stride()[1], ldc = C.stride()[1];
#endif
BENCHMARK_ISAAC(C = AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B)), (double)2*M*N*K/t*1e-3);
2015-12-21 18:42:56 -05:00
// BENCHMARK_ISAAC(C = sc::execution_handler(AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B)), sc::execution_options_type(0), sc::dispatcher_options_type(true)), (double)2*M*N*K/t);
/* clblas */
#ifdef BENCH_CLBLAS
2015-08-26 14:24:12 -04:00
if(C.context().backend()==sc::driver::OPENCL)
BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
0, CL_HANDLE(C.data()), 0, ldc, 1, &CL_HANDLE(queue),0, NULL, NULL), (double)2*M*N*K/t*1e-3)
#endif
/* BLAS */
#ifdef BENCH_CBLAS
std::vector<float> cC(M*N), cA(M*K), cB(N*K);
sc::copy(C, cC);
sc::copy(A, cA);
sc::copy(B, cB);
BENCHMARK_HOST(cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc), (double)2*M*N*K/t*1e-3);
#endif
#ifdef BENCH_CUBLAS
BENCHMARK_CUDA(cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)CU_HANDLE(A.data()), lda, (T*)CU_HANDLE(B.data()), ldb, 1, (T*)CU_HANDLE(C.data()), ldc), (double)2*M*N*K/t*1e-3)
#endif
std::cout << std::endl;
}
}
2015-01-25 18:19:19 -05:00
2014-10-27 05:35:04 -04:00
}
2014-10-29 17:03:24 +01:00
int main(int argc, char* argv[])
2014-10-27 05:35:04 -04:00
{
std::vector<std::string> args(argv, argv + argc);
#ifdef BENCH_CLBLAS
clblasSetup();
#endif
sc::driver::backend::default_queue_properties = CL_QUEUE_PROFILING_ENABLE;
2015-02-05 04:42:57 -05:00
int device_idx = 0;
std::list<sc::driver::Context const *> contexts;
sc::driver::backend::contexts::get(contexts);
2015-02-04 22:06:15 -05:00
std::string operation;
if(contexts.size() > 1)
{
if(args.size() != 3)
{
std::cerr << "usage : blas-bench DEVICE_IDX OPERATION" << std::endl;
std::cout << "Devices available: " << std::endl;
unsigned int current=0;
for(sc::driver::Context const * context: contexts)
{
sc::driver::Device device = sc::driver::backend::queues::get(*context,0).device();
std::cout << current++ << ": " << device.name() << " on " << device.platform().name() << " " << device.platform().version() << std::endl;
}
exit(EXIT_FAILURE);
}
device_idx = atoi(argv[1]);
operation = args[2];
}
else
{
if(args.size() != 2)
{
std::cerr << "usage : blas-bench OPERATION" << std::endl;
exit(EXIT_FAILURE);
}
operation = args[1];
}
sc::driver::backend::default_device = device_idx;
std::cout << std::fixed << std::setprecision(2);
bench<float>(sc::FLOAT_TYPE, operation);
#ifdef BENCH_CLBLAS
clblasTeardown();
#endif
2014-10-27 05:35:04 -04:00
}