2015-01-12 13:20:53 -05:00
|
|
|
#include "atidlas/array.h"
|
2015-02-01 23:56:05 -05:00
|
|
|
#include "atidlas/symbolic/execute.h"
|
2015-02-08 23:19:38 -05:00
|
|
|
#include "atidlas/tools/timer.hpp"
|
2014-11-06 07:07:27 -05:00
|
|
|
#include "common.hpp"
|
2015-01-27 15:32:59 -05:00
|
|
|
#ifdef BENCH_CLAMDBLAS
|
|
|
|
#include "clAmdBlas.h"
|
|
|
|
#endif
|
|
|
|
#ifdef BENCH_CBLAS
|
|
|
|
#include "cblas.h"
|
|
|
|
#endif
|
|
|
|
#ifdef BENCH_CUBLAS
|
|
|
|
#include <cublas.h>
|
|
|
|
#endif
|
2014-10-27 05:35:04 -04:00
|
|
|
#include <iomanip>
|
|
|
|
#include <stdlib.h>
|
2015-01-12 13:20:53 -05:00
|
|
|
#include <cmath>
|
2015-02-08 23:19:38 -05:00
|
|
|
#include <numeric>
|
2014-11-06 07:07:27 -05:00
|
|
|
|
2014-10-27 05:35:04 -04:00
|
|
|
namespace ad = atidlas;
|
2015-01-27 16:14:02 -05:00
|
|
|
typedef ad::int_t int_t;
|
2014-10-27 05:35:04 -04:00
|
|
|
|
2015-02-08 23:19:38 -05:00
|
|
|
int ceil(int N, int pad)
|
|
|
|
{
|
|
|
|
return (N%pad==0)?N:(N+pad-1)/pad*pad;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<int> create_log_range(int min, int max, int N, int pad)
|
|
|
|
{
|
|
|
|
std::vector<int> res(N);
|
|
|
|
for(int i = 0 ; i < N ; ++i)
|
|
|
|
{
|
|
|
|
res[i] = std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N);
|
|
|
|
res[i] = ceil(res[i], pad);
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<int> create_full_range(int min, int max, int pad)
|
|
|
|
{
|
|
|
|
std::vector<int> N;
|
|
|
|
for(int i = ceil(min, pad) ; i < ceil(max, pad) ; i+=pad)
|
|
|
|
N.push_back(i);
|
|
|
|
return N;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
class make_vector {
|
|
|
|
public:
|
|
|
|
typedef make_vector<T> my_type;
|
|
|
|
my_type& operator<< (const T& val) {
|
|
|
|
data_.push_back(val);
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
operator std::vector<T>() const {
|
|
|
|
return data_;
|
|
|
|
}
|
|
|
|
private:
|
|
|
|
std::vector<T> data_;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2015-01-27 15:32:59 -05:00
|
|
|
template<class T>
|
2015-02-08 23:19:38 -05:00
|
|
|
T median(std::vector<T> x)
|
2014-10-30 13:04:33 -04:00
|
|
|
{
|
2015-02-08 23:19:38 -05:00
|
|
|
size_t size = x.size();
|
|
|
|
std::sort(x.begin(), x.end());
|
|
|
|
if (size % 2 == 0)
|
|
|
|
return (x[size / 2 - 1] + x[size / 2]) / 2;
|
|
|
|
else
|
|
|
|
return x[size / 2];
|
|
|
|
}
|
|
|
|
|
|
|
|
template<class T>
|
|
|
|
T mean(std::vector<T> x)
|
|
|
|
{
|
|
|
|
T res = 0;
|
|
|
|
int N = x.size();
|
|
|
|
for(int i = 0 ; i < N ; ++i)
|
|
|
|
res += x[i];
|
|
|
|
return res/N;
|
|
|
|
}
|
|
|
|
|
|
|
|
static double time_event(unsigned long sum, cl::Event const & e)
|
|
|
|
{ return sum + e.getProfilingInfo<CL_PROFILING_COMMAND_END>() - e.getProfilingInfo<CL_PROFILING_COMMAND_START>();}
|
|
|
|
|
|
|
|
template<class T>
|
|
|
|
void bench(ad::numeric_type dtype){
|
2014-10-29 17:03:24 +01:00
|
|
|
|
2015-02-08 00:56:24 -05:00
|
|
|
#define BENCHMARK_ATIDLAS(OP, PERF) \
|
|
|
|
{\
|
|
|
|
std::vector<long> times;\
|
|
|
|
double total_time = 0;\
|
2015-02-09 01:58:32 -05:00
|
|
|
queue.finish();\
|
|
|
|
while(total_time*1e-9 < 1e-3){\
|
2015-02-08 00:56:24 -05:00
|
|
|
std::list<cl::Event> events;\
|
|
|
|
OP;\
|
|
|
|
queue.finish();\
|
2015-02-08 23:19:38 -05:00
|
|
|
times.push_back(std::accumulate(events.begin(), events.end(), 0, &time_event));\
|
2015-02-08 00:56:24 -05:00
|
|
|
total_time+=times.back();\
|
|
|
|
}\
|
|
|
|
double t = median(times);\
|
|
|
|
std::cout << " " << PERF << std::flush;\
|
|
|
|
}
|
|
|
|
|
|
|
|
#define BENCHMARK_CLAMDBLAS(OP, PERF) \
|
2015-01-24 14:51:48 -05:00
|
|
|
{\
|
2015-02-05 23:11:16 -05:00
|
|
|
std::vector<long> times;\
|
|
|
|
double total_time = 0;\
|
2015-02-09 01:58:32 -05:00
|
|
|
while(total_time*1e-9 < 1e-3){\
|
2015-02-05 04:42:57 -05:00
|
|
|
cl::Event event;\
|
2014-10-29 17:03:24 +01:00
|
|
|
OP;\
|
2015-02-05 23:11:16 -05:00
|
|
|
queue.finish();\
|
2015-02-09 01:58:32 -05:00
|
|
|
times.push_back(time_event(0, event));\
|
2015-02-05 23:11:16 -05:00
|
|
|
total_time+=times.back();\
|
2014-10-29 17:03:24 +01:00
|
|
|
}\
|
2015-02-05 23:11:16 -05:00
|
|
|
double t = median(times);\
|
|
|
|
std::cout << " " << PERF << std::flush;\
|
|
|
|
}
|
|
|
|
|
|
|
|
#define BENCHMARK_HOST(OP, PERF) \
|
|
|
|
{\
|
2015-02-08 23:19:38 -05:00
|
|
|
ad::tools::timer tmr;\
|
2015-02-05 23:11:16 -05:00
|
|
|
std::vector<int> cache_flusher(10000000, 0);\
|
2015-02-08 23:19:38 -05:00
|
|
|
tmr.start();\
|
2015-02-05 23:11:16 -05:00
|
|
|
OP;\
|
2015-02-08 23:19:38 -05:00
|
|
|
double t = 1e9*tmr.get();\
|
2015-02-05 23:11:16 -05:00
|
|
|
std::cout << " " << PERF << std::flush;\
|
|
|
|
}
|
|
|
|
|
|
|
|
#define BENCHMARK_CUDA(OP, PERF) \
|
|
|
|
{\
|
|
|
|
std::vector<long> times;\
|
|
|
|
double total_time = 0;\
|
2015-02-05 23:42:31 -05:00
|
|
|
float time;\
|
|
|
|
cudaEvent_t start, stop;\
|
|
|
|
cudaEventCreate(&start);\
|
|
|
|
cudaEventCreate(&stop);\
|
2015-02-05 23:11:16 -05:00
|
|
|
while(total_time*1e-3 < 1e-1){\
|
2015-02-05 23:42:31 -05:00
|
|
|
cudaEventRecord(start,0);\
|
2015-02-05 23:11:16 -05:00
|
|
|
OP;\
|
2015-02-05 23:42:31 -05:00
|
|
|
cudaEventRecord(stop,0);\
|
|
|
|
cudaEventSynchronize(stop);\
|
2015-02-05 23:11:16 -05:00
|
|
|
cudaEventElapsedTime(&time, start, stop);\
|
2015-02-05 23:42:31 -05:00
|
|
|
times.push_back(time*1e6);\
|
2015-02-05 23:11:16 -05:00
|
|
|
total_time+=time;\
|
|
|
|
}\
|
2015-02-05 23:42:31 -05:00
|
|
|
double t = median(times);\
|
2015-02-05 23:11:16 -05:00
|
|
|
std::cout << " " << PERF << std::flush;\
|
2015-01-24 14:51:48 -05:00
|
|
|
}
|
2014-10-27 05:35:04 -04:00
|
|
|
|
2015-02-08 23:19:38 -05:00
|
|
|
unsigned int dtsize = ad::size_of(dtype);
|
|
|
|
cl::CommandQueue & queue = ad::cl_ext::queues[ad::cl_ext::default_context()][0];
|
|
|
|
|
|
|
|
// BLAS1 Sizes
|
|
|
|
static const std::vector<int> BLAS1_N = create_log_range(1e3, 2e7, 50, 64);
|
|
|
|
|
|
|
|
// BLAS2 Sizes
|
2015-02-10 03:09:12 -05:00
|
|
|
static const std::vector<int> BLAS2_N = make_vector<int>() << 128;
|
2015-02-08 23:19:38 -05:00
|
|
|
static const std::vector<int> BLAS2_M = create_full_range(128, 10000, 64);
|
|
|
|
|
|
|
|
// BLAS3 Sizes
|
|
|
|
static const std::vector<int> BLAS3_M = make_vector<int>() << 1024;
|
|
|
|
static const std::vector<int> BLAS3_N = make_vector<int>() << 128;
|
|
|
|
static const std::vector<int> BLAS3_K = create_full_range(128, 5000, 64);
|
2014-11-06 07:07:27 -05:00
|
|
|
|
2015-02-08 23:19:38 -05:00
|
|
|
// /*---------*/
|
|
|
|
// /*--BLAS1--*/
|
|
|
|
// /*---------*/
|
2015-02-09 01:58:32 -05:00
|
|
|
std::cout << "#AXPY" << std::endl;
|
2015-02-08 23:19:38 -05:00
|
|
|
for(int_t i = 0 ; i < BLAS1_N.size() ; ++i)
|
|
|
|
{
|
|
|
|
int_t N = BLAS1_N[i];
|
|
|
|
std::cout << N;
|
|
|
|
ad::array x(N, dtype), y(N, dtype);
|
2015-02-09 01:58:32 -05:00
|
|
|
/* ATIDLAS */
|
|
|
|
std::list<cl::Event> events;\
|
2015-02-10 03:09:12 -05:00
|
|
|
BENCHMARK_ATIDLAS(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t)
|
2015-02-08 23:19:38 -05:00
|
|
|
/* clAmdBlas */
|
|
|
|
#ifdef BENCH_CLAMDBLAS
|
2015-02-09 01:58:32 -05:00
|
|
|
BENCHMARK_CLAMDBLAS(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
|
2015-02-08 23:19:38 -05:00
|
|
|
#endif
|
|
|
|
/* BLAS */
|
|
|
|
#ifdef BENCH_CBLAS
|
|
|
|
std::vector<float> cx(N), cy(N);
|
|
|
|
ad::copy(x, cx);
|
|
|
|
ad::copy(y, cy);
|
2015-02-09 01:58:32 -05:00
|
|
|
BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
|
2015-02-08 23:19:38 -05:00
|
|
|
#endif
|
2015-02-09 01:58:32 -05:00
|
|
|
/* CuBLAS */
|
2015-02-08 23:19:38 -05:00
|
|
|
#ifdef BENCH_CUBLAS
|
|
|
|
T *cux, *cuy;
|
|
|
|
cudaMalloc((void**) &cux, N * sizeof(T));
|
|
|
|
cudaMalloc((void**) &cuy, N * sizeof(T));
|
2015-02-09 01:58:32 -05:00
|
|
|
BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
|
2015-02-08 23:19:38 -05:00
|
|
|
cudaFree(cux);
|
|
|
|
cudaFree(cuy);
|
|
|
|
#endif
|
|
|
|
std::cout << std::endl;
|
|
|
|
}
|
|
|
|
std::cout << "\n\n" << std::flush;
|
|
|
|
|
2015-02-09 01:58:32 -05:00
|
|
|
// std::cout << "#DOT" << std::endl;
|
|
|
|
// for(int_t i = 0 ; i < BLAS1_N.size() ; ++i)
|
|
|
|
// {
|
|
|
|
// int_t N = BLAS1_N[i];
|
|
|
|
// std::cout << N;
|
|
|
|
// /* ATIDLAS */
|
|
|
|
// ad::array x(N, dtype), y(N, dtype);
|
|
|
|
// ad::array scratch(N, dtype);
|
|
|
|
// ad::scalar s(dtype);
|
|
|
|
// s = dot(x,y); queue.finish();
|
|
|
|
// BENCHMARK_ATIDLAS(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t)
|
|
|
|
// /* clAmdBlas */
|
|
|
|
//#ifdef BENCH_CLAMDBLAS
|
|
|
|
// BENCHMARK_CLAMDBLAS(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
|
|
|
|
//#endif
|
|
|
|
// /* BLAS */
|
|
|
|
//#ifdef BENCH_CBLAS
|
|
|
|
// std::vector<float> cx(N), cy(N);
|
|
|
|
// ad::copy(x, cx);
|
|
|
|
// ad::copy(y, cy);
|
|
|
|
// BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
|
|
|
|
//#endif
|
|
|
|
//#ifdef BENCH_CUBLAS
|
|
|
|
// T *cux, *cuy;
|
|
|
|
// T result;
|
|
|
|
// cudaMalloc((void**) &cux, N * sizeof(T));
|
|
|
|
// cudaMalloc((void**) &cuy, N * sizeof(T));
|
|
|
|
// BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
|
|
|
|
// cudaFree(cux);
|
|
|
|
// cudaFree(cuy);
|
|
|
|
//#endif
|
|
|
|
// std::cout << std::endl;
|
|
|
|
// }
|
|
|
|
// std::cout << "\n\n" << std::flush;
|
|
|
|
|
2015-02-10 23:01:16 -05:00
|
|
|
/*---------*/
|
|
|
|
/*--BLAS2--*/
|
|
|
|
/*---------*/
|
|
|
|
//T-layout
|
|
|
|
std::cout << "#GEMV-T" << std::endl;
|
|
|
|
for(int_t i = 0 ; i < BLAS2_N.size() ; ++i)
|
|
|
|
for(int_t j = 0 ; j < BLAS2_M.size() ; ++j)
|
|
|
|
{
|
|
|
|
int_t N = BLAS2_N[i];
|
|
|
|
int_t M = BLAS2_M[j];
|
|
|
|
std::cout << M << "," << N;
|
|
|
|
/* ATIDLAS */
|
|
|
|
ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
|
|
|
|
y = dot(trans(A),x); queue.finish();
|
|
|
|
BENCHMARK_ATIDLAS(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
|
|
|
|
#ifdef BENCH_CLAMDBLAS
|
|
|
|
BENCHMARK_CLAMDBLAS(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &queue(),0, NULL, &event()), (M*N + M + N)*dtsize/t)
|
|
|
|
#endif
|
|
|
|
#ifdef BENCH_CBLAS
|
|
|
|
std::vector<float> cA(N*M), cx(N), cy(M);
|
|
|
|
ad::copy(x, cx);
|
|
|
|
ad::copy(y, cy);
|
|
|
|
ad::copy(A, cA);
|
|
|
|
BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
|
|
|
|
#endif
|
|
|
|
#ifdef BENCH_CUBLAS
|
|
|
|
T *cuA, *cux, *cuy;
|
|
|
|
cudaMalloc((void**) &cuA, N * M * sizeof(T));
|
|
|
|
cudaMalloc((void**) &cux, N * sizeof(T));
|
|
|
|
cudaMalloc((void**) &cuy, M * sizeof(T));
|
|
|
|
BENCHMARK_CUDA(cublasSgemv(cublasTrans, N, M, 1, cuA, N, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
|
|
|
|
cudaFree(cuA);
|
|
|
|
cudaFree(cux);
|
|
|
|
cudaFree(cuy);
|
|
|
|
#endif
|
|
|
|
std::cout << std::endl;
|
|
|
|
}
|
|
|
|
std::cout << "\n\n" << std::flush;
|
2015-01-28 22:07:09 -05:00
|
|
|
|
2015-02-10 03:09:12 -05:00
|
|
|
/*---------*/
|
|
|
|
/*--BLAS3--*/
|
|
|
|
/*---------*/
|
|
|
|
std::cout << "#GEMM-NT" << std::endl;
|
|
|
|
for(std::vector<int_t>::const_iterator Mit = BLAS3_M.begin() ; Mit != BLAS3_M.end() ; ++Mit)
|
|
|
|
for(std::vector<int_t>::const_iterator Nit = BLAS3_N.begin() ; Nit != BLAS3_N.end() ; ++Nit)
|
|
|
|
for(std::vector<int_t>::const_iterator Kit = BLAS3_K.begin() ; Kit != BLAS3_K.end() ; ++Kit)
|
|
|
|
{
|
|
|
|
int_t M = *Kit, N = *Kit, K = *Kit;
|
|
|
|
std::cout << M << "," << N << "," << K;
|
|
|
|
/* ATIDLAS */
|
|
|
|
ad::array C(M, N, dtype), A(M, K, dtype), B(N, K, dtype);
|
|
|
|
BENCHMARK_ATIDLAS(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), (double)2*M*N*K/t);
|
|
|
|
/* clAmdBlas */
|
|
|
|
#ifdef BENCH_CLAMDBLAS
|
|
|
|
BENCHMARK_CLAMDBLAS(clAmdBlasSgemm(clAmdBlasColumnMajor, clAmdBlasNoTrans, clAmdBlasTrans, M, N, K, 1, A.data()(), A.ld(), B.data()(), B.ld(),
|
|
|
|
0, C.data()(), C.ld(), 1, &queue(),0, NULL, &event()), (double)2*M*N*K/t)
|
|
|
|
#endif
|
|
|
|
/* BLAS */
|
|
|
|
#ifdef BENCH_CBLAS
|
|
|
|
std::vector<float> cC(M*N), cA(M*K), cB(N*K);
|
|
|
|
ad::copy(C, cC);
|
|
|
|
ad::copy(A, cA);
|
|
|
|
ad::copy(B, cB);
|
|
|
|
BENCHMARK_HOST(cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, M, N, K, 1, cA.data(), M, cB.data(), N, 1, cC.data(), M), (double)2*M*N*K/t);
|
|
|
|
#endif
|
|
|
|
std::cout << std::endl;
|
|
|
|
}
|
2015-01-25 18:19:19 -05:00
|
|
|
|
2014-10-27 05:35:04 -04:00
|
|
|
}
|
|
|
|
|
2014-10-29 17:03:24 +01:00
|
|
|
int main(int argc, char* argv[])
|
2014-10-27 05:35:04 -04:00
|
|
|
{
|
2015-01-24 14:51:48 -05:00
|
|
|
#ifdef BENCH_CLAMDBLAS
|
|
|
|
clAmdBlasSetup();
|
|
|
|
#endif
|
|
|
|
|
2015-02-05 04:42:57 -05:00
|
|
|
ad::cl_ext::queue_properties = CL_QUEUE_PROFILING_ENABLE;
|
|
|
|
|
2015-01-24 14:51:48 -05:00
|
|
|
int device_idx = 0;
|
2015-02-04 22:06:15 -05:00
|
|
|
ad::cl_ext::queues_type::data_type const & queues = ad::cl_ext::queues.data();
|
|
|
|
|
|
|
|
if(queues.size()>1){
|
2015-01-24 14:51:48 -05:00
|
|
|
if(argc!=2)
|
|
|
|
{
|
|
|
|
std::cerr << "usage : blas-bench [DEVICE_IDX]" << std::endl;
|
|
|
|
std::cout << "Devices available: " << std::endl;
|
|
|
|
unsigned int current=0;
|
2015-02-08 23:19:38 -05:00
|
|
|
for(ad::cl_ext::queues_type::data_type::const_iterator it = queues.begin() ; it != queues.end() ; ++it){
|
|
|
|
cl::Device device = it->first.getInfo<CL_CONTEXT_DEVICES>()[0];
|
2015-01-27 16:14:02 -05:00
|
|
|
std::cout << current++ << ": " << device.getInfo<CL_DEVICE_NAME>() << "(" << cl::Platform(device.getInfo<CL_DEVICE_PLATFORM>()).getInfo<CL_PLATFORM_NAME>() << ")" << std::endl;
|
2015-01-24 14:51:48 -05:00
|
|
|
}
|
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
}
|
|
|
|
else if(argc==2)
|
|
|
|
device_idx = atoi(argv[1]);
|
|
|
|
}
|
|
|
|
|
2015-01-27 16:14:02 -05:00
|
|
|
ad::cl_ext::default_context_idx = device_idx;
|
2014-10-30 13:04:33 -04:00
|
|
|
std::cout << "#Benchmark : BLAS" << std::endl;
|
|
|
|
std::cout << "#----------------" << std::endl;
|
2015-01-27 15:32:59 -05:00
|
|
|
bench<float>(ad::FLOAT_TYPE);
|
2015-01-24 14:51:48 -05:00
|
|
|
|
|
|
|
#ifdef BENCH_CLAMDBLAS
|
|
|
|
clAmdBlasTeardown();
|
|
|
|
#endif
|
2014-10-27 05:35:04 -04:00
|
|
|
}
|