2015-01-12 13:20:53 -05:00
# include "atidlas/array.h"
2015-02-01 23:56:05 -05:00
# include "atidlas/symbolic/execute.h"
2015-01-12 13:20:53 -05:00
# include "atidlas/tools/timer.hpp"
2014-11-06 07:07:27 -05:00
# include "common.hpp"
2015-01-27 15:32:59 -05:00
# ifdef BENCH_CLAMDBLAS
# include "clAmdBlas.h"
# endif
# ifdef BENCH_CBLAS
# include "cblas.h"
# endif
# ifdef BENCH_CUBLAS
# include <cublas.h>
# endif
2014-10-27 05:35:04 -04:00
# include <iomanip>
# include <stdlib.h>
2015-01-12 13:20:53 -05:00
# include <cmath>
2014-10-27 05:35:04 -04:00
2014-11-06 07:07:27 -05:00
2014-10-27 05:35:04 -04:00
namespace ad = atidlas ;
2015-01-27 16:14:02 -05:00
typedef ad : : int_t int_t ;
2014-10-27 05:35:04 -04:00
2015-01-27 15:32:59 -05:00
template < class T >
2015-01-12 13:20:53 -05:00
void bench ( ad : : numeric_type dtype )
2014-10-30 13:04:33 -04:00
{
2014-10-29 17:03:24 +01:00
float total_time = 0 ;
2015-01-12 13:20:53 -05:00
std : : vector < double > times ;
ad : : tools : : timer timer ;
2015-01-29 15:19:40 -05:00
unsigned int dtsize = ad : : size_of ( dtype ) ;
2014-10-29 17:03:24 +01:00
2015-01-28 20:06:41 -05:00
# define BENCHMARK(OP, PERF, SYNC) \
2015-01-24 14:51:48 -05:00
{ \
2014-10-29 17:03:24 +01:00
times . clear ( ) ; \
total_time = 0 ; \
2014-10-27 05:35:04 -04:00
OP ; \
2015-02-01 23:56:05 -05:00
SYNC ; \
2015-02-01 22:28:49 -05:00
while ( total_time < 5e-1 ) { \
2014-10-29 17:03:24 +01:00
timer . start ( ) ; \
OP ; \
2015-01-28 20:06:41 -05:00
SYNC ; \
2014-10-29 17:03:24 +01:00
times . push_back ( timer . get ( ) ) ; \
total_time + = times . back ( ) ; \
} \
2015-01-24 14:51:48 -05:00
float tres = median ( times ) ; \
2015-02-01 22:28:49 -05:00
std : : cout < < " " < < PERF < < std : : flush ; \
2015-01-24 14:51:48 -05:00
}
2014-10-27 05:35:04 -04:00
2015-02-01 23:56:05 -05:00
# define CL_BENCHMARK(OP, PERF) BENCHMARK(OP, PERF, queue.flush(); queue.finish();)
2015-01-28 20:06:41 -05:00
# define CPU_SYNCHRONIZE
# define CPU_BENCHMARK(OP, PERF) BENCHMARK(OP, PERF, CPU_SYNCHRONIZE)
# ifdef BENCH_CUBLAS
# define CUDA_BENCHMARK(OP, PERF) BENCHMARK(OP, PERF, cudaThreadSynchronize())
# endif
2015-01-27 02:41:27 -05:00
/*---------*/
/*--BLAS1--*/
/*---------*/
std : : cout < < " #AXPY " < < std : : endl ;
for ( std : : vector < int_t > : : const_iterator it = BLAS1_N . begin ( ) ; it ! = BLAS1_N . end ( ) ; + + it )
{
int_t N = * it ;
std : : cout < < N ;
/* ATIDLAS */
2015-01-27 16:14:02 -05:00
ad : : array x ( N , dtype ) , y ( N , dtype ) ;
2015-02-01 23:56:05 -05:00
cl : : CommandQueue & queue = ad : : cl_ext : : get_queue ( x . context ( ) , 0 ) ;
ad : : model & model = ad : : get_model ( queue , ad : : VECTOR_AXPY_TYPE , dtype ) ;
2015-02-01 18:59:27 -05:00
ad : : array_expression E = ad : : detail : : assign ( y , x + y ) ;
2015-02-01 22:28:49 -05:00
model . tune ( E ) ;
2015-02-01 23:56:05 -05:00
ad : : operation_cache cache ;
model . execute ( E , & cache ) ;
queue . flush ( ) ;
queue . finish ( ) ;
CL_BENCHMARK ( cache . enqueue ( ) , bandwidth ( 3 * N , tres , dtsize ) ) ;
2015-01-27 02:41:27 -05:00
/* clAmdBlas */
# ifdef BENCH_CLAMDBLAS
2015-01-28 20:06:41 -05:00
CL_BENCHMARK ( clAmdBlasSaxpy ( N , 1 , x . data ( ) ( ) , 0 , 1 , y . data ( ) ( ) , 0 , 1 , 1 , & ad : : cl_ext : : get_queue ( x . context ( ) , 0 ) ( ) , 0 , NULL , NULL ) , bandwidth ( 3 * N , tres , dtsize ) )
2015-01-27 02:41:27 -05:00
# endif
/* BLAS */
# ifdef BENCH_CBLAS
std : : vector < float > cx ( N ) , cy ( N ) ;
2015-01-27 16:14:02 -05:00
ad : : copy ( x , cx ) ;
ad : : copy ( y , cy ) ;
2015-01-28 20:06:41 -05:00
CPU_BENCHMARK ( cblas_saxpy ( N , 1 , cx . data ( ) , 1 , cy . data ( ) , 1 ) , bandwidth ( 3 * N , tres , dtsize ) ) ;
2015-01-27 15:32:59 -05:00
# endif
/* CuBLAS */
# ifdef BENCH_CUBLAS
T * cux , * cuy ;
cudaMalloc ( ( void * * ) & cux , N * sizeof ( T ) ) ;
cudaMalloc ( ( void * * ) & cuy , N * sizeof ( T ) ) ;
2015-01-28 20:06:41 -05:00
CUDA_BENCHMARK ( cublasSaxpy ( N , 2 , cux , 1 , cuy , 1 ) , bandwidth ( 3 * N , tres , dtsize ) )
2015-01-27 15:32:59 -05:00
cudaFree ( cux ) ;
cudaFree ( cuy ) ;
2015-01-27 02:41:27 -05:00
# endif
std : : cout < < std : : endl ;
}
std : : cout < < " \n \n " < < std : : flush ;
2014-11-06 07:07:27 -05:00
2015-01-28 22:07:09 -05:00
// std::cout << "#DOT" << std::endl;
// for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
// {
// int_t N = *it;
// std::cout << N;
// /* ATIDLAS */
// ad::array x(N, dtype), y(N, dtype);
// ad::array scratch(N, dtype);
// ad::scalar s(dtype);
// CL_BENCHMARK(s = dot(x,y), bandwidth(2*N, tres, dtsize));
// /* clAmdBlas */
//#ifdef BENCH_CLAMDBLAS
// CL_BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize))
//#endif
// /* BLAS */
//#ifdef BENCH_CBLAS
// std::vector<float> cx(N), cy(N);
// ad::copy(x, cx);
// ad::copy(y, cy);
// CPU_BENCHMARK(cblas_sdot(N, cx.data(), 1, cy.data(), 1), bandwidth(2*N, tres, dtsize));
//#endif
// std::cout << std::endl;
// }
// std::cout << "\n\n" << std::flush;
2014-11-06 07:07:27 -05:00
2015-01-12 13:20:53 -05:00
// /*---------*/
2015-01-28 22:07:09 -05:00
// /*--BLAS2--*/
2015-01-12 13:20:53 -05:00
// /*---------*/
2015-01-28 22:07:09 -05:00
// //T-layout
// std::cout << "#GEMV-T" << std::endl;
// for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
// for(std::vector<int_t>::const_iterator Nit = BLAS2_N.begin() ; Nit != BLAS2_N.end() ; ++Nit)
// {
// int_t M = *Mit;
// int_t N = *Nit;
// std::cout << M << "," << N;
// /* ATIDLAS */
// ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
// CL_BENCHMARK(y = dot(trans(A),x), bandwidth(M*N + M + N, tres, dtsize));
// /* clAmdBlas */
// #ifdef BENCH_CLAMDBLAS
// CL_BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize))
// #endif
// /* BLAS */
// #ifdef BENCH_CBLAS
// std::vector<float> cA(N*M), cx(N), cy(M);
// ad::copy(x, cx);
// ad::copy(y, cy);
// ad::copy(A, cA);
// CPU_BENCHMARK(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), bandwidth(M*N + M + N, tres, dtsize));
// #endif
// std::cout << std::endl;
// }
// std::cout << "\n\n" << std::flush;
//// /*---------*/
//// /*--BLAS3--*/
//// /*---------*/
// std::cout << "#GEMM-NT" << std::endl;
// for(std::vector<int_t>::const_iterator Mit = BLAS3_M.begin() ; Mit != BLAS3_M.end() ; ++Mit)
// for(std::vector<int_t>::const_iterator Nit = BLAS3_N.begin() ; Nit != BLAS3_N.end() ; ++Nit)
// for(std::vector<int_t>::const_iterator Kit = BLAS3_K.begin() ; Kit != BLAS3_K.end() ; ++Kit)
// {
// int_t M = *Kit, N = *Kit, K = *Kit;
// std::cout << M << "," << N << "," << K;
// /* ATIDLAS */
// ad::array C(M, N, dtype), A(M, K, dtype), B(N, K, dtype);
// CL_BENCHMARK(C = dot(A,trans(B)), gflops((double)2*M*N*K, tres));
// /* clAmdBlas */
// #ifdef BENCH_CLAMDBLAS
// CL_BENCHMARK(clAmdBlasSgemm(clAmdBlasColumnMajor, clAmdBlasNoTrans, clAmdBlasTrans, M, N, K, 1, A.data()(), A.ld(), B.data()(), B.ld(),
// 0, C.data()(), C.ld(), 1, &ad::cl_ext::get_queue(C.context(), 0)(),0, NULL, NULL), gflops((double)2*M*N*K, tres))
// #endif
// /* BLAS */
// #ifdef BENCH_CBLAS
// std::vector<float> cC(M*N), cA(M*K), cB(N*K);
// ad::copy(C, cC);
// ad::copy(A, cA);
// ad::copy(B, cB);
// CPU_BENCHMARK(cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, M, N, K, 1, cA.data(), M, cB.data(), N, 1, cC.data(), M), gflops((double)2*M*N*K, tres));
// #endif
// std::cout << std::endl;
// }
2015-01-25 18:19:19 -05:00
2014-10-27 05:35:04 -04:00
}
2014-10-29 17:03:24 +01:00
int main ( int argc , char * argv [ ] )
2014-10-27 05:35:04 -04:00
{
2015-01-24 14:51:48 -05:00
# ifdef BENCH_CLAMDBLAS
clAmdBlasSetup ( ) ;
# endif
int device_idx = 0 ;
2015-01-27 16:14:02 -05:00
if ( ad : : cl_ext : : queues . size ( ) > 1 ) {
ad : : cl_ext : : queues_t & queues = ad : : cl_ext : : queues ;
2015-01-24 14:51:48 -05:00
if ( argc ! = 2 )
{
std : : cerr < < " usage : blas-bench [DEVICE_IDX] " < < std : : endl ;
std : : cout < < " Devices available: " < < std : : endl ;
unsigned int current = 0 ;
2015-01-27 16:14:02 -05:00
for ( ad : : cl_ext : : queues_t : : const_iterator it = queues . begin ( ) ; it ! = queues . end ( ) ; + + it ) {
cl : : Device device = it - > first . getInfo < CL_CONTEXT_DEVICES > ( ) [ 0 ] ;
std : : cout < < current + + < < " : " < < device . getInfo < CL_DEVICE_NAME > ( ) < < " ( " < < cl : : Platform ( device . getInfo < CL_DEVICE_PLATFORM > ( ) ) . getInfo < CL_PLATFORM_NAME > ( ) < < " ) " < < std : : endl ;
2015-01-24 14:51:48 -05:00
}
exit ( EXIT_FAILURE ) ;
}
else if ( argc = = 2 )
device_idx = atoi ( argv [ 1 ] ) ;
}
2015-01-27 16:14:02 -05:00
ad : : cl_ext : : default_context_idx = device_idx ;
2014-10-30 13:04:33 -04:00
std : : cout < < " #Benchmark : BLAS " < < std : : endl ;
std : : cout < < " #---------------- " < < std : : endl ;
2015-01-27 15:32:59 -05:00
bench < float > ( ad : : FLOAT_TYPE ) ;
2015-01-24 14:51:48 -05:00
# ifdef BENCH_CLAMDBLAS
clAmdBlasTeardown ( ) ;
# endif
2014-10-27 05:35:04 -04:00
}