Bench: Now displaying results in a table

This commit is contained in:
Philippe Tillet
2016-09-29 14:41:29 -04:00
parent 29b3a576df
commit fa4cb6866d
6 changed files with 71 additions and 78 deletions

View File

@@ -19,20 +19,20 @@ if(CLBLAS_FOUND)
set(BLAS_LIBS ${BLAS_LIBS} ${CLBLAS_LIBRARIES} ) set(BLAS_LIBS ${BLAS_LIBS} ${CLBLAS_LIBRARIES} )
endif() endif()
#CBLAS ##CBLAS
find_package(MKL QUIET) #find_package(MKL QUIET)
if(MKL_FOUND) #if(MKL_FOUND)
set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_MKL -DBENCH_CBLAS") # set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_MKL")
include_directories(${MKL_INCLUDE_DIR}) # include_directories(${MKL_INCLUDE_DIR})
set(BLAS_LIBS ${BLAS_LIBS} ${MKL_LIBRARIES} ) # set(BLAS_LIBS ${BLAS_LIBS} ${MKL_LIBRARIES} )
else() #else()
find_package(OpenBlas) # find_package(OpenBlas)
if(OPENBLAS_FOUND) # if(OPENBLAS_FOUND)
set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_CBLAS") # set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_CBLAS")
include_directories(${OPENBLAS_INCLUDE_DIR}) # include_directories(${OPENBLAS_INCLUDE_DIR})
set(BLAS_LIBS ${BLAS_LIBS} ${OPENBLAS_LIBRARIES} ) # set(BLAS_LIBS ${BLAS_LIBS} ${OPENBLAS_LIBRARIES} )
endif() # endif()
endif() #endif()
string(REPLACE ";" " " BLAS_DEF_STR "${BLAS_DEF}") string(REPLACE ";" " " BLAS_DEF_STR "${BLAS_DEF}")

View File

@@ -49,7 +49,7 @@ void bench(sc::numeric_type dtype, std::string operation)
total_time+=times.back();\ total_time+=times.back();\
}\ }\
double t = min(times);\ double t = min(times);\
std::cout << " " << (int)(PERF) << std::flush;\ std::cout << "\t" << (int)PERF << std::flush;\
} }
#define BENCHMARK_CLBLAS(OP, PERF) \ #define BENCHMARK_CLBLAS(OP, PERF) \
@@ -66,7 +66,7 @@ void bench(sc::numeric_type dtype, std::string operation)
total_time+=times.back();\ total_time+=times.back();\
}\ }\
double t = min(times);\ double t = min(times);\
std::cout << " " << (int)(PERF) << std::flush;\ std::cout << "\t" << PERF << std::flush;\
} }
#define BENCHMARK_HOST(OP, PERF) \ #define BENCHMARK_HOST(OP, PERF) \
@@ -82,7 +82,7 @@ void bench(sc::numeric_type dtype, std::string operation)
total_time += time;\ total_time += time;\
}\ }\
double t = min(times);\ double t = min(times);\
std::cout << " " << (int)(PERF) << std::flush;\ std::cout << "\t" << PERF << std::flush;\
} }
#define BENCHMARK_CUDA(OP, PERF) \ #define BENCHMARK_CUDA(OP, PERF) \
@@ -99,27 +99,14 @@ void bench(sc::numeric_type dtype, std::string operation)
total_time+=times.back();\ total_time+=times.back();\
}\ }\
double t = min(times);\ double t = min(times);\
std::cout << " " << (int)(PERF) << std::flush;\ std::cout << "\t" << PERF << std::flush;\
} }
unsigned int dtsize = sc::size_of(dtype); unsigned int dtsize = sc::size_of(dtype);
sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0); sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0);
std::map<std::string, std::string> metric{ {"axpy", "GB/s"}, {"dot", "GB/s"}, {"gemv", "GB/s"}, {"gemm", "GFLOPS"}}; std::map<std::string, std::string> metric{ {"axpy", "GB/s"}, {"dot", "GB/s"}, {"gemv", "GB/s"}, {"gemm", "GFLOPS"}};
sc::array flush((int)1e6, sc::FLOAT_TYPE); sc::array flush((int)1e6, sc::FLOAT_TYPE);
std::cout << "#" << operation << " (" << metric[operation] << ")" << std::endl;
std::cout << "\"N\"";
std::cout << " \"ISAAC\"";
// std::cout << " \"ISAAC (Best impl.)\"";
#ifdef BENCH_CLBLAS
std::cout << " \"clBLAS\"";
#endif
#ifdef BENCH_CBLAS
std::cout << " \"BLAS\"";
#endif
#ifdef BENCH_CUBLAS
std::cout << " \"cuBLAS\"";
#endif
std::cout << std::endl;
// //
// RUN BENCHMARKS // RUN BENCHMARKS
// //
@@ -250,53 +237,51 @@ void bench(sc::numeric_type dtype, std::string operation)
if(operation.substr(0,4)=="gemm") if(operation.substr(0,4)=="gemm")
{ {
std::vector<std::tuple<std::string, char, char, int_t, int_t, int_t> > MNKs; std::vector<std::tuple<std::string, int_t, int_t, int_t, char, char> > MNKs;
//Square //Square
MNKs.push_back(std::make_tuple("square896",'N','T',896,896,896)); MNKs.push_back(std::make_tuple("Square",896,896,896,'N','T'));
MNKs.push_back(std::make_tuple("square2560",'N','T',2560,2560,2560)); MNKs.push_back(std::make_tuple("Square",2560,2560,2560,'N','T'));
//DeepBench
//Convolution for(size_t MK: std::vector<size_t>{1760, 2048, 2560})
MNKs.push_back(std::make_tuple("conv1",'N','N',3025,64,363)); for(size_t N: std::vector<size_t>{16, 32, 64, 128, MK})
MNKs.push_back(std::make_tuple("conv2",'N','N',729,192,1600)); MNKs.push_back(std::make_tuple("Deep", MK, N, MK, 'N', 'N'));
MNKs.push_back(std::make_tuple("conv3",'N','N',169,384,1728)); for(size_t MK: std::vector<size_t>{1760, 2048, 2560})
MNKs.push_back(std::make_tuple("conv4",'N','N',169,256,3456)); for(size_t N: std::vector<size_t>{16, 32, 64, 128, MK})
MNKs.push_back(std::make_tuple("conv5",'N','N',169,128,2304)); MNKs.push_back(std::make_tuple("Deep", MK, N, MK, 'T', 'N'));
for(size_t MK: std::vector<size_t>{1760})
// //Convolution Gradient-1 MNKs.push_back(std::make_tuple("Deep", MK, 1733, MK, 'N', 'T'));
// MNKs.push_back(std::make_tuple("convgrad5-1]",'T','N',2304,256,169));
// MNKs.push_back(std::make_tuple("convgrad4-1]",'T','N',3456,256,169));
// MNKs.push_back(std::make_tuple("convgrad3-1]",'T','N',1728,384,169));
// MNKs.push_back(std::make_tuple("convgrad2-1]",'T','N',1600,192,729));
// MNKs.push_back(std::make_tuple("convgrad1-1]",'T','N',363,64,3025));
// //Convolution Gradient-2
// MNKs.push_back(std::make_tuple("convgrad5-2]",'N','T',169,2304,256));
// MNKs.push_back(std::make_tuple("convgrad4-2]",'N','T',169,3456,256));
// MNKs.push_back(std::make_tuple("convgrad3-2]",'N','T',169,1728,384));
// MNKs.push_back(std::make_tuple("convgrad2-2]",'N','T',729,1600,192));
// MNKs.push_back(std::make_tuple("convgrad1-2]",'N','T',3025,363,64));
//Covariance (e.g., ICA, 10minutes/100Hz) //Covariance (e.g., ICA, 10minutes/100Hz)
MNKs.push_back(std::make_tuple("ica32",'N','T',32,32,60000)); MNKs.push_back(std::make_tuple("Cov",32,32,60000,'N','T'));
MNKs.push_back(std::make_tuple("ica256",'N','T',256,256,60000)); MNKs.push_back(std::make_tuple("Cov",256,256,60000,'N','T'));
//Bi-diagonalization
MNKs.push_back(std::make_tuple("Lapack",4096,4096,32,'N','T'));
MNKs.push_back(std::make_tuple("Lapack",3456,3456,32,'N','T'));
MNKs.push_back(std::make_tuple("Lapack",896,896,32,'N','T'));
// //Bi-diagonalization std::cout << "BENCH\tM\tN\tK\ta_t\tb_t\tISAAC";;
MNKs.push_back(std::make_tuple("32rank1-4096",'N','T',4096,4096,32)); #ifdef BENCH_CLBLAS
MNKs.push_back(std::make_tuple("32rank1-3456",'N','T',3456,3456,32)); std::cout << "\tclBLAS";
MNKs.push_back(std::make_tuple("32rank1-896",'N','T',896,896,32)); #endif
#ifdef BENCH_CBLAS
std::cout << "\tBLAS";
#endif
#ifdef BENCH_CUBLAS
std::cout << "\tcuBLAS";
#endif
std::cout << std::endl;
/*---------*/ /*---------*/
/*--BLAS3--*/ /*--BLAS3--*/
/*---------*/ /*---------*/
for(std::tuple<std::string, char, char, int_t, int_t, int_t> MNK: MNKs) for(auto MNK: MNKs)
{ {
bool AT = std::get<1>(MNK)=='T'; bool AT = std::get<4>(MNK)=='T';
bool BT = std::get<2>(MNK)=='T'; bool BT = std::get<5>(MNK)=='T';
int_t M = std::get<3>(MNK); int_t M = std::get<1>(MNK);
int_t N = std::get<4>(MNK); int_t N = std::get<2>(MNK);
int_t K = std::get<5>(MNK); int_t K = std::get<3>(MNK);
std::cout << "\"" << std::get<0>(MNK) << "\""; std::cout << std::get<0>(MNK) << "\t" << M << "\t" << N
std::cout << std::flush; << "\t" << K << "\t" << std::get<4>(MNK) << "\t" << std::get<5>(MNK);
/* ISAAC */ /* ISAAC */
int_t As1 = M, As2 = K; int_t As1 = M, As2 = K;
if(AT) std::swap(As1, As2); if(AT) std::swap(As1, As2);
@@ -307,13 +292,13 @@ void bench(sc::numeric_type dtype, std::string operation)
#ifdef HAS_A_BLAS #ifdef HAS_A_BLAS
int_t lda = A.stride()[1], ldb = B.stride()[1], ldc = C.stride()[1]; int_t lda = A.stride()[1], ldb = B.stride()[1], ldc = C.stride()[1];
#endif #endif
BENCHMARK_ISAAC(C = AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B)), (double)2*M*N*K/t); BENCHMARK_ISAAC(C = AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B)), (double)2*M*N*K/t*1e-3);
// BENCHMARK_ISAAC(C = sc::execution_handler(AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B)), sc::execution_options_type(0), sc::dispatcher_options_type(true)), (double)2*M*N*K/t); // BENCHMARK_ISAAC(C = sc::execution_handler(AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B)), sc::execution_options_type(0), sc::dispatcher_options_type(true)), (double)2*M*N*K/t);
/* clblas */ /* clblas */
#ifdef BENCH_CLBLAS #ifdef BENCH_CLBLAS
if(C.context().backend()==sc::driver::OPENCL) if(C.context().backend()==sc::driver::OPENCL)
BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb, BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
0, CL_HANDLE(C.data()), 0, ldc, 1, &CL_HANDLE(queue),0, NULL, NULL), (double)2*M*N*K/t) 0, CL_HANDLE(C.data()), 0, ldc, 1, &CL_HANDLE(queue),0, NULL, NULL), (double)2*M*N*K/t*1e-3)
#endif #endif
/* BLAS */ /* BLAS */
#ifdef BENCH_CBLAS #ifdef BENCH_CBLAS
@@ -321,10 +306,10 @@ void bench(sc::numeric_type dtype, std::string operation)
sc::copy(C, cC); sc::copy(C, cC);
sc::copy(A, cA); sc::copy(A, cA);
sc::copy(B, cB); sc::copy(B, cB);
BENCHMARK_HOST(cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc), (double)2*M*N*K/t); BENCHMARK_HOST(cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc), (double)2*M*N*K/t*1e-3);
#endif #endif
#ifdef BENCH_CUBLAS #ifdef BENCH_CUBLAS
BENCHMARK_CUDA(cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)CU_HANDLE(A.data()), lda, (T*)CU_HANDLE(B.data()), ldb, 1, (T*)CU_HANDLE(C.data()), ldc), (double)2*M*N*K/t) BENCHMARK_CUDA(cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)CU_HANDLE(A.data()), lda, (T*)CU_HANDLE(B.data()), ldb, 1, (T*)CU_HANDLE(C.data()), ldc), (double)2*M*N*K/t*1e-3)
#endif #endif
std::cout << std::endl; std::cout << std::endl;
} }
@@ -373,8 +358,7 @@ int main(int argc, char* argv[])
} }
sc::driver::backend::default_device = device_idx; sc::driver::backend::default_device = device_idx;
std::cout << "#Benchmark : BLAS" << std::endl; std::cout << std::fixed << std::setprecision(2);
std::cout << "#----------------" << std::endl;
bench<float>(sc::FLOAT_TYPE, operation); bench<float>(sc::FLOAT_TYPE, operation);
#ifdef BENCH_CLBLAS #ifdef BENCH_CLBLAS

View File

@@ -74,6 +74,7 @@ public:
SM_3_7, SM_3_7,
SM_5_0, SM_5_0,
SM_5_2, SM_5_2,
SM_6_1,
//AMD //AMD
TERASCALE_2, TERASCALE_2,

View File

@@ -81,6 +81,12 @@ Device::Architecture Device::architecture() const
std::pair<unsigned int, unsigned int> sm = nv_compute_capability(); std::pair<unsigned int, unsigned int> sm = nv_compute_capability();
switch(sm.first) switch(sm.first)
{ {
case 6:
switch(sm.second)
{
case 1: return Architecture::SM_6_1;
}
case 5: case 5:
switch(sm.second) switch(sm.second)
{ {

View File

@@ -33,6 +33,7 @@
#include "database/nvidia/sm_3_0.hpp" #include "database/nvidia/sm_3_0.hpp"
#include "database/nvidia/sm_3_5.hpp" #include "database/nvidia/sm_3_5.hpp"
#include "database/nvidia/sm_5_2.hpp" #include "database/nvidia/sm_5_2.hpp"
#include "database/nvidia/sm_6_1.hpp"
//AMD //AMD
#include "database/amd/gcn_1_1.hpp" #include "database/amd/gcn_1_1.hpp"
@@ -60,6 +61,7 @@ const profiles::presets_type profiles::presets_ =
DATABASE_ENTRY(GPU, NVIDIA, SM_3_7, database::nvidia::sm_3_5), DATABASE_ENTRY(GPU, NVIDIA, SM_3_7, database::nvidia::sm_3_5),
DATABASE_ENTRY(GPU, NVIDIA, SM_5_0, database::nvidia::sm_5_2), DATABASE_ENTRY(GPU, NVIDIA, SM_5_0, database::nvidia::sm_5_2),
DATABASE_ENTRY(GPU, NVIDIA, SM_5_2, database::nvidia::sm_5_2), DATABASE_ENTRY(GPU, NVIDIA, SM_5_2, database::nvidia::sm_5_2),
DATABASE_ENTRY(GPU, NVIDIA, SM_5_2, database::nvidia::sm_6_1),
//AMD //AMD
DATABASE_ENTRY(GPU, AMD, GCN_1_0, database::amd::gcn_1_1), DATABASE_ENTRY(GPU, AMD, GCN_1_0, database::amd::gcn_1_1),
DATABASE_ENTRY(GPU, AMD, GCN_1_1, database::amd::gcn_1_1), DATABASE_ENTRY(GPU, AMD, GCN_1_1, database::amd::gcn_1_1),

View File

@@ -73,7 +73,7 @@ def main():
libraries += ['gnustl_shared'] libraries += ['gnustl_shared']
#Source files #Source files
src = 'src/lib/random/rand.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/base.cpp src/lib/runtime/execute.cpp src/lib/runtime/inference/database.cpp src/lib/runtime/inference/profiles.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/dispatch.cpp src/lib/driver/program_cache.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/exception/api.cpp src/lib/exception/driver.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']] src = 'src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/value_scalar.cpp src/lib/random/rand.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/driver/dispatch.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/profiles.cpp src/lib/runtime/inference/database.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
boostsrc = 'external/boost/libs/' boostsrc = 'external/boost/libs/'
for s in ['numpy','python','smart_ptr','system','thread']: for s in ['numpy','python','smart_ptr','system','thread']:
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x] src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]