Bench: Now displaying results in a table

This commit is contained in:
Philippe Tillet
2016-09-29 14:41:29 -04:00
parent 29b3a576df
commit fa4cb6866d
6 changed files with 71 additions and 78 deletions

View File

@@ -19,20 +19,20 @@ if(CLBLAS_FOUND)
set(BLAS_LIBS ${BLAS_LIBS} ${CLBLAS_LIBRARIES} )
endif()
#CBLAS
find_package(MKL QUIET)
if(MKL_FOUND)
set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_MKL -DBENCH_CBLAS")
include_directories(${MKL_INCLUDE_DIR})
set(BLAS_LIBS ${BLAS_LIBS} ${MKL_LIBRARIES} )
else()
find_package(OpenBlas)
if(OPENBLAS_FOUND)
set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_CBLAS")
include_directories(${OPENBLAS_INCLUDE_DIR})
set(BLAS_LIBS ${BLAS_LIBS} ${OPENBLAS_LIBRARIES} )
endif()
endif()
##CBLAS
#find_package(MKL QUIET)
#if(MKL_FOUND)
# set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_MKL")
# include_directories(${MKL_INCLUDE_DIR})
# set(BLAS_LIBS ${BLAS_LIBS} ${MKL_LIBRARIES} )
#else()
# find_package(OpenBlas)
# if(OPENBLAS_FOUND)
# set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_CBLAS")
# include_directories(${OPENBLAS_INCLUDE_DIR})
# set(BLAS_LIBS ${BLAS_LIBS} ${OPENBLAS_LIBRARIES} )
# endif()
#endif()
string(REPLACE ";" " " BLAS_DEF_STR "${BLAS_DEF}")

View File

@@ -49,7 +49,7 @@ void bench(sc::numeric_type dtype, std::string operation)
total_time+=times.back();\
}\
double t = min(times);\
std::cout << " " << (int)(PERF) << std::flush;\
std::cout << "\t" << (int)PERF << std::flush;\
}
#define BENCHMARK_CLBLAS(OP, PERF) \
@@ -66,7 +66,7 @@ void bench(sc::numeric_type dtype, std::string operation)
total_time+=times.back();\
}\
double t = min(times);\
std::cout << " " << (int)(PERF) << std::flush;\
std::cout << "\t" << PERF << std::flush;\
}
#define BENCHMARK_HOST(OP, PERF) \
@@ -82,7 +82,7 @@ void bench(sc::numeric_type dtype, std::string operation)
total_time += time;\
}\
double t = min(times);\
std::cout << " " << (int)(PERF) << std::flush;\
std::cout << "\t" << PERF << std::flush;\
}
#define BENCHMARK_CUDA(OP, PERF) \
@@ -99,27 +99,14 @@ void bench(sc::numeric_type dtype, std::string operation)
total_time+=times.back();\
}\
double t = min(times);\
std::cout << " " << (int)(PERF) << std::flush;\
std::cout << "\t" << PERF << std::flush;\
}
unsigned int dtsize = sc::size_of(dtype);
sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0);
std::map<std::string, std::string> metric{ {"axpy", "GB/s"}, {"dot", "GB/s"}, {"gemv", "GB/s"}, {"gemm", "GFLOPS"}};
sc::array flush((int)1e6, sc::FLOAT_TYPE);
std::cout << "#" << operation << " (" << metric[operation] << ")" << std::endl;
std::cout << "\"N\"";
std::cout << " \"ISAAC\"";
// std::cout << " \"ISAAC (Best impl.)\"";
#ifdef BENCH_CLBLAS
std::cout << " \"clBLAS\"";
#endif
#ifdef BENCH_CBLAS
std::cout << " \"BLAS\"";
#endif
#ifdef BENCH_CUBLAS
std::cout << " \"cuBLAS\"";
#endif
std::cout << std::endl;
//
// RUN BENCHMARKS
//
@@ -250,53 +237,51 @@ void bench(sc::numeric_type dtype, std::string operation)
if(operation.substr(0,4)=="gemm")
{
std::vector<std::tuple<std::string, char, char, int_t, int_t, int_t> > MNKs;
std::vector<std::tuple<std::string, int_t, int_t, int_t, char, char> > MNKs;
//Square
MNKs.push_back(std::make_tuple("square896",'N','T',896,896,896));
MNKs.push_back(std::make_tuple("square2560",'N','T',2560,2560,2560));
//Convolution
MNKs.push_back(std::make_tuple("conv1",'N','N',3025,64,363));
MNKs.push_back(std::make_tuple("conv2",'N','N',729,192,1600));
MNKs.push_back(std::make_tuple("conv3",'N','N',169,384,1728));
MNKs.push_back(std::make_tuple("conv4",'N','N',169,256,3456));
MNKs.push_back(std::make_tuple("conv5",'N','N',169,128,2304));
// //Convolution Gradient-1
// MNKs.push_back(std::make_tuple("convgrad5-1]",'T','N',2304,256,169));
// MNKs.push_back(std::make_tuple("convgrad4-1]",'T','N',3456,256,169));
// MNKs.push_back(std::make_tuple("convgrad3-1]",'T','N',1728,384,169));
// MNKs.push_back(std::make_tuple("convgrad2-1]",'T','N',1600,192,729));
// MNKs.push_back(std::make_tuple("convgrad1-1]",'T','N',363,64,3025));
// //Convolution Gradient-2
// MNKs.push_back(std::make_tuple("convgrad5-2]",'N','T',169,2304,256));
// MNKs.push_back(std::make_tuple("convgrad4-2]",'N','T',169,3456,256));
// MNKs.push_back(std::make_tuple("convgrad3-2]",'N','T',169,1728,384));
// MNKs.push_back(std::make_tuple("convgrad2-2]",'N','T',729,1600,192));
// MNKs.push_back(std::make_tuple("convgrad1-2]",'N','T',3025,363,64));
MNKs.push_back(std::make_tuple("Square",896,896,896,'N','T'));
MNKs.push_back(std::make_tuple("Square",2560,2560,2560,'N','T'));
//DeepBench
for(size_t MK: std::vector<size_t>{1760, 2048, 2560})
for(size_t N: std::vector<size_t>{16, 32, 64, 128, MK})
MNKs.push_back(std::make_tuple("Deep", MK, N, MK, 'N', 'N'));
for(size_t MK: std::vector<size_t>{1760, 2048, 2560})
for(size_t N: std::vector<size_t>{16, 32, 64, 128, MK})
MNKs.push_back(std::make_tuple("Deep", MK, N, MK, 'T', 'N'));
for(size_t MK: std::vector<size_t>{1760})
MNKs.push_back(std::make_tuple("Deep", MK, 1733, MK, 'N', 'T'));
//Covariance (e.g., ICA, 10minutes/100Hz)
MNKs.push_back(std::make_tuple("ica32",'N','T',32,32,60000));
MNKs.push_back(std::make_tuple("ica256",'N','T',256,256,60000));
MNKs.push_back(std::make_tuple("Cov",32,32,60000,'N','T'));
MNKs.push_back(std::make_tuple("Cov",256,256,60000,'N','T'));
//Bi-diagonalization
MNKs.push_back(std::make_tuple("Lapack",4096,4096,32,'N','T'));
MNKs.push_back(std::make_tuple("Lapack",3456,3456,32,'N','T'));
MNKs.push_back(std::make_tuple("Lapack",896,896,32,'N','T'));
// //Bi-diagonalization
MNKs.push_back(std::make_tuple("32rank1-4096",'N','T',4096,4096,32));
MNKs.push_back(std::make_tuple("32rank1-3456",'N','T',3456,3456,32));
MNKs.push_back(std::make_tuple("32rank1-896",'N','T',896,896,32));
std::cout << "BENCH\tM\tN\tK\ta_t\tb_t\tISAAC";;
#ifdef BENCH_CLBLAS
std::cout << "\tclBLAS";
#endif
#ifdef BENCH_CBLAS
std::cout << "\tBLAS";
#endif
#ifdef BENCH_CUBLAS
std::cout << "\tcuBLAS";
#endif
std::cout << std::endl;
/*---------*/
/*--BLAS3--*/
/*---------*/
for(std::tuple<std::string, char, char, int_t, int_t, int_t> MNK: MNKs)
for(auto MNK: MNKs)
{
bool AT = std::get<1>(MNK)=='T';
bool BT = std::get<2>(MNK)=='T';
int_t M = std::get<3>(MNK);
int_t N = std::get<4>(MNK);
int_t K = std::get<5>(MNK);
std::cout << "\"" << std::get<0>(MNK) << "\"";
std::cout << std::flush;
bool AT = std::get<4>(MNK)=='T';
bool BT = std::get<5>(MNK)=='T';
int_t M = std::get<1>(MNK);
int_t N = std::get<2>(MNK);
int_t K = std::get<3>(MNK);
std::cout << std::get<0>(MNK) << "\t" << M << "\t" << N
<< "\t" << K << "\t" << std::get<4>(MNK) << "\t" << std::get<5>(MNK);
/* ISAAC */
int_t As1 = M, As2 = K;
if(AT) std::swap(As1, As2);
@@ -307,13 +292,13 @@ void bench(sc::numeric_type dtype, std::string operation)
#ifdef HAS_A_BLAS
int_t lda = A.stride()[1], ldb = B.stride()[1], ldc = C.stride()[1];
#endif
BENCHMARK_ISAAC(C = AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B)), (double)2*M*N*K/t);
BENCHMARK_ISAAC(C = AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B)), (double)2*M*N*K/t*1e-3);
// BENCHMARK_ISAAC(C = sc::execution_handler(AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B)), sc::execution_options_type(0), sc::dispatcher_options_type(true)), (double)2*M*N*K/t);
/* clblas */
#ifdef BENCH_CLBLAS
if(C.context().backend()==sc::driver::OPENCL)
BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
0, CL_HANDLE(C.data()), 0, ldc, 1, &CL_HANDLE(queue),0, NULL, NULL), (double)2*M*N*K/t)
0, CL_HANDLE(C.data()), 0, ldc, 1, &CL_HANDLE(queue),0, NULL, NULL), (double)2*M*N*K/t*1e-3)
#endif
/* BLAS */
#ifdef BENCH_CBLAS
@@ -321,10 +306,10 @@ void bench(sc::numeric_type dtype, std::string operation)
sc::copy(C, cC);
sc::copy(A, cA);
sc::copy(B, cB);
BENCHMARK_HOST(cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc), (double)2*M*N*K/t);
BENCHMARK_HOST(cblas_sgemm(CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc), (double)2*M*N*K/t*1e-3);
#endif
#ifdef BENCH_CUBLAS
BENCHMARK_CUDA(cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)CU_HANDLE(A.data()), lda, (T*)CU_HANDLE(B.data()), ldb, 1, (T*)CU_HANDLE(C.data()), ldc), (double)2*M*N*K/t)
BENCHMARK_CUDA(cublasSgemm(AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)CU_HANDLE(A.data()), lda, (T*)CU_HANDLE(B.data()), ldb, 1, (T*)CU_HANDLE(C.data()), ldc), (double)2*M*N*K/t*1e-3)
#endif
std::cout << std::endl;
}
@@ -373,8 +358,7 @@ int main(int argc, char* argv[])
}
sc::driver::backend::default_device = device_idx;
std::cout << "#Benchmark : BLAS" << std::endl;
std::cout << "#----------------" << std::endl;
std::cout << std::fixed << std::setprecision(2);
bench<float>(sc::FLOAT_TYPE, operation);
#ifdef BENCH_CLBLAS

View File

@@ -74,6 +74,7 @@ public:
SM_3_7,
SM_5_0,
SM_5_2,
SM_6_1,
//AMD
TERASCALE_2,

View File

@@ -81,6 +81,12 @@ Device::Architecture Device::architecture() const
std::pair<unsigned int, unsigned int> sm = nv_compute_capability();
switch(sm.first)
{
case 6:
switch(sm.second)
{
case 1: return Architecture::SM_6_1;
}
case 5:
switch(sm.second)
{

View File

@@ -33,6 +33,7 @@
#include "database/nvidia/sm_3_0.hpp"
#include "database/nvidia/sm_3_5.hpp"
#include "database/nvidia/sm_5_2.hpp"
#include "database/nvidia/sm_6_1.hpp"
//AMD
#include "database/amd/gcn_1_1.hpp"
@@ -60,6 +61,7 @@ const profiles::presets_type profiles::presets_ =
DATABASE_ENTRY(GPU, NVIDIA, SM_3_7, database::nvidia::sm_3_5),
DATABASE_ENTRY(GPU, NVIDIA, SM_5_0, database::nvidia::sm_5_2),
DATABASE_ENTRY(GPU, NVIDIA, SM_5_2, database::nvidia::sm_5_2),
DATABASE_ENTRY(GPU, NVIDIA, SM_5_2, database::nvidia::sm_6_1),
//AMD
DATABASE_ENTRY(GPU, AMD, GCN_1_0, database::amd::gcn_1_1),
DATABASE_ENTRY(GPU, AMD, GCN_1_1, database::amd::gcn_1_1),

View File

@@ -73,7 +73,7 @@ def main():
libraries += ['gnustl_shared']
#Source files
src = 'src/lib/random/rand.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/base.cpp src/lib/runtime/execute.cpp src/lib/runtime/inference/database.cpp src/lib/runtime/inference/profiles.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/dispatch.cpp src/lib/driver/program_cache.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/exception/api.cpp src/lib/exception/driver.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
src = 'src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/value_scalar.cpp src/lib/random/rand.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/driver/dispatch.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/matrix_product.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/runtime/execute.cpp src/lib/runtime/scheduler/dag.cpp src/lib/runtime/scheduler/strategies/heft.cpp src/lib/runtime/inference/predictors/random_forest.cpp src/lib/runtime/inference/profiles.cpp src/lib/runtime/inference/database.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
boostsrc = 'external/boost/libs/'
for s in ['numpy','python','smart_ptr','system','thread']:
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]