Improved benchmark suite
This commit is contained in:
@@ -1,12 +1,15 @@
|
|||||||
|
set(CMAKE_BUILD_TYPE Release)
|
||||||
|
|
||||||
|
find_package(CUDA)
|
||||||
|
|
||||||
foreach(PROG blas)
|
foreach(PROG blas)
|
||||||
add_executable(${PROG}-bench ${PROG}.cpp)
|
add_executable(${PROG}-bench ${PROG}.cpp)
|
||||||
target_link_libraries(${PROG}-bench ${OPENCL_LIBRARIES})
|
target_link_libraries(${PROG}-bench ${OPENCL_LIBRARIES})
|
||||||
set_target_properties(${PROG}-bench PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL -Wall -Wextra")
|
set_target_properties(${PROG}-bench PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL -Wall -Wextra")
|
||||||
endforeach(PROG)
|
|
||||||
|
|
||||||
# packages
|
if(CUDA_FOUND)
|
||||||
find_package(CUDA)
|
set(CUPROG cu${PROG})
|
||||||
if(CUDA_FOUND)
|
cuda_add_executable(${CUPROG}-bench ${CUPROG}.cu OPTIONS "-DVIENNACL_WITH_OPENCL")
|
||||||
cuda_add_executable(cublas-bench "cublas.cu")
|
target_link_libraries(${CUPROG}-bench ${CUPROG} OpenCL)
|
||||||
target_link_libraries(cublas-bench "cublas")
|
endif()
|
||||||
endif()
|
endforeach(PROG)
|
||||||
|
@@ -1,26 +1,31 @@
|
|||||||
|
//#define VIENNACL_DEBUG_ALL
|
||||||
|
|
||||||
#include "viennacl/matrix.hpp"
|
#include "viennacl/matrix.hpp"
|
||||||
#include "viennacl/vector.hpp"
|
#include "viennacl/vector.hpp"
|
||||||
#include "viennacl/tools/timer.hpp"
|
#include "viennacl/tools/timer.hpp"
|
||||||
|
#include "viennacl/linalg/prod.hpp"
|
||||||
|
#include "viennacl/linalg/inner_prod.hpp"
|
||||||
|
#include "viennacl/scheduler/execute.hpp"
|
||||||
|
|
||||||
#include "atidlas/tools/misc.hpp"
|
#include "atidlas/tools/misc.hpp"
|
||||||
#include "atidlas/model/import.hpp"
|
#include "atidlas/model/import.hpp"
|
||||||
#include "atidlas/model/model.hpp"
|
#include "atidlas/model/model.hpp"
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
|
||||||
namespace ad = atidlas;
|
namespace ad = atidlas;
|
||||||
typedef atidlas::atidlas_int_t int_t;
|
typedef atidlas::atidlas_int_t int_t;
|
||||||
|
|
||||||
template<class T>
|
template<class T>
|
||||||
float bandwidth(std::size_t N, float t)
|
void bench(std::map<std::string, ad::tools::shared_ptr<ad::model> > & models)
|
||||||
{
|
{
|
||||||
return N * sizeof(T) * 1e-9 / t;
|
typedef viennacl::matrix<T,viennacl::column_major> matrix_type;
|
||||||
}
|
typedef viennacl::vector<T> vector_type;
|
||||||
|
|
||||||
template<class T>
|
|
||||||
void bench(std::vector<int_t> BLAS1_N, std::map<std::string, ad::tools::shared_ptr<ad::model> > & models)
|
|
||||||
{
|
|
||||||
viennacl::tools::timer timer;
|
viennacl::tools::timer timer;
|
||||||
float total_time = 0;
|
float total_time = 0;
|
||||||
std::vector<T> times;
|
std::vector<T> times;
|
||||||
@@ -30,7 +35,7 @@ void bench(std::vector<int_t> BLAS1_N, std::map<std::string, ad::tools::shared_p
|
|||||||
total_time = 0;\
|
total_time = 0;\
|
||||||
OP;\
|
OP;\
|
||||||
viennacl::backend::finish();\
|
viennacl::backend::finish();\
|
||||||
while(total_time < 1e-1){\
|
while(total_time < 1e-2){\
|
||||||
timer.start(); \
|
timer.start(); \
|
||||||
OP;\
|
OP;\
|
||||||
viennacl::backend::finish();\
|
viennacl::backend::finish();\
|
||||||
@@ -40,36 +45,67 @@ void bench(std::vector<int_t> BLAS1_N, std::map<std::string, ad::tools::shared_p
|
|||||||
viennacl::backend::finish();\
|
viennacl::backend::finish();\
|
||||||
float resname = ad::tools::median(times);
|
float resname = ad::tools::median(times);
|
||||||
|
|
||||||
std::cout << "#N PerfNaive PerfModel PerfOpt" << std::endl;
|
|
||||||
|
|
||||||
#define BENCH(declarations, statement_op, sizes, measure, N, key) \
|
#define BENCH(declarations, statement_op, sizes, measure, N, key) \
|
||||||
|
if(models.find(key)!=models.end()){\
|
||||||
|
if(!first)\
|
||||||
|
{\
|
||||||
|
std::cout << std::endl;\
|
||||||
|
std::cout << std::endl;\
|
||||||
|
}\
|
||||||
std::cout << "#" << key << std::endl;\
|
std::cout << "#" << key << std::endl;\
|
||||||
for(std::vector<int_t>::const_iterator it = sizes.begin() ; it != sizes.end() ; ++it)\
|
for(std::vector<int_t>::const_iterator it = sizes.begin() ; it != sizes.end() ; ++it)\
|
||||||
{\
|
{\
|
||||||
declarations;\
|
declarations;\
|
||||||
viennacl::scheduler::statement statement(statement_op);\
|
viennacl::scheduler::statement statement(statement_op);\
|
||||||
BENCHMARK(y = x + y, time_viennacl);\
|
BENCHMARK(models.at(key)->execute(statement), time_model);\
|
||||||
BENCHMARK(models[key]->execute(statement), time_model);\
|
|
||||||
BENCHMARK(models[key]->execute(statement, true), time_unique_kernel);\
|
BENCHMARK(models[key]->execute(statement, true), time_unique_kernel);\
|
||||||
models[key]->tune(statement);\
|
models[key]->tune(statement);\
|
||||||
BENCHMARK(models[key]->execute(statement), time_opt);\
|
BENCHMARK(models[key]->execute(statement), time_opt);\
|
||||||
std::cout << *it << " " << measure<T>(N, time_viennacl) << " " << measure<T>(N,time_unique_kernel) << " " << measure<T>(N,time_model) << " " << measure<T>(N,time_opt) << std::endl;\
|
std::cout << *it << " " << measure<T>(N,time_unique_kernel) << " " << measure<T>(N,time_model) << " " << measure<T>(N,time_opt) << std::endl;\
|
||||||
|
}\
|
||||||
}\
|
}\
|
||||||
|
|
||||||
#define DECLARE(type, ...) type __VA_ARGS__
|
#define DECLARE(type, ...) type __VA_ARGS__
|
||||||
#define ARGS(...) __VA_ARGS__
|
#define ARGS(...) __VA_ARGS__
|
||||||
|
|
||||||
BENCH(DECLARE(viennacl::vector<T>, x(*it), y(*it)), ARGS(y, viennacl::op_assign(), x + y), BLAS1_N, bandwidth, 3*(*it), "vector-axpy-float32");
|
/*---------*/
|
||||||
std::cout << std::endl;
|
/*--BLAS1--*/
|
||||||
std::cout << std::endl;
|
/*---------*/
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int_t> create_log_range(int_t min, int_t max, int_t N)
|
//AXPY
|
||||||
{
|
bool first =true;
|
||||||
std::vector<int_t> res(N);
|
BENCH(DECLARE(viennacl::vector<T>, x(*it), y(*it)), ARGS(y, viennacl::op_assign(), x + y),
|
||||||
for(int_t i = 0 ; i < N ; ++i)
|
BLAS1_N, bandwidth, 3*(*it), "vector-axpy-float32");
|
||||||
res[i] = std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N);
|
first=false;
|
||||||
return res;
|
|
||||||
|
|
||||||
|
//DOT
|
||||||
|
BENCH(DECLARE(viennacl::scalar<T> s(0)); DECLARE(vector_type, x(*it), y(*it)), ARGS(s, viennacl::op_assign(), viennacl::linalg::inner_prod(x,y)),
|
||||||
|
BLAS1_N, bandwidth, 2*(*it), "reduction-float32");
|
||||||
|
|
||||||
|
|
||||||
|
/*---------*/
|
||||||
|
/*--BLAS2--*/
|
||||||
|
/*---------*/
|
||||||
|
|
||||||
|
//N-layout
|
||||||
|
for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
|
||||||
|
{
|
||||||
|
BENCH(DECLARE(matrix_type, A(*Mit,*it)); DECLARE(vector_type, y(*Mit), x(*it)),ARGS(y, viennacl::op_assign(), viennacl::linalg::prod(A,x)), BLAS2_N,
|
||||||
|
bandwidth, (*Mit)*(*it), "row-wise-reductionN-float32");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//T-layout
|
||||||
|
for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
|
||||||
|
{
|
||||||
|
BENCH(DECLARE(matrix_type, A(*it,*Mit)) ; DECLARE(vector_type, y(*Mit), x(*it)), ARGS(y, viennacl::op_assign(), viennacl::linalg::prod(viennacl::trans(A),x)), BLAS2_N,
|
||||||
|
bandwidth, (*Mit)*(*it), "row-wise-reductionT-float32");
|
||||||
|
}
|
||||||
|
|
||||||
|
/*---------*/
|
||||||
|
/*--BLAS3--*/
|
||||||
|
/*---------*/
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char* argv[])
|
int main(int argc, char* argv[])
|
||||||
@@ -81,9 +117,7 @@ int main(int argc, char* argv[])
|
|||||||
}
|
}
|
||||||
std::map<std::string, ad::tools::shared_ptr<ad::model> > models = ad::import(argv[1]);
|
std::map<std::string, ad::tools::shared_ptr<ad::model> > models = ad::import(argv[1]);
|
||||||
|
|
||||||
std::vector<int_t> BLAS1_N = create_log_range(1e3, 2e7, 50);
|
|
||||||
|
|
||||||
std::cout << "#Benchmark : BLAS" << std::endl;
|
std::cout << "#Benchmark : BLAS" << std::endl;
|
||||||
std::cout << "#----------------" << std::endl;
|
std::cout << "#----------------" << std::endl;
|
||||||
bench<float>(BLAS1_N, models);
|
bench<float>(models);
|
||||||
}
|
}
|
||||||
|
59
bench/common.hpp
Normal file
59
bench/common.hpp
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
#ifndef ATIDLAS_BENCH_COMMON_HPP_
|
||||||
|
#define ATIDLAS_BENCH_COMMON_HPP_
|
||||||
|
|
||||||
|
#include "vector"
|
||||||
|
#include "atidlas/tools/misc.hpp"
|
||||||
|
|
||||||
|
int ceil(int N, int pad)
|
||||||
|
{
|
||||||
|
return (N%pad==0)?N:(N+pad-1)/pad*pad;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int> create_log_range(int min, int max, int N, int pad)
|
||||||
|
{
|
||||||
|
std::vector<int> res(N);
|
||||||
|
for(int i = 0 ; i < N ; ++i)
|
||||||
|
{
|
||||||
|
res[i] = std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N);
|
||||||
|
res[i] = ceil(res[i], pad);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int> create_full_range(int min, int max, int pad)
|
||||||
|
{
|
||||||
|
std::vector<int> N;
|
||||||
|
for(int i = ceil(min, pad) ; i < ceil(max, pad) ; i+=pad)
|
||||||
|
N.push_back(i);
|
||||||
|
return N;
|
||||||
|
}
|
||||||
|
|
||||||
|
// BLAS1 Sizes
|
||||||
|
static const std::vector<int> BLAS1_N = create_log_range(1e3, 2e7, 50, 64);
|
||||||
|
|
||||||
|
// BLAS2 Sizes
|
||||||
|
static const std::vector<int> BLAS2_M = atidlas::tools::make_vector<int>() << 256;
|
||||||
|
static const std::vector<int> BLAS2_N = create_full_range(128, 5000, 64);
|
||||||
|
|
||||||
|
// BLAS3 Sizes
|
||||||
|
static const std::vector<int> BLAS3_N = create_full_range(128, 5000, 64);
|
||||||
|
|
||||||
|
|
||||||
|
template<class T>
|
||||||
|
float bandwidth(std::size_t N, float t)
|
||||||
|
{
|
||||||
|
return N * sizeof(T) * 1e-9 / t;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class T>
|
||||||
|
T median(std::vector<T> x)
|
||||||
|
{
|
||||||
|
size_t size = x.size();
|
||||||
|
std::sort(x.begin(), x.end());
|
||||||
|
if (size % 2 == 0)
|
||||||
|
return (x[size / 2 - 1] + x[size / 2]) / 2;
|
||||||
|
else
|
||||||
|
return x[size / 2];
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
@@ -8,8 +8,11 @@
|
|||||||
|
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
|
|
||||||
|
#include "CL/cl.h"
|
||||||
|
#include "atidlas/execute.hpp"
|
||||||
|
|
||||||
template<class NumericT>
|
template<class NumericT>
|
||||||
void run(std::vector<int> const & BLAS1_N, std::vector<int> const & BLAS2_N, std::vector<int> const & BLAS3_N)
|
void run()
|
||||||
{
|
{
|
||||||
#define FILL_TIMINGS(OP, timings) \
|
#define FILL_TIMINGS(OP, timings) \
|
||||||
{\
|
{\
|
||||||
@@ -74,9 +77,11 @@ void run(std::vector<int> const & BLAS1_N, std::vector<int> const & BLAS2_N, std
|
|||||||
|
|
||||||
std::cout << "#GEMV" << std::endl;
|
std::cout << "#GEMV" << std::endl;
|
||||||
std::cout << "#N Perf" << std::endl;
|
std::cout << "#N Perf" << std::endl;
|
||||||
|
for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
|
||||||
|
{
|
||||||
for(std::vector<int>::const_iterator it = BLAS2_N.begin() ; it != BLAS2_N.end() ; ++it)
|
for(std::vector<int>::const_iterator it = BLAS2_N.begin() ; it != BLAS2_N.end() ; ++it)
|
||||||
{
|
{
|
||||||
int M = *it;
|
int M = *Mit;
|
||||||
int N = *it;
|
int N = *it;
|
||||||
NumericT *x, *y, *A;
|
NumericT *x, *y, *A;
|
||||||
cudaMalloc((void**) &A, M * N * sizeof(NumericT));
|
cudaMalloc((void**) &A, M * N * sizeof(NumericT));
|
||||||
@@ -84,7 +89,7 @@ void run(std::vector<int> const & BLAS1_N, std::vector<int> const & BLAS2_N, std
|
|||||||
cudaMalloc((void**) &y, N * sizeof(NumericT));
|
cudaMalloc((void**) &y, N * sizeof(NumericT));
|
||||||
//Bench
|
//Bench
|
||||||
std::vector<float> timings;
|
std::vector<float> timings;
|
||||||
FILL_TIMINGS(cublasSgemv('T', M, N, 1.0, A, M, x, 1, 1.0, y, 1), timings);
|
FILL_TIMINGS(cublasSgemv('N', M, N, 1.0, A, M, x, 1, 1.0, y, 1), timings);
|
||||||
std::cout << N << " " << (M + N + M*N)*sizeof(NumericT)*1e-9/median(timings) << std::endl;
|
std::cout << N << " " << (M + N + M*N)*sizeof(NumericT)*1e-9/median(timings) << std::endl;
|
||||||
//Free
|
//Free
|
||||||
cudaFree(A);
|
cudaFree(A);
|
||||||
@@ -93,6 +98,7 @@ void run(std::vector<int> const & BLAS1_N, std::vector<int> const & BLAS2_N, std
|
|||||||
}
|
}
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
std::cout << "#GEMM" << std::endl;
|
std::cout << "#GEMM" << std::endl;
|
||||||
std::cout << "#N Perf" << std::endl;
|
std::cout << "#N Perf" << std::endl;
|
||||||
@@ -124,10 +130,5 @@ int main(int argc, char** argv)
|
|||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
cudaGetDeviceProperties(&prop, 0);
|
cudaGetDeviceProperties(&prop, 0);
|
||||||
std::cout << "Device: " << prop.name << std::endl;
|
std::cout << "Device: " << prop.name << std::endl;
|
||||||
|
run<float>();
|
||||||
std::vector<int> BLAS1_N = create_log_range(1e3, 2e7, 50);
|
|
||||||
std::vector<int> BLAS2_N = create_log_range(100, 4000, 50);
|
|
||||||
std::vector<int> BLAS3_N = create_log_range(100, 4000, 50);
|
|
||||||
|
|
||||||
run<float>(BLAS1_N, BLAS2_N, BLAS3_N);
|
|
||||||
}
|
}
|
||||||
|
@@ -1,7 +1,14 @@
|
|||||||
set logscale x
|
|
||||||
|
|
||||||
set terminal pdf
|
set terminal pdf
|
||||||
set output 'saxpy.pdf'
|
set output 'bench.pdf'
|
||||||
plot "out.dat" i 0 using 1:2 with lines title 'Naive', \
|
|
||||||
"out.dat" i 0 using 1:3 with lines title 'Model', \
|
set xlabel 'N'
|
||||||
"out.dat" i 0 using 1:4 with lines title 'Optimal'
|
set ylabel 'Bandwidth (GB/s)'
|
||||||
|
set key top left
|
||||||
|
stats "out.dat" nooutput
|
||||||
|
|
||||||
|
set logscale x
|
||||||
|
do for [i=1:STATS_blocks]{
|
||||||
|
plot "out.dat" index (i-1) using 1:2 with lines title 'Naive', \
|
||||||
|
"out.dat" index (i-1) using 1:3 with lines title 'Model', \
|
||||||
|
"out.dat" index (i-1) using 1:4 with lines title 'Optimal'
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user