diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt index e648f092c..8ad73fb46 100644 --- a/bench/CMakeLists.txt +++ b/bench/CMakeLists.txt @@ -1,12 +1,15 @@ +set(CMAKE_BUILD_TYPE Release) + +find_package(CUDA) + foreach(PROG blas) add_executable(${PROG}-bench ${PROG}.cpp) target_link_libraries(${PROG}-bench ${OPENCL_LIBRARIES}) set_target_properties(${PROG}-bench PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL -Wall -Wextra") -endforeach(PROG) -# packages -find_package(CUDA) -if(CUDA_FOUND) - cuda_add_executable(cublas-bench "cublas.cu") - target_link_libraries(cublas-bench "cublas") -endif() + if(CUDA_FOUND) + set(CUPROG cu${PROG}) + cuda_add_executable(${CUPROG}-bench ${CUPROG}.cu OPTIONS "-DVIENNACL_WITH_OPENCL") + target_link_libraries(${CUPROG}-bench ${CUPROG} OpenCL) + endif() +endforeach(PROG) diff --git a/bench/blas.cpp b/bench/blas.cpp index 5e380c579..687525149 100644 --- a/bench/blas.cpp +++ b/bench/blas.cpp @@ -1,26 +1,31 @@ +//#define VIENNACL_DEBUG_ALL + #include "viennacl/matrix.hpp" #include "viennacl/vector.hpp" #include "viennacl/tools/timer.hpp" +#include "viennacl/linalg/prod.hpp" +#include "viennacl/linalg/inner_prod.hpp" +#include "viennacl/scheduler/execute.hpp" #include "atidlas/tools/misc.hpp" #include "atidlas/model/import.hpp" #include "atidlas/model/model.hpp" +#include "common.hpp" + #include #include + namespace ad = atidlas; typedef atidlas::atidlas_int_t int_t; template -float bandwidth(std::size_t N, float t) +void bench(std::map > & models) { - return N * sizeof(T) * 1e-9 / t; -} + typedef viennacl::matrix matrix_type; + typedef viennacl::vector vector_type; -template -void bench(std::vector BLAS1_N, std::map > & models) -{ viennacl::tools::timer timer; float total_time = 0; std::vector times; @@ -30,7 +35,7 @@ void bench(std::vector BLAS1_N, std::map BLAS1_N, std::map::const_iterator it = sizes.begin() ; it != sizes.end() ; ++it)\ - {\ - declarations;\ - viennacl::scheduler::statement statement(statement_op);\ - BENCHMARK(y = x + y, time_viennacl);\ - BENCHMARK(models[key]->execute(statement), time_model);\ - BENCHMARK(models[key]->execute(statement, true), time_unique_kernel);\ - models[key]->tune(statement);\ - BENCHMARK(models[key]->execute(statement), time_opt);\ - std::cout << *it << " " << measure(N, time_viennacl) << " " << measure(N,time_unique_kernel) << " " << measure(N,time_model) << " " << measure(N,time_opt) << std::endl;\ + if(models.find(key)!=models.end()){\ + if(!first)\ + {\ + std::cout << std::endl;\ + std::cout << std::endl;\ + }\ + std::cout << "#" << key << std::endl;\ + for(std::vector::const_iterator it = sizes.begin() ; it != sizes.end() ; ++it)\ + {\ + declarations;\ + viennacl::scheduler::statement statement(statement_op);\ + BENCHMARK(models.at(key)->execute(statement), time_model);\ + BENCHMARK(models[key]->execute(statement, true), time_unique_kernel);\ + models[key]->tune(statement);\ + BENCHMARK(models[key]->execute(statement), time_opt);\ + std::cout << *it << " " << measure(N,time_unique_kernel) << " " << measure(N,time_model) << " " << measure(N,time_opt) << std::endl;\ + }\ }\ #define DECLARE(type, ...) type __VA_ARGS__ #define ARGS(...) __VA_ARGS__ - BENCH(DECLARE(viennacl::vector, x(*it), y(*it)), ARGS(y, viennacl::op_assign(), x + y), BLAS1_N, bandwidth, 3*(*it), "vector-axpy-float32"); - std::cout << std::endl; - std::cout << std::endl; -} + /*---------*/ + /*--BLAS1--*/ + /*---------*/ -std::vector create_log_range(int_t min, int_t max, int_t N) -{ - std::vector res(N); - for(int_t i = 0 ; i < N ; ++i) - res[i] = std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N); - return res; + //AXPY + bool first =true; + BENCH(DECLARE(viennacl::vector, x(*it), y(*it)), ARGS(y, viennacl::op_assign(), x + y), + BLAS1_N, bandwidth, 3*(*it), "vector-axpy-float32"); + first=false; + + + //DOT + BENCH(DECLARE(viennacl::scalar s(0)); DECLARE(vector_type, x(*it), y(*it)), ARGS(s, viennacl::op_assign(), viennacl::linalg::inner_prod(x,y)), + BLAS1_N, bandwidth, 2*(*it), "reduction-float32"); + + + /*---------*/ + /*--BLAS2--*/ + /*---------*/ + + //N-layout + for(std::vector::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit) + { + BENCH(DECLARE(matrix_type, A(*Mit,*it)); DECLARE(vector_type, y(*Mit), x(*it)),ARGS(y, viennacl::op_assign(), viennacl::linalg::prod(A,x)), BLAS2_N, + bandwidth, (*Mit)*(*it), "row-wise-reductionN-float32"); + } + + + //T-layout + for(std::vector::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit) + { + BENCH(DECLARE(matrix_type, A(*it,*Mit)) ; DECLARE(vector_type, y(*Mit), x(*it)), ARGS(y, viennacl::op_assign(), viennacl::linalg::prod(viennacl::trans(A),x)), BLAS2_N, + bandwidth, (*Mit)*(*it), "row-wise-reductionT-float32"); + } + + /*---------*/ + /*--BLAS3--*/ + /*---------*/ } int main(int argc, char* argv[]) @@ -81,9 +117,7 @@ int main(int argc, char* argv[]) } std::map > models = ad::import(argv[1]); - std::vector BLAS1_N = create_log_range(1e3, 2e7, 50); - std::cout << "#Benchmark : BLAS" << std::endl; std::cout << "#----------------" << std::endl; - bench(BLAS1_N, models); + bench(models); } diff --git a/bench/common.hpp b/bench/common.hpp new file mode 100644 index 000000000..3971f1cfb --- /dev/null +++ b/bench/common.hpp @@ -0,0 +1,59 @@ +#ifndef ATIDLAS_BENCH_COMMON_HPP_ +#define ATIDLAS_BENCH_COMMON_HPP_ + +#include "vector" +#include "atidlas/tools/misc.hpp" + +int ceil(int N, int pad) +{ + return (N%pad==0)?N:(N+pad-1)/pad*pad; +} + +std::vector create_log_range(int min, int max, int N, int pad) +{ + std::vector res(N); + for(int i = 0 ; i < N ; ++i) + { + res[i] = std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N); + res[i] = ceil(res[i], pad); + } + return res; +} + +std::vector create_full_range(int min, int max, int pad) +{ + std::vector N; + for(int i = ceil(min, pad) ; i < ceil(max, pad) ; i+=pad) + N.push_back(i); + return N; +} + +// BLAS1 Sizes +static const std::vector BLAS1_N = create_log_range(1e3, 2e7, 50, 64); + +// BLAS2 Sizes +static const std::vector BLAS2_M = atidlas::tools::make_vector() << 256; +static const std::vector BLAS2_N = create_full_range(128, 5000, 64); + +// BLAS3 Sizes +static const std::vector BLAS3_N = create_full_range(128, 5000, 64); + + +template +float bandwidth(std::size_t N, float t) +{ + return N * sizeof(T) * 1e-9 / t; +} + +template +T median(std::vector x) +{ + size_t size = x.size(); + std::sort(x.begin(), x.end()); + if (size % 2 == 0) + return (x[size / 2 - 1] + x[size / 2]) / 2; + else + return x[size / 2]; +} + +#endif diff --git a/bench/cublas.cu b/bench/cublas.cu index 09ffec4b2..ef0a8f7f3 100644 --- a/bench/cublas.cu +++ b/bench/cublas.cu @@ -8,8 +8,11 @@ #include "common.hpp" +#include "CL/cl.h" +#include "atidlas/execute.hpp" + template -void run(std::vector const & BLAS1_N, std::vector const & BLAS2_N, std::vector const & BLAS3_N) +void run() { #define FILL_TIMINGS(OP, timings) \ {\ @@ -74,25 +77,28 @@ void run(std::vector const & BLAS1_N, std::vector const & BLAS2_N, std std::cout << "#GEMV" << std::endl; std::cout << "#N Perf" << std::endl; - for(std::vector::const_iterator it = BLAS2_N.begin() ; it != BLAS2_N.end() ; ++it) + for(std::vector::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit) { - int M = *it; - int N = *it; - NumericT *x, *y, *A; - cudaMalloc((void**) &A, M * N * sizeof(NumericT)); - cudaMalloc((void**) &x, M * sizeof(NumericT)); - cudaMalloc((void**) &y, N * sizeof(NumericT)); - //Bench - std::vector timings; - FILL_TIMINGS(cublasSgemv('T', M, N, 1.0, A, M, x, 1, 1.0, y, 1), timings); - std::cout << N << " " << (M + N + M*N)*sizeof(NumericT)*1e-9/median(timings) << std::endl; - //Free - cudaFree(A); - cudaFree(x); - cudaFree(y); + for(std::vector::const_iterator it = BLAS2_N.begin() ; it != BLAS2_N.end() ; ++it) + { + int M = *Mit; + int N = *it; + NumericT *x, *y, *A; + cudaMalloc((void**) &A, M * N * sizeof(NumericT)); + cudaMalloc((void**) &x, M * sizeof(NumericT)); + cudaMalloc((void**) &y, N * sizeof(NumericT)); + //Bench + std::vector timings; + FILL_TIMINGS(cublasSgemv('N', M, N, 1.0, A, M, x, 1, 1.0, y, 1), timings); + std::cout << N << " " << (M + N + M*N)*sizeof(NumericT)*1e-9/median(timings) << std::endl; + //Free + cudaFree(A); + cudaFree(x); + cudaFree(y); + } + std::cout << std::endl; + std::cout << std::endl; } - std::cout << std::endl; - std::cout << std::endl; std::cout << "#GEMM" << std::endl; std::cout << "#N Perf" << std::endl; @@ -124,10 +130,5 @@ int main(int argc, char** argv) cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); std::cout << "Device: " << prop.name << std::endl; - - std::vector BLAS1_N = create_log_range(1e3, 2e7, 50); - std::vector BLAS2_N = create_log_range(100, 4000, 50); - std::vector BLAS3_N = create_log_range(100, 4000, 50); - - run(BLAS1_N, BLAS2_N, BLAS3_N); + run(); } diff --git a/bench/plot.gnuplot b/bench/plot.gnuplot index eac214b5f..c0b59ddb0 100644 --- a/bench/plot.gnuplot +++ b/bench/plot.gnuplot @@ -1,7 +1,14 @@ -set logscale x - set terminal pdf -set output 'saxpy.pdf' -plot "out.dat" i 0 using 1:2 with lines title 'Naive', \ - "out.dat" i 0 using 1:3 with lines title 'Model', \ - "out.dat" i 0 using 1:4 with lines title 'Optimal' +set output 'bench.pdf' + +set xlabel 'N' +set ylabel 'Bandwidth (GB/s)' +set key top left +stats "out.dat" nooutput + +set logscale x +do for [i=1:STATS_blocks]{ +plot "out.dat" index (i-1) using 1:2 with lines title 'Naive', \ + "out.dat" index (i-1) using 1:3 with lines title 'Model', \ + "out.dat" index (i-1) using 1:4 with lines title 'Optimal' +}