Rearranged benchmarking script

2015-02-05 23:11:16 -05:00
parent e214927b16
commit 8f8b01938b
3 changed files with 60 additions and 27 deletions
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -23,6 +23,13 @@ if(MKL_FOUND)
    set(BLAS_DEF ${BLAS_DEF} "-DBENCH_CBLAS")
    include_directories(${MKL_INCLUDE_DIR})
    set(BLAS_LIBS ${BLAS_LIBS}  ${MKL_LIBRARIES} )
 else()
    find_package(OpenBlas)
    if(OPENBLAS_FOUND)
        set(BLAS_DEF ${BLAS_DEF} "-DBENCH_CBLAS")
        include_directories(${OPENBLAS_INCLUDE_DIR})
        set(BLAS_LIBS ${BLAS_LIBS}  ${OPENBLAS_LIBRARIES} )
    endif()
 endif()
 string(REPLACE ";" " " BLAS_DEF_STR "${BLAS_DEF}")
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -1,6 +1,5 @@
 #include "atidlas/array.h"
 #include "atidlas/symbolic/execute.h"
 #include "atidlas/tools/timer.hpp"
 #include "common.hpp"
 #ifdef BENCH_CLAMDBLAS
  #include "clAmdBlas.h"
@@ -14,7 +13,7 @@
 #include <iomanip>
 #include <stdlib.h>
 #include <cmath>
-
+#include <chrono>
 namespace ad = atidlas;
 typedef ad::int_t int_t;
@@ -22,26 +21,55 @@ typedef ad::int_t int_t;
 template<class T>
 void bench(ad::numeric_type dtype)
 {
  float total_time = 0;
  std::vector<double> times;
  ad::tools::timer timer;
  unsigned int dtsize = ad::size_of(dtype);
-#define BENCHMARK(OP, TIME, PERF) \
+#define BENCHMARK_OPENCL(OP, PERF) \
  {\
-  total_time = 0;\
+  std::vector<long> times;\
-  while(total_time < 1e-3){\
+  double total_time = 0;\
  while(total_time*1e-9 < 1e-1){\
    cl::Event event;\
    OP;\
-    times.push_back(TIME);\
+    queue.flush();\
    queue.finish();\
    times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());\
    total_time+=times.back();\
  }\
-  float tres = median(times);\
+  double t = median(times);\
-  std::cout << " " << tres << std::flush;\
+  std::cout << " " << PERF << std::flush;\
  }
 #define BENCHMARK_HOST(OP, PERF) \
  {\
  std::vector<int> cache_flusher(10000000, 0);\
  auto start = std::chrono::steady_clock::now();\
  OP;\
  auto end = std::chrono::steady_clock::now();\
  double t = std::chrono::duration<double, std::nano>(end - start).count();\
  std::cout << " " << PERF << std::flush;\
  }
 #define BENCHMARK_CUDA(OP, PERF) \
  {\
  std::vector<long> times;\
  double total_time = 0;\
  double time;\
  while(total_time*1e-3 < 1e-1){\
    cudaEvent_t start, stop;\
    cudaEventCreate(&start);\
    cudaEventCreate(&stop);\
    cudaEventRecord(start);\
    OP;\
    cudaEventRecord(stop);\
    cudaEventSynchronize();\
    cudaEventElapsedTime(&time, start, stop);\
    times.push_back(time);\
    total_time+=time;\
  }\
  double t = 1e-6*median(times);\
  std::cout << " " << PERF << std::flush;\
  }
 #define CL_TIME
 #define CL_SYNC queue.flush(); queue.finish()
  /*---------*/
  /*--BLAS1--*/
  /*---------*/
@@ -49,30 +77,28 @@ void bench(ad::numeric_type dtype)
  for(auto N : BLAS1_N)
  {
    std::cout << N;
    /* ATIDLAS */
    ad::array x(N, dtype), y(N, dtype);
    cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
-    cl::Event event;
+    /* ATIDLAS */
-    y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event));
+    y = x + y; queue.flush(); queue.finish();
-    queue.flush(); queue.finish();
+    BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
-    std::cout << " " << bandwidth(3*N, 1e-9*(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>()), dtsize) << std::flush;
+    /* clAmdBlas */
-//    /* clAmdBlas */
+#ifdef BENCH_CLAMDBLAS
-//#ifdef BENCH_CLAMDBLAS
+    BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
-//    BENCHMARK(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, &event(), NULL); CL_SYNC,  CL_TIME, bandwidth(3*N, tres, dtsize))
+#endif
 //#endif
    /* BLAS */
 #ifdef BENCH_CBLAS
    std::vector<float> cx(N), cy(N);
    ad::copy(x, cx);
    ad::copy(y, cy);
-    BENCHMARK(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), bandwidth(3*N, tres, dtsize));
+    BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
 #endif
    /* CuBLAS */
 #ifdef BENCH_CUBLAS
    T *cux, *cuy;
    cudaMalloc((void**) &cux, N * sizeof(T));
    cudaMalloc((void**) &cuy, N * sizeof(T));
-    BENCHMARK(cublasSaxpy(N, 2, cux, 1, cuy, 1); cudaThreadSynchronize();, bandwidth(3*N, tres, dtsize))
+    BENCHMARK(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
    cudaFree(cux);
    cudaFree(cuy);
 #endif
--- a/bench/common.hpp
+++ b/bench/common.hpp
@@ -57,7 +57,7 @@ static const std::vector<int> BLAS3_N = make_vector<int>() << 128;
 static const std::vector<int> BLAS3_K = create_full_range(128, 5000, 64);
-float bandwidth(std::size_t N, float t, unsigned int dtsize)
+double bandwidth(std::size_t N, double t, unsigned int dtsize)
 { return N * dtsize * 1e-9 / t; }
 double gflops(double nops, double t)