Now using events to time autotuning

2015-02-06 22:11:03 -05:00
parent 385f007c0b
commit b768e913c9
7 changed files with 163 additions and 95 deletions
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -30,7 +30,6 @@ void bench(ad::numeric_type dtype)
  while(total_time*1e-9 < 1e-1){\
    cl::Event event;\
    OP;\
-    queue.flush();\
    queue.finish();\
    times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());\
    total_time+=times.back();\
@@ -70,96 +69,114 @@ void bench(ad::numeric_type dtype)
  std::cout << " " << PERF << std::flush;\
  }

-  /*---------*/
-  /*--BLAS1--*/
-  /*---------*/
-  std::cout << "#AXPY" << std::endl;
-  for(auto N : BLAS1_N)
-  {
-    std::cout << N;
-    ad::array x(N, dtype), y(N, dtype);
-    cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
-    /* ATIDLAS */
-    y = x + y; queue.flush(); queue.finish();
-    BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
-    /* clAmdBlas */
-#ifdef BENCH_CLAMDBLAS
-    BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
-#endif
-    /* BLAS */
-#ifdef BENCH_CBLAS
-    std::vector<float> cx(N), cy(N);
-    ad::copy(x, cx);
-    ad::copy(y, cy);
-    BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
-#endif
-    /* CuBLAS */
-#ifdef BENCH_CUBLAS
-    T *cux, *cuy;
-    cudaMalloc((void**) &cux, N * sizeof(T));
-    cudaMalloc((void**) &cuy, N * sizeof(T));
-    BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
-    cudaFree(cux);
-    cudaFree(cuy);
-#endif
-    std::cout << std::endl;
-  }
-  std::cout << "\n\n" << std::flush;
-
-//  std::cout << "#DOT" << std::endl;
-//  for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
+//  /*---------*/
+//  /*--BLAS1--*/
+//  /*---------*/
+//  std::cout << "#AXPY" << std::endl;
+//  for(int_t N : create_log_range(1e3, 2e7, 50, 64))
 //  {
-//    int_t N = *it;
 //    std::cout << N;
-//    /* ATIDLAS */
 //    ad::array x(N, dtype), y(N, dtype);
-//    ad::array scratch(N, dtype);
-//    ad::scalar s(dtype);
-//    CL_BENCHMARK(s = dot(x,y), bandwidth(2*N, tres, dtsize));
+//    cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
+//    /* ATIDLAS */
+//    y = x + y; queue.flush(); queue.finish();
+//    BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
 //    /* clAmdBlas */
 //#ifdef BENCH_CLAMDBLAS
-//    CL_BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize))
+//    BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
 //#endif
 //    /* BLAS */
 //#ifdef BENCH_CBLAS
 //    std::vector<float> cx(N), cy(N);
 //    ad::copy(x, cx);
 //    ad::copy(y, cy);
-//    CPU_BENCHMARK(cblas_sdot(N, cx.data(), 1, cy.data(), 1), bandwidth(2*N, tres, dtsize));
+//    BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
+//#endif
+//    /* CuBLAS */
+//#ifdef BENCH_CUBLAS
+//    T *cux, *cuy;
+//    cudaMalloc((void**) &cux, N * sizeof(T));
+//    cudaMalloc((void**) &cuy, N * sizeof(T));
+//    BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
+//    cudaFree(cux);
+//    cudaFree(cuy);
 //#endif
 //    std::cout << std::endl;
 //  }
 //  std::cout << "\n\n" << std::flush;

-//  /*---------*/
-//  /*--BLAS2--*/
-//  /*---------*/
-//  //T-layout
-//  std::cout << "#GEMV-T" << std::endl;
-//  for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
-//    for(std::vector<int_t>::const_iterator Nit = BLAS2_N.begin() ; Nit != BLAS2_N.end() ; ++Nit)
-//    {
-//      int_t M = *Mit;
-//      int_t N = *Nit;
-//      std::cout << M << "," << N;
-//      /* ATIDLAS */
-//      ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
-//      CL_BENCHMARK(y = dot(trans(A),x), bandwidth(M*N + M + N, tres, dtsize));
-//      /* clAmdBlas */
-//  #ifdef BENCH_CLAMDBLAS
-//      CL_BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize))
-//  #endif
-//      /* BLAS */
-//  #ifdef BENCH_CBLAS
-//      std::vector<float> cA(N*M), cx(N), cy(M);
-//      ad::copy(x, cx);
-//      ad::copy(y, cy);
-//      ad::copy(A, cA);
-//      CPU_BENCHMARK(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), bandwidth(M*N + M + N, tres, dtsize));
-//  #endif
-//      std::cout << std::endl;
-//    }
-//    std::cout << "\n\n" << std::flush;
+//  std::cout << "#DOT" << std::endl;
+//  for(int_t N : create_log_range(1e3, 2e7, 50, 64))
+//  {
+//    std::cout << N;
+//    /* ATIDLAS */
+//    ad::array x(N, dtype), y(N, dtype);
+//    cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
+//    ad::array scratch(N, dtype);
+//    ad::scalar s(dtype);
+//    s = dot(x,y); queue.flush(); queue.finish();
+//    BENCHMARK_OPENCL(s = ad::controller<atidlas::array_expression>(dot(x,y), ad::execution_options_type(0, &event)), 2*N*dtsize/t)
+//    /* clAmdBlas */
+//#ifdef BENCH_CLAMDBLAS
+//    BENCHMARK_OPENCL(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
+//#endif
+//    /* BLAS */
+//#ifdef BENCH_CBLAS
+//    std::vector<float> cx(N), cy(N);
+//    ad::copy(x, cx);
+//    ad::copy(y, cy);
+//    BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
+//#endif
+//#ifdef BENCH_CUBLAS
+//    T *cux, *cuy;
+//    T result;
+//    cudaMalloc((void**) &cux, N * sizeof(T));
+//    cudaMalloc((void**) &cuy, N * sizeof(T));
+//    BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
+//    cudaFree(cux);
+//    cudaFree(cuy);
+//#endif
+//    std::cout << std::endl;
+//  }
+//  std::cout << "\n\n" << std::flush;
+
+  /*---------*/
+  /*--BLAS2--*/
+  /*---------*/
+  //T-layout
+  std::cout << "#GEMV-T" << std::endl;
+  for(int_t N: std::vector<int>{64})
+    for(int_t M: create_full_range(128, 10000, 64))
+    {
+      std::cout << M << "," << N;
+      /* ATIDLAS */
+      ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
+      cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
+      y = dot(trans(A),x); queue.flush(); queue.finish();
+      BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(dot(trans(A),x), ad::execution_options_type(0, &event)),(M*N + M + N)*dtsize/t);
+  #ifdef BENCH_CLAMDBLAS
+      BENCHMARK_OPENCL(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &queue(),0, NULL, &event()), (M*N + M + N)*dtsize/t)
+  #endif
+  #ifdef BENCH_CBLAS
+      std::vector<float> cA(N*M), cx(N), cy(M);
+      ad::copy(x, cx);
+      ad::copy(y, cy);
+      ad::copy(A, cA);
+      BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
+  #endif
+  #ifdef BENCH_CUBLAS
+      T *cuA, *cux, *cuy;
+      cudaMalloc((void**) &cuA, N * M * sizeof(T));
+      cudaMalloc((void**) &cux, N * sizeof(T));
+      cudaMalloc((void**) &cuy, M * sizeof(T));
+      BENCHMARK_CUDA(cublasSgemv(cublasTrans, N, M, 1, cuA, N, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
+      cudaFree(cuA);
+      cudaFree(cux);
+      cudaFree(cuy);
+  #endif
+      std::cout << std::endl;
+    }
+    std::cout << "\n\n" << std::flush;

 ////  /*---------*/
 ////  /*--BLAS3--*/