reducing overhead; reverted custom CL/ header because CL/cl.hpp was buggy

2015-01-28 22:07:09 -05:00
parent 1246fbe9a8
commit c7665021d1
21 changed files with 10317 additions and 1474 deletions
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -40,7 +40,7 @@ void bench(ad::numeric_type dtype)
    total_time += times.back();\
  }\
  float tres = median(times);\
-  std::cout << " " << PERF << std::flush;\
+  std::cout << " " << tres << std::flush;\
  }

 #define CL_BENCHMARK(OP, PERF) BENCHMARK(OP, PERF, ad::cl_ext::synchronize(ad::cl_ext::default_context()))
@@ -86,89 +86,89 @@ void bench(ad::numeric_type dtype)
  }
  std::cout << "\n\n" << std::flush;

-  std::cout << "#DOT" << std::endl;
-  for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
-  {
-    int_t N = *it;
-    std::cout << N;
-    /* ATIDLAS */
-    ad::array x(N, dtype), y(N, dtype);
-    ad::array scratch(N, dtype);
-    ad::scalar s(dtype);
-    CL_BENCHMARK(s = dot(x,y), bandwidth(2*N, tres, dtsize));
-    /* clAmdBlas */
-#ifdef BENCH_CLAMDBLAS
-    CL_BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize))
-#endif
-    /* BLAS */
-#ifdef BENCH_CBLAS
-    std::vector<float> cx(N), cy(N);
-    ad::copy(x, cx);
-    ad::copy(y, cy);
-    CPU_BENCHMARK(cblas_sdot(N, cx.data(), 1, cy.data(), 1), bandwidth(2*N, tres, dtsize));
-#endif
-    std::cout << std::endl;
-  }
-  std::cout << "\n\n" << std::flush;
-
-  /*---------*/
-  /*--BLAS2--*/
-  /*---------*/
-  //T-layout
-  std::cout << "#GEMV-T" << std::endl;
-  for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
-    for(std::vector<int_t>::const_iterator Nit = BLAS2_N.begin() ; Nit != BLAS2_N.end() ; ++Nit)
-    {
-      int_t M = *Mit;
-      int_t N = *Nit;
-      std::cout << M << "," << N;
-      /* ATIDLAS */
-      ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
-      CL_BENCHMARK(y = dot(trans(A),x), bandwidth(M*N + M + N, tres, dtsize));
-      /* clAmdBlas */
-  #ifdef BENCH_CLAMDBLAS
-      CL_BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize))
-  #endif
-      /* BLAS */
-  #ifdef BENCH_CBLAS
-      std::vector<float> cA(N*M), cx(N), cy(M);
-      ad::copy(x, cx);
-      ad::copy(y, cy);
-      ad::copy(A, cA);
-      CPU_BENCHMARK(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), bandwidth(M*N + M + N, tres, dtsize));
-  #endif
-      std::cout << std::endl;
-    }
-    std::cout << "\n\n" << std::flush;
+//  std::cout << "#DOT" << std::endl;
+//  for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
+//  {
+//    int_t N = *it;
+//    std::cout << N;
+//    /* ATIDLAS */
+//    ad::array x(N, dtype), y(N, dtype);
+//    ad::array scratch(N, dtype);
+//    ad::scalar s(dtype);
+//    CL_BENCHMARK(s = dot(x,y), bandwidth(2*N, tres, dtsize));
+//    /* clAmdBlas */
+//#ifdef BENCH_CLAMDBLAS
+//    CL_BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize))
+//#endif
+//    /* BLAS */
+//#ifdef BENCH_CBLAS
+//    std::vector<float> cx(N), cy(N);
+//    ad::copy(x, cx);
+//    ad::copy(y, cy);
+//    CPU_BENCHMARK(cblas_sdot(N, cx.data(), 1, cy.data(), 1), bandwidth(2*N, tres, dtsize));
+//#endif
+//    std::cout << std::endl;
+//  }
+//  std::cout << "\n\n" << std::flush;

 //  /*---------*/
-//  /*--BLAS3--*/
+//  /*--BLAS2--*/
 //  /*---------*/
-    std::cout << "#GEMM-NT" << std::endl;
-    for(std::vector<int_t>::const_iterator Mit = BLAS3_M.begin() ; Mit != BLAS3_M.end() ; ++Mit)
-    for(std::vector<int_t>::const_iterator Nit = BLAS3_N.begin() ; Nit != BLAS3_N.end() ; ++Nit)
-    for(std::vector<int_t>::const_iterator Kit = BLAS3_K.begin() ; Kit != BLAS3_K.end() ; ++Kit)
-    {
-      int_t M = *Kit, N = *Kit, K = *Kit;
-      std::cout << M << "," << N << "," << K;
-      /* ATIDLAS */
-      ad::array C(M, N, dtype), A(M, K, dtype), B(N, K, dtype);
-      CL_BENCHMARK(C = dot(A,trans(B)), gflops((double)2*M*N*K, tres));
-      /* clAmdBlas */
-  #ifdef BENCH_CLAMDBLAS
-      CL_BENCHMARK(clAmdBlasSgemm(clAmdBlasColumnMajor, clAmdBlasNoTrans, clAmdBlasTrans, M, N, K, 1, A.data()(), A.ld(), B.data()(), B.ld(),
-                               0, C.data()(), C.ld(), 1, &ad::cl_ext::get_queue(C.context(), 0)(),0, NULL, NULL), gflops((double)2*M*N*K, tres))
-  #endif
-      /* BLAS */
-  #ifdef BENCH_CBLAS
-      std::vector<float> cC(M*N), cA(M*K), cB(N*K);
-      ad::copy(C, cC);
-      ad::copy(A, cA);
-      ad::copy(B, cB);
-      CPU_BENCHMARK(cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, M, N, K, 1, cA.data(), M, cB.data(), N, 1, cC.data(), M), gflops((double)2*M*N*K, tres));
-  #endif
-      std::cout << std::endl;
-    }
+//  //T-layout
+//  std::cout << "#GEMV-T" << std::endl;
+//  for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
+//    for(std::vector<int_t>::const_iterator Nit = BLAS2_N.begin() ; Nit != BLAS2_N.end() ; ++Nit)
+//    {
+//      int_t M = *Mit;
+//      int_t N = *Nit;
+//      std::cout << M << "," << N;
+//      /* ATIDLAS */
+//      ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
+//      CL_BENCHMARK(y = dot(trans(A),x), bandwidth(M*N + M + N, tres, dtsize));
+//      /* clAmdBlas */
+//  #ifdef BENCH_CLAMDBLAS
+//      CL_BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize))
+//  #endif
+//      /* BLAS */
+//  #ifdef BENCH_CBLAS
+//      std::vector<float> cA(N*M), cx(N), cy(M);
+//      ad::copy(x, cx);
+//      ad::copy(y, cy);
+//      ad::copy(A, cA);
+//      CPU_BENCHMARK(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), bandwidth(M*N + M + N, tres, dtsize));
+//  #endif
+//      std::cout << std::endl;
+//    }
+//    std::cout << "\n\n" << std::flush;
+
+////  /*---------*/
+////  /*--BLAS3--*/
+////  /*---------*/
+//    std::cout << "#GEMM-NT" << std::endl;
+//    for(std::vector<int_t>::const_iterator Mit = BLAS3_M.begin() ; Mit != BLAS3_M.end() ; ++Mit)
+//    for(std::vector<int_t>::const_iterator Nit = BLAS3_N.begin() ; Nit != BLAS3_N.end() ; ++Nit)
+//    for(std::vector<int_t>::const_iterator Kit = BLAS3_K.begin() ; Kit != BLAS3_K.end() ; ++Kit)
+//    {
+//      int_t M = *Kit, N = *Kit, K = *Kit;
+//      std::cout << M << "," << N << "," << K;
+//      /* ATIDLAS */
+//      ad::array C(M, N, dtype), A(M, K, dtype), B(N, K, dtype);
+//      CL_BENCHMARK(C = dot(A,trans(B)), gflops((double)2*M*N*K, tres));
+//      /* clAmdBlas */
+//  #ifdef BENCH_CLAMDBLAS
+//      CL_BENCHMARK(clAmdBlasSgemm(clAmdBlasColumnMajor, clAmdBlasNoTrans, clAmdBlasTrans, M, N, K, 1, A.data()(), A.ld(), B.data()(), B.ld(),
+//                               0, C.data()(), C.ld(), 1, &ad::cl_ext::get_queue(C.context(), 0)(),0, NULL, NULL), gflops((double)2*M*N*K, tres))
+//  #endif
+//      /* BLAS */
+//  #ifdef BENCH_CBLAS
+//      std::vector<float> cC(M*N), cA(M*K), cB(N*K);
+//      ad::copy(C, cC);
+//      ad::copy(A, cA);
+//      ad::copy(B, cB);
+//      CPU_BENCHMARK(cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, M, N, K, 1, cA.data(), M, cB.data(), N, 1, cC.data(), M), gflops((double)2*M*N*K, tres));
+//  #endif
+//      std::cout << std::endl;
+//    }

 }