Now using events to time autotuning

2015-02-06 22:11:03 -05:00
parent 385f007c0b
commit b768e913c9
7 changed files with 163 additions and 95 deletions
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -30,7 +30,6 @@ void bench(ad::numeric_type dtype)
  while(total_time*1e-9 < 1e-1){\
    cl::Event event;\
    OP;\
-    queue.flush();\
    queue.finish();\
    times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());\
    total_time+=times.back();\
@@ -70,96 +69,114 @@ void bench(ad::numeric_type dtype)
  std::cout << " " << PERF << std::flush;\
  }

-  /*---------*/
-  /*--BLAS1--*/
-  /*---------*/
-  std::cout << "#AXPY" << std::endl;
-  for(auto N : BLAS1_N)
-  {
-    std::cout << N;
-    ad::array x(N, dtype), y(N, dtype);
-    cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
-    /* ATIDLAS */
-    y = x + y; queue.flush(); queue.finish();
-    BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
-    /* clAmdBlas */
-#ifdef BENCH_CLAMDBLAS
-    BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
-#endif
-    /* BLAS */
-#ifdef BENCH_CBLAS
-    std::vector<float> cx(N), cy(N);
-    ad::copy(x, cx);
-    ad::copy(y, cy);
-    BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
-#endif
-    /* CuBLAS */
-#ifdef BENCH_CUBLAS
-    T *cux, *cuy;
-    cudaMalloc((void**) &cux, N * sizeof(T));
-    cudaMalloc((void**) &cuy, N * sizeof(T));
-    BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
-    cudaFree(cux);
-    cudaFree(cuy);
-#endif
-    std::cout << std::endl;
-  }
-  std::cout << "\n\n" << std::flush;
-
-//  std::cout << "#DOT" << std::endl;
-//  for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
+//  /*---------*/
+//  /*--BLAS1--*/
+//  /*---------*/
+//  std::cout << "#AXPY" << std::endl;
+//  for(int_t N : create_log_range(1e3, 2e7, 50, 64))
 //  {
-//    int_t N = *it;
 //    std::cout << N;
-//    /* ATIDLAS */
 //    ad::array x(N, dtype), y(N, dtype);
-//    ad::array scratch(N, dtype);
-//    ad::scalar s(dtype);
-//    CL_BENCHMARK(s = dot(x,y), bandwidth(2*N, tres, dtsize));
+//    cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
+//    /* ATIDLAS */
+//    y = x + y; queue.flush(); queue.finish();
+//    BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
 //    /* clAmdBlas */
 //#ifdef BENCH_CLAMDBLAS
-//    CL_BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize))
+//    BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
 //#endif
 //    /* BLAS */
 //#ifdef BENCH_CBLAS
 //    std::vector<float> cx(N), cy(N);
 //    ad::copy(x, cx);
 //    ad::copy(y, cy);
-//    CPU_BENCHMARK(cblas_sdot(N, cx.data(), 1, cy.data(), 1), bandwidth(2*N, tres, dtsize));
+//    BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
+//#endif
+//    /* CuBLAS */
+//#ifdef BENCH_CUBLAS
+//    T *cux, *cuy;
+//    cudaMalloc((void**) &cux, N * sizeof(T));
+//    cudaMalloc((void**) &cuy, N * sizeof(T));
+//    BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
+//    cudaFree(cux);
+//    cudaFree(cuy);
 //#endif
 //    std::cout << std::endl;
 //  }
 //  std::cout << "\n\n" << std::flush;

-//  /*---------*/
-//  /*--BLAS2--*/
-//  /*---------*/
-//  //T-layout
-//  std::cout << "#GEMV-T" << std::endl;
-//  for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
-//    for(std::vector<int_t>::const_iterator Nit = BLAS2_N.begin() ; Nit != BLAS2_N.end() ; ++Nit)
-//    {
-//      int_t M = *Mit;
-//      int_t N = *Nit;
-//      std::cout << M << "," << N;
-//      /* ATIDLAS */
-//      ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
-//      CL_BENCHMARK(y = dot(trans(A),x), bandwidth(M*N + M + N, tres, dtsize));
-//      /* clAmdBlas */
-//  #ifdef BENCH_CLAMDBLAS
-//      CL_BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize))
-//  #endif
-//      /* BLAS */
-//  #ifdef BENCH_CBLAS
-//      std::vector<float> cA(N*M), cx(N), cy(M);
-//      ad::copy(x, cx);
-//      ad::copy(y, cy);
-//      ad::copy(A, cA);
-//      CPU_BENCHMARK(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), bandwidth(M*N + M + N, tres, dtsize));
-//  #endif
-//      std::cout << std::endl;
-//    }
-//    std::cout << "\n\n" << std::flush;
+//  std::cout << "#DOT" << std::endl;
+//  for(int_t N : create_log_range(1e3, 2e7, 50, 64))
+//  {
+//    std::cout << N;
+//    /* ATIDLAS */
+//    ad::array x(N, dtype), y(N, dtype);
+//    cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
+//    ad::array scratch(N, dtype);
+//    ad::scalar s(dtype);
+//    s = dot(x,y); queue.flush(); queue.finish();
+//    BENCHMARK_OPENCL(s = ad::controller<atidlas::array_expression>(dot(x,y), ad::execution_options_type(0, &event)), 2*N*dtsize/t)
+//    /* clAmdBlas */
+//#ifdef BENCH_CLAMDBLAS
+//    BENCHMARK_OPENCL(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
+//#endif
+//    /* BLAS */
+//#ifdef BENCH_CBLAS
+//    std::vector<float> cx(N), cy(N);
+//    ad::copy(x, cx);
+//    ad::copy(y, cy);
+//    BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
+//#endif
+//#ifdef BENCH_CUBLAS
+//    T *cux, *cuy;
+//    T result;
+//    cudaMalloc((void**) &cux, N * sizeof(T));
+//    cudaMalloc((void**) &cuy, N * sizeof(T));
+//    BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
+//    cudaFree(cux);
+//    cudaFree(cuy);
+//#endif
+//    std::cout << std::endl;
+//  }
+//  std::cout << "\n\n" << std::flush;
+
+  /*---------*/
+  /*--BLAS2--*/
+  /*---------*/
+  //T-layout
+  std::cout << "#GEMV-T" << std::endl;
+  for(int_t N: std::vector<int>{64})
+    for(int_t M: create_full_range(128, 10000, 64))
+    {
+      std::cout << M << "," << N;
+      /* ATIDLAS */
+      ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
+      cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
+      y = dot(trans(A),x); queue.flush(); queue.finish();
+      BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(dot(trans(A),x), ad::execution_options_type(0, &event)),(M*N + M + N)*dtsize/t);
+  #ifdef BENCH_CLAMDBLAS
+      BENCHMARK_OPENCL(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &queue(),0, NULL, &event()), (M*N + M + N)*dtsize/t)
+  #endif
+  #ifdef BENCH_CBLAS
+      std::vector<float> cA(N*M), cx(N), cy(M);
+      ad::copy(x, cx);
+      ad::copy(y, cy);
+      ad::copy(A, cA);
+      BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
+  #endif
+  #ifdef BENCH_CUBLAS
+      T *cuA, *cux, *cuy;
+      cudaMalloc((void**) &cuA, N * M * sizeof(T));
+      cudaMalloc((void**) &cux, N * sizeof(T));
+      cudaMalloc((void**) &cuy, M * sizeof(T));
+      BENCHMARK_CUDA(cublasSgemv(cublasTrans, N, M, 1, cuA, N, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
+      cudaFree(cuA);
+      cudaFree(cux);
+      cudaFree(cuy);
+  #endif
+      std::cout << std::endl;
+    }
+    std::cout << "\n\n" << std::flush;

 ////  /*---------*/
 ////  /*--BLAS3--*/
--- a/include/atidlas/array.h
+++ b/include/atidlas/array.h
@@ -34,9 +34,11 @@ public:

  //General constructor
  array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context = cl_ext::default_context());
+
  array(array_expression const & proxy);
  array(array const &);
-
+  template<class T>
+  array(controller<T> const &);
  //Getters
  numeric_type dtype() const;
  size4 shape() const;
--- a/include/atidlas/symbolic/expression.h
+++ b/include/atidlas/symbolic/expression.h
@@ -284,6 +284,10 @@ private:
  compilation_options_type compilation_options_;
 };

+template<class TYPE>
+controller<TYPE> control(TYPE const & x, execution_options_type const& execution_options = execution_options_type(),
+                         dispatcher_options_type const & dispatcher_options = dispatcher_options_type(), compilation_options_type const & compilation_options = compilation_options_type())
+{ return controller<TYPE>(x, execution_options, dispatcher_options, compilation_options); }

 class expressions_tuple
 {
--- a/lib/array.cpp
+++ b/lib/array.cpp
@@ -83,23 +83,22 @@ array::array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const
   ld_(ld), context_(context), data_(data)
 { }

-array::array(array_expression const & proxy) :
-  dtype_(proxy.dtype()),
-  shape_(proxy.shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
-  context_(proxy.context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
-{
-  *this = proxy;
-}
+array::array(array_expression const & proxy) : array(control(proxy)){}
+array::array(array const & other) : array(control(other)){}

-array::array(array const & other) :
-  dtype_(other.dtype()),
-  shape_(other.shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
-  context_(other.context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
+template<class TYPE>
+array::array(controller<TYPE> const & other) :
+  dtype_(other.x().dtype()),
+  shape_(other.x().shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
+  context_(other.x().context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
 {
  *this = other;
 }


+template array::array(controller<array> const&);
+template array::array(controller<array_expression> const&);
+
 /*--- Getters ---*/
 numeric_type array::dtype() const
 { return dtype_; }
--- a/python/autotune/pysrc/autotune.py
+++ b/python/autotune/pysrc/autotune.py
@@ -213,6 +213,7 @@ class ArgumentsHandler:
                self.blas3_size = map(int, self.blas3_size)

 if __name__ == "__main__":
+    atd.state.queue_properties = atd.queue_properties_type.CL_QUEUE_PROFILING_ENABLE
    
    platforms = atd.get_platforms()
    devices = [d for platform in platforms for d in platform.get_devices()]
--- a/python/autotune/pysrc/misc_tools.py
+++ b/python/autotune/pysrc/misc_tools.py
@@ -222,13 +222,9 @@ def benchmark(template, symbolic):
        atd.synchronize(symbolic.context)
        current_time = 0
        timings = []
-        while current_time < 1e-1:
-            time_before = time.time()
-            x = atd.array(symbolic)
-            atd.synchronize(symbolic.context)
-            timings.append(time.time() - time_before)
-            current_time = current_time + timings[-1]
-        return np.median(timings)
+        x, event, cache = atd.flush(symbolic)
+        atd.synchronize(symbolic.context)
+        return 1e-9*(event.end - event.start)


 def sanitize_string(string, keep_chars = ['_']):
--- a/python/pyatidlas/src/_atidlas.cpp
+++ b/python/pyatidlas/src/_atidlas.cpp
@@ -166,6 +166,16 @@ namespace detail
    return res;
  }

+  template<class T>
+  std::vector<T> to_vector(bp::list const & list)
+  {
+    std::size_t len = bp::len(list);
+    std::vector<T> res; res.reserve(len);
+    for(int i = 0 ; i < len ; ++i)
+      res.push_back(boost::python::extract<T>(list[i]));
+    return res;
+  }
+
  bp::list nv_compute_capability(cl::Device const & device)
  {
    bp::list res;
@@ -288,6 +298,10 @@ namespace detail
  wrap_command_queue_info(cl::CommandQueue const & x)
  { return x.getInfo<INFO>(NULL); }

+  template<cl_int INFO>
+  typename cl::detail::param_traits<cl::detail::cl_profiling_info, INFO>::param_type
+  wrap_profiling_info(cl::Event const & x)
+  { return x.getProfilingInfo<INFO>(NULL); }

  std::string to_string(cl_device_type type)
  {
@@ -301,8 +315,20 @@ namespace detail
  boost::shared_ptr<cl::Context> make_context(cl::Device const & dev)
  { return boost::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>(1, dev))); }

+  bp::tuple flush(atd::array_expression const & expression, unsigned int queue_id, bp::list dependencies, int label, std::string const & program_name, bool force_recompile)
+  {
+      cl::Event event;
+      atd::operation_cache cache;
+      std::vector<cl::Event> cdependencies = to_vector<cl::Event>(dependencies);
+      boost::shared_ptr<atd::array> parray(new atd::array(atd::control(expression, atd::execution_options_type(queue_id, &event, &cache, &cdependencies),
+                                                                       atd::dispatcher_options_type(label), atd::compilation_options_type(program_name, force_recompile))));
+
+      return bp::make_tuple(*parray, event, cache);
+  }
 }

+struct state_type{ };
+state_type state;

 void export_cl()
 {
@@ -362,9 +388,32 @@ void export_cl()
      .add_property("models", bp::make_function(&atd::get_model_map, bp::return_internal_reference<>()));
      ;

+  bp::class_<cl::Event>("event")
+    #define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_profiling_info<NAME>)
+      WRAP("start", CL_PROFILING_COMMAND_START)
+      WRAP("submit", CL_PROFILING_COMMAND_SUBMIT)
+      WRAP("end", CL_PROFILING_COMMAND_END)
+     ;
+
+  bp::class_<atd::operation_cache>("operation_cache", bp::no_init)
+      .def("enqueue", &atd::operation_cache::enqueue)
+      ;
+
  bp::def("synchronize", &atd::cl_ext::synchronize);
  bp::def("get_platforms", &detail::get_platforms);

+  bp::def("flush", &detail::flush, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));
+
+  bp::enum_<cl_command_queue_properties>("queue_properties_type")
+      .value("CL_QUEUE_PROFILING_ENABLE", CL_QUEUE_PROFILING_ENABLE)
+      .value("CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
+      ;
+
+  bp::class_<state_type>("state_type")
+          .def_readwrite("queue_properties",&atd::cl_ext::queue_properties)
+      ;
+
+  bp::scope().attr("state") = bp::object(bp::ptr(&state));
 }

 namespace detail