Now using events to time autotuning

2015-02-06 22:11:03 -05:00
parent 385f007c0b
commit b768e913c9
7 changed files with 163 additions and 95 deletions
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -30,7 +30,6 @@ void bench(ad::numeric_type dtype)
  while(total_time*1e-9 < 1e-1){\
    cl::Event event;\
    OP;\
    queue.flush();\
    queue.finish();\
    times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());\
    total_time+=times.back();\
@@ -70,96 +69,114 @@ void bench(ad::numeric_type dtype)
  std::cout << " " << PERF << std::flush;\
  }
-  /*---------*/
+//  /*---------*/
-  /*--BLAS1--*/
+//  /*--BLAS1--*/
-  /*---------*/
+//  /*---------*/
-  std::cout << "#AXPY" << std::endl;
+//  std::cout << "#AXPY" << std::endl;
-  for(auto N : BLAS1_N)
+//  for(int_t N : create_log_range(1e3, 2e7, 50, 64))
  {
    std::cout << N;
    ad::array x(N, dtype), y(N, dtype);
    cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
    /* ATIDLAS */
    y = x + y; queue.flush(); queue.finish();
    BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
    /* clAmdBlas */
 #ifdef BENCH_CLAMDBLAS
    BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
 #endif
    /* BLAS */
 #ifdef BENCH_CBLAS
    std::vector<float> cx(N), cy(N);
    ad::copy(x, cx);
    ad::copy(y, cy);
    BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
 #endif
    /* CuBLAS */
 #ifdef BENCH_CUBLAS
    T *cux, *cuy;
    cudaMalloc((void**) &cux, N * sizeof(T));
    cudaMalloc((void**) &cuy, N * sizeof(T));
    BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
    cudaFree(cux);
    cudaFree(cuy);
 #endif
    std::cout << std::endl;
  }
  std::cout << "\n\n" << std::flush;
 //  std::cout << "#DOT" << std::endl;
 //  for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
 //  {
 //    int_t N = *it;
 //    std::cout << N;
 //    /* ATIDLAS */
 //    ad::array x(N, dtype), y(N, dtype);
-//    ad::array scratch(N, dtype);
+//    cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
-//    ad::scalar s(dtype);
+//    /* ATIDLAS */
-//    CL_BENCHMARK(s = dot(x,y), bandwidth(2*N, tres, dtsize));
+//    y = x + y; queue.flush(); queue.finish();
 //    BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
 //    /* clAmdBlas */
 //#ifdef BENCH_CLAMDBLAS
-//    CL_BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize))
+//    BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
 //#endif
 //    /* BLAS */
 //#ifdef BENCH_CBLAS
 //    std::vector<float> cx(N), cy(N);
 //    ad::copy(x, cx);
 //    ad::copy(y, cy);
-//    CPU_BENCHMARK(cblas_sdot(N, cx.data(), 1, cy.data(), 1), bandwidth(2*N, tres, dtsize));
+//    BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
 //#endif
 //    /* CuBLAS */
 //#ifdef BENCH_CUBLAS
 //    T *cux, *cuy;
 //    cudaMalloc((void**) &cux, N * sizeof(T));
 //    cudaMalloc((void**) &cuy, N * sizeof(T));
 //    BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
 //    cudaFree(cux);
 //    cudaFree(cuy);
 //#endif
 //    std::cout << std::endl;
 //  }
 //  std::cout << "\n\n" << std::flush;
-//  /*---------*/
+//  std::cout << "#DOT" << std::endl;
-//  /*--BLAS2--*/
+//  for(int_t N : create_log_range(1e3, 2e7, 50, 64))
-//  /*---------*/
+//  {
-//  //T-layout
+//    std::cout << N;
-//  std::cout << "#GEMV-T" << std::endl;
+//    /* ATIDLAS */
-//  for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
+//    ad::array x(N, dtype), y(N, dtype);
-//    for(std::vector<int_t>::const_iterator Nit = BLAS2_N.begin() ; Nit != BLAS2_N.end() ; ++Nit)
+//    cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
-//    {
+//    ad::array scratch(N, dtype);
-//      int_t M = *Mit;
+//    ad::scalar s(dtype);
-//      int_t N = *Nit;
+//    s = dot(x,y); queue.flush(); queue.finish();
-//      std::cout << M << "," << N;
+//    BENCHMARK_OPENCL(s = ad::controller<atidlas::array_expression>(dot(x,y), ad::execution_options_type(0, &event)), 2*N*dtsize/t)
-//      /* ATIDLAS */
+//    /* clAmdBlas */
-//      ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
+//#ifdef BENCH_CLAMDBLAS
-//      CL_BENCHMARK(y = dot(trans(A),x), bandwidth(M*N + M + N, tres, dtsize));
+//    BENCHMARK_OPENCL(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
-//      /* clAmdBlas */
+//#endif
-//  #ifdef BENCH_CLAMDBLAS
+//    /* BLAS */
-//      CL_BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize))
+//#ifdef BENCH_CBLAS
-//  #endif
+//    std::vector<float> cx(N), cy(N);
-//      /* BLAS */
+//    ad::copy(x, cx);
-//  #ifdef BENCH_CBLAS
+//    ad::copy(y, cy);
-//      std::vector<float> cA(N*M), cx(N), cy(M);
+//    BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
-//      ad::copy(x, cx);
+//#endif
-//      ad::copy(y, cy);
+//#ifdef BENCH_CUBLAS
-//      ad::copy(A, cA);
+//    T *cux, *cuy;
-//      CPU_BENCHMARK(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), bandwidth(M*N + M + N, tres, dtsize));
+//    T result;
-//  #endif
+//    cudaMalloc((void**) &cux, N * sizeof(T));
-//      std::cout << std::endl;
+//    cudaMalloc((void**) &cuy, N * sizeof(T));
-//    }
+//    BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
-//    std::cout << "\n\n" << std::flush;
+//    cudaFree(cux);
 //    cudaFree(cuy);
 //#endif
 //    std::cout << std::endl;
 //  }
 //  std::cout << "\n\n" << std::flush;
  /*---------*/
  /*--BLAS2--*/
  /*---------*/
  //T-layout
  std::cout << "#GEMV-T" << std::endl;
  for(int_t N: std::vector<int>{64})
    for(int_t M: create_full_range(128, 10000, 64))
    {
      std::cout << M << "," << N;
      /* ATIDLAS */
      ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
      cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
      y = dot(trans(A),x); queue.flush(); queue.finish();
      BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(dot(trans(A),x), ad::execution_options_type(0, &event)),(M*N + M + N)*dtsize/t);
  #ifdef BENCH_CLAMDBLAS
      BENCHMARK_OPENCL(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &queue(),0, NULL, &event()), (M*N + M + N)*dtsize/t)
  #endif
  #ifdef BENCH_CBLAS
      std::vector<float> cA(N*M), cx(N), cy(M);
      ad::copy(x, cx);
      ad::copy(y, cy);
      ad::copy(A, cA);
      BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
  #endif
  #ifdef BENCH_CUBLAS
      T *cuA, *cux, *cuy;
      cudaMalloc((void**) &cuA, N * M * sizeof(T));
      cudaMalloc((void**) &cux, N * sizeof(T));
      cudaMalloc((void**) &cuy, M * sizeof(T));
      BENCHMARK_CUDA(cublasSgemv(cublasTrans, N, M, 1, cuA, N, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
      cudaFree(cuA);
      cudaFree(cux);
      cudaFree(cuy);
  #endif
      std::cout << std::endl;
    }
    std::cout << "\n\n" << std::flush;
 ////  /*---------*/
 ////  /*--BLAS3--*/
--- a/include/atidlas/array.h
+++ b/include/atidlas/array.h
@@ -34,9 +34,11 @@ public:
  //General constructor
  array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context = cl_ext::default_context());
  array(array_expression const & proxy);
  array(array const &);
-
+  template<class T>
  array(controller<T> const &);
  //Getters
  numeric_type dtype() const;
  size4 shape() const;
--- a/include/atidlas/symbolic/expression.h
+++ b/include/atidlas/symbolic/expression.h
@@ -284,6 +284,10 @@ private:
  compilation_options_type compilation_options_;
 };
 template<class TYPE>
 controller<TYPE> control(TYPE const & x, execution_options_type const& execution_options = execution_options_type(),
                         dispatcher_options_type const & dispatcher_options = dispatcher_options_type(), compilation_options_type const & compilation_options = compilation_options_type())
 { return controller<TYPE>(x, execution_options, dispatcher_options, compilation_options); }
 class expressions_tuple
 {
--- a/lib/array.cpp
+++ b/lib/array.cpp
@@ -83,23 +83,22 @@ array::array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const
   ld_(ld), context_(context), data_(data)
 { }
-array::array(array_expression const & proxy) :
+array::array(array_expression const & proxy) : array(control(proxy)){}
-  dtype_(proxy.dtype()),
+array::array(array const & other) : array(control(other)){}
  shape_(proxy.shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
  context_(proxy.context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
 {
  *this = proxy;
 }
-array::array(array const & other) :
+template<class TYPE>
-  dtype_(other.dtype()),
+array::array(controller<TYPE> const & other) :
-  shape_(other.shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
+  dtype_(other.x().dtype()),
-  context_(other.context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
+  shape_(other.x().shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
  context_(other.x().context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
 {
  *this = other;
 }
 template array::array(controller<array> const&);
 template array::array(controller<array_expression> const&);
 /*--- Getters ---*/
 numeric_type array::dtype() const
 { return dtype_; }
--- a/python/autotune/pysrc/autotune.py
+++ b/python/autotune/pysrc/autotune.py
@@ -213,6 +213,7 @@ class ArgumentsHandler:
                self.blas3_size = map(int, self.blas3_size)
 if __name__ == "__main__":
    atd.state.queue_properties = atd.queue_properties_type.CL_QUEUE_PROFILING_ENABLE
    platforms = atd.get_platforms()
    devices = [d for platform in platforms for d in platform.get_devices()]
--- a/python/autotune/pysrc/misc_tools.py
+++ b/python/autotune/pysrc/misc_tools.py
@@ -222,13 +222,9 @@ def benchmark(template, symbolic):
        atd.synchronize(symbolic.context)
        current_time = 0
        timings = []
-        while current_time < 1e-1:
+        x, event, cache = atd.flush(symbolic)
-            time_before = time.time()
+        atd.synchronize(symbolic.context)
-            x = atd.array(symbolic)
+        return 1e-9*(event.end - event.start)
            atd.synchronize(symbolic.context)
            timings.append(time.time() - time_before)
            current_time = current_time + timings[-1]
        return np.median(timings)
 def sanitize_string(string, keep_chars = ['_']):
--- a/python/pyatidlas/src/_atidlas.cpp
+++ b/python/pyatidlas/src/_atidlas.cpp
@@ -166,6 +166,16 @@ namespace detail
    return res;
  }
  template<class T>
  std::vector<T> to_vector(bp::list const & list)
  {
    std::size_t len = bp::len(list);
    std::vector<T> res; res.reserve(len);
    for(int i = 0 ; i < len ; ++i)
      res.push_back(boost::python::extract<T>(list[i]));
    return res;
  }
  bp::list nv_compute_capability(cl::Device const & device)
  {
    bp::list res;
@@ -288,6 +298,10 @@ namespace detail
  wrap_command_queue_info(cl::CommandQueue const & x)
  { return x.getInfo<INFO>(NULL); }
  template<cl_int INFO>
  typename cl::detail::param_traits<cl::detail::cl_profiling_info, INFO>::param_type
  wrap_profiling_info(cl::Event const & x)
  { return x.getProfilingInfo<INFO>(NULL); }
  std::string to_string(cl_device_type type)
  {
@@ -301,8 +315,20 @@ namespace detail
  boost::shared_ptr<cl::Context> make_context(cl::Device const & dev)
  { return boost::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>(1, dev))); }
  bp::tuple flush(atd::array_expression const & expression, unsigned int queue_id, bp::list dependencies, int label, std::string const & program_name, bool force_recompile)
  {
      cl::Event event;
      atd::operation_cache cache;
      std::vector<cl::Event> cdependencies = to_vector<cl::Event>(dependencies);
      boost::shared_ptr<atd::array> parray(new atd::array(atd::control(expression, atd::execution_options_type(queue_id, &event, &cache, &cdependencies),
                                                                       atd::dispatcher_options_type(label), atd::compilation_options_type(program_name, force_recompile))));
      return bp::make_tuple(*parray, event, cache);
  }
 }
 struct state_type{ };
 state_type state;
 void export_cl()
 {
@@ -362,9 +388,32 @@ void export_cl()
      .add_property("models", bp::make_function(&atd::get_model_map, bp::return_internal_reference<>()));
      ;
  bp::class_<cl::Event>("event")
    #define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_profiling_info<NAME>)
      WRAP("start", CL_PROFILING_COMMAND_START)
      WRAP("submit", CL_PROFILING_COMMAND_SUBMIT)
      WRAP("end", CL_PROFILING_COMMAND_END)
     ;
  bp::class_<atd::operation_cache>("operation_cache", bp::no_init)
      .def("enqueue", &atd::operation_cache::enqueue)
      ;
  bp::def("synchronize", &atd::cl_ext::synchronize);
  bp::def("get_platforms", &detail::get_platforms);
  bp::def("flush", &detail::flush, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));
  bp::enum_<cl_command_queue_properties>("queue_properties_type")
      .value("CL_QUEUE_PROFILING_ENABLE", CL_QUEUE_PROFILING_ENABLE)
      .value("CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
      ;
  bp::class_<state_type>("state_type")
          .def_readwrite("queue_properties",&atd::cl_ext::queue_properties)
      ;
  bp::scope().attr("state") = bp::object(bp::ptr(&state));
 }
 namespace detail