From b32de3ac76d8ae0c8efe50cf402ae933e63f0b0e Mon Sep 17 00:00:00 2001
From: Philippe Tillet <ptillet@g.harvard.edu>
Date: Thu, 25 Jun 2015 08:12:16 -0700
Subject: [PATCH] C++: More clBLAS routines

---
 bench/blas.cpp        | 262 +++++++++++++++++++++---------------------
 include/isaac/array.h |   8 +-
 lib/array.cpp         |  35 ++++--
 lib/wrap/clBLAS.cpp   | 183 +++++++++++++++++++++++------
 4 files changed, 310 insertions(+), 178 deletions(-)
diff --git a/bench/blas.cpp b/bench/blas.cpp
index 4959697e2..c263d3077 100644
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -99,6 +99,7 @@ void bench(ad::numeric_type dtype, std::string operation)
   double total_time = 0;\
   while(total_time*1e-9 < 1e-3){\
     cl::Event event;\
+    flush = ad::zeros(1e6, 1, dtype);\
     OP;\
     queue.synchronize();\
     times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>());\
@@ -146,8 +147,7 @@ void bench(ad::numeric_type dtype, std::string operation)
   ad::array flush(1e6, dtype);
   std::cout << "#" << operation << " (" << metric[operation] << ")" << std::endl;
   std::cout << "N";
-  std::cout << "\tISAAC (predictive)";
-  std::cout << "\tISAAC (optimal)";
+  std::cout << "\tISAAC";
 #ifdef BENCH_CLBLAS
   std::cout << "\tclBLAS";
 #endif
@@ -162,142 +162,143 @@ void bench(ad::numeric_type dtype, std::string operation)
   // RUN BENCHMARKS
   //
 
-//  /*---------*/
-//  /*--BLAS1--*/
-//  /*---------*/
+  /*---------*/
+  /*--BLAS1--*/
+  /*---------*/
 
-//  if(operation=="axpy")
-//  {
-//    for(int_t N: create_log_range(1e3, 2e7, 50, 64))
-//    {
-//      std::cout << N;
-//      ad::array x(N, dtype), y(N, dtype);
-//      /* ISAAC */
-//      std::list<ad::driver::Event> events;\
-//      BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), 3*N*dtsize/t)
-//      BENCHMARK_ISAAC(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t)
-//      /* clblas */
-//  #ifdef BENCH_CLBLAS
-//      BENCHMARK_CLBLAS(clblasSaxpy(N, 1, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event()), 3*N*dtsize/t)
-//  #endif
-//      /* BLAS */
-//  #ifdef BENCH_CBLAS
-//      std::vector<float> cx(N), cy(N);
-//      ad::copy(x, cx);
-//      ad::copy(y, cy);
-//      BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
-//  #endif
-//      /* CuBLAS */
-//  #ifdef BENCH_CUBLAS
-//      T *cux, *cuy;
-//      cudaMalloc((void**) &cux, N * sizeof(T));
-//      cudaMalloc((void**) &cuy, N * sizeof(T));
-//      BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
-//      cudaFree(cux);
-//      cudaFree(cuy);
-//  #endif
-//      std::cout << std::endl;
-//    }
-//  }
+  if(operation=="axpy")
+  {
+    float alpha = 1;
+    for(int_t N: create_log_range(1e3, 2e7, 50, 64))
+    {
+      std::cout << N;
+      ad::array x(N, dtype), y(N, dtype);
+      /* ISAAC */
+      std::list<ad::driver::Event> events;\
+      BENCHMARK_ISAAC(y = ad::control(x + alpha*y, ad::execution_options_type(0, &events)), 3*N*dtsize/t)
+      /* clblas */
+  #ifdef BENCH_CLBLAS
+      BENCHMARK_CLBLAS(clblasSaxpy(N, alpha, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event()), 3*N*dtsize/t)
+  #endif
+      /* BLAS */
+  #ifdef BENCH_CBLAS
+      std::vector<float> cx(N), cy(N);
+      ad::copy(x, cx);
+      ad::copy(y, cy);
+      BENCHMARK_HOST(cblas_saxpy(N, alpha, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
+  #endif
+      /* CuBLAS */
+  #ifdef BENCH_CUBLAS
+      T *cux, *cuy;
+      cudaMalloc((void**) &cux, N * sizeof(T));
+      cudaMalloc((void**) &cuy, N * sizeof(T));
+      BENCHMARK_CUDA(cublasSaxpy(N, alpha, cux, 1, cuy, 1), 3*N*dtsize/t)
+      cudaFree(cux);
+      cudaFree(cuy);
+  #endif
+      std::cout << std::endl;
+    }
+  }
 
-//  if(operation=="dot")
-//  {
-//    for(int_t N: create_log_range(1e3, 2e7, 50, 64))
-//    {
-//      std::cout << N;
-//      /* ISAAC */
-//      ad::array x(N, dtype), y(N, dtype);
-//      ad::array scratch(N, dtype);
-//      ad::scalar s(dtype);
-//      s = dot(x,y); queue.synchronize();
-//      BENCHMARK_ISAAC(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t)
-//      /* clblas */
-//  #ifdef BENCH_CLBLAS
-//      BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event()), 2*N*dtsize/t)
-//  #endif
-//      /* BLAS */
-//  #ifdef BENCH_CBLAS
-//      std::vector<float> cx(N), cy(N);
-//      ad::copy(x, cx);
-//      ad::copy(y, cy);
-//      BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
-//  #endif
-//  #ifdef BENCH_CUBLAS
-//      T *cux, *cuy;
-//      T result;
-//      cudaMalloc((void**) &cux, N * sizeof(T));
-//      cudaMalloc((void**) &cuy, N * sizeof(T));
-//      BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t)
-//      cudaFree(cux);
-//      cudaFree(cuy);
-//  #endif
-//      std::cout << std::endl;
-//    }
-//    std::cout << "\n\n" << std::flush;
-//  }
+  if(operation=="dot")
+  {
+    for(int_t N: create_log_range(1e3, 2e7, 50, 64))
+    {
+      std::cout << N;
+      /* ISAAC */
+      ad::array x(N, dtype), y(N, dtype);
+      ad::array scratch(N, dtype);
+      ad::scalar s(dtype);
+      s = dot(x,y); queue.synchronize();
+      BENCHMARK_ISAAC(s = ad::control(dot(x,y), ad::execution_options_type(0, &events)), 2*N*dtsize/t)
+      /* clblas */
+  #ifdef BENCH_CLBLAS
+      BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event()), 2*N*dtsize/t)
+  #endif
+      /* BLAS */
+  #ifdef BENCH_CBLAS
+      std::vector<float> cx(N), cy(N);
+      ad::copy(x, cx);
+      ad::copy(y, cy);
+      BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
+  #endif
+  #ifdef BENCH_CUBLAS
+      T *cux, *cuy;
+      T result;
+      cudaMalloc((void**) &cux, N * sizeof(T));
+      cudaMalloc((void**) &cuy, N * sizeof(T));
+      BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1), 2*N*dtsize/t)
+      cudaFree(cux);
+      cudaFree(cuy);
+  #endif
+      std::cout << std::endl;
+    }
+    std::cout << "\n\n" << std::flush;
+  }
 
-//  if(operation.substr(0, 4)=="gemv")
-//  {
-//    std::vector<std::tuple<int_t, int_t> > MNs;
-//    MNs.push_back(std::make_tuple(896,896));
-//    MNs.push_back(std::make_tuple(3072,3072));
-//    MNs.push_back(std::make_tuple(64,32000));
-//    MNs.push_back(std::make_tuple(896,32000));
-//    MNs.push_back(std::make_tuple(32000, 64));
-//    MNs.push_back(std::make_tuple(32000, 896));
+  if(operation.substr(0, 4)=="gemv")
+  {
+    std::vector<std::tuple<int_t, int_t> > MNs;
+    MNs.push_back(std::make_tuple(896,896));
+    MNs.push_back(std::make_tuple(3072,3072));
+    MNs.push_back(std::make_tuple(64,32000));
+    MNs.push_back(std::make_tuple(896,32000));
+    MNs.push_back(std::make_tuple(32000, 64));
+    MNs.push_back(std::make_tuple(32000, 896));
 
-//    /*---------*/
-//    /*--BLAS2--*/
-//    /*---------*/
-//    //T-layout
-//    for(std::tuple<int_t, int_t> MN: MNs)
-//    {
-//        int_t M = std::get<0>(MN);
-//        int_t N = std::get<1>(MN);
-//        std::cout << M << "," << N;
-//        /* ISAAC */
-//        ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
-//    #if HAS_A_BLAS
-//        int_t lda = A.ld();
-//    #endif
-//        y = dot(trans(A),x); queue.synchronize();
-//        BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)),(M*N + M + N)*dtsize/t);
-//        BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
-//    #ifdef BENCH_CLBLAS
-//        BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, clblasTrans, N, M, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event()), (M*N + M + N)*dtsize/t)
-//    #endif
-//    #ifdef BENCH_CBLAS
-//        std::vector<float> cA(N*M), cx(N), cy(M);
-//        ad::copy(x, cx);
-//        ad::copy(y, cy);
-//        ad::copy(A, cA);
-//        BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
-//    #endif
-//    #ifdef BENCH_CUBLAS
-//        T *cuA, *cux, *cuy;
-//        cudaMalloc((void**) &cuA, N * M * sizeof(T));
-//        cudaMalloc((void**) &cux, N * sizeof(T));
-//        cudaMalloc((void**) &cuy, M * sizeof(T));
-//        BENCHMARK_CUDA(cublasSgemv('t', N, M, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
-//        cudaFree(cuA);
-//        cudaFree(cux);
-//        cudaFree(cuy);
-//    #endif
-//        std::cout << std::endl;
-//      }
-//      std::cout << "\n\n" << std::flush;
-//  }
+    /*---------*/
+    /*--BLAS2--*/
+    /*---------*/
+    //T-layout
+    for(std::tuple<int_t, int_t> MN: MNs)
+    {
+        int_t M = std::get<0>(MN);
+        int_t N = std::get<1>(MN);
+        std::cout << M << "," << N;
+        /* ISAAC */
+        ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
+    #if HAS_A_BLAS
+        int_t lda = A.ld();
+    #endif
+        y = dot(trans(A),x); queue.synchronize();
+        BENCHMARK_ISAAC(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events)),(M*N + M + N)*dtsize/t);
+    #ifdef BENCH_CLBLAS
+        BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, clblasTrans, N, M, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event()), (M*N + M + N)*dtsize/t)
+    #endif
+    #ifdef BENCH_CBLAS
+        std::vector<float> cA(N*M), cx(N), cy(M);
+        ad::copy(x, cx);
+        ad::copy(y, cy);
+        ad::copy(A, cA);
+        BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
+    #endif
+    #ifdef BENCH_CUBLAS
+        T *cuA, *cux, *cuy;
+        cudaMalloc((void**) &cuA, N * M * sizeof(T));
+        cudaMalloc((void**) &cux, N * sizeof(T));
+        cudaMalloc((void**) &cuy, M * sizeof(T));
+        BENCHMARK_CUDA(cublasSgemv('t', N, M, 1, cuA, lda, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
+        cudaFree(cuA);
+        cudaFree(cux);
+        cudaFree(cuy);
+    #endif
+        std::cout << std::endl;
+      }
+      std::cout << "\n\n" << std::flush;
+  }
 
   if(operation.substr(0,4)=="gemm")
   {
     std::vector<std::tuple<int_t, int_t, int_t> > MNKs;
-    MNKs.push_back(std::make_tuple(896,896,896));
-    MNKs.push_back(std::make_tuple(3072,3072,3072));
-    MNKs.push_back(std::make_tuple(1024,64,768));
-    MNKs.push_back(std::make_tuple(768,64,128));
-    MNKs.push_back(std::make_tuple(64,64,32000));
-    MNKs.push_back(std::make_tuple(1024,1024,32000));
+//    MNKs.push_back(std::make_tuple(896,896,896));
+//    MNKs.push_back(std::make_tuple(3072,3072,3072));
+//    MNKs.push_back(std::make_tuple(1024,64,768));
+//    MNKs.push_back(std::make_tuple(768,64,128));
+//    MNKs.push_back(std::make_tuple(64,64,32000));
+//    MNKs.push_back(std::make_tuple(1024,1024,32000));
 
+    for(unsigned int N = 1 ; N <10 ; ++N)
+        MNKs.push_back(std::make_tuple(128*N, 128*N, 128*N));
     /*---------*/
     /*--BLAS3--*/
     /*---------*/
@@ -312,8 +313,7 @@ void bench(ad::numeric_type dtype, std::string operation)
     #if HAS_A_BLAS
         int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld();
     #endif
-//        BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(false)), (double)2*M*N*K/t);
-        //BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), (double)2*M*N*K/t);
+        BENCHMARK_ISAAC(C = ad::control(dot(A,trans(B)), ad::execution_options_type(0, &events)), (double)2*M*N*K/t);
         /* clblas */
     #ifdef BENCH_CLBLAS
         BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, clblasNoTrans, clblasTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
diff --git a/include/isaac/array.h b/include/isaac/array.h
index 5ef23b16f..ca3955085 100644
--- a/include/isaac/array.h
+++ b/include/isaac/array.h
@@ -14,15 +14,22 @@ class scalar;
 
 class array: public array_base
 {
+protected:
+  //Slices
+  array(numeric_type dtype, driver::Buffer data, slice const & s1, slice const & s2, int_t ld);
+
 public:
   //1D Constructors
   array(int_t size1, numeric_type dtype, driver::Context context = driver::queues.default_context());
+  array(int_t size1, numeric_type dtype, driver::Buffer data, int_t start, int_t inc);
+
   template<typename DT>
   array(std::vector<DT> const & data, driver::Context context = driver::queues.default_context());
   array(array & v, slice const & s1);
 
   //2D Constructors
   array(int_t size1, int_t size2, numeric_type dtype, driver::Context context = driver::queues.default_context());
+  array(int_t size1, int_t size2, numeric_type dtype, driver::Buffer data, int_t start, int_t ld);
   template<typename DT>
   array(int_t size1, int_t size2, std::vector<DT> const & data, driver::Context context = driver::queues.default_context());
   array(array & M, slice const & s1, slice const & s2);
@@ -31,7 +38,6 @@ public:
   array(int_t size1, int_t size2, int_t size3, numeric_type dtype, driver::Context context = driver::queues.default_context());
 
   //General constructor
-  array(numeric_type dtype, driver::Buffer data, slice const & s1, slice const & s2, int_t ld);
 
   array(array_expression const & proxy);
   array(array const &);
diff --git a/lib/array.cpp b/lib/array.cpp
index 7c1777f4b..6756a46d0 100644
--- a/lib/array.cpp
+++ b/lib/array.cpp
@@ -19,6 +19,11 @@ array::array(int_t shape0, numeric_type dtype, driver::Context context) :
   context_(context), data_(context_, size_of(dtype)*dsize())
 { }
 
+array::array(int_t shape0, numeric_type dtype, driver::Buffer data, int_t start, int_t inc):
+    dtype_(dtype), shape_(shape0), start_(start, 0, 0, 0), stride_(inc), ld_(shape_[0]), context_(data.context()), data_(data)
+{ }
+
+
 template<class DT>
 array::array(std::vector<DT> const & x, driver::Context context):
   dtype_(to_numeric_type<DT>::value), shape_(x.size(), 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape_[0]),
@@ -45,21 +50,20 @@ INSTANTIATE(double);
 #undef INSTANTIATE
 
 // 2D
-array::array(int_t shape0, int_t shape1, numeric_type dtype, driver::Context context) : dtype_(dtype), shape_(shape0, shape1, 1, 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape0),
+array::array(int_t shape0, int_t shape1, numeric_type dtype, driver::Context context) : dtype_(dtype), shape_(shape0, shape1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape0),
                                                                                         context_(context), data_(context_, size_of(dtype_)*dsize())
 {}
 
+array::array(int_t shape0, int_t shape1, numeric_type dtype, driver::Buffer data, int_t start, int_t ld) :
+    dtype_(dtype), shape_(shape0, shape1), start_(start%ld, start/ld, 0, 0), stride_(1, 1, 1, 1), ld_(ld), context_(data.context()), data_(data)
+{ }
+
 array::array(array & M, slice const & s0, slice const & s1) :  dtype_(M.dtype_), shape_(s0.size, s1.size, 1, 1),
                                                           start_(M.start_[0] + M.stride_[0]*s0.start, M.start_[1] + M.stride_[1]*s1.start, 0, 0),
                                                           stride_(M.stride_[0]*s0.stride, M.stride_[1]*s1.stride, 1, 1), ld_(M.ld_),
                                                           context_(M.data_.context()), data_(M.data_)
 { }
 
-// 3D
-array::array(int_t shape0, int_t shape1, int_t shape2, numeric_type dtype, driver::Context context) : dtype_(dtype), shape_(shape0, shape1, shape2, 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape0),
-                                                                                        context_(context), data_(context_, size_of(dtype_)*dsize())
-{}
-
 template<typename DT>
 array::array(int_t shape0, int_t shape1, std::vector<DT> const & data, driver::Context context)
   : dtype_(to_numeric_type<DT>::value),
@@ -69,6 +73,19 @@ array::array(int_t shape0, int_t shape1, std::vector<DT> const & data, driver::C
   isaac::copy(data, *this);
 }
 
+// 3D
+array::array(int_t shape0, int_t shape1, int_t shape2, numeric_type dtype, driver::Context context) : dtype_(dtype), shape_(shape0, shape1, shape2, 1), start_(0, 0, 0, 0), stride_(1, 1, 1, 1), ld_(shape0),
+                                                                                        context_(context), data_(context_, size_of(dtype_)*dsize())
+{}
+
+//Slices
+array::array(numeric_type dtype, driver::Buffer data, slice const & s0, slice const & s1, int_t ld):
+  dtype_(dtype), shape_(s0.size, s1.size), start_(s0.start, s1.start), stride_(s0.stride, s1.stride),
+   ld_(ld), context_(data.context()), data_(data)
+{ }
+
+
+
 #define INSTANTIATE(T) template array::array(int_t, int_t, std::vector<T> const &, driver::Context)
 INSTANTIATE(char);
 INSTANTIATE(unsigned char);
@@ -84,12 +101,6 @@ INSTANTIATE(float);
 INSTANTIATE(double);
 #undef INSTANTIATE
 
-// General
-array::array(numeric_type dtype, driver::Buffer data, slice const & s0, slice const & s1, int_t ld):
-  dtype_(dtype), shape_(s0.size, s1.size), start_(s0.start, s1.start), stride_(s0.stride, s1.stride),
-   ld_(ld), context_(data.context()), data_(data)
-{ }
-
 array::array(array_expression const & proxy) : array(control(proxy)){}
 array::array(array const & other) : array(control(other)){}
 
diff --git a/lib/wrap/clBLAS.cpp b/lib/wrap/clBLAS.cpp
index 9c61b214d..d719843bd 100644
--- a/lib/wrap/clBLAS.cpp
+++ b/lib/wrap/clBLAS.cpp
@@ -2,6 +2,7 @@
 
 #include "isaac/wrap/clBLAS.h"
 #include "isaac/array.h"
+#include "isaac/symbolic/execute.h"
 
 namespace is = isaac;
 
@@ -18,6 +19,136 @@ extern "C"
 
     }
 
+    void execute(is::array_expression const & operation, is::driver::Context const & context,
+                 cl_uint numCommandQueues, cl_command_queue *commandQueues,
+                 cl_uint numEventsInWaitList, const cl_event *eventWaitList,
+                 cl_event *events)
+    {
+        std::vector<is::driver::Event> waitlist;
+        for(cl_uint i = 0 ; i < numEventsInWaitList ; ++i)
+            waitlist.push_back(cl::Event(eventWaitList[i]));
+        for(cl_uint i = 0 ; i < numCommandQueues ; ++i)
+        {
+            std::list<is::driver::Event> levents;
+            is::driver::CommandQueue queue(cl::CommandQueue(commandQueues[i]));
+            clRetainCommandQueue(commandQueues[i]);
+            is::execution_options_type options(queue, &levents, &waitlist);
+            is::execute(is::control(operation, options), is::models(options.queue(context)));
+            if(events)
+            {
+                events[i] = static_cast<cl::Event>(levents.front())();
+                clRetainEvent(events[i]);
+            }
+        }
+    }
+
+    //*****************
+    //BLAS1
+    //*****************
+    clblasStatus clblasSaxpy(size_t N, cl_float alpha,
+                            const cl_mem mx,  size_t offx, int incx,
+                            cl_mem my, size_t offy, int incy,
+                            cl_uint numCommandQueues, cl_command_queue *commandQueues,
+                            cl_uint numEventsInWaitList, const cl_event *eventWaitList,
+                            cl_event *events)
+    {
+        is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx);
+        clRetainMemObject(mx);
+        is::array y(N, is::FLOAT_TYPE, cl::Buffer(my), offy, incy);
+        clRetainMemObject(my);
+        execute(is::detail::assign(y, x + alpha*y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+        return clblasSuccess;
+    }
+
+    clblasStatus clblasSscal(size_t N, cl_float alpha,
+                             cl_mem mx, size_t offx, int incx,
+                             cl_uint numCommandQueues, cl_command_queue *commandQueues,
+                             cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)
+    {
+        is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx);
+        clRetainMemObject(mx);
+        execute(is::detail::assign(x, alpha*x), x.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+        return clblasSuccess;
+    }
+
+    clblasStatus clblasScopy(size_t N,
+                             const cl_mem mx, size_t offx, int incx,
+                             cl_mem my, size_t offy, int incy,
+                             cl_uint numCommandQueues, cl_command_queue *commandQueues,
+                             cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)
+    {
+        const is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx);
+        clRetainMemObject(mx);
+        is::array y(N, is::FLOAT_TYPE, cl::Buffer(my), offy, incy);
+        clRetainMemObject(my);
+        execute(is::detail::assign(y, x), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+        return clblasSuccess;
+    }
+
+    clblasStatus clblasSdot(size_t N, cl_mem dotProduct, size_t offDP,
+               const cl_mem mx, size_t offx, int incx,
+               const cl_mem my, size_t offy, int incy,
+               cl_mem /*scratchBuff*/, cl_uint numCommandQueues,
+               cl_command_queue *commandQueues, cl_uint numEventsInWaitList,
+               const cl_event *eventWaitList, cl_event *events)
+    {
+        is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx);
+        clRetainMemObject(mx);
+        is::array y(N, is::FLOAT_TYPE, cl::Buffer(my), offy, incy);
+        clRetainMemObject(my);
+        is::scalar s(is::FLOAT_TYPE, cl::Buffer(dotProduct), offDP);
+        clRetainMemObject(dotProduct);
+        execute(is::detail::assign(s, dot(x,y)), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+        return clblasSuccess;
+    }
+
+    clblasStatus clblasSasum(size_t N, cl_mem asum, size_t offAsum,
+                             const cl_mem mx, size_t offx, int incx,
+                             cl_mem /*scratchBuff*/, cl_uint numCommandQueues, cl_command_queue *commandQueues,
+                             cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)
+    {
+        is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx);
+        clRetainMemObject(mx);
+        is::scalar s(is::FLOAT_TYPE, cl::Buffer(asum), offAsum);
+        clRetainMemObject(asum);
+        execute(is::detail::assign(s, sum(abs(x))), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+        return clblasSuccess;
+    }
+
+    //*****************
+    //BLAS2
+    //*****************
+    clblasStatus clblasSgemv(clblasOrder order, clblasTranspose transA,
+                             size_t M, size_t N,
+                             cl_float alpha, const cl_mem mA, size_t offA, size_t lda,
+                             const cl_mem mx, size_t offx, int incx,
+                             cl_float beta, cl_mem my, size_t offy, int incy,
+                             cl_uint numCommandQueues, cl_command_queue *commandQueues,
+                             cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events)
+    {
+        //A
+        is::int_t As1 = M, As2 = N;
+        if(transA==clblasTrans) std::swap(As1, As2);
+        is::array A(As1, As2, is::FLOAT_TYPE, cl::Buffer(mA), offA, lda);
+        clRetainMemObject(mA);
+        //x
+        is::array x(N, is::FLOAT_TYPE, cl::Buffer(mx), offx, incx);
+        clRetainMemObject(mx);
+        //y
+        is::array y(N, is::FLOAT_TYPE, cl::Buffer(my), offy, incy);
+        clRetainMemObject(my);
+        //Operation
+        is::driver::Context const & context = A.context();
+        if(transA==clblasTrans)
+            execute(is::detail::assign(y, alpha*dot(A.T(), x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+        else
+            execute(is::detail::assign(y, alpha*dot(A, x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+        return clblasSuccess;
+    }
+
+    //*****************
+    //BLAS3
+    //*****************
     clblasStatus clblasSgemm(clblasOrder order, clblasTranspose transA,  clblasTranspose transB,
                             size_t M, size_t N, size_t K,
                             cl_float alpha, const cl_mem mA, size_t offA, size_t lda,
@@ -28,42 +159,26 @@ extern "C"
     {
         is::int_t As1 = M, As2 = K;
         is::int_t Bs1 = K, Bs2 = N;
-
+        //Trans
         if(transA==clblasTrans) std::swap(As1, As2);
         if(transB==clblasTrans) std::swap(Bs1, Bs2);
-
-        is::array A(is::FLOAT_TYPE, cl::Buffer(mA), is::_(offA%lda, As1, 1), is::_(offA/lda, As2, 1), lda);
-        is::array B(is::FLOAT_TYPE, cl::Buffer(mB), is::_(offB%ldb, Bs1, 1), is::_(offB/ldb, Bs2, 1), ldb);
-        is::array C(is::FLOAT_TYPE, cl::Buffer(mC), is::_(offC%ldc, M, 1), is::_(offC/ldc, N, 1), ldc);
-
-        std::vector<is::driver::Event> waitlist;
-        for(cl_uint i = 0 ; i < numEventsInWaitList ; ++i)
-            waitlist.push_back(cl::Event(eventWaitList[i]));
-
-        std::list<is::driver::Event> levents;
-
-        for(cl_uint i = 0 ; i < numCommandQueues ; ++i)
-        {
-            clRetainCommandQueue(commandQueues[i]);
-            cl::CommandQueue clqueue(commandQueues[i]);
-            is::driver::CommandQueue queue(clqueue);
-
-
-            is::execution_options_type opt(queue, &levents, &waitlist);
-
-            if(transA==clblasTrans && transB==clblasTrans)
-                C = is::control(alpha*dot(A.T(), B.T()) + beta*C, opt);
-            else if(transA==clblasTrans && transB==clblasNoTrans)
-                C = is::control(alpha*dot(A.T(), B) + beta*C, opt);
-            else if(transA==clblasNoTrans && transB==clblasTrans)
-                C = is::control(alpha*dot(A, B.T()) + beta*C, opt);
-            else
-                C = is::control(alpha*dot(A, B) + beta*C, opt);
-        }
-
-        if(events)
-            *events = static_cast<cl::Event>(levents.front())();
-        std::cout << events << std::endl;
+        //Struct
+        is::array A(As1, As2, is::FLOAT_TYPE, cl::Buffer(mA), offA, lda);
+        clRetainMemObject(mA);
+        is::array B(Bs1, Bs2, is::FLOAT_TYPE, cl::Buffer(mB), offB, ldb);
+        clRetainMemObject(mB);
+        is::array C(M, N, is::FLOAT_TYPE, cl::Buffer(mC), offC, ldc);
+        clRetainMemObject(mC);
+        is::driver::Context const & context = C.context();
+        //Operation
+        if(transA==clblasTrans && transB==clblasTrans)
+            execute(is::detail::assign(C, alpha*dot(A.T(), B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+        else if(transA==clblasTrans && transB==clblasNoTrans)
+            execute(is::detail::assign(C, alpha*dot(A.T(), B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+        else if(transA==clblasNoTrans && transB==clblasTrans)
+            execute(is::detail::assign(C, alpha*dot(A, B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
+        else
+            execute(is::detail::assign(C, alpha*dot(A, B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
 
         return clblasSuccess;
     }