removing C++11 interface

2015-02-08 23:19:38 -05:00
parent 85fb438806
commit a6d7671831
21 changed files with 423 additions and 956 deletions
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -33,7 +33,7 @@ else()
 endif()

 string(REPLACE ";" " " BLAS_DEF_STR "${BLAS_DEF}")
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} " ${BLAS_DEF_STR} -std=c++11")
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} " ${BLAS_DEF_STR}")
 foreach(PROG blas overhead)
    include_directories(${CMAKE_CURRENT_SOURCE_DIR})
    if(CUDA_FOUND)
@@ -44,7 +44,7 @@ foreach(PROG blas overhead)
 	cuda_add_cublas_to_target(${PROG}-bench)
    else()
        add_executable(${PROG}-bench  ${PROG}.cpp)
-        set_target_properties(${PROG}-bench PROPERTIES COMPILE_FLAGS "-Wall -Wextra ${BLAS_DEF_STR} -std=c++11")
+        set_target_properties(${PROG}-bench PROPERTIES COMPILE_FLAGS "-Wall -Wextra ${BLAS_DEF_STR}")
    endif()
     target_link_libraries(${PROG}-bench ${BLAS_LIBS})
 endforeach(PROG)
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -1,5 +1,6 @@
 #include "atidlas/array.h"
 #include "atidlas/symbolic/execute.h"
+#include "atidlas/tools/timer.hpp"
 #include "common.hpp"
 #ifdef BENCH_CLAMDBLAS
  #include "clAmdBlas.h"
@@ -13,27 +14,89 @@
 #include <iomanip>
 #include <stdlib.h>
 #include <cmath>
-#include <chrono>
+#include <numeric>

 namespace ad = atidlas;
 typedef ad::int_t int_t;

-template<class T>
-void bench(ad::numeric_type dtype)
+int ceil(int N, int pad)
 {
-  unsigned int dtsize = ad::size_of(dtype);
-  cl::CommandQueue & queue = ad::cl_ext::queues[ad::cl_ext::default_context()][0];
+    return (N%pad==0)?N:(N+pad-1)/pad*pad;
+}
+
+std::vector<int> create_log_range(int min, int max, int N, int pad)
+{
+  std::vector<int> res(N);
+  for(int i = 0 ; i < N ; ++i)
+  {
+    res[i] = std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N);
+    res[i] = ceil(res[i], pad);
+  }
+  return res;
+}
+
+std::vector<int> create_full_range(int min, int max, int pad)
+{
+    std::vector<int> N;
+    for(int i = ceil(min, pad) ; i < ceil(max, pad) ; i+=pad)
+        N.push_back(i);
+    return N;
+}
+
+template <typename T>
+class make_vector {
+public:
+  typedef make_vector<T> my_type;
+  my_type& operator<< (const T& val) {
+    data_.push_back(val);
+    return *this;
+  }
+  operator std::vector<T>() const {
+    return data_;
+  }
+private:
+  std::vector<T> data_;
+};
+
+
+
+
+template<class T>
+T median(std::vector<T> x)
+{
+  size_t size = x.size();
+  std::sort(x.begin(), x.end());
+  if (size  % 2 == 0)
+      return (x[size / 2 - 1] + x[size / 2]) / 2;
+  else
+      return x[size / 2];
+}
+
+template<class T>
+T mean(std::vector<T> x)
+{
+  T res = 0;
+  int N = x.size();
+  for(int i = 0 ; i < N ; ++i)
+    res += x[i];
+  return res/N;
+}
+
+static double time_event(unsigned long sum, cl::Event const & e)
+{ return sum + e.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  e.getProfilingInfo<CL_PROFILING_COMMAND_START>();}
+
+template<class T>
+void bench(ad::numeric_type dtype){

 #define BENCHMARK_ATIDLAS(OP, PERF) \
  {\
  std::vector<long> times;\
  double total_time = 0;\
-  while(total_time*1e-9 < 1e-1){\
+  while(total_time*1e-9 < 1e-2){\
    std::list<cl::Event> events;\
    OP;\
    queue.finish();\
-    times.push_back(std::accumulate(events.begin(), events.end(), 0, \
-                    [](unsigned long sum, cl::Event const & e){ return sum + e.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  e.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>();}));\
+    times.push_back(std::accumulate(events.begin(), events.end(), 0, &time_event));\
    total_time+=times.back();\
  }\
  double t = median(times);\
@@ -44,11 +107,11 @@ void bench(ad::numeric_type dtype)
  {\
  std::vector<long> times;\
  double total_time = 0;\
-  while(total_time*1e-9 < 1e-1){\
+  while(total_time*1e-9 < 1e-2){\
    cl::Event event;\
    OP;\
    queue.finish();\
-    times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());\
+    times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  event.getProfilingInfo<CL_PROFILING_COMMAND_START>());\
    total_time+=times.back();\
  }\
  double t = median(times);\
@@ -57,11 +120,11 @@ void bench(ad::numeric_type dtype)

 #define BENCHMARK_HOST(OP, PERF) \
  {\
+  ad::tools::timer tmr;\
  std::vector<int> cache_flusher(10000000, 0);\
-  auto start = std::chrono::steady_clock::now();\
+  tmr.start();\
  OP;\
-  auto end = std::chrono::steady_clock::now();\
-  double t = std::chrono::duration<double, std::nano>(end - start).count();\
+  double t = 1e9*tmr.get();\
  std::cout << " " << PERF << std::flush;\
  }

@@ -86,68 +149,49 @@ void bench(ad::numeric_type dtype)
  std::cout << " " << PERF << std::flush;\
  }

-  /*---------*/
-  /*--BLAS1--*/
-  /*---------*/
-  std::cout << "#AXPY" << std::endl;
-  for(int_t N : create_log_range(1e3, 2e7, 50, 64))
-  {
-    std::cout << N;
-    ad::array x(N, dtype), y(N, dtype);
-    /* ATIDLAS */
-    y = x + y; queue.flush(); queue.finish();
-    BENCHMARK_ATIDLAS(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t)
-    /* clAmdBlas */
-#ifdef BENCH_CLAMDBLAS
-    BENCHMARK_CLAMDBLAS(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
-#endif
-    /* BLAS */
-#ifdef BENCH_CBLAS
-    std::vector<float> cx(N), cy(N);
-    ad::copy(x, cx);
-    ad::copy(y, cy);
-    BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
-#endif
-    /* CuBLAS */
-#ifdef BENCH_CUBLAS
-    T *cux, *cuy;
-    cudaMalloc((void**) &cux, N * sizeof(T));
-    cudaMalloc((void**) &cuy, N * sizeof(T));
-    BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
-    cudaFree(cux);
-    cudaFree(cuy);
-#endif
-    std::cout << std::endl;
-  }
-  std::cout << "\n\n" << std::flush;
+unsigned int dtsize = ad::size_of(dtype);
+cl::CommandQueue & queue = ad::cl_ext::queues[ad::cl_ext::default_context()][0];

-//  std::cout << "#DOT" << std::endl;
+  // BLAS1 Sizes
+  static const std::vector<int> BLAS1_N = create_log_range(1e3, 2e7, 50, 64);
+
+  // BLAS2 Sizes
+  static const std::vector<int> BLAS2_N = make_vector<int>() << 64;
+  static const std::vector<int> BLAS2_M = create_full_range(128, 10000, 64);
+
+  // BLAS3 Sizes
+  static const std::vector<int> BLAS3_M = make_vector<int>() << 1024;
+  static const std::vector<int> BLAS3_N = make_vector<int>() << 128;
+  static const std::vector<int> BLAS3_K = create_full_range(128, 5000, 64);
+
+//  /*---------*/
+//  /*--BLAS1--*/
+//  /*---------*/
+//  std::cout << "#AXPY" << std::endl;
 //  for(int_t N : create_log_range(1e3, 2e7, 50, 64))
 //  {
 //    std::cout << N;
-//    /* ATIDLAS */
 //    ad::array x(N, dtype), y(N, dtype);
-//    ad::array scratch(N, dtype);
-//    ad::scalar s(dtype);
-//    s = dot(x,y); queue.flush(); queue.finish();
-//    BENCHMARK_OPENCL(s = ad::controller<atidlas::array_expression>(dot(x,y), ad::execution_options_type(0, &event)), 2*N*dtsize/t)
+//    /* ATIDLAS */
+//    y = x + y; queue.finish();
+//    BENCHMARK_ATIDLAS(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t)
 //    /* clAmdBlas */
 //#ifdef BENCH_CLAMDBLAS
-//    BENCHMARK_OPENCL(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
+//    BENCHMARK_CLAMDBLAS(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
 //#endif
 //    /* BLAS */
 //#ifdef BENCH_CBLAS
 //    std::vector<float> cx(N), cy(N);
 //    ad::copy(x, cx);
 //    ad::copy(y, cy);
-//    BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
+//    BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
 //#endif
+//    /* CuBLAS */
 //#ifdef BENCH_CUBLAS
 //    T *cux, *cuy;
-//    T result;
 //    cudaMalloc((void**) &cux, N * sizeof(T));
 //    cudaMalloc((void**) &cuy, N * sizeof(T));
-//    BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
+//    BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
 //    cudaFree(cux);
 //    cudaFree(cuy);
 //#endif
@@ -155,21 +199,56 @@ void bench(ad::numeric_type dtype)
 //  }
 //  std::cout << "\n\n" << std::flush;

+  std::cout << "#DOT" << std::endl;
+  for(int_t i = 0 ; i < BLAS1_N.size() ; ++i)
+  {
+    int_t N = BLAS1_N[i];
+    std::cout << N;
+    /* ATIDLAS */
+    ad::array x(N, dtype), y(N, dtype);
+    ad::array scratch(N, dtype);
+    ad::scalar s(dtype);
+    s = dot(x,y); queue.finish();
+    BENCHMARK_ATIDLAS(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t)
+    /* clAmdBlas */
+#ifdef BENCH_CLAMDBLAS
+    BENCHMARK_CLAMDBLAS(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
+#endif
+    /* BLAS */
+#ifdef BENCH_CBLAS
+    std::vector<float> cx(N), cy(N);
+    ad::copy(x, cx);
+    ad::copy(y, cy);
+    BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
+#endif
+#ifdef BENCH_CUBLAS
+    T *cux, *cuy;
+    T result;
+    cudaMalloc((void**) &cux, N * sizeof(T));
+    cudaMalloc((void**) &cuy, N * sizeof(T));
+    BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
+    cudaFree(cux);
+    cudaFree(cuy);
+#endif
+    std::cout << std::endl;
+  }
+  std::cout << "\n\n" << std::flush;
+
 //  /*---------*/
 //  /*--BLAS2--*/
 //  /*---------*/
 //  //T-layout
 //  std::cout << "#GEMV-T" << std::endl;
-//  for(int_t N: std::vector<int>{64})
+//  for(int_t N: std::vector<int>{128})
 //    for(int_t M: create_full_range(128, 10000, 64))
 //    {
 //      std::cout << M << "," << N;
 //      /* ATIDLAS */
 //      ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
-//      y = dot(trans(A),x); queue.flush(); queue.finish();
-//      BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(dot(trans(A),x), ad::execution_options_type(0, &event)),(M*N + M + N)*dtsize/t);
+//      y = dot(trans(A),x); queue.finish();
+//      BENCHMARK_ATIDLAS(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
 //  #ifdef BENCH_CLAMDBLAS
-//      BENCHMARK_OPENCL(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &queue(),0, NULL, &event()), (M*N + M + N)*dtsize/t)
+//      BENCHMARK_CLAMDBLAS(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &queue(),0, NULL, &event()), (M*N + M + N)*dtsize/t)
 //  #endif
 //  #ifdef BENCH_CBLAS
 //      std::vector<float> cA(N*M), cx(N), cy(M);
@@ -192,9 +271,9 @@ void bench(ad::numeric_type dtype)
 //    }
 //    std::cout << "\n\n" << std::flush;

-////  /*---------*/
-////  /*--BLAS3--*/
-////  /*---------*/
+//  /*---------*/
+//  /*--BLAS3--*/
+//  /*---------*/
 //    std::cout << "#GEMM-NT" << std::endl;
 //    for(std::vector<int_t>::const_iterator Mit = BLAS3_M.begin() ; Mit != BLAS3_M.end() ; ++Mit)
 //    for(std::vector<int_t>::const_iterator Nit = BLAS3_N.begin() ; Nit != BLAS3_N.end() ; ++Nit)
@@ -240,8 +319,8 @@ int main(int argc, char* argv[])
      std::cerr << "usage : blas-bench [DEVICE_IDX]" << std::endl;
      std::cout << "Devices available: " << std::endl;
      unsigned int current=0;
-      for(const auto & queue : queues){
-        cl::Device device = queue.first.getInfo<CL_CONTEXT_DEVICES>()[0];
+      for(ad::cl_ext::queues_type::data_type::const_iterator it = queues.begin() ; it != queues.end() ; ++it){
+        cl::Device device = it->first.getInfo<CL_CONTEXT_DEVICES>()[0];
        std::cout << current++ << ": " << device.getInfo<CL_DEVICE_NAME>() << "(" << cl::Platform(device.getInfo<CL_DEVICE_PLATFORM>()).getInfo<CL_PLATFORM_NAME>() << ")" << std::endl;
      }
      exit(EXIT_FAILURE);
--- a/bench/common.hpp
+++ b/bench/common.hpp
@@ -5,83 +5,6 @@
 #include <cmath>
 #include <algorithm>

-int ceil(int N, int pad)
-{
-    return (N%pad==0)?N:(N+pad-1)/pad*pad;
-}

-std::vector<int> create_log_range(int min, int max, int N, int pad)
-{
-  std::vector<int> res(N);
-  for(int i = 0 ; i < N ; ++i)
-  {
-    res[i] = std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N);
-    res[i] = ceil(res[i], pad);
-  }
-  return res;
-}
-
-std::vector<int> create_full_range(int min, int max, int pad)
-{
-    std::vector<int> N;
-    for(int i = ceil(min, pad) ; i < ceil(max, pad) ; i+=pad)
-        N.push_back(i);
-    return N;
-}
-
-template <typename T>
-class make_vector {
-public:
-  typedef make_vector<T> my_type;
-  my_type& operator<< (const T& val) {
-    data_.push_back(val);
-    return *this;
-  }
-  operator std::vector<T>() const {
-    return data_;
-  }
-private:
-  std::vector<T> data_;
-};
-
-// BLAS1 Sizes
-static const std::vector<int> BLAS1_N = create_log_range(1e3, 2e7, 50, 64);
-
-// BLAS2 Sizes
-static const std::vector<int> BLAS2_N = make_vector<int>() << 64;
-static const std::vector<int> BLAS2_M = create_full_range(128, 10000, 64);
-
-// BLAS3 Sizes
-static const std::vector<int> BLAS3_M = make_vector<int>() << 1024;
-static const std::vector<int> BLAS3_N = make_vector<int>() << 128;
-static const std::vector<int> BLAS3_K = create_full_range(128, 5000, 64);
-
-
-double bandwidth(std::size_t N, double t, unsigned int dtsize)
-{ return N * dtsize * 1e-9 / t; }
-
-double gflops(double nops, double t)
-{ return nops * 1e-9 / t; }
-
-template<class T>
-T median(std::vector<T> x)
-{
-  size_t size = x.size();
-  std::sort(x.begin(), x.end());
-  if (size  % 2 == 0)
-      return (x[size / 2 - 1] + x[size / 2]) / 2;
-  else
-      return x[size / 2];
-}
-
-template<class T>
-T mean(std::vector<T> x)
-{
-  T res = 0;
-  int N = x.size();
-  for(int i = 0 ; i < N ; ++i)
-    res += x[i];
-  return res/N;
-}

 #endif
--- a/include/CL/cl.hpp
+++ b/include/CL/cl.hpp
@@ -160,26 +160,21 @@

 #pragma push_macro("max")
 #undef max
-#if defined(USE_DX_INTEROP)
-#include <CL/cl_d3d10.h>
-#include <CL/cl_dx9_media_sharing.h>
-#endif
+
 #endif // _WIN32

+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/opencl.h>
+#include <libkern/OSAtomic.h>
+#else
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
 // 
 #if defined(USE_CL_DEVICE_FISSION)
 #include <CL/cl_ext.h>
 #endif

-#if defined(__APPLE__) || defined(__MACOSX)
-#include <OpenGL/OpenGL.h>
-#include <OpenCL/opencl.h>
-#include <libkern/OSAtomic.h>
-#else
-#include <GL/gl.h>
-#include <CL/opencl.h>
-#endif // !__APPLE__
-
 // To avoid accidentally taking ownership of core OpenCL types
 // such as cl_kernel constructors are made explicit
 // under OpenCL 1.2
@@ -367,9 +362,6 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
 #define __COPY_ERR                          __ERR_STR(cl::copy)
 #define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
-#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
-#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
-#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
 #if defined(CL_VERSION_1_2)
 #define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
 #define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
@@ -3219,266 +3211,6 @@ public:
 #endif
 };

-#if defined (USE_DX_INTEROP)
-/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
- *
- *  This is provided to facilitate interoperability with Direct3D.
- * 
- *  See Memory for details about copy semantics, etc.
- *
- *  \see Memory
- */
-class BufferD3D10 : public Buffer
-{
-public:
-    typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
-    cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
-    cl_int* errcode_ret);
-
-    /*! \brief Constructs a BufferD3D10, in a specified context, from a
-     *         given ID3D10Buffer.
-     *
-     *  Wraps clCreateFromD3D10BufferKHR().
-     */
-    BufferD3D10(
-        const Context& context,
-        cl_mem_flags flags,
-        ID3D10Buffer* bufobj,
-        cl_int * err = NULL)
-    {
-        static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
-
-#if defined(CL_VERSION_1_2)
-        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
-        cl_platform platform = -1;
-        for( int i = 0; i < props.size(); ++i ) {
-            if( props[i] == CL_CONTEXT_PLATFORM ) {
-                platform = props[i+1];
-            }
-        }
-        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
-#endif
-#if defined(CL_VERSION_1_1)
-        __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
-#endif
-
-        cl_int error;
-        object_ = pfn_clCreateFromD3D10BufferKHR(
-            context(),
-            flags,
-            bufobj,
-            &error);
-
-        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-    BufferD3D10() : Buffer() { }
-
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
-
-    /*! \brief Assignment from BufferD3D10 - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferD3D10& operator = (const BufferD3D10& rhs)
-    {
-        if (this != &rhs) {
-            Buffer::operator=(rhs);
-        }
-        return *this;
-    }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferD3D10& operator = (const cl_mem& rhs)
-    {
-        Buffer::operator=(rhs);
-        return *this;
-    }
-};
-#endif
-
-/*! \brief Class interface for GL Buffer Memory Objects.
- *
- *  This is provided to facilitate interoperability with OpenGL.
- * 
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class BufferGL : public Buffer
-{
-public:
-    /*! \brief Constructs a BufferGL in a specified context, from a given
-     *         GL buffer.
-     *
-     *  Wraps clCreateFromGLBuffer().
-     */
-    BufferGL(
-        const Context& context,
-        cl_mem_flags flags,
-        GLuint bufobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLBuffer(
-            context(),
-            flags,
-            bufobj,
-            &error);
-
-        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-    BufferGL() : Buffer() { }
-
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
-
-    /*! \brief Assignment from BufferGL - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferGL& operator = (const BufferGL& rhs)
-    {
-        if (this != &rhs) {
-            Buffer::operator=(rhs);
-        }
-        return *this;
-    }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferGL& operator = (const cl_mem& rhs)
-    {
-        Buffer::operator=(rhs);
-        return *this;
-    }
-
-    //! \brief Wrapper for clGetGLObjectInfo().
-    cl_int getObjectInfo(
-        cl_gl_object_type *type,
-        GLuint * gl_object_name)
-    {
-        return detail::errHandler(
-            ::clGetGLObjectInfo(object_,type,gl_object_name),
-            __GET_GL_OBJECT_INFO_ERR);
-    }
-};
-
-/*! \brief Class interface for GL Render Buffer Memory Objects.
- *
- *  This is provided to facilitate interoperability with OpenGL.
- * 
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class BufferRenderGL : public Buffer
-{
-public:
-    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
-     *         GL Renderbuffer.
-     *
-     *  Wraps clCreateFromGLRenderbuffer().
-     */
-    BufferRenderGL(
-        const Context& context,
-        cl_mem_flags flags,
-        GLuint bufobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLRenderbuffer(
-            context(),
-            flags,
-            bufobj,
-            &error);
-
-        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-    BufferRenderGL() : Buffer() { }
-
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { }
-
-    /*! \brief Assignment from BufferGL - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferRenderGL& operator = (const BufferRenderGL& rhs)
-    {
-        if (this != &rhs) {
-            Buffer::operator=(rhs);
-        }
-        return *this;
-    }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    BufferRenderGL& operator = (const cl_mem& rhs)
-    {
-        Buffer::operator=(rhs);
-        return *this;
-    }
-
-    //! \brief Wrapper for clGetGLObjectInfo().
-    cl_int getObjectInfo(
-        cl_gl_object_type *type,
-        GLuint * gl_object_name)
-    {
-        return detail::errHandler(
-            ::clGetGLObjectInfo(object_,type,gl_object_name),
-            __GET_GL_OBJECT_INFO_ERR);
-    }
-};

 /*! \brief C++ base class for Image Memory objects.
 *
@@ -3869,86 +3601,6 @@ public:
 };


-#if !defined(CL_VERSION_1_2)
-/*! \brief Class interface for GL 2D Image Memory objects.
- *
- *  This is provided to facilitate interoperability with OpenGL.
- * 
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
- */
-class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
-{
-public:
-    /*! \brief Constructs an Image2DGL in a specified context, from a given
-     *         GL Texture.
-     *
-     *  Wraps clCreateFromGLTexture2D().
-     */
-    Image2DGL(
-        const Context& context,
-        cl_mem_flags flags,
-        GLenum target,
-        GLint  miplevel,
-        GLuint texobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLTexture2D(
-            context(),
-            flags,
-            target,
-            miplevel,
-            texobj,
-            &error);
-
-        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-
-    }
-    
-    //! \brief Default constructor - initializes to NULL.
-    Image2DGL() : Image2D() { }
-
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image2DGL(const Image2DGL& image) : Image2D(image) { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
-
-    /*! \brief Assignment from Image2DGL - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image2DGL& operator = (const Image2DGL& rhs)
-    {
-        if (this != &rhs) {
-            Image2D::operator=(rhs);
-        }
-        return *this;
-    }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image2DGL& operator = (const cl_mem& rhs)
-    {
-        Image2D::operator=(rhs);
-        return *this;
-    }
-};
-#endif // #if !defined(CL_VERSION_1_2)

 #if defined(CL_VERSION_1_2)
 /*! \class Image2DArray
@@ -4138,246 +3790,6 @@ public:
    }
 };

-#if !defined(CL_VERSION_1_2)
-/*! \brief Class interface for GL 3D Image Memory objects.
- *
- *  This is provided to facilitate interoperability with OpenGL.
- * 
- *  See Memory for details about copy semantics, etc.
- * 
- *  \see Memory
- */
-class Image3DGL : public Image3D
-{
-public:
-    /*! \brief Constructs an Image3DGL in a specified context, from a given
-     *         GL Texture.
-     *
-     *  Wraps clCreateFromGLTexture3D().
-     */
-    Image3DGL(
-        const Context& context,
-        cl_mem_flags flags,
-        GLenum target,
-        GLint  miplevel,
-        GLuint texobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLTexture3D(
-            context(),
-            flags,
-            target,
-            miplevel,
-            texobj,
-            &error);
-
-        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    //! \brief Default constructor - initializes to NULL.
-    Image3DGL() : Image3D() { }
-
-    /*! \brief Copy constructor - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image3DGL(const Image3DGL& image) : Image3D(image) { }
-
-    /*! \brief Constructor from cl_mem - takes ownership.
-     *
-     *  See Memory for further details.
-     */
-    __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
-
-    /*! \brief Assignment from Image3DGL - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image3DGL& operator = (const Image3DGL& rhs)
-    {
-        if (this != &rhs) {
-            Image3D::operator=(rhs);
-        }
-        return *this;
-    }
-
-    /*! \brief Assignment from cl_mem - performs shallow copy.
-     *
-     *  See Memory for further details.
-     */
-    Image3DGL& operator = (const cl_mem& rhs)
-    {
-        Image3D::operator=(rhs);
-        return *this;
-    }
-};
-#endif // #if !defined(CL_VERSION_1_2)
-
-#if defined(CL_VERSION_1_2)
-/*! \class ImageGL
- * \brief general image interface for GL interop.
- * We abstract the 2D and 3D GL images into a single instance here
- * that wraps all GL sourced images on the grounds that setup information
- * was performed by OpenCL anyway.
- */
-class ImageGL : public Image
-{
-public:
-    ImageGL(
-        const Context& context,
-        cl_mem_flags flags,
-        GLenum target,
-        GLint  miplevel,
-        GLuint texobj,
-        cl_int * err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateFromGLTexture(
-            context(), 
-            flags, 
-            target,
-            miplevel,
-            texobj,
-            &error);
-
-        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    ImageGL() : Image() { }
-
-    ImageGL(const ImageGL& image) : Image(image) { }
-
-    __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
-
-    ImageGL& operator = (const ImageGL& rhs)
-    {
-        if (this != &rhs) {
-            Image::operator=(rhs);
-        }
-        return *this;
-    }
-
-    ImageGL& operator = (const cl_mem& rhs)
-    {
-        Image::operator=(rhs);
-        return *this;
-    }
-};
-#endif // #if defined(CL_VERSION_1_2)
-
-/*! \brief Class interface for cl_sampler.
- *
- *  \note Copies of these objects are shallow, meaning that the copy will refer
- *        to the same underlying cl_sampler as the original.  For details, see
- *        clRetainSampler() and clReleaseSampler().
- *
- *  \see cl_sampler 
- */
-class Sampler : public detail::Wrapper<cl_sampler>
-{
-public:
-    /*! \brief Destructor.
-     *
-     *  This calls clReleaseSampler() on the value held by this instance.
-     */
-    ~Sampler() { }
-
-    //! \brief Default constructor - initializes to NULL.
-    Sampler() { }
-
-    /*! \brief Constructs a Sampler in a specified context.
-     *
-     *  Wraps clCreateSampler().
-     */
-    Sampler(
-        const Context& context,
-        cl_bool normalized_coords,
-        cl_addressing_mode addressing_mode,
-        cl_filter_mode filter_mode,
-        cl_int* err = NULL)
-    {
-        cl_int error;
-        object_ = ::clCreateSampler(
-            context(), 
-            normalized_coords,
-            addressing_mode,
-            filter_mode,
-            &error);
-
-        detail::errHandler(error, __CREATE_SAMPLER_ERR);
-        if (err != NULL) {
-            *err = error;
-        }
-    }
-
-    /*! \brief Copy constructor - performs shallow copy.
-     * 
-     *  This calls clRetainSampler() on the parameter's cl_sampler.
-     */
-    Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
-
-    /*! \brief Constructor from cl_sampler - takes ownership.
-     * 
-     *  This effectively transfers ownership of a refcount on the cl_sampler
-     *  into the new Sampler object.
-     */
-    Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
-
-    /*! \brief Assignment operator from Sampler.
-     * 
-     *  This calls clRetainSampler() on the parameter and clReleaseSampler()
-     *  on the previous value held by this instance.
-     */
-    Sampler& operator = (const Sampler& rhs)
-    {
-        if (this != &rhs) {
-            detail::Wrapper<cl_type>::operator=(rhs);
-        }
-        return *this;
-    }
-
-    /*! \brief Assignment operator from cl_sampler - takes ownership.
-     *
-     *  This effectively transfers ownership of a refcount on the rhs and calls
-     *  clReleaseSampler() on the value previously held by this instance.
-     */
-    Sampler& operator = (const cl_sampler& rhs)
-    {
-        detail::Wrapper<cl_type>::operator=(rhs);
-        return *this;
-    }
-
-    //! \brief Wrapper for clGetSamplerInfo().
-    template <typename T>
-    cl_int getInfo(cl_sampler_info name, T* param) const
-    {
-        return detail::errHandler(
-            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
-            __GET_SAMPLER_INFO_ERR);
-    }
-
-    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
-    template <cl_int name> typename
-    detail::param_traits<detail::cl_sampler_info, name>::param_type
-    getInfo(cl_int* err = NULL) const
-    {
-        typename detail::param_traits<
-            detail::cl_sampler_info, name>::param_type param;
-        cl_int result = getInfo(name, &param);
-        if (err != NULL) {
-            *err = result;
-        }
-        return param;
-    }
-};
-
 class Program;
 class CommandQueue;
 class Kernel;
@@ -6012,126 +5424,8 @@ public:
    }
 #endif // #if defined(CL_VERSION_1_1)

-    cl_int enqueueAcquireGLObjects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-     {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-             ::clEnqueueAcquireGLObjects(
-                 object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
-             __ENQUEUE_ACQUIRE_GL_ERR);

-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;

-        return err;
-     }
-
-    cl_int enqueueReleaseGLObjects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-     {
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-             ::clEnqueueReleaseGLObjects(
-                 object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
-             __ENQUEUE_RELEASE_GL_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-     }
-
-#if defined (USE_DX_INTEROP)
-typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
-    cl_command_queue command_queue, cl_uint num_objects,
-    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
-    const cl_event* event_wait_list, cl_event* event);
-typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
-    cl_command_queue command_queue, cl_uint num_objects,
-    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
-    const cl_event* event_wait_list, cl_event* event);
-
-    cl_int enqueueAcquireD3D10Objects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-    {
-        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
-#if defined(CL_VERSION_1_2)
-        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
-        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
-        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
-        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
-#endif
-#if defined(CL_VERSION_1_1)
-        __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
-#endif
-        
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-             pfn_clEnqueueAcquireD3D10ObjectsKHR(
-                 object_,
-                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
-                 (events != NULL) ? (cl_uint) events->size() : 0,
-                 (events != NULL) ? (cl_event*) &events->front() : NULL,
-                 (event != NULL) ? &tmp : NULL),
-             __ENQUEUE_ACQUIRE_GL_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-     }
-
-    cl_int enqueueReleaseD3D10Objects(
-         const VECTOR_CLASS<Memory>* mem_objects = NULL,
-         const VECTOR_CLASS<Event>* events = NULL,
-         Event* event = NULL) const
-    {
-        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
-#if defined(CL_VERSION_1_2)
-        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
-        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
-        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
-        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
-#endif // #if defined(CL_VERSION_1_2)
-#if defined(CL_VERSION_1_1)
-        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
-#endif // #if defined(CL_VERSION_1_1)
-
-        cl_event tmp;
-        cl_int err = detail::errHandler(
-            pfn_clEnqueueReleaseD3D10ObjectsKHR(
-                object_,
-                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
-                (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
-                (events != NULL) ? (cl_uint) events->size() : 0,
-                (events != NULL) ? (cl_event*) &events->front() : NULL,
-                (event != NULL) ? &tmp : NULL),
-            __ENQUEUE_RELEASE_GL_ERR);
-
-        if (event != NULL && err == CL_SUCCESS)
-            *event = tmp;
-
-        return err;
-    }
-#endif

 /**
 * Deprecated APIs for 1.2
--- a/include/atidlas/array.h
+++ b/include/atidlas/array.h
@@ -2,7 +2,6 @@
 #define ATIDLAS_ARRAY_H_

 #include <iostream>
-#include <type_traits>
 #include <CL/cl.hpp>
 #include "atidlas/types.h"
 #include "atidlas/cl_ext/backend.h"
@@ -17,8 +16,7 @@ class scalar;
 class array: public array_base
 {
  friend array reshape(array const &, int_t, int_t);
-  template<class T>
-  struct is_array { enum{ value = std::is_same<T, array>::value || std::is_same<T, array_expression>::value}; };
+
 public:
  //1D Constructors
  array(int_t size1, numeric_type dtype, cl::Context context = cl_ext::default_context());
--- a/include/atidlas/backend/mapped_object.h
+++ b/include/atidlas/backend/mapped_object.h
@@ -20,7 +20,7 @@ enum leaf_t
 class mapped_object;

 typedef std::pair<int_t, leaf_t> mapping_key;
-typedef std::map<mapping_key, std::shared_ptr<mapped_object> > mapping_type;
+typedef std::map<mapping_key, tools::shared_ptr<mapped_object> > mapping_type;

 /** @brief Mapped Object
 *
--- a/include/atidlas/backend/templates/base.h
+++ b/include/atidlas/backend/templates/base.h
@@ -4,11 +4,11 @@

 #include <list>
 #include <set>
+#include <CL/cl.hpp>

 #include "atidlas/types.h"
 #include "atidlas/backend/parse.h"
 #include "atidlas/backend/stream.h"
-#include <CL/cl.hpp>
 #include "atidlas/cl_ext/lazy_compiler.h"
 #include "atidlas/symbolic/expression.h"

@@ -75,15 +75,15 @@ protected:
    /** @brief Accessor for the numeric type */
    numeric_type get_numeric_type(atidlas::array_expression const * array_expression, int_t root_idx) const;
    /** @brief Creates a binary leaf */
-    template<class T> std::shared_ptr<mapped_object> binary_leaf(atidlas::array_expression const * array_expression, int_t root_idx, mapping_type const * mapping) const;
+    template<class T> tools::shared_ptr<mapped_object> binary_leaf(atidlas::array_expression const * array_expression, int_t root_idx, mapping_type const * mapping) const;
    /** @brief Creates a value scalar mapping */
-    std::shared_ptr<mapped_object> create(numeric_type dtype, values_holder) const;
+    tools::shared_ptr<mapped_object> create(numeric_type dtype, values_holder) const;
    /** @brief Creates a vector mapping */
-    std::shared_ptr<mapped_object> create(array_infos const &) const;
+    tools::shared_ptr<mapped_object> create(array_infos const &) const;
    /** @brief Creates a tuple mapping */
-    std::shared_ptr<mapped_object> create(repeat_infos const &) const;
+    tools::shared_ptr<mapped_object> create(repeat_infos const &) const;
    /** @brief Creates a mapping */
-    std::shared_ptr<mapped_object> create(lhs_rhs_element const &) const;
+    tools::shared_ptr<mapped_object> create(lhs_rhs_element const &) const;
  public:
    map_functor(symbolic_binder & binder, mapping_type & mapping);
    /** @brief Functor for traversing the tree */
@@ -143,7 +143,7 @@ protected:
  static bool is_reduction(array_expression::node const & node);
  static bool is_index_reduction(op_element const & op);

-  std::shared_ptr<symbolic_binder> make_binder();
+  tools::shared_ptr<symbolic_binder> make_binder();
  static std::string vstore(unsigned int simd_width, std::string const & value, std::string const & offset, std::string const & ptr);
  static std::string vload(unsigned int simd_width, std::string const & offset, std::string const & ptr);
  static std::string append_width(std::string const & str, unsigned int width);
@@ -164,7 +164,7 @@ public:
  virtual int check_invalid(expressions_tuple const & expressions, cl::Device const & device) const = 0;
  virtual void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
                       unsigned int label, controller<expressions_tuple> const & expressions) = 0;
-  virtual std::shared_ptr<base> clone() const = 0;
+  virtual tools::shared_ptr<base> clone() const = 0;
 private:
  binding_policy_t binding_policy_;
 };
@@ -180,7 +180,7 @@ public:
  base_impl(parameters_type const & parameters, binding_policy_t binding_policy);
  int_t local_size_0() const;
  int_t local_size_1() const;
-  std::shared_ptr<base> clone() const;
+  tools::shared_ptr<base> clone() const;
  /** @brief returns whether or not the profile has undefined behavior on particular device */
  int check_invalid(expressions_tuple const & expressions, cl::Device const & device) const;
 protected:
--- a/include/atidlas/model/model.h
+++ b/include/atidlas/model/model.h
@@ -16,7 +16,7 @@ namespace atidlas

  class model
  {
-    typedef std::vector< std::shared_ptr<base> > templates_container;
+    typedef std::vector< tools::shared_ptr<base> > templates_container;

  private:
    std::string define_extension(std::string const & extensions, std::string const & ext);
@@ -24,21 +24,21 @@ namespace atidlas
    std::vector<cl_ext::lazy_compiler>& init(controller<expressions_tuple> const &);

  public:
-    model(predictors::random_forest const &, std::vector< std::shared_ptr<base> > const &, cl::CommandQueue &);
-    model(std::vector< std::shared_ptr<base> > const &, cl::CommandQueue &);
+    model(predictors::random_forest const &, std::vector< tools::shared_ptr<base> > const &, cl::CommandQueue &);
+    model(std::vector< tools::shared_ptr<base> > const &, cl::CommandQueue &);
    model(base const &, cl::CommandQueue &);

    void execute(controller<expressions_tuple> const &);
    templates_container const & templates() const;
  private:
    templates_container templates_;
-    std::shared_ptr<predictors::random_forest> predictor_;
+    tools::shared_ptr<predictors::random_forest> predictor_;
    std::map<std::vector<int_t>, int> hardcoded_;
    std::map<cl_context, std::map<std::string, std::vector<cl_ext::lazy_compiler> > > lazy_programs_;
    cl::CommandQueue & queue_;
  };

-  typedef std::map<std::pair<expression_type, numeric_type>, std::shared_ptr<model> > model_map_t;
+  typedef std::map<std::pair<expression_type, numeric_type>, tools::shared_ptr<model> > model_map_t;

  model_map_t init_models(cl::CommandQueue const & queue);
  model_map_t& get_model_map(cl::CommandQueue & queue);
--- a/include/atidlas/symbolic/expression.h
+++ b/include/atidlas/symbolic/expression.h
@@ -6,7 +6,8 @@
 #include <CL/cl.hpp>
 #include "atidlas/types.h"
 #include "atidlas/value_scalar.h"
-#include <memory>
+#include "atidlas/tools/shared_ptr.hpp"
+#include <iostream>

 namespace atidlas
 {
@@ -221,21 +222,14 @@ class operation_cache
  };

 public:
-  void push_back(cl::CommandQueue & queue, cl::Kernel const & kernel, cl::NDRange const & offset, cl::NDRange const & global, cl::NDRange const & local, std::vector<cl::Event>* dependencies)
-  { l_.push_back({queue, kernel, offset, global, local, dependencies});  }
-
-  void enqueue(std::list<cl::Event>* events = NULL)
-  {
-    for(infos & i : l_){
-      events->push_back(cl::Event());
-      i.queue.enqueueNDRangeKernel(i.kernel, i.offset, i.global, i.local, i.dependencies, &events->back());
-    }
-  }
-
+  void push_back(cl::CommandQueue & queue, cl::Kernel const & kernel, cl::NDRange const & offset, cl::NDRange const & global, cl::NDRange const & local, std::vector<cl::Event>* dependencies);
+  void enqueue(std::list<cl::Event>* events = NULL);
 private:
  std::list<infos> l_;
 };

+
+
 struct execution_options_type
 {
  execution_options_type(unsigned int _queue_id = 0, std::list<cl::Event>* _events = NULL, operation_cache* _cache = NULL, std::vector<cl::Event>* _dependencies = NULL) : queue_id(_queue_id), events(_events), cache(_cache), dependencies(_dependencies){}
@@ -299,9 +293,9 @@ controller<TYPE> control(TYPE const & x, execution_options_type const& execution
 class expressions_tuple
 {
 private:
-  std::shared_ptr<array_expression> create(array_expression const & s);
+  tools::shared_ptr<array_expression> create(array_expression const & s);
 public:
-  typedef std::list<std::shared_ptr<array_expression> > data_type;
+  typedef std::list<tools::shared_ptr<array_expression> > data_type;
  enum order_type { SEQUENTIAL, INDEPENDENT };

  expressions_tuple(array_expression const & s0);
--- a/include/atidlas/tools/shared_ptr.hpp
+++ b/include/atidlas/tools/shared_ptr.hpp
@@ -0,0 +1,162 @@
+#ifndef ATIDLAS_TOOLS_SHARED_PTR_HPP
+#define ATIDLAS_TOOLS_SHARED_PTR_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2012, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file tools/shared_ptr.hpp
+    @brief Implementation of a shared pointer class (cf. tools::shared_ptr, boost::shared_ptr). Will be used until C++11 is widely available.
+
+    Contributed by Philippe Tillet.
+*/
+
+#include <cstdlib>
+#include <algorithm>
+
+namespace atidlas
+{
+namespace tools
+{
+namespace detail
+{
+
+  /** @brief Reference counting class for the shared_ptr implementation */
+  class count
+  {
+  public:
+    count(unsigned int val) : val_(val){ }
+    void dec(){ --val_; }
+    void inc(){ ++val_; }
+    bool is_null(){ return val_ == 0; }
+    unsigned int val(){ return val_; }
+  private:
+    unsigned int val_;
+  };
+
+  /** @brief Interface for the reference counter inside the shared_ptr */
+  struct aux
+  {
+    detail::count count;
+
+    aux() :count(1) {}
+    virtual void destroy()=0;
+    virtual ~aux() {}
+  };
+
+  /** @brief Implementation helper for the reference counting mechanism inside shared_ptr. */
+  template<class U, class Deleter>
+  struct auximpl: public detail::aux
+  {
+    U* p;
+    Deleter d;
+
+    auximpl(U* pu, Deleter x) :p(pu), d(x) {}
+    virtual void destroy() { d(p); }
+  };
+
+  /** @brief Default deleter class for a pointer. The default is to just call 'delete' on the pointer. Provide your own implementations for 'delete[]' and 'free'. */
+  template<class U>
+  struct default_deleter
+  {
+    void operator()(U* p) const { delete p; }
+  };
+
+}
+
+class shared_ptr_base
+{
+protected:
+  detail::aux* pa;
+public:
+  unsigned int count() { return pa->count.val(); }
+};
+
+/** @brief A shared pointer class similar to boost::shared_ptr. Reimplemented in order to avoid a Boost-dependency. Will be replaced by tools::shared_ptr as soon as C++11 is widely available. */
+template<class T>
+class shared_ptr : public shared_ptr_base
+{
+  template<class U>
+  friend class shared_ptr;
+
+  detail::aux* pa;
+  T* pt;
+
+public:
+
+  shared_ptr() :pa(NULL), pt(NULL) {}
+
+  template<class U, class Deleter>
+  shared_ptr(U* pu, Deleter d) : pa(new detail::auximpl<U, Deleter>(pu, d)), pt(pu) {}
+
+  template<class U>
+  explicit shared_ptr(U* pu) : pa(new detail::auximpl<U, detail::default_deleter<U> >(pu, detail::default_deleter<U>())), pt(pu) {}
+
+  template<class U>
+  shared_ptr(const shared_ptr<U>& s) :pa(s.pa), pt(s.pt)  { inc(); }
+
+  shared_ptr(const shared_ptr& s) :pa(s.pa), pt(s.pt)  { inc(); }
+  ~shared_ptr() { dec(); }
+
+  T* get() const {  return pt; }
+  T* operator->() const {  return pt; }
+  T& operator*() const { return *pt; }
+
+  void reset() { shared_ptr<T>().swap(*this); }
+  void reset(T * ptr) { shared_ptr<T>(ptr).swap(*this); }
+
+  void swap(shared_ptr<T> & other)
+  {
+    std::swap(pt,other.pt);
+    std::swap(pa, other.pa);
+  }
+
+  shared_ptr& operator=(const shared_ptr& s)
+  {
+    if (this!=&s)
+    {
+      dec();
+      pa = s.pa;
+      pt = s.pt;
+      inc();
+    }
+    return *this;
+  }
+
+  void inc()
+  {
+    if (pa) pa->count.inc();
+  }
+
+  void dec()
+  {
+    if (pa)
+    {
+      pa->count.dec();
+      if (pa->count.is_null())
+      {
+        pa->destroy();
+        delete pa;
+        pa = NULL;
+      }
+    }
+  }
+};
+
+}
+}
+
+#endif
--- a/lib/backend/templates/base.cpp
+++ b/lib/backend/templates/base.cpp
@@ -30,44 +30,44 @@ numeric_type base::map_functor::get_numeric_type(atidlas::array_expression const

 /** @brief Binary leaf */
 template<class T>
-std::shared_ptr<mapped_object> base::map_functor::binary_leaf(atidlas::array_expression const * array_expression, int_t root_idx, mapping_type const * mapping) const
+tools::shared_ptr<mapped_object> base::map_functor::binary_leaf(atidlas::array_expression const * array_expression, int_t root_idx, mapping_type const * mapping) const
 {
-  return std::shared_ptr<mapped_object>(new T(numeric_type_to_string(array_expression->dtype()), binder_.get(NULL), mapped_object::node_info(mapping, array_expression, root_idx)));
+  return tools::shared_ptr<mapped_object>(new T(numeric_type_to_string(array_expression->dtype()), binder_.get(NULL), mapped_object::node_info(mapping, array_expression, root_idx)));
 }

 /** @brief Scalar mapping */
-std::shared_ptr<mapped_object> base::map_functor::create(numeric_type dtype, values_holder) const
+tools::shared_ptr<mapped_object> base::map_functor::create(numeric_type dtype, values_holder) const
 {
  std::string strdtype = numeric_type_to_string(dtype);
-  return std::shared_ptr<mapped_object>(new mapped_host_scalar(strdtype, binder_.get(NULL)));
+  return tools::shared_ptr<mapped_object>(new mapped_host_scalar(strdtype, binder_.get(NULL)));
 }

 /** @brief Vector mapping */
-std::shared_ptr<mapped_object> base::map_functor::create(array_infos const & a) const
+tools::shared_ptr<mapped_object> base::map_functor::create(array_infos const & a) const
 {
  std::string dtype = numeric_type_to_string(a.dtype);
  unsigned int id = binder_.get(a.data);
  //Scalar
  if(a.shape1==1 && a.shape2==1)
-    return std::shared_ptr<mapped_object>(new mapped_array(dtype, id, 's'));
+    return tools::shared_ptr<mapped_object>(new mapped_array(dtype, id, 's'));
  //Column vector
  else if(a.shape1>1 && a.shape2==1)
-    return std::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'c'));
+    return tools::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'c'));
  //Row vector
  else if(a.shape1==1 && a.shape2>1)
-    return std::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'r'));
+    return tools::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'r'));
  //Matrix
  else
-    return std::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'm'));
+    return tools::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'm'));
 }

-std::shared_ptr<mapped_object> base::map_functor::create(repeat_infos const &) const
+tools::shared_ptr<mapped_object> base::map_functor::create(repeat_infos const &) const
 {
  //TODO: Make it less specific!
-  return std::shared_ptr<mapped_object>(new mapped_tuple("int",binder_.get(NULL),4));
+  return tools::shared_ptr<mapped_object>(new mapped_tuple("int",binder_.get(NULL),4));
 }

-std::shared_ptr<mapped_object> base::map_functor::create(lhs_rhs_element const & lhs_rhs) const
+tools::shared_ptr<mapped_object> base::map_functor::create(lhs_rhs_element const & lhs_rhs) const
 {
  switch(lhs_rhs.type_family)
  {
@@ -111,7 +111,7 @@ void base::map_functor::operator()(atidlas::array_expression const & array_expre
    else if (root_node.op.type == OPERATOR_OUTER_PROD_TYPE)
      mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_outer>(&array_expression, root_idx, &mapping_)));
    else if (detail::is_cast(root_node.op))
-      mapping_.insert(mapping_type::value_type(key, std::shared_ptr<mapped_object>(new mapped_cast(root_node.op.type, binder_.get(NULL)))));
+      mapping_.insert(mapping_type::value_type(key, tools::shared_ptr<mapped_object>(new mapped_cast(root_node.op.type, binder_.get(NULL)))));
  }
 }

@@ -280,7 +280,7 @@ std::string base::generate_arguments(std::string const & data_type, std::vector<

 void base::set_arguments(expressions_tuple const & expressions, cl::Kernel & kernel, unsigned int & current_arg)
 {
-  std::shared_ptr<symbolic_binder> binder = make_binder();
+  tools::shared_ptr<symbolic_binder> binder = make_binder();
  for (const auto & elem : expressions.data())
    traverse(*elem, (elem)->root(), set_arguments_functor(*binder, current_arg, kernel), true);
 }
@@ -478,12 +478,12 @@ unsigned int base::align(unsigned int to_round, unsigned int base)
  return (to_round + base - 1)/base * base;
 }

-std::shared_ptr<symbolic_binder> base::make_binder()
+tools::shared_ptr<symbolic_binder> base::make_binder()
 {
  if (binding_policy_==BIND_TO_HANDLE)
-    return std::shared_ptr<symbolic_binder>(new bind_to_handle());
+    return tools::shared_ptr<symbolic_binder>(new bind_to_handle());
  else
-    return std::shared_ptr<symbolic_binder>(new bind_all_unique());
+    return tools::shared_ptr<symbolic_binder>(new bind_all_unique());
 }


@@ -509,7 +509,7 @@ std::vector<std::string> base::generate(unsigned int label, expressions_tuple co

  //Create mapping
  std::vector<mapping_type> mappings(expressions.data().size());
-  std::shared_ptr<symbolic_binder> binder = make_binder();
+  tools::shared_ptr<symbolic_binder> binder = make_binder();
  for (mit = mappings.begin(), sit = expressions.data().begin(); sit != expressions.data().end(); ++sit, ++mit)
    traverse(**sit, (*sit)->root(), map_functor(*binder,*mit), true);

@@ -533,8 +533,8 @@ int_t base_impl<TType, PType>::local_size_1() const
 { return p_.local_size_1; }

 template<class TType, class PType>
-std::shared_ptr<base> base_impl<TType, PType>::clone() const
-{ return std::shared_ptr<base>(new TType(*dynamic_cast<TType const *>(this))); }
+tools::shared_ptr<base> base_impl<TType, PType>::clone() const
+{ return tools::shared_ptr<base>(new TType(*dynamic_cast<TType const *>(this))); }

 template<class TType, class PType>
 int base_impl<TType, PType>::check_invalid(expressions_tuple const & expressions, cl::Device const & device) const
--- a/lib/backend/templates/mproduct.cpp
+++ b/lib/backend/templates/mproduct.cpp
@@ -587,7 +587,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
    kernel.setArg(current_arg++, cl_uint(N));
    kernel.setArg(current_arg++, cl_uint(K));

-    std::shared_ptr<symbolic_binder> binder = make_binder();
+    tools::shared_ptr<symbolic_binder> binder = make_binder();
    set_arguments_functor fun(*binder, current_arg, kernel);
    fun.set_arguments(C);
    fun.set_arguments(alpha.dtype(), alpha.values());
--- a/lib/model/model.cpp
+++ b/lib/model/model.cpp
@@ -82,11 +82,11 @@ std::vector<cl_ext::lazy_compiler>& model::init(controller<expressions_tuple> co
  return to_init;
 }

-model::model(predictors::random_forest const & predictor, std::vector< std::shared_ptr<base> > const & templates, cl::CommandQueue & queue) :
+model::model(predictors::random_forest const & predictor, std::vector< tools::shared_ptr<base> > const & templates, cl::CommandQueue & queue) :
  templates_(templates), predictor_(new predictors::random_forest(predictor)), queue_(queue)
 {}

-model::model(std::vector< std::shared_ptr<base> > const & templates, cl::CommandQueue & queue) :  templates_(templates), queue_(queue)
+model::model(std::vector< tools::shared_ptr<base> > const & templates, cl::CommandQueue & queue) :  templates_(templates), queue_(queue)
 {}

 model::model(base const & tp, cl::CommandQueue & queue) : templates_(1,tp.clone()), queue_(queue)
@@ -158,27 +158,27 @@ namespace detail
    throw std::invalid_argument("Invalid datatype: " + name);
  }

-  static std::shared_ptr<base> create(std::string const & template_name, std::vector<int> const & a)
+  static tools::shared_ptr<base> create(std::string const & template_name, std::vector<int> const & a)
  {
    fetching_policy_type fetch[] = {FETCH_FROM_LOCAL, FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_CONTIGUOUS};
    if(template_name=="vaxpy")
-      return std::shared_ptr<base>(new vaxpy(a[0], a[1], a[2], fetch[a[3]]));
+      return tools::shared_ptr<base>(new vaxpy(a[0], a[1], a[2], fetch[a[3]]));
    else if(template_name=="dot")
-      return std::shared_ptr<base>(new reduction(a[0], a[1], a[2], fetch[a[3]]));
+      return tools::shared_ptr<base>(new reduction(a[0], a[1], a[2], fetch[a[3]]));
    else if(template_name=="maxpy")
-      return std::shared_ptr<base>(new maxpy(a[0], a[1], a[2], a[3], a[4], fetch[a[5]]));
+      return tools::shared_ptr<base>(new maxpy(a[0], a[1], a[2], a[3], a[4], fetch[a[5]]));
    else if(template_name.find("gemvN")!=std::string::npos)
-      return std::shared_ptr<base>(new mreduction_rows(a[0], a[1], a[2], a[3], fetch[a[4]]));
+      return tools::shared_ptr<base>(new mreduction_rows(a[0], a[1], a[2], a[3], fetch[a[4]]));
    else if(template_name.find("gemvT")!=std::string::npos)
-      return std::shared_ptr<base>(new mreduction_cols(a[0], a[1], a[2], a[3], fetch[a[4]]));
+      return tools::shared_ptr<base>(new mreduction_cols(a[0], a[1], a[2], a[3], fetch[a[4]]));
    else if(template_name.find("gemmNN")!=std::string::npos)
-      return std::shared_ptr<base>(new mproduct_nn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+      return tools::shared_ptr<base>(new mproduct_nn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else if(template_name.find("gemmTN")!=std::string::npos)
-      return std::shared_ptr<base>(new mproduct_tn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+      return tools::shared_ptr<base>(new mproduct_tn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else if(template_name.find("gemmNT")!=std::string::npos)
-      return std::shared_ptr<base>(new mproduct_nt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+      return tools::shared_ptr<base>(new mproduct_nt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else if(template_name.find("gemmTT")!=std::string::npos)
-      return std::shared_ptr<base>(new mproduct_tt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+      return tools::shared_ptr<base>(new mproduct_tt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else
      throw std::invalid_argument("Invalid expression: " + template_name);
  }
@@ -214,7 +214,7 @@ void import(std::string const & fname, cl::CommandQueue & queue, model_map_t& re
          numeric_type dtype = detail::get_dtype(elem);

          // Get profiles
-          std::vector<std::shared_ptr<base> > templates;
+          std::vector<tools::shared_ptr<base> > templates;
          js::Value const & profiles = document[opcstr][dtcstr]["profiles"];
          for (js::SizeType id = 0 ; id < profiles.Size() ; ++id)
            templates.push_back(detail::create(operation, tools::to_int_array<int>(profiles[id])));
@@ -222,10 +222,10 @@ void import(std::string const & fname, cl::CommandQueue & queue, model_map_t& re
          {
            // Get predictor
            predictors::random_forest predictor(document[opcstr][dtcstr]["predictor"]);
-            result[std::make_pair(etype, dtype)] = std::shared_ptr<model>(new model(predictor, templates, queue));
+            result[std::make_pair(etype, dtype)] = tools::shared_ptr<model>(new model(predictor, templates, queue));
          }
          else
-            result[std::make_pair(etype, dtype)] = std::shared_ptr<model>(new model(templates, queue));
+            result[std::make_pair(etype, dtype)] = tools::shared_ptr<model>(new model(templates, queue));
        }
      }
    }
@@ -235,7 +235,7 @@ void import(std::string const & fname, cl::CommandQueue & queue, model_map_t& re
 model_map_t init_models(cl::CommandQueue & queue)
 {
  model_map_t res;
-  typedef std::shared_ptr<model> ptr_t;
+  typedef tools::shared_ptr<model> ptr_t;
  numeric_type types[] = {CHAR_TYPE, UCHAR_TYPE, SHORT_TYPE, USHORT_TYPE, INT_TYPE, UINT_TYPE, LONG_TYPE, ULONG_TYPE, FLOAT_TYPE, DOUBLE_TYPE};

  for(auto DTYPE : types){
--- a/lib/symbolic/execute.cpp
+++ b/lib/symbolic/execute.cpp
@@ -173,30 +173,30 @@ namespace atidlas

    /*----Parse required temporaries-----*/
    detail::parse(tree, rootidx, current_type, breakpoints, final_type);
-    std::vector<std::shared_ptr<array> > temporaries_;
+    std::vector<tools::shared_ptr<array> > temporaries_;

    /*----Compute required temporaries----*/
    for(detail::breakpoints_t::reverse_iterator rit = breakpoints.rbegin() ; rit != breakpoints.rend() ; ++rit)
    {
-      std::shared_ptr<model> const & pmodel = models[std::make_pair(rit->first, dtype)];
+      tools::shared_ptr<model> const & pmodel = models[std::make_pair(rit->first, dtype)];
      array_expression::node const & node = tree[rit->second->node_index];
      array_expression::node const & lmost = lhs_most(tree, node);

      //Creates temporary
-      std::shared_ptr<array> tmp;
+      tools::shared_ptr<array> tmp;
      switch(rit->first){
        case SCALAR_AXPY_TYPE:
-        case REDUCTION_TYPE:           tmp = std::shared_ptr<array>(new array(1, dtype, context));                                                        break;
+        case REDUCTION_TYPE:           tmp = tools::shared_ptr<array>(new array(1, dtype, context));                                                        break;

-        case VECTOR_AXPY_TYPE:         tmp = std::shared_ptr<array>(new array(lmost.lhs.array.shape1, dtype, context));                              break;
-        case ROW_WISE_REDUCTION_TYPE:  tmp = std::shared_ptr<array>(new array(lmost.lhs.array.shape1, dtype, context));                              break;
-        case COL_WISE_REDUCTION_TYPE:  tmp = std::shared_ptr<array>(new array(lmost.lhs.array.shape2, dtype, context));                              break;
+        case VECTOR_AXPY_TYPE:         tmp = tools::shared_ptr<array>(new array(lmost.lhs.array.shape1, dtype, context));                              break;
+        case ROW_WISE_REDUCTION_TYPE:  tmp = tools::shared_ptr<array>(new array(lmost.lhs.array.shape1, dtype, context));                              break;
+        case COL_WISE_REDUCTION_TYPE:  tmp = tools::shared_ptr<array>(new array(lmost.lhs.array.shape2, dtype, context));                              break;

-        case MATRIX_AXPY_TYPE:         tmp = std::shared_ptr<array>(new array(lmost.lhs.array.shape1, lmost.lhs.array.shape2, dtype, context)); break;
-        case MATRIX_PRODUCT_NN_TYPE:   tmp = std::shared_ptr<array>(new array(node.lhs.array.shape1, node.rhs.array.shape2, dtype, context));   break;
-        case MATRIX_PRODUCT_NT_TYPE:   tmp = std::shared_ptr<array>(new array(node.lhs.array.shape1, node.rhs.array.shape1, dtype, context));   break;
-        case MATRIX_PRODUCT_TN_TYPE:   tmp = std::shared_ptr<array>(new array(node.lhs.array.shape2, node.rhs.array.shape2, dtype, context));   break;
-        case MATRIX_PRODUCT_TT_TYPE:   tmp = std::shared_ptr<array>(new array(node.lhs.array.shape2, node.rhs.array.shape1, dtype, context));   break;
+        case MATRIX_AXPY_TYPE:         tmp = tools::shared_ptr<array>(new array(lmost.lhs.array.shape1, lmost.lhs.array.shape2, dtype, context)); break;
+        case MATRIX_PRODUCT_NN_TYPE:   tmp = tools::shared_ptr<array>(new array(node.lhs.array.shape1, node.rhs.array.shape2, dtype, context));   break;
+        case MATRIX_PRODUCT_NT_TYPE:   tmp = tools::shared_ptr<array>(new array(node.lhs.array.shape1, node.rhs.array.shape1, dtype, context));   break;
+        case MATRIX_PRODUCT_TN_TYPE:   tmp = tools::shared_ptr<array>(new array(node.lhs.array.shape2, node.rhs.array.shape2, dtype, context));   break;
+        case MATRIX_PRODUCT_TT_TYPE:   tmp = tools::shared_ptr<array>(new array(node.lhs.array.shape2, node.rhs.array.shape1, dtype, context));   break;

        default: throw std::invalid_argument("Unrecognized operation");
      }
--- a/lib/symbolic/expression.cpp
+++ b/lib/symbolic/expression.cpp
@@ -176,11 +176,24 @@ array_expression array_expression::operator-()
 array_expression array_expression::operator!()
 { return array_expression(*this, invalid_node(), op_element(OPERATOR_UNARY_TYPE_FAMILY, OPERATOR_NEGATE_TYPE), context_, INT_TYPE, shape_); }

+//
+void operation_cache::push_back(cl::CommandQueue & queue, cl::Kernel const & kernel, cl::NDRange const & offset, cl::NDRange const & global, cl::NDRange const & local, std::vector<cl::Event>* dependencies)
+{
+  l_.push_back({queue, kernel, offset, global, local, dependencies});
+}
+
+void operation_cache::enqueue(std::list<cl::Event>* events)
+{
+  for(infos & i : l_){
+    events->push_back(cl::Event());
+    i.queue.enqueueNDRangeKernel(i.kernel, i.offset, i.global, i.local, i.dependencies, &events->back());
+  }
+}

 //
-std::shared_ptr<array_expression> expressions_tuple::create(array_expression const & s)
+tools::shared_ptr<array_expression> expressions_tuple::create(array_expression const & s)
 {
-  return std::shared_ptr<array_expression>(new array_expression(static_cast<array_expression const &>(s)));
+  return tools::shared_ptr<array_expression>(new array_expression(static_cast<array_expression const &>(s)));
 }

 expressions_tuple::expressions_tuple(data_type const & data, order_type order) : data_(data), order_(order)
--- a/python/autotune/pysrc/autotune.py
+++ b/python/autotune/pysrc/autotune.py
@@ -143,7 +143,7 @@ def do_tuning(args):
                      A = atd.empty(sizes, datatype, context=context)
                      C = atd.empty(sizes, datatype, context=context)
                      return execute(A + C, sizes, Template, parameters, fname)
-                  tune(execution_handler, 100, 5000, 2, (),'log', 'log')
+                  tune(execution_handler, 64, 5000, 2, (),'log', 'log')
              #Row-wise dot
              if operation=='gemv':
                  for A_trans in  args.gemv_layouts:
@@ -152,7 +152,7 @@ def do_tuning(args):
                          x = atd.empty(sizes[1], datatype, context=context)
                          LHS = A if A_trans=='N' else A.T
                          return execute(atd.dot(LHS, x), sizes, Template[A_trans], parameters, fname)
-                      tune(execution_handler, 100, 5000, 2, (A_trans,),'log', 'log')
+                      tune(execution_handler, 64, 6000, 2, (A_trans,),'log', 'log')
              #Matrix Product
              if operation=='gemm':
                  for L in args.gemm_layouts:
@@ -194,7 +194,7 @@ class ArgumentsHandler:

        full_parser = tune_subparsers.add_parser('full', help = 'Tune each operation for randomly chosen sizes')
        full_parser.add_argument("--build-model", default=True, type=bool)
-        full_parser.add_argument("--sample-size", default=30, type=int)
+        full_parser.add_argument("--sample-size", default=60, type=int)

        args = parser.parse_args()
        self.__dict__ = args.__dict__.copy()
--- a/python/autotune/pysrc/misc_tools.py
+++ b/python/autotune/pysrc/misc_tools.py
@@ -218,11 +218,16 @@ def benchmark(template, symbolic):
        raise ValueError("Template has too low occupancy")
    else:
        queue.models[template, atd.float32] = atd.model(template, queue)
-        x = atd.array(symbolic)
-        atd.synchronize(symbolic.context)
        x, events, cache = atd.flush(symbolic)
        atd.synchronize(symbolic.context)
-        return 1e-9*sum([e.end - e.start for e in events])
+        timings = []
+        current_time = 0
+        while current_time < 1e-3:
+            x, events, cache = atd.flush(symbolic)
+            atd.synchronize(symbolic.context)
+            timings.append(1e-9*sum([e.end - e.start for e in events]))
+            current_time = current_time + timings[-1]
+        return np.median(timings)


 def sanitize_string(string, keep_chars = ['_']):
--- a/python/pyatidlas/external/boost/include/boost/functional/hash/extensions.hpp
+++ b/python/pyatidlas/external/boost/include/boost/functional/hash/extensions.hpp
@@ -193,7 +193,7 @@ namespace boost

 #if !defined(BOOST_NO_CXX11_SMART_PTR)
    template <typename T>
-    inline std::size_t hash_value(std::shared_ptr<T> const& x) {
+    inline std::size_t hash_value(tools::shared_ptr<T> const& x) {
        return boost::hash_value(x.get());
    }

--- a/python/pyatidlas/external/boost/include/boost/get_pointer.hpp
+++ b/python/pyatidlas/external/boost/include/boost/get_pointer.hpp
@@ -36,7 +36,7 @@ template<class T> T * get_pointer( std::unique_ptr<T> const& p )
    return p.get();
 }

-template<class T> T * get_pointer( std::shared_ptr<T> const& p )
+template<class T> T * get_pointer( tools::shared_ptr<T> const& p )
 {
    return p.get();
 }
--- a/python/pyatidlas/src/_atidlas.cpp
+++ b/python/pyatidlas/src/_atidlas.cpp
@@ -315,15 +315,14 @@ namespace detail
  boost::shared_ptr<cl::Context> make_context(cl::Device const & dev)
  { return boost::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>(1, dev))); }

-  bp::tuple flush(atd::array_expression const & expression, unsigned int queue_id, bp::list dependencies, int label, std::string const & program_name, bool force_recompile)
+  bp::tuple flush(atd::array_expression const & expression, unsigned int queue_id, bp::list dependencies, bool tune, int label, std::string const & program_name, bool force_recompile)
  {
      std::list<cl::Event> events;
      atd::operation_cache cache;
      std::vector<cl::Event> cdependencies = to_vector<cl::Event>(dependencies);
      boost::shared_ptr<atd::array> parray(new atd::array(atd::control(expression, atd::execution_options_type(queue_id, &events, &cache, &cdependencies),
-                                                                       atd::dispatcher_options_type(label), atd::compilation_options_type(program_name, force_recompile))));
-
-      return bp::make_tuple(*parray, to_list(events.begin(), events.end()), cache);
+                                                                       atd::dispatcher_options_type(tune, label), atd::compilation_options_type(program_name, force_recompile))));
+      return bp::make_tuple(parray, to_list(events.begin(), events.end()), cache);
  }
 }

@@ -402,7 +401,7 @@ void export_cl()
  bp::def("synchronize", &atd::cl_ext::synchronize);
  bp::def("get_platforms", &detail::get_platforms);

-  bp::def("flush", &detail::flush, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));
+  bp::def("flush", &detail::flush, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("tune") = false, bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));

  bp::class_<state_type>("state_type")
          .def_readwrite("queue_properties",&atd::cl_ext::queue_properties)
--- a/tests/model.cpp
+++ b/tests/model.cpp
@@ -10,7 +10,7 @@ namespace ad = atidlas;
 int main()
 {
  viennacl::vector<float> x(10000), y(10000), z(10000);
-  std::map<std::string, ad::std::shared_ptr<ad::model> > models = ad::import("geforce_gt_540m.json");
+  std::map<std::string, ad::tools::shared_ptr<ad::model> > models = ad::import("geforce_gt_540m.json");
  models["vector-axpy-float32"]->tune(viennacl::symbolic_expression(z, viennacl::op_assign(), x));
  models["vector-axpy-float32"]->execute(viennacl::symbolic_expression(z, viennacl::op_assign(), x));
  return EXIT_SUCCESS;