removing C++11 interface

2015-02-08 23:19:38 -05:00
parent 85fb438806
commit a6d7671831
21 changed files with 423 additions and 956 deletions
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -33,7 +33,7 @@ else()
 endif()
 string(REPLACE ";" " " BLAS_DEF_STR "${BLAS_DEF}")
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} " ${BLAS_DEF_STR} -std=c++11")
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} " ${BLAS_DEF_STR}")
 foreach(PROG blas overhead)
    include_directories(${CMAKE_CURRENT_SOURCE_DIR})
    if(CUDA_FOUND)
@@ -44,7 +44,7 @@ foreach(PROG blas overhead)
 	cuda_add_cublas_to_target(${PROG}-bench)
    else()
        add_executable(${PROG}-bench  ${PROG}.cpp)
-        set_target_properties(${PROG}-bench PROPERTIES COMPILE_FLAGS "-Wall -Wextra ${BLAS_DEF_STR} -std=c++11")
+        set_target_properties(${PROG}-bench PROPERTIES COMPILE_FLAGS "-Wall -Wextra ${BLAS_DEF_STR}")
    endif()
     target_link_libraries(${PROG}-bench ${BLAS_LIBS})
 endforeach(PROG)
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -1,5 +1,6 @@
 #include "atidlas/array.h"
 #include "atidlas/symbolic/execute.h"
 #include "atidlas/tools/timer.hpp"
 #include "common.hpp"
 #ifdef BENCH_CLAMDBLAS
  #include "clAmdBlas.h"
@@ -13,27 +14,89 @@
 #include <iomanip>
 #include <stdlib.h>
 #include <cmath>
-#include <chrono>
+#include <numeric>
 namespace ad = atidlas;
 typedef ad::int_t int_t;
-template<class T>
+int ceil(int N, int pad)
 void bench(ad::numeric_type dtype)
 {
-  unsigned int dtsize = ad::size_of(dtype);
+    return (N%pad==0)?N:(N+pad-1)/pad*pad;
-  cl::CommandQueue & queue = ad::cl_ext::queues[ad::cl_ext::default_context()][0];
+}
 std::vector<int> create_log_range(int min, int max, int N, int pad)
 {
  std::vector<int> res(N);
  for(int i = 0 ; i < N ; ++i)
  {
    res[i] = std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N);
    res[i] = ceil(res[i], pad);
  }
  return res;
 }
 std::vector<int> create_full_range(int min, int max, int pad)
 {
    std::vector<int> N;
    for(int i = ceil(min, pad) ; i < ceil(max, pad) ; i+=pad)
        N.push_back(i);
    return N;
 }
 template <typename T>
 class make_vector {
 public:
  typedef make_vector<T> my_type;
  my_type& operator<< (const T& val) {
    data_.push_back(val);
    return *this;
  }
  operator std::vector<T>() const {
    return data_;
  }
 private:
  std::vector<T> data_;
 };
 template<class T>
 T median(std::vector<T> x)
 {
  size_t size = x.size();
  std::sort(x.begin(), x.end());
  if (size  % 2 == 0)
      return (x[size / 2 - 1] + x[size / 2]) / 2;
  else
      return x[size / 2];
 }
 template<class T>
 T mean(std::vector<T> x)
 {
  T res = 0;
  int N = x.size();
  for(int i = 0 ; i < N ; ++i)
    res += x[i];
  return res/N;
 }
 static double time_event(unsigned long sum, cl::Event const & e)
 { return sum + e.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  e.getProfilingInfo<CL_PROFILING_COMMAND_START>();}
 template<class T>
 void bench(ad::numeric_type dtype){
 #define BENCHMARK_ATIDLAS(OP, PERF) \
  {\
  std::vector<long> times;\
  double total_time = 0;\
-  while(total_time*1e-9 < 1e-1){\
+  while(total_time*1e-9 < 1e-2){\
    std::list<cl::Event> events;\
    OP;\
    queue.finish();\
-    times.push_back(std::accumulate(events.begin(), events.end(), 0, \
+    times.push_back(std::accumulate(events.begin(), events.end(), 0, &time_event));\
                    [](unsigned long sum, cl::Event const & e){ return sum + e.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  e.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>();}));\
    total_time+=times.back();\
  }\
  double t = median(times);\
@@ -44,11 +107,11 @@ void bench(ad::numeric_type dtype)
  {\
  std::vector<long> times;\
  double total_time = 0;\
-  while(total_time*1e-9 < 1e-1){\
+  while(total_time*1e-9 < 1e-2){\
    cl::Event event;\
    OP;\
    queue.finish();\
-    times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());\
+    times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() -  event.getProfilingInfo<CL_PROFILING_COMMAND_START>());\
    total_time+=times.back();\
  }\
  double t = median(times);\
@@ -57,11 +120,11 @@ void bench(ad::numeric_type dtype)
 #define BENCHMARK_HOST(OP, PERF) \
  {\
  ad::tools::timer tmr;\
  std::vector<int> cache_flusher(10000000, 0);\
-  auto start = std::chrono::steady_clock::now();\
+  tmr.start();\
  OP;\
-  auto end = std::chrono::steady_clock::now();\
+  double t = 1e9*tmr.get();\
  double t = std::chrono::duration<double, std::nano>(end - start).count();\
  std::cout << " " << PERF << std::flush;\
  }
@@ -86,68 +149,49 @@ void bench(ad::numeric_type dtype)
  std::cout << " " << PERF << std::flush;\
  }
-  /*---------*/
+unsigned int dtsize = ad::size_of(dtype);
-  /*--BLAS1--*/
+cl::CommandQueue & queue = ad::cl_ext::queues[ad::cl_ext::default_context()][0];
  /*---------*/
  std::cout << "#AXPY" << std::endl;
  for(int_t N : create_log_range(1e3, 2e7, 50, 64))
  {
    std::cout << N;
    ad::array x(N, dtype), y(N, dtype);
    /* ATIDLAS */
    y = x + y; queue.flush(); queue.finish();
    BENCHMARK_ATIDLAS(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t)
    /* clAmdBlas */
 #ifdef BENCH_CLAMDBLAS
    BENCHMARK_CLAMDBLAS(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
 #endif
    /* BLAS */
 #ifdef BENCH_CBLAS
    std::vector<float> cx(N), cy(N);
    ad::copy(x, cx);
    ad::copy(y, cy);
    BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
 #endif
    /* CuBLAS */
 #ifdef BENCH_CUBLAS
    T *cux, *cuy;
    cudaMalloc((void**) &cux, N * sizeof(T));
    cudaMalloc((void**) &cuy, N * sizeof(T));
    BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
    cudaFree(cux);
    cudaFree(cuy);
 #endif
    std::cout << std::endl;
  }
  std::cout << "\n\n" << std::flush;
-//  std::cout << "#DOT" << std::endl;
+  // BLAS1 Sizes
  static const std::vector<int> BLAS1_N = create_log_range(1e3, 2e7, 50, 64);
  // BLAS2 Sizes
  static const std::vector<int> BLAS2_N = make_vector<int>() << 64;
  static const std::vector<int> BLAS2_M = create_full_range(128, 10000, 64);
  // BLAS3 Sizes
  static const std::vector<int> BLAS3_M = make_vector<int>() << 1024;
  static const std::vector<int> BLAS3_N = make_vector<int>() << 128;
  static const std::vector<int> BLAS3_K = create_full_range(128, 5000, 64);
 //  /*---------*/
 //  /*--BLAS1--*/
 //  /*---------*/
 //  std::cout << "#AXPY" << std::endl;
 //  for(int_t N : create_log_range(1e3, 2e7, 50, 64))
 //  {
 //    std::cout << N;
 //    /* ATIDLAS */
 //    ad::array x(N, dtype), y(N, dtype);
-//    ad::array scratch(N, dtype);
+//    /* ATIDLAS */
-//    ad::scalar s(dtype);
+//    y = x + y; queue.finish();
-//    s = dot(x,y); queue.flush(); queue.finish();
+//    BENCHMARK_ATIDLAS(y = ad::control(x + y, ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 3*N*dtsize/t)
 //    BENCHMARK_OPENCL(s = ad::controller<atidlas::array_expression>(dot(x,y), ad::execution_options_type(0, &event)), 2*N*dtsize/t)
 //    /* clAmdBlas */
 //#ifdef BENCH_CLAMDBLAS
-//    BENCHMARK_OPENCL(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
+//    BENCHMARK_CLAMDBLAS(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
 //#endif
 //    /* BLAS */
 //#ifdef BENCH_CBLAS
 //    std::vector<float> cx(N), cy(N);
 //    ad::copy(x, cx);
 //    ad::copy(y, cy);
-//    BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
+//    BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
 //#endif
 //    /* CuBLAS */
 //#ifdef BENCH_CUBLAS
 //    T *cux, *cuy;
 //    T result;
 //    cudaMalloc((void**) &cux, N * sizeof(T));
 //    cudaMalloc((void**) &cuy, N * sizeof(T));
-//    BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
+//    BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
 //    cudaFree(cux);
 //    cudaFree(cuy);
 //#endif
@@ -155,21 +199,56 @@ void bench(ad::numeric_type dtype)
 //  }
 //  std::cout << "\n\n" << std::flush;
  std::cout << "#DOT" << std::endl;
  for(int_t i = 0 ; i < BLAS1_N.size() ; ++i)
  {
    int_t N = BLAS1_N[i];
    std::cout << N;
    /* ATIDLAS */
    ad::array x(N, dtype), y(N, dtype);
    ad::array scratch(N, dtype);
    ad::scalar s(dtype);
    s = dot(x,y); queue.finish();
    BENCHMARK_ATIDLAS(s = ad::control(dot(x,y), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)), 2*N*dtsize/t)
    /* clAmdBlas */
 #ifdef BENCH_CLAMDBLAS
    BENCHMARK_CLAMDBLAS(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
 #endif
    /* BLAS */
 #ifdef BENCH_CBLAS
    std::vector<float> cx(N), cy(N);
    ad::copy(x, cx);
    ad::copy(y, cy);
    BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
 #endif
 #ifdef BENCH_CUBLAS
    T *cux, *cuy;
    T result;
    cudaMalloc((void**) &cux, N * sizeof(T));
    cudaMalloc((void**) &cuy, N * sizeof(T));
    BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
    cudaFree(cux);
    cudaFree(cuy);
 #endif
    std::cout << std::endl;
  }
  std::cout << "\n\n" << std::flush;
 //  /*---------*/
 //  /*--BLAS2--*/
 //  /*---------*/
 //  //T-layout
 //  std::cout << "#GEMV-T" << std::endl;
-//  for(int_t N: std::vector<int>{64})
+//  for(int_t N: std::vector<int>{128})
 //    for(int_t M: create_full_range(128, 10000, 64))
 //    {
 //      std::cout << M << "," << N;
 //      /* ATIDLAS */
 //      ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
-//      y = dot(trans(A),x); queue.flush(); queue.finish();
+//      y = dot(trans(A),x); queue.finish();
-//      BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(dot(trans(A),x), ad::execution_options_type(0, &event)),(M*N + M + N)*dtsize/t);
+//      BENCHMARK_ATIDLAS(y = ad::control(dot(trans(A),x), ad::execution_options_type(0, &events), ad::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
 //  #ifdef BENCH_CLAMDBLAS
-//      BENCHMARK_OPENCL(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &queue(),0, NULL, &event()), (M*N + M + N)*dtsize/t)
+//      BENCHMARK_CLAMDBLAS(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &queue(),0, NULL, &event()), (M*N + M + N)*dtsize/t)
 //  #endif
 //  #ifdef BENCH_CBLAS
 //      std::vector<float> cA(N*M), cx(N), cy(M);
@@ -192,9 +271,9 @@ void bench(ad::numeric_type dtype)
 //    }
 //    std::cout << "\n\n" << std::flush;
-////  /*---------*/
+//  /*---------*/
-////  /*--BLAS3--*/
+//  /*--BLAS3--*/
-////  /*---------*/
+//  /*---------*/
 //    std::cout << "#GEMM-NT" << std::endl;
 //    for(std::vector<int_t>::const_iterator Mit = BLAS3_M.begin() ; Mit != BLAS3_M.end() ; ++Mit)
 //    for(std::vector<int_t>::const_iterator Nit = BLAS3_N.begin() ; Nit != BLAS3_N.end() ; ++Nit)
@@ -240,8 +319,8 @@ int main(int argc, char* argv[])
      std::cerr << "usage : blas-bench [DEVICE_IDX]" << std::endl;
      std::cout << "Devices available: " << std::endl;
      unsigned int current=0;
-      for(const auto & queue : queues){
+      for(ad::cl_ext::queues_type::data_type::const_iterator it = queues.begin() ; it != queues.end() ; ++it){
-        cl::Device device = queue.first.getInfo<CL_CONTEXT_DEVICES>()[0];
+        cl::Device device = it->first.getInfo<CL_CONTEXT_DEVICES>()[0];
        std::cout << current++ << ": " << device.getInfo<CL_DEVICE_NAME>() << "(" << cl::Platform(device.getInfo<CL_DEVICE_PLATFORM>()).getInfo<CL_PLATFORM_NAME>() << ")" << std::endl;
      }
      exit(EXIT_FAILURE);
--- a/bench/common.hpp
+++ b/bench/common.hpp
@@ -5,83 +5,6 @@
 #include <cmath>
 #include <algorithm>
 int ceil(int N, int pad)
 {
    return (N%pad==0)?N:(N+pad-1)/pad*pad;
 }
 std::vector<int> create_log_range(int min, int max, int N, int pad)
 {
  std::vector<int> res(N);
  for(int i = 0 ; i < N ; ++i)
  {
    res[i] = std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N);
    res[i] = ceil(res[i], pad);
  }
  return res;
 }
 std::vector<int> create_full_range(int min, int max, int pad)
 {
    std::vector<int> N;
    for(int i = ceil(min, pad) ; i < ceil(max, pad) ; i+=pad)
        N.push_back(i);
    return N;
 }
 template <typename T>
 class make_vector {
 public:
  typedef make_vector<T> my_type;
  my_type& operator<< (const T& val) {
    data_.push_back(val);
    return *this;
  }
  operator std::vector<T>() const {
    return data_;
  }
 private:
  std::vector<T> data_;
 };
 // BLAS1 Sizes
 static const std::vector<int> BLAS1_N = create_log_range(1e3, 2e7, 50, 64);
 // BLAS2 Sizes
 static const std::vector<int> BLAS2_N = make_vector<int>() << 64;
 static const std::vector<int> BLAS2_M = create_full_range(128, 10000, 64);
 // BLAS3 Sizes
 static const std::vector<int> BLAS3_M = make_vector<int>() << 1024;
 static const std::vector<int> BLAS3_N = make_vector<int>() << 128;
 static const std::vector<int> BLAS3_K = create_full_range(128, 5000, 64);
 double bandwidth(std::size_t N, double t, unsigned int dtsize)
 { return N * dtsize * 1e-9 / t; }
 double gflops(double nops, double t)
 { return nops * 1e-9 / t; }
 template<class T>
 T median(std::vector<T> x)
 {
  size_t size = x.size();
  std::sort(x.begin(), x.end());
  if (size  % 2 == 0)
      return (x[size / 2 - 1] + x[size / 2]) / 2;
  else
      return x[size / 2];
 }
 template<class T>
 T mean(std::vector<T> x)
 {
  T res = 0;
  int N = x.size();
  for(int i = 0 ; i < N ; ++i)
    res += x[i];
  return res/N;
 }
 #endif
--- a/include/CL/cl.hpp
+++ b/include/CL/cl.hpp
@@ -160,26 +160,21 @@
 #pragma push_macro("max")
 #undef max
-#if defined(USE_DX_INTEROP)
+
 #include <CL/cl_d3d10.h>
 #include <CL/cl_dx9_media_sharing.h>
 #endif
 #endif // _WIN32
 #if defined(__APPLE__) || defined(__MACOSX)
 #include <OpenCL/opencl.h>
 #include <libkern/OSAtomic.h>
 #else
 #include <CL/opencl.h>
 #endif // !__APPLE__
 // 
 #if defined(USE_CL_DEVICE_FISSION)
 #include <CL/cl_ext.h>
 #endif
 #if defined(__APPLE__) || defined(__MACOSX)
 #include <OpenGL/OpenGL.h>
 #include <OpenCL/opencl.h>
 #include <libkern/OSAtomic.h>
 #else
 #include <GL/gl.h>
 #include <CL/opencl.h>
 #endif // !__APPLE__
 // To avoid accidentally taking ownership of core OpenCL types
 // such as cl_kernel constructors are made explicit
 // under OpenCL 1.2
@@ -367,9 +362,6 @@ static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
 #define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
 #define __COPY_ERR                          __ERR_STR(cl::copy)
 #define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
 #define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
 #define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
 #define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
 #if defined(CL_VERSION_1_2)
 #define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
 #define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
@@ -3219,266 +3211,6 @@ public:
 #endif
 };
 #if defined (USE_DX_INTEROP)
 /*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
 *
 *  This is provided to facilitate interoperability with Direct3D.
 * 
 *  See Memory for details about copy semantics, etc.
 *
 *  \see Memory
 */
 class BufferD3D10 : public Buffer
 {
 public:
    typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
    cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
    cl_int* errcode_ret);
    /*! \brief Constructs a BufferD3D10, in a specified context, from a
     *         given ID3D10Buffer.
     *
     *  Wraps clCreateFromD3D10BufferKHR().
     */
    BufferD3D10(
        const Context& context,
        cl_mem_flags flags,
        ID3D10Buffer* bufobj,
        cl_int * err = NULL)
    {
        static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
 #if defined(CL_VERSION_1_2)
        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
        cl_platform platform = -1;
        for( int i = 0; i < props.size(); ++i ) {
            if( props[i] == CL_CONTEXT_PLATFORM ) {
                platform = props[i+1];
            }
        }
        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
 #endif
 #if defined(CL_VERSION_1_1)
        __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
 #endif
        cl_int error;
        object_ = pfn_clCreateFromD3D10BufferKHR(
            context(),
            flags,
            bufobj,
            &error);
        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
        if (err != NULL) {
            *err = error;
        }
    }
    //! \brief Default constructor - initializes to NULL.
    BufferD3D10() : Buffer() { }
    /*! \brief Copy constructor - performs shallow copy.
     *
     *  See Memory for further details.
     */
    BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
    /*! \brief Constructor from cl_mem - takes ownership.
     *
     *  See Memory for further details.
     */
    __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
    /*! \brief Assignment from BufferD3D10 - performs shallow copy.
     *
     *  See Memory for further details.
     */
    BufferD3D10& operator = (const BufferD3D10& rhs)
    {
        if (this != &rhs) {
            Buffer::operator=(rhs);
        }
        return *this;
    }
    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    BufferD3D10& operator = (const cl_mem& rhs)
    {
        Buffer::operator=(rhs);
        return *this;
    }
 };
 #endif
 /*! \brief Class interface for GL Buffer Memory Objects.
 *
 *  This is provided to facilitate interoperability with OpenGL.
 * 
 *  See Memory for details about copy semantics, etc.
 * 
 *  \see Memory
 */
 class BufferGL : public Buffer
 {
 public:
    /*! \brief Constructs a BufferGL in a specified context, from a given
     *         GL buffer.
     *
     *  Wraps clCreateFromGLBuffer().
     */
    BufferGL(
        const Context& context,
        cl_mem_flags flags,
        GLuint bufobj,
        cl_int * err = NULL)
    {
        cl_int error;
        object_ = ::clCreateFromGLBuffer(
            context(),
            flags,
            bufobj,
            &error);
        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
        if (err != NULL) {
            *err = error;
        }
    }
    //! \brief Default constructor - initializes to NULL.
    BufferGL() : Buffer() { }
    /*! \brief Copy constructor - performs shallow copy.
     *
     *  See Memory for further details.
     */
    BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
    /*! \brief Constructor from cl_mem - takes ownership.
     *
     *  See Memory for further details.
     */
    __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
    /*! \brief Assignment from BufferGL - performs shallow copy.
     *
     *  See Memory for further details.
     */
    BufferGL& operator = (const BufferGL& rhs)
    {
        if (this != &rhs) {
            Buffer::operator=(rhs);
        }
        return *this;
    }
    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    BufferGL& operator = (const cl_mem& rhs)
    {
        Buffer::operator=(rhs);
        return *this;
    }
    //! \brief Wrapper for clGetGLObjectInfo().
    cl_int getObjectInfo(
        cl_gl_object_type *type,
        GLuint * gl_object_name)
    {
        return detail::errHandler(
            ::clGetGLObjectInfo(object_,type,gl_object_name),
            __GET_GL_OBJECT_INFO_ERR);
    }
 };
 /*! \brief Class interface for GL Render Buffer Memory Objects.
 *
 *  This is provided to facilitate interoperability with OpenGL.
 * 
 *  See Memory for details about copy semantics, etc.
 * 
 *  \see Memory
 */
 class BufferRenderGL : public Buffer
 {
 public:
    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
     *         GL Renderbuffer.
     *
     *  Wraps clCreateFromGLRenderbuffer().
     */
    BufferRenderGL(
        const Context& context,
        cl_mem_flags flags,
        GLuint bufobj,
        cl_int * err = NULL)
    {
        cl_int error;
        object_ = ::clCreateFromGLRenderbuffer(
            context(),
            flags,
            bufobj,
            &error);
        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
        if (err != NULL) {
            *err = error;
        }
    }
    //! \brief Default constructor - initializes to NULL.
    BufferRenderGL() : Buffer() { }
    /*! \brief Copy constructor - performs shallow copy.
     *
     *  See Memory for further details.
     */
    BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
    /*! \brief Constructor from cl_mem - takes ownership.
     *
     *  See Memory for further details.
     */
    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { }
    /*! \brief Assignment from BufferGL - performs shallow copy.
     *
     *  See Memory for further details.
     */
    BufferRenderGL& operator = (const BufferRenderGL& rhs)
    {
        if (this != &rhs) {
            Buffer::operator=(rhs);
        }
        return *this;
    }
    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    BufferRenderGL& operator = (const cl_mem& rhs)
    {
        Buffer::operator=(rhs);
        return *this;
    }
    //! \brief Wrapper for clGetGLObjectInfo().
    cl_int getObjectInfo(
        cl_gl_object_type *type,
        GLuint * gl_object_name)
    {
        return detail::errHandler(
            ::clGetGLObjectInfo(object_,type,gl_object_name),
            __GET_GL_OBJECT_INFO_ERR);
    }
 };
 /*! \brief C++ base class for Image Memory objects.
 *
@@ -3869,86 +3601,6 @@ public:
 };
 #if !defined(CL_VERSION_1_2)
 /*! \brief Class interface for GL 2D Image Memory objects.
 *
 *  This is provided to facilitate interoperability with OpenGL.
 * 
 *  See Memory for details about copy semantics, etc.
 * 
 *  \see Memory
 *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
 */
 class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
 {
 public:
    /*! \brief Constructs an Image2DGL in a specified context, from a given
     *         GL Texture.
     *
     *  Wraps clCreateFromGLTexture2D().
     */
    Image2DGL(
        const Context& context,
        cl_mem_flags flags,
        GLenum target,
        GLint  miplevel,
        GLuint texobj,
        cl_int * err = NULL)
    {
        cl_int error;
        object_ = ::clCreateFromGLTexture2D(
            context(),
            flags,
            target,
            miplevel,
            texobj,
            &error);
        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
        if (err != NULL) {
            *err = error;
        }
    }
    //! \brief Default constructor - initializes to NULL.
    Image2DGL() : Image2D() { }
    /*! \brief Copy constructor - performs shallow copy.
     *
     *  See Memory for further details.
     */
    Image2DGL(const Image2DGL& image) : Image2D(image) { }
    /*! \brief Constructor from cl_mem - takes ownership.
     *
     *  See Memory for further details.
     */
    __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
    /*! \brief Assignment from Image2DGL - performs shallow copy.
     *
     *  See Memory for further details.
     */
    Image2DGL& operator = (const Image2DGL& rhs)
    {
        if (this != &rhs) {
            Image2D::operator=(rhs);
        }
        return *this;
    }
    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    Image2DGL& operator = (const cl_mem& rhs)
    {
        Image2D::operator=(rhs);
        return *this;
    }
 };
 #endif // #if !defined(CL_VERSION_1_2)
 #if defined(CL_VERSION_1_2)
 /*! \class Image2DArray
@@ -4138,246 +3790,6 @@ public:
    }
 };
 #if !defined(CL_VERSION_1_2)
 /*! \brief Class interface for GL 3D Image Memory objects.
 *
 *  This is provided to facilitate interoperability with OpenGL.
 * 
 *  See Memory for details about copy semantics, etc.
 * 
 *  \see Memory
 */
 class Image3DGL : public Image3D
 {
 public:
    /*! \brief Constructs an Image3DGL in a specified context, from a given
     *         GL Texture.
     *
     *  Wraps clCreateFromGLTexture3D().
     */
    Image3DGL(
        const Context& context,
        cl_mem_flags flags,
        GLenum target,
        GLint  miplevel,
        GLuint texobj,
        cl_int * err = NULL)
    {
        cl_int error;
        object_ = ::clCreateFromGLTexture3D(
            context(),
            flags,
            target,
            miplevel,
            texobj,
            &error);
        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
        if (err != NULL) {
            *err = error;
        }
    }
    //! \brief Default constructor - initializes to NULL.
    Image3DGL() : Image3D() { }
    /*! \brief Copy constructor - performs shallow copy.
     *
     *  See Memory for further details.
     */
    Image3DGL(const Image3DGL& image) : Image3D(image) { }
    /*! \brief Constructor from cl_mem - takes ownership.
     *
     *  See Memory for further details.
     */
    __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
    /*! \brief Assignment from Image3DGL - performs shallow copy.
     *
     *  See Memory for further details.
     */
    Image3DGL& operator = (const Image3DGL& rhs)
    {
        if (this != &rhs) {
            Image3D::operator=(rhs);
        }
        return *this;
    }
    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    Image3DGL& operator = (const cl_mem& rhs)
    {
        Image3D::operator=(rhs);
        return *this;
    }
 };
 #endif // #if !defined(CL_VERSION_1_2)
 #if defined(CL_VERSION_1_2)
 /*! \class ImageGL
 * \brief general image interface for GL interop.
 * We abstract the 2D and 3D GL images into a single instance here
 * that wraps all GL sourced images on the grounds that setup information
 * was performed by OpenCL anyway.
 */
 class ImageGL : public Image
 {
 public:
    ImageGL(
        const Context& context,
        cl_mem_flags flags,
        GLenum target,
        GLint  miplevel,
        GLuint texobj,
        cl_int * err = NULL)
    {
        cl_int error;
        object_ = ::clCreateFromGLTexture(
            context(), 
            flags, 
            target,
            miplevel,
            texobj,
            &error);
        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
        if (err != NULL) {
            *err = error;
        }
    }
    ImageGL() : Image() { }
    ImageGL(const ImageGL& image) : Image(image) { }
    __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
    ImageGL& operator = (const ImageGL& rhs)
    {
        if (this != &rhs) {
            Image::operator=(rhs);
        }
        return *this;
    }
    ImageGL& operator = (const cl_mem& rhs)
    {
        Image::operator=(rhs);
        return *this;
    }
 };
 #endif // #if defined(CL_VERSION_1_2)
 /*! \brief Class interface for cl_sampler.
 *
 *  \note Copies of these objects are shallow, meaning that the copy will refer
 *        to the same underlying cl_sampler as the original.  For details, see
 *        clRetainSampler() and clReleaseSampler().
 *
 *  \see cl_sampler 
 */
 class Sampler : public detail::Wrapper<cl_sampler>
 {
 public:
    /*! \brief Destructor.
     *
     *  This calls clReleaseSampler() on the value held by this instance.
     */
    ~Sampler() { }
    //! \brief Default constructor - initializes to NULL.
    Sampler() { }
    /*! \brief Constructs a Sampler in a specified context.
     *
     *  Wraps clCreateSampler().
     */
    Sampler(
        const Context& context,
        cl_bool normalized_coords,
        cl_addressing_mode addressing_mode,
        cl_filter_mode filter_mode,
        cl_int* err = NULL)
    {
        cl_int error;
        object_ = ::clCreateSampler(
            context(), 
            normalized_coords,
            addressing_mode,
            filter_mode,
            &error);
        detail::errHandler(error, __CREATE_SAMPLER_ERR);
        if (err != NULL) {
            *err = error;
        }
    }
    /*! \brief Copy constructor - performs shallow copy.
     * 
     *  This calls clRetainSampler() on the parameter's cl_sampler.
     */
    Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
    /*! \brief Constructor from cl_sampler - takes ownership.
     * 
     *  This effectively transfers ownership of a refcount on the cl_sampler
     *  into the new Sampler object.
     */
    Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
    /*! \brief Assignment operator from Sampler.
     * 
     *  This calls clRetainSampler() on the parameter and clReleaseSampler()
     *  on the previous value held by this instance.
     */
    Sampler& operator = (const Sampler& rhs)
    {
        if (this != &rhs) {
            detail::Wrapper<cl_type>::operator=(rhs);
        }
        return *this;
    }
    /*! \brief Assignment operator from cl_sampler - takes ownership.
     *
     *  This effectively transfers ownership of a refcount on the rhs and calls
     *  clReleaseSampler() on the value previously held by this instance.
     */
    Sampler& operator = (const cl_sampler& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }
    //! \brief Wrapper for clGetSamplerInfo().
    template <typename T>
    cl_int getInfo(cl_sampler_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
            __GET_SAMPLER_INFO_ERR);
    }
    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
    template <cl_int name> typename
    detail::param_traits<detail::cl_sampler_info, name>::param_type
    getInfo(cl_int* err = NULL) const
    {
        typename detail::param_traits<
            detail::cl_sampler_info, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != NULL) {
            *err = result;
        }
        return param;
    }
 };
 class Program;
 class CommandQueue;
 class Kernel;
@@ -6012,126 +5424,8 @@ public:
    }
 #endif // #if defined(CL_VERSION_1_1)
    cl_int enqueueAcquireGLObjects(
         const VECTOR_CLASS<Memory>* mem_objects = NULL,
         const VECTOR_CLASS<Event>* events = NULL,
         Event* event = NULL) const
     {
        cl_event tmp;
        cl_int err = detail::errHandler(
             ::clEnqueueAcquireGLObjects(
                 object_,
                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
                 (events != NULL) ? (cl_uint) events->size() : 0,
                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
                 (event != NULL) ? &tmp : NULL),
             __ENQUEUE_ACQUIRE_GL_ERR);
        if (event != NULL && err == CL_SUCCESS)
            *event = tmp;
        return err;
     }
    cl_int enqueueReleaseGLObjects(
         const VECTOR_CLASS<Memory>* mem_objects = NULL,
         const VECTOR_CLASS<Event>* events = NULL,
         Event* event = NULL) const
     {
        cl_event tmp;
        cl_int err = detail::errHandler(
             ::clEnqueueReleaseGLObjects(
                 object_,
                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
                 (events != NULL) ? (cl_uint) events->size() : 0,
                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
                 (event != NULL) ? &tmp : NULL),
             __ENQUEUE_RELEASE_GL_ERR);
        if (event != NULL && err == CL_SUCCESS)
            *event = tmp;
        return err;
     }
 #if defined (USE_DX_INTEROP)
 typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list, cl_event* event);
 typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list, cl_event* event);
    cl_int enqueueAcquireD3D10Objects(
         const VECTOR_CLASS<Memory>* mem_objects = NULL,
         const VECTOR_CLASS<Event>* events = NULL,
         Event* event = NULL) const
    {
        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
 #if defined(CL_VERSION_1_2)
        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
 #endif
 #if defined(CL_VERSION_1_1)
        __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
 #endif
        cl_event tmp;
        cl_int err = detail::errHandler(
             pfn_clEnqueueAcquireD3D10ObjectsKHR(
                 object_,
                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
                 (events != NULL) ? (cl_uint) events->size() : 0,
                 (events != NULL) ? (cl_event*) &events->front() : NULL,
                 (event != NULL) ? &tmp : NULL),
             __ENQUEUE_ACQUIRE_GL_ERR);
        if (event != NULL && err == CL_SUCCESS)
            *event = tmp;
        return err;
     }
    cl_int enqueueReleaseD3D10Objects(
         const VECTOR_CLASS<Memory>* mem_objects = NULL,
         const VECTOR_CLASS<Event>* events = NULL,
         Event* event = NULL) const
    {
        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
 #if defined(CL_VERSION_1_2)
        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
 #endif // #if defined(CL_VERSION_1_2)
 #if defined(CL_VERSION_1_1)
        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
 #endif // #if defined(CL_VERSION_1_1)
        cl_event tmp;
        cl_int err = detail::errHandler(
            pfn_clEnqueueReleaseD3D10ObjectsKHR(
                object_,
                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
                (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
                (events != NULL) ? (cl_uint) events->size() : 0,
                (events != NULL) ? (cl_event*) &events->front() : NULL,
                (event != NULL) ? &tmp : NULL),
            __ENQUEUE_RELEASE_GL_ERR);
        if (event != NULL && err == CL_SUCCESS)
            *event = tmp;
        return err;
    }
 #endif
 /**
 * Deprecated APIs for 1.2
--- a/include/atidlas/array.h
+++ b/include/atidlas/array.h
@@ -2,7 +2,6 @@
 #define ATIDLAS_ARRAY_H_
 #include <iostream>
 #include <type_traits>
 #include <CL/cl.hpp>
 #include "atidlas/types.h"
 #include "atidlas/cl_ext/backend.h"
@@ -17,8 +16,7 @@ class scalar;
 class array: public array_base
 {
  friend array reshape(array const &, int_t, int_t);
-  template<class T>
+
  struct is_array { enum{ value = std::is_same<T, array>::value || std::is_same<T, array_expression>::value}; };
 public:
  //1D Constructors
  array(int_t size1, numeric_type dtype, cl::Context context = cl_ext::default_context());
--- a/include/atidlas/backend/mapped_object.h
+++ b/include/atidlas/backend/mapped_object.h
@@ -20,7 +20,7 @@ enum leaf_t
 class mapped_object;
 typedef std::pair<int_t, leaf_t> mapping_key;
-typedef std::map<mapping_key, std::shared_ptr<mapped_object> > mapping_type;
+typedef std::map<mapping_key, tools::shared_ptr<mapped_object> > mapping_type;
 /** @brief Mapped Object
 *
--- a/include/atidlas/backend/templates/base.h
+++ b/include/atidlas/backend/templates/base.h
@@ -4,11 +4,11 @@
 #include <list>
 #include <set>
 #include <CL/cl.hpp>
 #include "atidlas/types.h"
 #include "atidlas/backend/parse.h"
 #include "atidlas/backend/stream.h"
 #include <CL/cl.hpp>
 #include "atidlas/cl_ext/lazy_compiler.h"
 #include "atidlas/symbolic/expression.h"
@@ -75,15 +75,15 @@ protected:
    /** @brief Accessor for the numeric type */
    numeric_type get_numeric_type(atidlas::array_expression const * array_expression, int_t root_idx) const;
    /** @brief Creates a binary leaf */
-    template<class T> std::shared_ptr<mapped_object> binary_leaf(atidlas::array_expression const * array_expression, int_t root_idx, mapping_type const * mapping) const;
+    template<class T> tools::shared_ptr<mapped_object> binary_leaf(atidlas::array_expression const * array_expression, int_t root_idx, mapping_type const * mapping) const;
    /** @brief Creates a value scalar mapping */
-    std::shared_ptr<mapped_object> create(numeric_type dtype, values_holder) const;
+    tools::shared_ptr<mapped_object> create(numeric_type dtype, values_holder) const;
    /** @brief Creates a vector mapping */
-    std::shared_ptr<mapped_object> create(array_infos const &) const;
+    tools::shared_ptr<mapped_object> create(array_infos const &) const;
    /** @brief Creates a tuple mapping */
-    std::shared_ptr<mapped_object> create(repeat_infos const &) const;
+    tools::shared_ptr<mapped_object> create(repeat_infos const &) const;
    /** @brief Creates a mapping */
-    std::shared_ptr<mapped_object> create(lhs_rhs_element const &) const;
+    tools::shared_ptr<mapped_object> create(lhs_rhs_element const &) const;
  public:
    map_functor(symbolic_binder & binder, mapping_type & mapping);
    /** @brief Functor for traversing the tree */
@@ -143,7 +143,7 @@ protected:
  static bool is_reduction(array_expression::node const & node);
  static bool is_index_reduction(op_element const & op);
-  std::shared_ptr<symbolic_binder> make_binder();
+  tools::shared_ptr<symbolic_binder> make_binder();
  static std::string vstore(unsigned int simd_width, std::string const & value, std::string const & offset, std::string const & ptr);
  static std::string vload(unsigned int simd_width, std::string const & offset, std::string const & ptr);
  static std::string append_width(std::string const & str, unsigned int width);
@@ -164,7 +164,7 @@ public:
  virtual int check_invalid(expressions_tuple const & expressions, cl::Device const & device) const = 0;
  virtual void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
                       unsigned int label, controller<expressions_tuple> const & expressions) = 0;
-  virtual std::shared_ptr<base> clone() const = 0;
+  virtual tools::shared_ptr<base> clone() const = 0;
 private:
  binding_policy_t binding_policy_;
 };
@@ -180,7 +180,7 @@ public:
  base_impl(parameters_type const & parameters, binding_policy_t binding_policy);
  int_t local_size_0() const;
  int_t local_size_1() const;
-  std::shared_ptr<base> clone() const;
+  tools::shared_ptr<base> clone() const;
  /** @brief returns whether or not the profile has undefined behavior on particular device */
  int check_invalid(expressions_tuple const & expressions, cl::Device const & device) const;
 protected:
--- a/include/atidlas/model/model.h
+++ b/include/atidlas/model/model.h
@@ -16,7 +16,7 @@ namespace atidlas
  class model
  {
-    typedef std::vector< std::shared_ptr<base> > templates_container;
+    typedef std::vector< tools::shared_ptr<base> > templates_container;
  private:
    std::string define_extension(std::string const & extensions, std::string const & ext);
@@ -24,21 +24,21 @@ namespace atidlas
    std::vector<cl_ext::lazy_compiler>& init(controller<expressions_tuple> const &);
  public:
-    model(predictors::random_forest const &, std::vector< std::shared_ptr<base> > const &, cl::CommandQueue &);
+    model(predictors::random_forest const &, std::vector< tools::shared_ptr<base> > const &, cl::CommandQueue &);
-    model(std::vector< std::shared_ptr<base> > const &, cl::CommandQueue &);
+    model(std::vector< tools::shared_ptr<base> > const &, cl::CommandQueue &);
    model(base const &, cl::CommandQueue &);
    void execute(controller<expressions_tuple> const &);
    templates_container const & templates() const;
  private:
    templates_container templates_;
-    std::shared_ptr<predictors::random_forest> predictor_;
+    tools::shared_ptr<predictors::random_forest> predictor_;
    std::map<std::vector<int_t>, int> hardcoded_;
    std::map<cl_context, std::map<std::string, std::vector<cl_ext::lazy_compiler> > > lazy_programs_;
    cl::CommandQueue & queue_;
  };
-  typedef std::map<std::pair<expression_type, numeric_type>, std::shared_ptr<model> > model_map_t;
+  typedef std::map<std::pair<expression_type, numeric_type>, tools::shared_ptr<model> > model_map_t;
  model_map_t init_models(cl::CommandQueue const & queue);
  model_map_t& get_model_map(cl::CommandQueue & queue);
--- a/include/atidlas/symbolic/expression.h
+++ b/include/atidlas/symbolic/expression.h
@@ -6,7 +6,8 @@
 #include <CL/cl.hpp>
 #include "atidlas/types.h"
 #include "atidlas/value_scalar.h"
-#include <memory>
+#include "atidlas/tools/shared_ptr.hpp"
 #include <iostream>
 namespace atidlas
 {
@@ -221,21 +222,14 @@ class operation_cache
  };
 public:
-  void push_back(cl::CommandQueue & queue, cl::Kernel const & kernel, cl::NDRange const & offset, cl::NDRange const & global, cl::NDRange const & local, std::vector<cl::Event>* dependencies)
+  void push_back(cl::CommandQueue & queue, cl::Kernel const & kernel, cl::NDRange const & offset, cl::NDRange const & global, cl::NDRange const & local, std::vector<cl::Event>* dependencies);
-  { l_.push_back({queue, kernel, offset, global, local, dependencies});  }
+  void enqueue(std::list<cl::Event>* events = NULL);
  void enqueue(std::list<cl::Event>* events = NULL)
  {
    for(infos & i : l_){
      events->push_back(cl::Event());
      i.queue.enqueueNDRangeKernel(i.kernel, i.offset, i.global, i.local, i.dependencies, &events->back());
    }
  }
 private:
  std::list<infos> l_;
 };
 struct execution_options_type
 {
  execution_options_type(unsigned int _queue_id = 0, std::list<cl::Event>* _events = NULL, operation_cache* _cache = NULL, std::vector<cl::Event>* _dependencies = NULL) : queue_id(_queue_id), events(_events), cache(_cache), dependencies(_dependencies){}
@@ -299,9 +293,9 @@ controller<TYPE> control(TYPE const & x, execution_options_type const& execution
 class expressions_tuple
 {
 private:
-  std::shared_ptr<array_expression> create(array_expression const & s);
+  tools::shared_ptr<array_expression> create(array_expression const & s);
 public:
-  typedef std::list<std::shared_ptr<array_expression> > data_type;
+  typedef std::list<tools::shared_ptr<array_expression> > data_type;
  enum order_type { SEQUENTIAL, INDEPENDENT };
  expressions_tuple(array_expression const & s0);
--- a/include/atidlas/tools/shared_ptr.hpp
+++ b/include/atidlas/tools/shared_ptr.hpp
@@ -0,0 +1,162 @@
 #ifndef ATIDLAS_TOOLS_SHARED_PTR_HPP
 #define ATIDLAS_TOOLS_SHARED_PTR_HPP
 /* =========================================================================
   Copyright (c) 2010-2012, Institute for Microelectronics,
                            Institute for Analysis and Scientific Computing,
                            TU Wien.
   Portions of this software are copyright by UChicago Argonne, LLC.
                            -----------------
                  ViennaCL - The Vienna Computing Library
                            -----------------
   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
   (A list of authors and contributors can be found in the PDF manual)
   License:         MIT (X11), see file LICENSE in the base directory
 ============================================================================= */
 /** @file tools/shared_ptr.hpp
    @brief Implementation of a shared pointer class (cf. tools::shared_ptr, boost::shared_ptr). Will be used until C++11 is widely available.
    Contributed by Philippe Tillet.
 */
 #include <cstdlib>
 #include <algorithm>
 namespace atidlas
 {
 namespace tools
 {
 namespace detail
 {
  /** @brief Reference counting class for the shared_ptr implementation */
  class count
  {
  public:
    count(unsigned int val) : val_(val){ }
    void dec(){ --val_; }
    void inc(){ ++val_; }
    bool is_null(){ return val_ == 0; }
    unsigned int val(){ return val_; }
  private:
    unsigned int val_;
  };
  /** @brief Interface for the reference counter inside the shared_ptr */
  struct aux
  {
    detail::count count;
    aux() :count(1) {}
    virtual void destroy()=0;
    virtual ~aux() {}
  };
  /** @brief Implementation helper for the reference counting mechanism inside shared_ptr. */
  template<class U, class Deleter>
  struct auximpl: public detail::aux
  {
    U* p;
    Deleter d;
    auximpl(U* pu, Deleter x) :p(pu), d(x) {}
    virtual void destroy() { d(p); }
  };
  /** @brief Default deleter class for a pointer. The default is to just call 'delete' on the pointer. Provide your own implementations for 'delete[]' and 'free'. */
  template<class U>
  struct default_deleter
  {
    void operator()(U* p) const { delete p; }
  };
 }
 class shared_ptr_base
 {
 protected:
  detail::aux* pa;
 public:
  unsigned int count() { return pa->count.val(); }
 };
 /** @brief A shared pointer class similar to boost::shared_ptr. Reimplemented in order to avoid a Boost-dependency. Will be replaced by tools::shared_ptr as soon as C++11 is widely available. */
 template<class T>
 class shared_ptr : public shared_ptr_base
 {
  template<class U>
  friend class shared_ptr;
  detail::aux* pa;
  T* pt;
 public:
  shared_ptr() :pa(NULL), pt(NULL) {}
  template<class U, class Deleter>
  shared_ptr(U* pu, Deleter d) : pa(new detail::auximpl<U, Deleter>(pu, d)), pt(pu) {}
  template<class U>
  explicit shared_ptr(U* pu) : pa(new detail::auximpl<U, detail::default_deleter<U> >(pu, detail::default_deleter<U>())), pt(pu) {}
  template<class U>
  shared_ptr(const shared_ptr<U>& s) :pa(s.pa), pt(s.pt)  { inc(); }
  shared_ptr(const shared_ptr& s) :pa(s.pa), pt(s.pt)  { inc(); }
  ~shared_ptr() { dec(); }
  T* get() const {  return pt; }
  T* operator->() const {  return pt; }
  T& operator*() const { return *pt; }
  void reset() { shared_ptr<T>().swap(*this); }
  void reset(T * ptr) { shared_ptr<T>(ptr).swap(*this); }
  void swap(shared_ptr<T> & other)
  {
    std::swap(pt,other.pt);
    std::swap(pa, other.pa);
  }
  shared_ptr& operator=(const shared_ptr& s)
  {
    if (this!=&s)
    {
      dec();
      pa = s.pa;
      pt = s.pt;
      inc();
    }
    return *this;
  }
  void inc()
  {
    if (pa) pa->count.inc();
  }
  void dec()
  {
    if (pa)
    {
      pa->count.dec();
      if (pa->count.is_null())
      {
        pa->destroy();
        delete pa;
        pa = NULL;
      }
    }
  }
 };
 }
 }
 #endif
--- a/lib/backend/templates/base.cpp
+++ b/lib/backend/templates/base.cpp
@@ -30,44 +30,44 @@ numeric_type base::map_functor::get_numeric_type(atidlas::array_expression const
 /** @brief Binary leaf */
 template<class T>
-std::shared_ptr<mapped_object> base::map_functor::binary_leaf(atidlas::array_expression const * array_expression, int_t root_idx, mapping_type const * mapping) const
+tools::shared_ptr<mapped_object> base::map_functor::binary_leaf(atidlas::array_expression const * array_expression, int_t root_idx, mapping_type const * mapping) const
 {
-  return std::shared_ptr<mapped_object>(new T(numeric_type_to_string(array_expression->dtype()), binder_.get(NULL), mapped_object::node_info(mapping, array_expression, root_idx)));
+  return tools::shared_ptr<mapped_object>(new T(numeric_type_to_string(array_expression->dtype()), binder_.get(NULL), mapped_object::node_info(mapping, array_expression, root_idx)));
 }
 /** @brief Scalar mapping */
-std::shared_ptr<mapped_object> base::map_functor::create(numeric_type dtype, values_holder) const
+tools::shared_ptr<mapped_object> base::map_functor::create(numeric_type dtype, values_holder) const
 {
  std::string strdtype = numeric_type_to_string(dtype);
-  return std::shared_ptr<mapped_object>(new mapped_host_scalar(strdtype, binder_.get(NULL)));
+  return tools::shared_ptr<mapped_object>(new mapped_host_scalar(strdtype, binder_.get(NULL)));
 }
 /** @brief Vector mapping */
-std::shared_ptr<mapped_object> base::map_functor::create(array_infos const & a) const
+tools::shared_ptr<mapped_object> base::map_functor::create(array_infos const & a) const
 {
  std::string dtype = numeric_type_to_string(a.dtype);
  unsigned int id = binder_.get(a.data);
  //Scalar
  if(a.shape1==1 && a.shape2==1)
-    return std::shared_ptr<mapped_object>(new mapped_array(dtype, id, 's'));
+    return tools::shared_ptr<mapped_object>(new mapped_array(dtype, id, 's'));
  //Column vector
  else if(a.shape1>1 && a.shape2==1)
-    return std::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'c'));
+    return tools::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'c'));
  //Row vector
  else if(a.shape1==1 && a.shape2>1)
-    return std::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'r'));
+    return tools::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'r'));
  //Matrix
  else
-    return std::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'm'));
+    return tools::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'm'));
 }
-std::shared_ptr<mapped_object> base::map_functor::create(repeat_infos const &) const
+tools::shared_ptr<mapped_object> base::map_functor::create(repeat_infos const &) const
 {
  //TODO: Make it less specific!
-  return std::shared_ptr<mapped_object>(new mapped_tuple("int",binder_.get(NULL),4));
+  return tools::shared_ptr<mapped_object>(new mapped_tuple("int",binder_.get(NULL),4));
 }
-std::shared_ptr<mapped_object> base::map_functor::create(lhs_rhs_element const & lhs_rhs) const
+tools::shared_ptr<mapped_object> base::map_functor::create(lhs_rhs_element const & lhs_rhs) const
 {
  switch(lhs_rhs.type_family)
  {
@@ -111,7 +111,7 @@ void base::map_functor::operator()(atidlas::array_expression const & array_expre
    else if (root_node.op.type == OPERATOR_OUTER_PROD_TYPE)
      mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_outer>(&array_expression, root_idx, &mapping_)));
    else if (detail::is_cast(root_node.op))
-      mapping_.insert(mapping_type::value_type(key, std::shared_ptr<mapped_object>(new mapped_cast(root_node.op.type, binder_.get(NULL)))));
+      mapping_.insert(mapping_type::value_type(key, tools::shared_ptr<mapped_object>(new mapped_cast(root_node.op.type, binder_.get(NULL)))));
  }
 }
@@ -280,7 +280,7 @@ std::string base::generate_arguments(std::string const & data_type, std::vector<
 void base::set_arguments(expressions_tuple const & expressions, cl::Kernel & kernel, unsigned int & current_arg)
 {
-  std::shared_ptr<symbolic_binder> binder = make_binder();
+  tools::shared_ptr<symbolic_binder> binder = make_binder();
  for (const auto & elem : expressions.data())
    traverse(*elem, (elem)->root(), set_arguments_functor(*binder, current_arg, kernel), true);
 }
@@ -478,12 +478,12 @@ unsigned int base::align(unsigned int to_round, unsigned int base)
  return (to_round + base - 1)/base * base;
 }
-std::shared_ptr<symbolic_binder> base::make_binder()
+tools::shared_ptr<symbolic_binder> base::make_binder()
 {
  if (binding_policy_==BIND_TO_HANDLE)
-    return std::shared_ptr<symbolic_binder>(new bind_to_handle());
+    return tools::shared_ptr<symbolic_binder>(new bind_to_handle());
  else
-    return std::shared_ptr<symbolic_binder>(new bind_all_unique());
+    return tools::shared_ptr<symbolic_binder>(new bind_all_unique());
 }
@@ -509,7 +509,7 @@ std::vector<std::string> base::generate(unsigned int label, expressions_tuple co
  //Create mapping
  std::vector<mapping_type> mappings(expressions.data().size());
-  std::shared_ptr<symbolic_binder> binder = make_binder();
+  tools::shared_ptr<symbolic_binder> binder = make_binder();
  for (mit = mappings.begin(), sit = expressions.data().begin(); sit != expressions.data().end(); ++sit, ++mit)
    traverse(**sit, (*sit)->root(), map_functor(*binder,*mit), true);
@@ -533,8 +533,8 @@ int_t base_impl<TType, PType>::local_size_1() const
 { return p_.local_size_1; }
 template<class TType, class PType>
-std::shared_ptr<base> base_impl<TType, PType>::clone() const
+tools::shared_ptr<base> base_impl<TType, PType>::clone() const
-{ return std::shared_ptr<base>(new TType(*dynamic_cast<TType const *>(this))); }
+{ return tools::shared_ptr<base>(new TType(*dynamic_cast<TType const *>(this))); }
 template<class TType, class PType>
 int base_impl<TType, PType>::check_invalid(expressions_tuple const & expressions, cl::Device const & device) const
--- a/lib/backend/templates/mproduct.cpp
+++ b/lib/backend/templates/mproduct.cpp
@@ -587,7 +587,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
    kernel.setArg(current_arg++, cl_uint(N));
    kernel.setArg(current_arg++, cl_uint(K));
-    std::shared_ptr<symbolic_binder> binder = make_binder();
+    tools::shared_ptr<symbolic_binder> binder = make_binder();
    set_arguments_functor fun(*binder, current_arg, kernel);
    fun.set_arguments(C);
    fun.set_arguments(alpha.dtype(), alpha.values());
--- a/lib/model/model.cpp
+++ b/lib/model/model.cpp
@@ -82,11 +82,11 @@ std::vector<cl_ext::lazy_compiler>& model::init(controller<expressions_tuple> co
  return to_init;
 }
-model::model(predictors::random_forest const & predictor, std::vector< std::shared_ptr<base> > const & templates, cl::CommandQueue & queue) :
+model::model(predictors::random_forest const & predictor, std::vector< tools::shared_ptr<base> > const & templates, cl::CommandQueue & queue) :
  templates_(templates), predictor_(new predictors::random_forest(predictor)), queue_(queue)
 {}
-model::model(std::vector< std::shared_ptr<base> > const & templates, cl::CommandQueue & queue) :  templates_(templates), queue_(queue)
+model::model(std::vector< tools::shared_ptr<base> > const & templates, cl::CommandQueue & queue) :  templates_(templates), queue_(queue)
 {}
 model::model(base const & tp, cl::CommandQueue & queue) : templates_(1,tp.clone()), queue_(queue)
@@ -158,27 +158,27 @@ namespace detail
    throw std::invalid_argument("Invalid datatype: " + name);
  }
-  static std::shared_ptr<base> create(std::string const & template_name, std::vector<int> const & a)
+  static tools::shared_ptr<base> create(std::string const & template_name, std::vector<int> const & a)
  {
    fetching_policy_type fetch[] = {FETCH_FROM_LOCAL, FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_CONTIGUOUS};
    if(template_name=="vaxpy")
-      return std::shared_ptr<base>(new vaxpy(a[0], a[1], a[2], fetch[a[3]]));
+      return tools::shared_ptr<base>(new vaxpy(a[0], a[1], a[2], fetch[a[3]]));
    else if(template_name=="dot")
-      return std::shared_ptr<base>(new reduction(a[0], a[1], a[2], fetch[a[3]]));
+      return tools::shared_ptr<base>(new reduction(a[0], a[1], a[2], fetch[a[3]]));
    else if(template_name=="maxpy")
-      return std::shared_ptr<base>(new maxpy(a[0], a[1], a[2], a[3], a[4], fetch[a[5]]));
+      return tools::shared_ptr<base>(new maxpy(a[0], a[1], a[2], a[3], a[4], fetch[a[5]]));
    else if(template_name.find("gemvN")!=std::string::npos)
-      return std::shared_ptr<base>(new mreduction_rows(a[0], a[1], a[2], a[3], fetch[a[4]]));
+      return tools::shared_ptr<base>(new mreduction_rows(a[0], a[1], a[2], a[3], fetch[a[4]]));
    else if(template_name.find("gemvT")!=std::string::npos)
-      return std::shared_ptr<base>(new mreduction_cols(a[0], a[1], a[2], a[3], fetch[a[4]]));
+      return tools::shared_ptr<base>(new mreduction_cols(a[0], a[1], a[2], a[3], fetch[a[4]]));
    else if(template_name.find("gemmNN")!=std::string::npos)
-      return std::shared_ptr<base>(new mproduct_nn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+      return tools::shared_ptr<base>(new mproduct_nn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else if(template_name.find("gemmTN")!=std::string::npos)
-      return std::shared_ptr<base>(new mproduct_tn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+      return tools::shared_ptr<base>(new mproduct_tn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else if(template_name.find("gemmNT")!=std::string::npos)
-      return std::shared_ptr<base>(new mproduct_nt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+      return tools::shared_ptr<base>(new mproduct_nt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else if(template_name.find("gemmTT")!=std::string::npos)
-      return std::shared_ptr<base>(new mproduct_tt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+      return tools::shared_ptr<base>(new mproduct_tt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else
      throw std::invalid_argument("Invalid expression: " + template_name);
  }
@@ -214,7 +214,7 @@ void import(std::string const & fname, cl::CommandQueue & queue, model_map_t& re
          numeric_type dtype = detail::get_dtype(elem);
          // Get profiles
-          std::vector<std::shared_ptr<base> > templates;
+          std::vector<tools::shared_ptr<base> > templates;
          js::Value const & profiles = document[opcstr][dtcstr]["profiles"];
          for (js::SizeType id = 0 ; id < profiles.Size() ; ++id)
            templates.push_back(detail::create(operation, tools::to_int_array<int>(profiles[id])));
@@ -222,10 +222,10 @@ void import(std::string const & fname, cl::CommandQueue & queue, model_map_t& re
          {
            // Get predictor
            predictors::random_forest predictor(document[opcstr][dtcstr]["predictor"]);
-            result[std::make_pair(etype, dtype)] = std::shared_ptr<model>(new model(predictor, templates, queue));
+            result[std::make_pair(etype, dtype)] = tools::shared_ptr<model>(new model(predictor, templates, queue));
          }
          else
-            result[std::make_pair(etype, dtype)] = std::shared_ptr<model>(new model(templates, queue));
+            result[std::make_pair(etype, dtype)] = tools::shared_ptr<model>(new model(templates, queue));
        }
      }
    }
@@ -235,7 +235,7 @@ void import(std::string const & fname, cl::CommandQueue & queue, model_map_t& re
 model_map_t init_models(cl::CommandQueue & queue)
 {
  model_map_t res;
-  typedef std::shared_ptr<model> ptr_t;
+  typedef tools::shared_ptr<model> ptr_t;
  numeric_type types[] = {CHAR_TYPE, UCHAR_TYPE, SHORT_TYPE, USHORT_TYPE, INT_TYPE, UINT_TYPE, LONG_TYPE, ULONG_TYPE, FLOAT_TYPE, DOUBLE_TYPE};
  for(auto DTYPE : types){
--- a/lib/symbolic/execute.cpp
+++ b/lib/symbolic/execute.cpp
@@ -173,30 +173,30 @@ namespace atidlas
    /*----Parse required temporaries-----*/
    detail::parse(tree, rootidx, current_type, breakpoints, final_type);
-    std::vector<std::shared_ptr<array> > temporaries_;
+    std::vector<tools::shared_ptr<array> > temporaries_;
    /*----Compute required temporaries----*/
    for(detail::breakpoints_t::reverse_iterator rit = breakpoints.rbegin() ; rit != breakpoints.rend() ; ++rit)
    {
-      std::shared_ptr<model> const & pmodel = models[std::make_pair(rit->first, dtype)];
+      tools::shared_ptr<model> const & pmodel = models[std::make_pair(rit->first, dtype)];
      array_expression::node const & node = tree[rit->second->node_index];
      array_expression::node const & lmost = lhs_most(tree, node);
      //Creates temporary
-      std::shared_ptr<array> tmp;
+      tools::shared_ptr<array> tmp;
      switch(rit->first){
        case SCALAR_AXPY_TYPE:
-        case REDUCTION_TYPE:           tmp = std::shared_ptr<array>(new array(1, dtype, context));                                                        break;
+        case REDUCTION_TYPE:           tmp = tools::shared_ptr<array>(new array(1, dtype, context));                                                        break;
-        case VECTOR_AXPY_TYPE:         tmp = std::shared_ptr<array>(new array(lmost.lhs.array.shape1, dtype, context));                              break;
+        case VECTOR_AXPY_TYPE:         tmp = tools::shared_ptr<array>(new array(lmost.lhs.array.shape1, dtype, context));                              break;
-        case ROW_WISE_REDUCTION_TYPE:  tmp = std::shared_ptr<array>(new array(lmost.lhs.array.shape1, dtype, context));                              break;
+        case ROW_WISE_REDUCTION_TYPE:  tmp = tools::shared_ptr<array>(new array(lmost.lhs.array.shape1, dtype, context));                              break;
-        case COL_WISE_REDUCTION_TYPE:  tmp = std::shared_ptr<array>(new array(lmost.lhs.array.shape2, dtype, context));                              break;
+        case COL_WISE_REDUCTION_TYPE:  tmp = tools::shared_ptr<array>(new array(lmost.lhs.array.shape2, dtype, context));                              break;
-        case MATRIX_AXPY_TYPE:         tmp = std::shared_ptr<array>(new array(lmost.lhs.array.shape1, lmost.lhs.array.shape2, dtype, context)); break;
+        case MATRIX_AXPY_TYPE:         tmp = tools::shared_ptr<array>(new array(lmost.lhs.array.shape1, lmost.lhs.array.shape2, dtype, context)); break;
-        case MATRIX_PRODUCT_NN_TYPE:   tmp = std::shared_ptr<array>(new array(node.lhs.array.shape1, node.rhs.array.shape2, dtype, context));   break;
+        case MATRIX_PRODUCT_NN_TYPE:   tmp = tools::shared_ptr<array>(new array(node.lhs.array.shape1, node.rhs.array.shape2, dtype, context));   break;
-        case MATRIX_PRODUCT_NT_TYPE:   tmp = std::shared_ptr<array>(new array(node.lhs.array.shape1, node.rhs.array.shape1, dtype, context));   break;
+        case MATRIX_PRODUCT_NT_TYPE:   tmp = tools::shared_ptr<array>(new array(node.lhs.array.shape1, node.rhs.array.shape1, dtype, context));   break;
-        case MATRIX_PRODUCT_TN_TYPE:   tmp = std::shared_ptr<array>(new array(node.lhs.array.shape2, node.rhs.array.shape2, dtype, context));   break;
+        case MATRIX_PRODUCT_TN_TYPE:   tmp = tools::shared_ptr<array>(new array(node.lhs.array.shape2, node.rhs.array.shape2, dtype, context));   break;
-        case MATRIX_PRODUCT_TT_TYPE:   tmp = std::shared_ptr<array>(new array(node.lhs.array.shape2, node.rhs.array.shape1, dtype, context));   break;
+        case MATRIX_PRODUCT_TT_TYPE:   tmp = tools::shared_ptr<array>(new array(node.lhs.array.shape2, node.rhs.array.shape1, dtype, context));   break;
        default: throw std::invalid_argument("Unrecognized operation");
      }
--- a/lib/symbolic/expression.cpp
+++ b/lib/symbolic/expression.cpp
@@ -176,11 +176,24 @@ array_expression array_expression::operator-()
 array_expression array_expression::operator!()
 { return array_expression(*this, invalid_node(), op_element(OPERATOR_UNARY_TYPE_FAMILY, OPERATOR_NEGATE_TYPE), context_, INT_TYPE, shape_); }
 //
 void operation_cache::push_back(cl::CommandQueue & queue, cl::Kernel const & kernel, cl::NDRange const & offset, cl::NDRange const & global, cl::NDRange const & local, std::vector<cl::Event>* dependencies)
 {
  l_.push_back({queue, kernel, offset, global, local, dependencies});
 }
 void operation_cache::enqueue(std::list<cl::Event>* events)
 {
  for(infos & i : l_){
    events->push_back(cl::Event());
    i.queue.enqueueNDRangeKernel(i.kernel, i.offset, i.global, i.local, i.dependencies, &events->back());
  }
 }
 //
-std::shared_ptr<array_expression> expressions_tuple::create(array_expression const & s)
+tools::shared_ptr<array_expression> expressions_tuple::create(array_expression const & s)
 {
-  return std::shared_ptr<array_expression>(new array_expression(static_cast<array_expression const &>(s)));
+  return tools::shared_ptr<array_expression>(new array_expression(static_cast<array_expression const &>(s)));
 }
 expressions_tuple::expressions_tuple(data_type const & data, order_type order) : data_(data), order_(order)
--- a/python/autotune/pysrc/autotune.py
+++ b/python/autotune/pysrc/autotune.py
@@ -143,7 +143,7 @@ def do_tuning(args):
                      A = atd.empty(sizes, datatype, context=context)
                      C = atd.empty(sizes, datatype, context=context)
                      return execute(A + C, sizes, Template, parameters, fname)
-                  tune(execution_handler, 100, 5000, 2, (),'log', 'log')
+                  tune(execution_handler, 64, 5000, 2, (),'log', 'log')
              #Row-wise dot
              if operation=='gemv':
                  for A_trans in  args.gemv_layouts:
@@ -152,7 +152,7 @@ def do_tuning(args):
                          x = atd.empty(sizes[1], datatype, context=context)
                          LHS = A if A_trans=='N' else A.T
                          return execute(atd.dot(LHS, x), sizes, Template[A_trans], parameters, fname)
-                      tune(execution_handler, 100, 5000, 2, (A_trans,),'log', 'log')
+                      tune(execution_handler, 64, 6000, 2, (A_trans,),'log', 'log')
              #Matrix Product
              if operation=='gemm':
                  for L in args.gemm_layouts:
@@ -194,7 +194,7 @@ class ArgumentsHandler:
        full_parser = tune_subparsers.add_parser('full', help = 'Tune each operation for randomly chosen sizes')
        full_parser.add_argument("--build-model", default=True, type=bool)
-        full_parser.add_argument("--sample-size", default=30, type=int)
+        full_parser.add_argument("--sample-size", default=60, type=int)
        args = parser.parse_args()
        self.__dict__ = args.__dict__.copy()
--- a/python/autotune/pysrc/misc_tools.py
+++ b/python/autotune/pysrc/misc_tools.py
@@ -218,11 +218,16 @@ def benchmark(template, symbolic):
        raise ValueError("Template has too low occupancy")
    else:
        queue.models[template, atd.float32] = atd.model(template, queue)
        x = atd.array(symbolic)
        atd.synchronize(symbolic.context)
        x, events, cache = atd.flush(symbolic)
        atd.synchronize(symbolic.context)
-        return 1e-9*sum([e.end - e.start for e in events])
+        timings = []
        current_time = 0
        while current_time < 1e-3:
            x, events, cache = atd.flush(symbolic)
            atd.synchronize(symbolic.context)
            timings.append(1e-9*sum([e.end - e.start for e in events]))
            current_time = current_time + timings[-1]
        return np.median(timings)
 def sanitize_string(string, keep_chars = ['_']):
--- a/python/pyatidlas/external/boost/include/boost/functional/hash/extensions.hpp
+++ b/python/pyatidlas/external/boost/include/boost/functional/hash/extensions.hpp
@@ -193,7 +193,7 @@ namespace boost
 #if !defined(BOOST_NO_CXX11_SMART_PTR)
    template <typename T>
-    inline std::size_t hash_value(std::shared_ptr<T> const& x) {
+    inline std::size_t hash_value(tools::shared_ptr<T> const& x) {
        return boost::hash_value(x.get());
    }
--- a/python/pyatidlas/external/boost/include/boost/get_pointer.hpp
+++ b/python/pyatidlas/external/boost/include/boost/get_pointer.hpp
@@ -36,7 +36,7 @@ template<class T> T * get_pointer( std::unique_ptr<T> const& p )
    return p.get();
 }
-template<class T> T * get_pointer( std::shared_ptr<T> const& p )
+template<class T> T * get_pointer( tools::shared_ptr<T> const& p )
 {
    return p.get();
 }
--- a/python/pyatidlas/src/_atidlas.cpp
+++ b/python/pyatidlas/src/_atidlas.cpp
@@ -315,15 +315,14 @@ namespace detail
  boost::shared_ptr<cl::Context> make_context(cl::Device const & dev)
  { return boost::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>(1, dev))); }
-  bp::tuple flush(atd::array_expression const & expression, unsigned int queue_id, bp::list dependencies, int label, std::string const & program_name, bool force_recompile)
+  bp::tuple flush(atd::array_expression const & expression, unsigned int queue_id, bp::list dependencies, bool tune, int label, std::string const & program_name, bool force_recompile)
  {
      std::list<cl::Event> events;
      atd::operation_cache cache;
      std::vector<cl::Event> cdependencies = to_vector<cl::Event>(dependencies);
      boost::shared_ptr<atd::array> parray(new atd::array(atd::control(expression, atd::execution_options_type(queue_id, &events, &cache, &cdependencies),
-                                                                       atd::dispatcher_options_type(label), atd::compilation_options_type(program_name, force_recompile))));
+                                                                       atd::dispatcher_options_type(tune, label), atd::compilation_options_type(program_name, force_recompile))));
-
+      return bp::make_tuple(parray, to_list(events.begin(), events.end()), cache);
      return bp::make_tuple(*parray, to_list(events.begin(), events.end()), cache);
  }
 }
@@ -402,7 +401,7 @@ void export_cl()
  bp::def("synchronize", &atd::cl_ext::synchronize);
  bp::def("get_platforms", &detail::get_platforms);
-  bp::def("flush", &detail::flush, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));
+  bp::def("flush", &detail::flush, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("tune") = false, bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));
  bp::class_<state_type>("state_type")
          .def_readwrite("queue_properties",&atd::cl_ext::queue_properties)
--- a/tests/model.cpp
+++ b/tests/model.cpp
@@ -10,7 +10,7 @@ namespace ad = atidlas;
 int main()
 {
  viennacl::vector<float> x(10000), y(10000), z(10000);
-  std::map<std::string, ad::std::shared_ptr<ad::model> > models = ad::import("geforce_gt_540m.json");
+  std::map<std::string, ad::tools::shared_ptr<ad::model> > models = ad::import("geforce_gt_540m.json");
  models["vector-axpy-float32"]->tune(viennacl::symbolic_expression(z, viennacl::op_assign(), x));
  models["vector-axpy-float32"]->execute(viennacl::symbolic_expression(z, viennacl::op_assign(), x));
  return EXIT_SUCCESS;