Now using system CL include

2015-01-27 16:14:02 -05:00
parent 53c9bef85d
commit c37d8a2a81
39 changed files with 154 additions and 12608 deletions
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -16,7 +16,7 @@


 namespace ad = atidlas;
-typedef atidlas::int_t int_t;
+typedef ad::int_t int_t;

 template<class T>
 void bench(ad::numeric_type dtype)
@@ -31,11 +31,11 @@ void bench(ad::numeric_type dtype)
  times.clear();\
  total_time = 0;\
  OP;\
-  ad::cl::synchronize(ad::cl::default_context());\
+  ad::cl_ext::synchronize(ad::cl_ext::default_context());\
  while(total_time < 1e-2){\
    timer.start(); \
    OP;\
-    ad::cl::synchronize(ad::cl::default_context());\
+    ad::cl_ext::synchronize(ad::cl_ext::default_context());\
    times.push_back(timer.get());\
    total_time += times.back();\
  }\
@@ -52,17 +52,17 @@ void bench(ad::numeric_type dtype)
    int_t N = *it;
    std::cout << N;
    /* ATIDLAS */
-    atidlas::array x(N, dtype), y(N, dtype);
+    ad::array x(N, dtype), y(N, dtype);
    BENCHMARK(y = x + y, bandwidth(3*N, tres, dtsize));
    /* clAmdBlas */
 #ifdef BENCH_CLAMDBLAS
-    BENCHMARK(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &atidlas::cl::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(3*N, tres, dtsize))
+    BENCHMARK(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(3*N, tres, dtsize))
 #endif
    /* BLAS */
 #ifdef BENCH_CBLAS
    std::vector<float> cx(N), cy(N);
-    atidlas::copy(x, cx);
-    atidlas::copy(y, cy);
+    ad::copy(x, cx);
+    ad::copy(y, cy);
    BENCHMARK(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), bandwidth(3*N, tres, dtsize));
 #endif
    /* CuBLAS */
@@ -84,19 +84,19 @@ void bench(ad::numeric_type dtype)
    int_t N = *it;
    std::cout << N;
    /* ATIDLAS */
-    atidlas::array x(N, dtype), y(N, dtype);
-    atidlas::array scratch(N, dtype);
-    atidlas::scalar s(dtype);
+    ad::array x(N, dtype), y(N, dtype);
+    ad::array scratch(N, dtype);
+    ad::scalar s(dtype);
    BENCHMARK(s = dot(x,y), bandwidth(2*N, tres, dtsize));
    /* clAmdBlas */
 #ifdef BENCH_CLAMDBLAS
-    BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &atidlas::cl::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize))
+    BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize))
 #endif
    /* BLAS */
 #ifdef BENCH_CBLAS
    std::vector<float> cx(N), cy(N);
-    atidlas::copy(x, cx);
-    atidlas::copy(y, cy);
+    ad::copy(x, cx);
+    ad::copy(y, cy);
    BENCHMARK(cblas_sdot(N, cx.data(), 1, cy.data(), 1), bandwidth(2*N, tres, dtsize));
 #endif
    std::cout << std::endl;
@@ -115,18 +115,18 @@ void bench(ad::numeric_type dtype)
      int_t N = *Nit;
      std::cout << M << "," << N;
      /* ATIDLAS */
-      atidlas::array A(N, M, dtype), y(M, dtype), x(N, dtype);
+      ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
      BENCHMARK(y = dot(trans(A),x), bandwidth(M*N + M + N, tres, dtsize));
      /* clAmdBlas */
  #ifdef BENCH_CLAMDBLAS
-      BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &atidlas::cl::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize))
+      BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize))
  #endif
      /* BLAS */
  #ifdef BENCH_CBLAS
      std::vector<float> cA(N*M), cx(N), cy(M);
-      atidlas::copy(x, cx);
-      atidlas::copy(y, cy);
-      atidlas::copy(A, cA);
+      ad::copy(x, cx);
+      ad::copy(y, cy);
+      ad::copy(A, cA);
      BENCHMARK(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), bandwidth(M*N + M + N, tres, dtsize));
  #endif
      std::cout << std::endl;
@@ -144,19 +144,19 @@ void bench(ad::numeric_type dtype)
      int_t M = *Mit, N = *Nit, K = *Kit;
      std::cout << M << "," << N << "," << K;
      /* ATIDLAS */
-      atidlas::array C(M, N, dtype), A(M, K, dtype), B(N, K, dtype);
+      ad::array C(M, N, dtype), A(M, K, dtype), B(N, K, dtype);
      BENCHMARK(C = dot(A,trans(B)), gflops((double)2*M*N*K, tres));
      /* clAmdBlas */
  #ifdef BENCH_CLAMDBLAS
      BENCHMARK(clAmdBlasSgemm(clAmdBlasColumnMajor, clAmdBlasNoTrans, clAmdBlasTrans, M, N, K, 1, A.data()(), A.ld(), B.data()(), B.ld(),
-                               0, C.data()(), C.ld(), 1, &atidlas::cl::get_queue(C.context(), 0)(),0, NULL, NULL), gflops((double)2*M*N*K, tres))
+                               0, C.data()(), C.ld(), 1, &ad::cl_ext::get_queue(C.context(), 0)(),0, NULL, NULL), gflops((double)2*M*N*K, tres))
  #endif
      /* BLAS */
  #ifdef BENCH_CBLAS
      std::vector<float> cC(M*N), cA(M*K), cB(N*K);
-      atidlas::copy(C, cC);
-      atidlas::copy(A, cA);
-      atidlas::copy(B, cB);
+      ad::copy(C, cC);
+      ad::copy(A, cA);
+      ad::copy(B, cB);
      BENCHMARK(cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, M, N, K, 1, cA.data(), M, cB.data(), N, 1, cC.data(), M), gflops((double)2*M*N*K, tres));
  #endif
      std::cout << std::endl;
@@ -171,16 +171,16 @@ int main(int argc, char* argv[])
 #endif

  int device_idx = 0;
-  if(atidlas::cl::queues.size()>1){
-    atidlas::cl::queues_t & queues = atidlas::cl::queues;
+  if(ad::cl_ext::queues.size()>1){
+    ad::cl_ext::queues_t & queues = ad::cl_ext::queues;
    if(argc!=2)
    {
      std::cerr << "usage : blas-bench [DEVICE_IDX]" << std::endl;
      std::cout << "Devices available: " << std::endl;
      unsigned int current=0;
-      for(atidlas::cl::queues_t::const_iterator it = queues.begin() ; it != queues.end() ; ++it){
-        atidlas::cl::Device device = it->first.getInfo<CL_CONTEXT_DEVICES>()[0];
-        std::cout << current++ << ": " << device.getInfo<CL_DEVICE_NAME>() << "(" << atidlas::cl::Platform(device.getInfo<CL_DEVICE_PLATFORM>()).getInfo<CL_PLATFORM_NAME>() << ")" << std::endl;
+      for(ad::cl_ext::queues_t::const_iterator it = queues.begin() ; it != queues.end() ; ++it){
+        cl::Device device = it->first.getInfo<CL_CONTEXT_DEVICES>()[0];
+        std::cout << current++ << ": " << device.getInfo<CL_DEVICE_NAME>() << "(" << cl::Platform(device.getInfo<CL_DEVICE_PLATFORM>()).getInfo<CL_PLATFORM_NAME>() << ")" << std::endl;
      }
      exit(EXIT_FAILURE);
    }
@@ -188,7 +188,7 @@ int main(int argc, char* argv[])
      device_idx = atoi(argv[1]);
  }

-  atidlas::cl::default_context_idx = device_idx;
+  ad::cl_ext::default_context_idx = device_idx;
  std::cout << "#Benchmark : BLAS" << std::endl;
  std::cout << "#----------------" << std::endl;
  bench<float>(ad::FLOAT_TYPE);
--- a/bench/overhead.cpp
+++ b/bench/overhead.cpp
@@ -7,19 +7,19 @@ namespace ad = atidlas;

 int main()
 {
-  for(ad::cl::queues_t::iterator it = ad::cl::queues.begin() ; it != ad::cl::queues.end() ; ++it)
+  for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
  {
    ad::array x(10, ad::FLOAT_TYPE, it->first);
-    ad::cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
+    cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
    ad::tools::timer t;
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "-------------------------" << std::endl;
    x = x + x;
-    ad::cl::synchronize(x.context());
+    ad::cl_ext::synchronize(x.context());
    t.start();\
    for(unsigned int i = 0 ; i < 100 ; ++i){
      x = x + x;
-      ad::cl::synchronize(x.context());
+      ad::cl_ext::synchronize(x.context());
    }
    std::cout << "Kernel launch overhead: " << t.get()/100 << std::endl;
    std::cout << "Expression tree creation:" << std::endl;
--- a/include/atidlas/array.h
+++ b/include/atidlas/array.h
@@ -3,7 +3,7 @@

 #include <iostream>
 #include "atidlas/types.h"
-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>
 #include "atidlas/cl/queues.h"
 #include "atidlas/symbolic/expression.h"

@@ -18,19 +18,19 @@ class array: public obj_base
  friend array reshape(array const &, int_t, int_t);
 public:
  //1D Constructors
-  array(int_t size1, numeric_type dtype, cl::Context context = cl::default_context());
+  array(int_t size1, numeric_type dtype, cl::Context context = cl_ext::default_context());
  template<typename DT>
-  array(std::vector<DT> const & data, cl::Context context = cl::default_context());
+  array(std::vector<DT> const & data, cl::Context context = cl_ext::default_context());
  array(array & v, slice const & s1);

  //2D Constructors
-  array(int_t size1, int_t size2, numeric_type dtype, cl::Context context = cl::default_context());
+  array(int_t size1, int_t size2, numeric_type dtype, cl::Context context = cl_ext::default_context());
  template<typename DT>
-  array(int_t size1, int_t size2, std::vector<DT> const & data, cl::Context context = cl::default_context());
+  array(int_t size1, int_t size2, std::vector<DT> const & data, cl::Context context = cl_ext::default_context());
  array(array & M, slice const & s1, slice const & s2);

  //General constructor
-  array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context = cl::default_context());
+  array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context = cl_ext::default_context());
  array(array_expression const & proxy);
  array(array const &);

@@ -91,9 +91,9 @@ class scalar : public array
 private:
  template<class T> T cast() const;
 public:
-  explicit scalar(numeric_type dtype, cl::Buffer const & data, int_t offset, cl::Context context = cl::default_context());
-  explicit scalar(value_scalar value, cl::Context context = cl::default_context());
-  explicit scalar(numeric_type dtype, cl::Context context = cl::default_context());
+  explicit scalar(numeric_type dtype, cl::Buffer const & data, int_t offset, cl::Context context = cl_ext::default_context());
+  explicit scalar(value_scalar value, cl::Context context = cl_ext::default_context());
+  explicit scalar(numeric_type dtype, cl::Context context = cl_ext::default_context());
  scalar(array_expression const & proxy);
  scalar& operator=(value_scalar const &);
 //  scalar& operator=(scalar const & s);
@@ -209,8 +209,8 @@ ATIDLAS_DECLARE_REDUCTION(max)
 ATIDLAS_DECLARE_REDUCTION(min)
 ATIDLAS_DECLARE_REDUCTION(argmin)

-atidlas::array_expression eye(std::size_t, std::size_t, atidlas::numeric_type, cl::Context ctx = cl::default_context());
-array_expression zeros(std::size_t M, std::size_t N, numeric_type dtype, cl::Context ctx = cl::default_context());
+atidlas::array_expression eye(std::size_t, std::size_t, atidlas::numeric_type, cl::Context ctx = cl_ext::default_context());
+array_expression zeros(std::size_t M, std::size_t N, numeric_type dtype, cl::Context ctx = cl_ext::default_context());
 array reshape(array const &, int_t, int_t);

 //
--- a/include/atidlas/backend/binder.h
+++ b/include/atidlas/backend/binder.h
@@ -2,7 +2,7 @@
 #define ATIDLAS_BACKEND_BINDER_H

 #include <map>
-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>

 namespace atidlas
 {
--- a/include/atidlas/backend/templates/base.h
+++ b/include/atidlas/backend/templates/base.h
@@ -8,7 +8,7 @@
 #include "atidlas/types.h"
 #include "atidlas/backend/parse.h"
 #include "atidlas/backend/stream.h"
-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>
 #include "atidlas/cl/lazy_compiler.h"
 #include "atidlas/symbolic/expression.h"

@@ -163,7 +163,7 @@ public:
  std::vector<std::string> generate(unsigned int label, symbolic_expressions_container const & symbolic_expressions, cl::Device const & device);
  virtual int check_invalid(symbolic_expressions_container const & symbolic_expressions, cl::Device const & device) const = 0;
  virtual void enqueue(cl::CommandQueue & queue,
-                       std::vector<cl::lazy_compiler> & programs,
+                       std::vector<cl_ext::lazy_compiler> & programs,
                       unsigned int label, symbolic_expressions_container const & symbolic_expressions) = 0;
  virtual tools::shared_ptr<base> clone() const = 0;
 private:
--- a/include/atidlas/backend/templates/maxpy.h
+++ b/include/atidlas/backend/templates/maxpy.h
@@ -27,7 +27,7 @@ public:
  maxpy(parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
  maxpy(unsigned int simd, unsigned int ls1, unsigned int ls2,  unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
  std::vector<int_t> input_sizes(symbolic_expressions_container const & symbolic_expressions);
-  void enqueue(cl::CommandQueue & queue, std::vector<cl::lazy_compiler> & programs,  unsigned int label, symbolic_expressions_container const & symbolic_expressions);
+  void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,  unsigned int label, symbolic_expressions_container const & symbolic_expressions);
 };

 }
--- a/include/atidlas/backend/templates/mproduct.h
+++ b/include/atidlas/backend/templates/mproduct.h
@@ -41,7 +41,7 @@ private:
  void enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K,
                     array_infos const & A, array_infos const & B, array_infos const & C,
                     value_scalar const & alpha, value_scalar const & beta,
-                     std::vector<cl::lazy_compiler> & programs, unsigned int label, int id);
+                     std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, int id);
  array_infos create_slice(array_infos & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap);
  std::vector<int_t> infos(symbolic_expressions_container const & symbolic_expressions,
                                   lhs_rhs_element & C, lhs_rhs_element & A, lhs_rhs_element & B);
@@ -49,7 +49,7 @@ public:
  mproduct(mproduct::parameters_type const & parameters, char A_trans, char B_trans);
  std::vector<int_t> input_sizes(symbolic_expressions_container const & symbolic_expressions);
  void enqueue(cl::CommandQueue & queue,
-               std::vector<cl::lazy_compiler> & programs,
+               std::vector<cl_ext::lazy_compiler> & programs,
               unsigned int label,
               symbolic_expressions_container const & symbolic_expressions);

--- a/include/atidlas/backend/templates/mreduction.h
+++ b/include/atidlas/backend/templates/mreduction.h
@@ -35,7 +35,7 @@ private:
  std::vector<std::string> generate_impl(unsigned int, symbolic_expressions_container const &, std::vector<mapping_type> const &) const;
 public:
  virtual std::vector<int_t> input_sizes(symbolic_expressions_container const & symbolic_expressions);
-  void enqueue(cl::CommandQueue & queue,std::vector<cl::lazy_compiler> & programs,unsigned int label, symbolic_expressions_container const & symbolic_expressions);
+  void enqueue(cl::CommandQueue & queue,std::vector<cl_ext::lazy_compiler> & programs,unsigned int label, symbolic_expressions_container const & symbolic_expressions);
 private:
  reduction_type reduction_type_;
 };
--- a/include/atidlas/backend/templates/reduction.h
+++ b/include/atidlas/backend/templates/reduction.h
@@ -30,7 +30,7 @@ public:
  reduction(unsigned int simd, unsigned int ls, unsigned int ng, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
  std::vector<int_t> input_sizes(symbolic_expressions_container const & symbolic_expressions);
  void enqueue(cl::CommandQueue & queue,
-               std::vector<cl::lazy_compiler> & programs,
+               std::vector<cl_ext::lazy_compiler> & programs,
               unsigned int label,
               symbolic_expressions_container const & symbolic_expressions);
 private:
--- a/include/atidlas/backend/templates/vaxpy.h
+++ b/include/atidlas/backend/templates/vaxpy.h
@@ -23,7 +23,7 @@ public:
  vaxpy(vaxpy::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
  vaxpy(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
  std::vector<int_t> input_sizes(symbolic_expressions_container const & symbolic_expressions);
-  void enqueue(cl::CommandQueue & queue, std::vector<cl::lazy_compiler> & programs,
+  void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
               unsigned int label, symbolic_expressions_container const & symbolic_expressions);
 };

--- a/include/atidlas/cl/cl.hpp
+++ b/include/atidlas/cl/cl.hpp
--- a/include/atidlas/cl/compare.hpp
+++ b/include/atidlas/cl/compare.hpp
@@ -3,7 +3,8 @@

 namespace atidlas
 {
-namespace cl
+
+namespace cl_ext
 {

 struct compare{
--- a/include/atidlas/cl/lazy_compiler.h
+++ b/include/atidlas/cl/lazy_compiler.h
@@ -1,13 +1,13 @@
 #ifndef ATIDLAS_CL_LAZY_COMPILER_H
 #define ATIDLAS_CL_LAZY_COMPILER_H

-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>
 #include "atidlas/cl/program_map.h"

 namespace atidlas
 {

-namespace cl
+namespace cl_ext
 {

 class lazy_compiler
--- a/include/atidlas/cl/program_map.h
+++ b/include/atidlas/cl/program_map.h
@@ -2,12 +2,12 @@
 #define ATIDLAS_CL_PROGRAM_MAP_H

 #include <map>
-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>

 namespace atidlas
 {

-namespace cl
+namespace cl_ext
 {

 class program_map
--- a/include/atidlas/cl/queues.h
+++ b/include/atidlas/cl/queues.h
@@ -2,16 +2,16 @@
 #define ATIDLAS_CL_QUEUES_H

 #include <map>
-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>
 #include "atidlas/cl/compare.hpp"

 namespace atidlas
 {

-namespace cl
+namespace cl_ext
 {

-typedef std::map<cl::Program, cl::Kernel, cl::compare> kernels_t;
+typedef std::map<cl::Program, cl::Kernel, cl_ext::compare> kernels_t;
 typedef std::vector<std::pair<cl::Context, std::vector<cl::CommandQueue> > > queues_t;

 queues_t init_queues();
--- a/include/atidlas/model/model.h
+++ b/include/atidlas/model/model.h
@@ -21,7 +21,7 @@ namespace atidlas
  private:
    std::string define_extension(std::string const & extensions, std::string const & ext);
    inline void fill_program_name(char* program_name, symbolic_expressions_container const & symbolic_expressions, binding_policy_t binding_policy);
-    std::vector<cl::lazy_compiler>& init(symbolic_expressions_container const & symbolic_expressions, cl::Context const & context, cl::Device const & device, bool force_recompilation);
+    std::vector<cl_ext::lazy_compiler>& init(symbolic_expressions_container const & symbolic_expressions, cl::Context const & context, cl::Device const & device, bool force_recompilation);

  public:
    model(predictors::random_forest const &, std::vector< tools::shared_ptr<base> > const &, cl::CommandQueue &);
@@ -36,7 +36,7 @@ namespace atidlas
    templates_container templates_;
    tools::shared_ptr<predictors::random_forest> predictor_;
    std::map<std::vector<int_t>, int> hardcoded_;
-    std::map<cl_context, std::map<std::string, std::vector<cl::lazy_compiler> > > lazy_programs_;
+    std::map<cl_context, std::map<std::string, std::vector<cl_ext::lazy_compiler> > > lazy_programs_;
    cl::CommandQueue & queue_;
  };

@@ -46,7 +46,7 @@ namespace atidlas
  model_map_t& get_model_map(cl::CommandQueue & queue);
  model& get_model(cl::CommandQueue & queue, expression_type, numeric_type);

-  extern std::map<cl::CommandQueue, model_map_t, cl::compare> models;
+  extern std::map<cl::CommandQueue, model_map_t, cl_ext::compare> models;

 }

--- a/include/atidlas/symbolic/execute.h
+++ b/include/atidlas/symbolic/execute.h
@@ -1,7 +1,7 @@
 #ifndef _ATIDLAS_SCHEDULER_EXECUTE_H
 #define _ATIDLAS_SCHEDULER_EXECUTE_H

-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>
 #include "atidlas/model/model.h"
 #include "atidlas/symbolic/expression.h"

--- a/include/atidlas/symbolic/expression.h
+++ b/include/atidlas/symbolic/expression.h
@@ -5,7 +5,7 @@
 #include <list>
 #include "atidlas/types.h"
 #include "atidlas/value_scalar.h"
-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>
 #include "atidlas/tools/shared_ptr.hpp"

 namespace atidlas
--- a/include/atidlas/types.h
+++ b/include/atidlas/types.h
@@ -1,7 +1,7 @@
 #ifndef ATIDLAS_TYPES_H
 #define ATIDLAS_TYPES_H

-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>
 #include "atidlas/exception/unknown_datatype.h"

 namespace atidlas
--- a/include/atidlas/value_scalar.h
+++ b/include/atidlas/value_scalar.h
@@ -2,7 +2,7 @@
 #define ATIDLAS_VALUE_SCALAR_H

 #include "atidlas/types.h"
-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>

 namespace atidlas
 {
--- a/lib/array.cpp
+++ b/lib/array.cpp
@@ -1,7 +1,7 @@
 #include <cassert>

 #include "atidlas/array.h"
-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>
 #include "atidlas/exception/unknown_datatype.h"
 #include "atidlas/model/model.h"
 #include "atidlas/symbolic/execute.h"
@@ -131,7 +131,7 @@ int_t array::dsize() const
 array & array::operator=(array const & rhs)
 {
  array_expression expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ASSIGN_TYPE), context_, dtype_, shape_);
-  cl::CommandQueue & queue = cl::get_queue(context_, 0);
+  cl::CommandQueue & queue = cl_ext::get_queue(context_, 0);
  model_map_t & mmap = atidlas::get_model_map(queue);
  execute(expression, mmap);
  return *this;
@@ -140,7 +140,7 @@ array & array::operator=(array const & rhs)
 array & array::operator=(array_expression const & rhs)
 {
  array_expression expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ASSIGN_TYPE), shape_);
-  cl::CommandQueue & queue = cl::get_queue(context_, 0);
+  cl::CommandQueue & queue = cl_ext::get_queue(context_, 0);
  model_map_t & mmap = atidlas::get_model_map(queue);
  execute(expression, mmap);
  return *this;
@@ -243,7 +243,7 @@ namespace detail
 template<class T>
 void copy(cl::Context & ctx, cl::Buffer const & data, T value)
 {
-  cl::get_queue(ctx, 0).enqueueWriteBuffer(data, CL_TRUE, 0, sizeof(T), (void*)&value);
+  cl_ext::get_queue(ctx, 0).enqueueWriteBuffer(data, CL_TRUE, 0, sizeof(T), (void*)&value);
 }

 }
@@ -282,7 +282,7 @@ T scalar::cast() const
  int_t dtsize = size_of(dtype_);
 #define HANDLE_CASE(DTYPE, VAL) \
 case DTYPE:\
-  cl::get_queue(context_, 0).enqueueReadBuffer(data_, CL_TRUE, start_._1*dtsize, dtsize, (void*)&v.VAL);\
+  cl_ext::get_queue(context_, 0).enqueueReadBuffer(data_, CL_TRUE, start_._1*dtsize, dtsize, (void*)&v.VAL);\
  return v.VAL

  switch(dtype_)
@@ -305,7 +305,7 @@ case DTYPE:\

 scalar& scalar::operator=(value_scalar const & s)
 {
-  cl::CommandQueue& queue = cl::get_queue(context_, 0);
+  cl::CommandQueue& queue = cl_ext::get_queue(context_, 0);
  int_t dtsize = size_of(dtype_);

 #define HANDLE_CASE(TYPE, CLTYPE) case TYPE:\
@@ -727,7 +727,7 @@ void copy(void const * data, array& x, cl::CommandQueue & queue, bool blocking)
    x = tmp;
  }
  if(blocking)
-    cl::synchronize(x.context());
+    cl_ext::synchronize(x.context());
 }

 void copy(array const & x, void* data, cl::CommandQueue & queue, bool blocking)
@@ -744,14 +744,14 @@ void copy(array const & x, void* data, cl::CommandQueue & queue, bool blocking)
    queue.enqueueReadBuffer(tmp.data(), CL_FALSE, 0, tmp.dsize()*dtypesize, data);
  }
  if(blocking)
-    cl::synchronize(x.context());
+    cl_ext::synchronize(x.context());
 }

 void copy(void const *data, array &x, bool blocking)
-{ copy(data, x, cl::get_queue(x.context(), 0), blocking); }
+{ copy(data, x, cl_ext::get_queue(x.context(), 0), blocking); }

 void copy(array const & x, void* data, bool blocking)
-{ copy(x, data, cl::get_queue(x.context(), 0), blocking); }
+{ copy(x, data, cl_ext::get_queue(x.context(), 0), blocking); }

 //std::vector<>
 template<class T>
@@ -776,11 +776,11 @@ void copy(array const & x, std::vector<T> & cx, cl::CommandQueue & queue, bool b

 template<class T>
 void copy(std::vector<T> const & cx, array & x, bool blocking)
-{ copy(cx, x, cl::get_queue(x.context(), 0), blocking); }
+{ copy(cx, x, cl_ext::get_queue(x.context(), 0), blocking); }

 template<class T>
 void copy(array const & x, std::vector<T> & cx, bool blocking)
-{ copy(x, cx, cl::get_queue(x.context(), 0), blocking); }
+{ copy(x, cx, cl_ext::get_queue(x.context(), 0), blocking); }

 #define INSTANTIATE(T) \
  template void copy<T>(std::vector<T> const &, array &, cl::CommandQueue&, bool);\
--- a/lib/backend/templates/maxpy.cpp
+++ b/lib/backend/templates/maxpy.cpp
@@ -105,7 +105,7 @@ std::vector<int_t> maxpy::input_sizes(symbolic_expressions_container const & sym
 }

 void maxpy::enqueue(cl::CommandQueue & queue,
-             std::vector<cl::lazy_compiler> & programs,
+             std::vector<cl_ext::lazy_compiler> & programs,
             unsigned int label,
             symbolic_expressions_container const & symbolic_expressions)
 {
--- a/lib/backend/templates/mproduct.cpp
+++ b/lib/backend/templates/mproduct.cpp
@@ -568,7 +568,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
  void mproduct::enqueue_block(cl::CommandQueue & queue, int_t M, int_t N, int_t K,
                     array_infos const & A, array_infos const & B, array_infos const & C,
                     value_scalar const & alpha, value_scalar const & beta,
-                     std::vector<cl::lazy_compiler> & programs, unsigned int label, int id)
+                     std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, int id)
  {
    if (A.shape1==0 || A.shape2==0 || B.shape1==0 || B.shape2==0 || C.shape1==0 || C.shape2==0)
      return;
@@ -646,7 +646,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
    return infos(symbolic_expressions, d0, d1, d2);
  }

-  void mproduct::enqueue(cl::CommandQueue & queue, std::vector<cl::lazy_compiler> & programs, unsigned int label, symbolic_expressions_container const & symbolic_expressions)
+  void mproduct::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs, unsigned int label, symbolic_expressions_container const & symbolic_expressions)
  {
    using namespace tools;

--- a/lib/backend/templates/mreduction.cpp
+++ b/lib/backend/templates/mreduction.cpp
@@ -215,7 +215,7 @@ std::vector<int_t> mreduction::input_sizes(symbolic_expressions_container const
 }

 void mreduction::enqueue(cl::CommandQueue & queue,
-             std::vector<cl::lazy_compiler> & programs,
+             std::vector<cl_ext::lazy_compiler> & programs,
             unsigned int label,
             symbolic_expressions_container const & symbolic_expressions)
 {
--- a/lib/backend/templates/reduction.cpp
+++ b/lib/backend/templates/reduction.cpp
@@ -1,6 +1,6 @@
 #include <iostream>
 #include "atidlas/backend/templates/reduction.h"
-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>
 #include "atidlas/tools/to_string.hpp"
 #include "atidlas/tools/make_map.hpp"
 #include "atidlas/tools/make_vector.hpp"
@@ -281,7 +281,7 @@ std::vector<int_t> reduction::input_sizes(symbolic_expressions_container const &
 }

 void reduction::enqueue(cl::CommandQueue & queue,
-             std::vector<cl::lazy_compiler> & programs,
+             std::vector<cl_ext::lazy_compiler> & programs,
             unsigned int label,
             symbolic_expressions_container const & symbolic_expressions)
 {
--- a/lib/backend/templates/vaxpy.cpp
+++ b/lib/backend/templates/vaxpy.cpp
@@ -106,7 +106,7 @@ std::vector<int_t> vaxpy::input_sizes(symbolic_expressions_container const & sym
 }

 void vaxpy::enqueue(cl::CommandQueue & queue,
-             std::vector<cl::lazy_compiler> & programs,
+             std::vector<cl_ext::lazy_compiler> & programs,
             unsigned int label,
             symbolic_expressions_container const & symbolic_expressions)
 {
--- a/lib/cl/lazy_compiler.cpp
+++ b/lib/cl/lazy_compiler.cpp
@@ -3,7 +3,7 @@
 namespace atidlas
 {

-namespace cl
+namespace cl_ext
 {

 lazy_compiler::lazy_compiler(cl::Context const & ctx, std::string const & name, std::string const & src, bool force_recompilation) :
--- a/lib/cl/program_map.cpp
+++ b/lib/cl/program_map.cpp
@@ -9,7 +9,7 @@
 namespace atidlas
 {

-namespace cl
+namespace cl_ext
 {

 program_map::program_map()
--- a/lib/cl/queues.cpp
+++ b/lib/cl/queues.cpp
@@ -5,7 +5,7 @@
 namespace atidlas
 {

-namespace cl
+namespace cl_ext
 {

 void synchronize(cl::Context const & context)
--- a/lib/model/model.cpp
+++ b/lib/model/model.cpp
@@ -42,20 +42,20 @@ void model::fill_program_name(char* program_name, symbolic_expressions_container
  delete binder;
 }

-std::vector<cl::lazy_compiler>& model::init(symbolic_expressions_container const & symbolic_expressions, cl::Context const & context, cl::Device const & device, bool force_recompilation)
+std::vector<cl_ext::lazy_compiler>& model::init(symbolic_expressions_container const & symbolic_expressions, cl::Context const & context, cl::Device const & device, bool force_recompilation)
 {
  char program_name[256];
  fill_program_name(program_name, symbolic_expressions, BIND_TO_HANDLE);
  std::string pname(program_name);
-  std::vector<cl::lazy_compiler> & to_init = lazy_programs_[context()][pname];
+  std::vector<cl_ext::lazy_compiler> & to_init = lazy_programs_[context()][pname];
  if(to_init.empty())
  {
    std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();

-    to_init.push_back(cl::lazy_compiler(context, pname, force_recompilation));
+    to_init.push_back(cl_ext::lazy_compiler(context, pname, force_recompilation));
    to_init.back().add(define_extension(extensions, "cl_khr_fp64"));

-    to_init.push_back(cl::lazy_compiler(context, pname + "_fb", force_recompilation));
+    to_init.push_back(cl_ext::lazy_compiler(context, pname + "_fb", force_recompilation));
    to_init.back().add(define_extension(extensions, "cl_khr_fp64"));

    for(size_t i = 0 ; i < templates_.size() ; ++i)
@@ -86,7 +86,7 @@ void model::execute(symbolic_expressions_container const & symbolic_expressions,
  assert(context() == queue_.getInfo<CL_QUEUE_CONTEXT>()());
  cl::Device const & device = queue_.getInfo<CL_QUEUE_DEVICE>();

-  std::vector<cl::lazy_compiler> & compilers = init(symbolic_expressions, context, device, force_recompilation);
+  std::vector<cl_ext::lazy_compiler> & compilers = init(symbolic_expressions, context, device, force_recompilation);

  //Prediction
  std::vector<int_t> x = templates_[0]->input_sizes(symbolic_expressions);
@@ -114,7 +114,7 @@ void model::tune(symbolic_expressions_container const & symbolic_expressions)
  assert(context() == queue_.getInfo<CL_QUEUE_CONTEXT>()());
  cl::Device device = queue_.getInfo<CL_QUEUE_DEVICE>();

-  std::vector<cl::lazy_compiler> & compilers = init(symbolic_expressions, context, device, false);
+  std::vector<cl_ext::lazy_compiler> & compilers = init(symbolic_expressions, context, device, false);

  //Collect the timings
  std::vector<float> timings(templates_.size());
@@ -265,7 +265,7 @@ model_map_t init_models(cl::CommandQueue & queue)

 model_map_t& get_model_map(cl::CommandQueue & queue)
 {
-  std::map<cl::CommandQueue, model_map_t, cl::compare>::iterator it = models.find(queue);
+  std::map<cl::CommandQueue, model_map_t, cl_ext::compare>::iterator it = models.find(queue);
  if(it == models.end())
    return models.insert(std::make_pair(queue, init_models(queue))).first->second;
  return it->second;
@@ -277,6 +277,6 @@ model& get_model(cl::CommandQueue & queue, expression_type expression, numeric_t
  return *get_model_map(queue).at(key);
 }

-std::map<cl::CommandQueue, model_map_t, cl::compare> models;
+std::map<cl::CommandQueue, model_map_t, cl_ext::compare> models;

 }
--- a/lib/symbolic/execute.cpp
+++ b/lib/symbolic/execute.cpp
@@ -3,7 +3,7 @@
 #include <vector>
 #include "atidlas/types.h"
 #include "atidlas/array.h"
-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>
 #include "atidlas/model/model.h"
 #include "atidlas/symbolic/expression.h"

--- a/lib/symbolic/expression.cpp
+++ b/lib/symbolic/expression.cpp
@@ -2,7 +2,7 @@
 #include <vector>
 #include "atidlas/array.h"
 #include "atidlas/value_scalar.h"
-#include "atidlas/cl/cl.hpp"
+#include <CL/cl.hpp>
 #include "atidlas/symbolic/expression.h"

 namespace atidlas
--- a/python/pyatidlas/external/boost/include/boost/intrusive/detail/clear_on_destructor_base.hpp
+++ b/python/pyatidlas/external/boost/include/boost/intrusive/detail/clear_on_destructor_base.hpp
@@ -17,7 +17,7 @@ namespace boost {
 namespace intrusive {
 namespace detail {

-template<class Derived, bool DoClear = true>
+template<class Derived, bool Dcl_extear = true>
 class clear_on_destructor_base
 {
   protected:
--- a/python/pyatidlas/src/_atidlas.cpp
+++ b/python/pyatidlas/src/_atidlas.cpp
@@ -77,12 +77,12 @@ bp::tuple get_shape(atd::array const & x)
 //  x.reshape(size1, size2);
 //}

-//boost::python::dict create_queues(atd::cl::queues_t queues)
+//boost::python::dict create_queues(atd::cl_ext::queues_t queues)
 //{
 //  boost::python::dict dictionary;
-//  for (atd::cl::queues_t::iterator it = queues.begin(); it != queues.end(); ++it) {
+//  for (atd::cl_ext::queues_t::iterator it = queues.begin(); it != queues.end(); ++it) {
 //    bp::list list;
-//    for (atd::cl::queues_t::mapped_type::iterator itt = it->second.begin(); itt != it->second.end(); ++itt)
+//    for (atd::cl_ext::queues_t::mapped_type::iterator itt = it->second.begin(); itt != it->second.end(); ++itt)
 //      list.append(*itt);
 //    dictionary[it->first] = list;
 //  }
@@ -175,7 +175,7 @@ namespace detail
    return res;
  }

-  bp::list nv_compute_capability(atd::cl::Device const & device)
+  bp::list nv_compute_capability(cl::Device const & device)
  {
    bp::list res;
    res.append(device.getInfo<CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV>());
@@ -185,20 +185,20 @@ namespace detail

  bp::list get_platforms()
  {
-    std::vector<atd::cl::Platform> platforms;
-    atd::cl::Platform::get(&platforms);
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
    return to_list(platforms.begin(), platforms.end());
  }

-  bp::list get_devices(atd::cl::Platform const & platform)
+  bp::list get_devices(cl::Platform const & platform)
  {
-    std::vector<atd::cl::Device> devices;
+    std::vector<cl::Device> devices;
    platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
    return to_list(devices.begin(), devices.end());
  }

-  std::vector<atd::cl::CommandQueue> & get_queue(atd::cl::Context const & ctx)
-  { return atd::cl::get_queues(ctx); }
+  std::vector<cl::CommandQueue> & get_queue(cl::Context const & ctx)
+  { return atd::cl_ext::get_queues(ctx); }

  atd::numeric_type extract_dtype(bp::object const & odtype)
  {
@@ -272,27 +272,27 @@ namespace detail
      }
  };

-  atd::cl::Platform get_platform(atd::cl::Device const & device)
-  {  return atd::cl::Platform(device.getInfo<CL_DEVICE_PLATFORM>());  }
+  cl::Platform get_platform(cl::Device const & device)
+  {  return cl::Platform(device.getInfo<CL_DEVICE_PLATFORM>());  }

  template<cl_int INFO>
-  typename atd::cl::detail::param_traits<atd::cl::detail::cl_device_info, INFO>::param_type
-  wrap_device_info(atd::cl::Device const & x)
+  typename cl::detail::param_traits<cl::detail::cl_device_info, INFO>::param_type
+  wrap_device_info(cl::Device const & x)
  { return x.getInfo<INFO>(NULL); }

  template<cl_int INFO>
-  typename atd::cl::detail::param_traits<atd::cl::detail::cl_context_info, INFO>::param_type
-  wrap_context_info(atd::cl::Context const & x)
+  typename cl::detail::param_traits<cl::detail::cl_context_info, INFO>::param_type
+  wrap_context_info(cl::Context const & x)
  { return x.getInfo<INFO>(NULL); }

  template<cl_int INFO>
-  typename atd::cl::detail::param_traits<atd::cl::detail::cl_platform_info, INFO>::param_type
-  wrap_platform_info(atd::cl::Platform const & x)
+  typename cl::detail::param_traits<cl::detail::cl_platform_info, INFO>::param_type
+  wrap_platform_info(cl::Platform const & x)
  { return x.getInfo<INFO>(NULL); }

  template<cl_int INFO>
-  typename atd::cl::detail::param_traits<atd::cl::detail::cl_command_queue_info, INFO>::param_type
-  wrap_command_queue_info(atd::cl::CommandQueue const & x)
+  typename cl::detail::param_traits<cl::detail::cl_command_queue_info, INFO>::param_type
+  wrap_command_queue_info(cl::CommandQueue const & x)
  { return x.getInfo<INFO>(NULL); }


@@ -309,7 +309,7 @@ namespace detail

 void export_cl()
 {
-  typedef std::vector<atd::cl::CommandQueue> queues_t;
+  typedef std::vector<cl::CommandQueue> queues_t;
  bp::class_<queues_t>("queues")
      .def("__len__", &queues_t::size)
      .def("__getitem__", &bp::vector_indexing_suite<queues_t>::get_item, bp::return_internal_reference<>())
@@ -333,14 +333,14 @@ void export_cl()
  bp::def("device_type_to_string", &detail::to_string);


-  bp::class_<atd::cl::Platform>("platform", bp::no_init)
+  bp::class_<cl::Platform>("platform", bp::no_init)
    #define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_platform_info<NAME>)
      WRAP("name", CL_PLATFORM_NAME)
    #undef WRAP
      .def("get_devices", &detail::get_devices)
      ;

-  bp::class_<atd::cl::Device>("device", bp::no_init)
+  bp::class_<cl::Device>("device", bp::no_init)
    #define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_device_info<NAME>)
      .add_property("nv_compute_capability", &detail::nv_compute_capability)
      .add_property("platform", &detail::get_platform)
@@ -351,20 +351,20 @@ void export_cl()
    #undef WRAP
      ;

-  bp::class_<atd::cl::Context>("context", bp::init<atd::cl::Device>())
+  bp::class_<cl::Context>("context", bp::init<cl::Device>())
    #define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_context_info<NAME>)
    #undef WRAP
      .add_property("queues", bp::make_function(&detail::get_queue, bp::return_internal_reference<>()))
      ;

-  bp::class_<atd::cl::CommandQueue>("command_queue", bp::init<atd::cl::Context, atd::cl::Device>())
+  bp::class_<cl::CommandQueue>("command_queue", bp::init<cl::Context, cl::Device>())
    #define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_command_queue_info<NAME>)
      WRAP("device", CL_QUEUE_DEVICE)
    #undef WRAP
      .add_property("models", bp::make_function(&atd::get_model_map, bp::return_internal_reference<>()));
      ;

-  bp::def("synchronize", &atd::cl::synchronize);
+  bp::def("synchronize", &atd::cl_ext::synchronize);
  bp::def("get_platforms", &detail::get_platforms);

 }
@@ -372,7 +372,7 @@ void export_cl()
 namespace detail
 {
  boost::shared_ptr<atd::array>
-  ndarray_to_atdarray(const np::ndarray& array, const atd::cl::Context& ctx)
+  ndarray_to_atdarray(const np::ndarray& array, const cl::Context& ctx)
  {

    int d = array.get_nd();
@@ -393,12 +393,12 @@ namespace detail



-  boost::shared_ptr<atd::array> create_array(bp::object const & obj, bp::object odtype, atd::cl::Context context)
+  boost::shared_ptr<atd::array> create_array(bp::object const & obj, bp::object odtype, cl::Context context)
  {
    return ndarray_to_atdarray(np::from_object(obj, to_np_dtype(extract_dtype(odtype))), context);
  }

-  boost::shared_ptr<atd::array> create_empty_array(bp::object sizes, bp::object odtype, atd::cl::Context context)
+  boost::shared_ptr<atd::array> create_empty_array(bp::object sizes, bp::object odtype, cl::Context context)
  {
      typedef boost::shared_ptr<atd::array> result_type;

@@ -435,7 +435,7 @@ namespace detail
      return bp::extract<std::string>(obj.attr("__class__").attr("__name__"))();
  }

-  boost::shared_ptr<atd::scalar> construct_scalar(bp::object obj, atd::cl::Context const & context)
+  boost::shared_ptr<atd::scalar> construct_scalar(bp::object obj, cl::Context const & context)
  {
    typedef boost::shared_ptr<atd::scalar> result_type;
    std::string name = type_name(obj);
@@ -504,7 +504,7 @@ void export_array()
  bp::class_<atd::array,
          boost::shared_ptr<atd::array> >
  ( "array", bp::no_init)
-      .def("__init__", bp::make_constructor(detail::create_array, bp::default_call_policies(), (bp::arg("obj"), bp::arg("dtype") = bp::scope().attr("float32"), bp::arg("context")=atd::cl::default_context())))
+      .def("__init__", bp::make_constructor(detail::create_array, bp::default_call_policies(), (bp::arg("obj"), bp::arg("dtype") = bp::scope().attr("float32"), bp::arg("context")=atd::cl_ext::default_context())))
      .def(bp::init<atd::array_expression>())
      .add_property("dtype", &atd::array::dtype)
      .add_property("context", bp::make_function(&atd::array::context, bp::return_internal_reference<>()))
@@ -527,11 +527,11 @@ void export_array()

  bp::class_<atd::scalar, bp::bases<atd::array> >
      ("scalar", bp::no_init)
-      .def("__init__", bp::make_constructor(detail::construct_scalar, bp::default_call_policies(), (bp::arg(""), bp::arg("context")=atd::cl::default_context())))
+      .def("__init__", bp::make_constructor(detail::construct_scalar, bp::default_call_policies(), (bp::arg(""), bp::arg("context")=atd::cl_ext::default_context())))
      ;

  //Other numpy-like initializers
-  bp::def("empty", &detail::create_empty_array, (bp::arg("shape"), bp::arg("dtype") = bp::scope().attr("float32"), bp::arg("context")=atd::cl::default_context()));
+  bp::def("empty", &detail::create_empty_array, (bp::arg("shape"), bp::arg("dtype") = bp::scope().attr("float32"), bp::arg("context")=atd::cl_ext::default_context()));

 //Binary
 #define MAP_FUNCTION(name) \
@@ -593,7 +593,7 @@ void export_scalar()
 void export_model()
 {

-  bp::class_<atidlas::model>("model", bp::init<atd::base const &, atd::cl::CommandQueue&>())
+  bp::class_<atidlas::model>("model", bp::init<atd::base const &, cl::CommandQueue&>())
                  .def("execute", &atd::model::execute);
  
  bp::enum_<atidlas::fetching_policy_type>
--- a/tests/maxpy.cpp
+++ b/tests/maxpy.cpp
@@ -13,7 +13,7 @@ void test(T epsilon, simple_matrix_base<T> & cA, simple_matrix_base<T>& cB, simp
  using namespace std;

  int failure_count = 0;
-  ad::cl::Context const & ctx = C.context();
+  cl::Context const & ctx = C.context();

  int_t M = cC.size1();
  int_t N = cC.size2();
@@ -94,7 +94,7 @@ void test(T epsilon, simple_matrix_base<T> & cA, simple_matrix_base<T>& cB, simp
 }

 template<typename T>
-void test_impl(T epsilon, ad::cl::Context const & ctx)
+void test_impl(T epsilon, cl::Context const & ctx)
 {
  using atidlas::_;

@@ -119,9 +119,9 @@ void test_impl(T epsilon, ad::cl::Context const & ctx)

 int main()
 {
-  for(ad::cl::queues_t::iterator it = ad::cl::queues.begin() ; it != ad::cl::queues.end() ; ++it)
+  for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
  {
-    ad::cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
+    cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "---" << std::endl;
    std::cout << ">> float" << std::endl;
--- a/tests/mproduct.cpp
+++ b/tests/mproduct.cpp
@@ -53,7 +53,7 @@ void test_impl(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> cons
 }

 template<typename T>
-void test_impl(T epsilon, ad::cl::Context const & ctx)
+void test_impl(T epsilon, cl::Context const & ctx)
 {
  int_t M = 412;
  int_t N = 245;
@@ -75,9 +75,9 @@ void test_impl(T epsilon, ad::cl::Context const & ctx)

 int main()
 {
-  for(ad::cl::queues_t::iterator it = ad::cl::queues.begin() ; it != ad::cl::queues.end() ; ++it)
+  for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
  {
-    ad::cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
+    cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "---" << std::endl;
    std::cout << ">> float" << std::endl;
--- a/tests/mreduction.cpp
+++ b/tests/mreduction.cpp
@@ -46,7 +46,7 @@ void test_row_wise_reduction(T epsilon, simple_vector_base<T> & cy, simple_matri
 }

 template<typename T>
-void test_impl(T epsilon, ad::cl::Context const & ctx)
+void test_impl(T epsilon, cl::Context const & ctx)
 {
  int_t M = 1324;
  int_t N = 1143;
@@ -65,9 +65,9 @@ void test_impl(T epsilon, ad::cl::Context const & ctx)

 int main()
 {
-  for(ad::cl::queues_t::iterator it = ad::cl::queues.begin() ; it != ad::cl::queues.end() ; ++it)
+  for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
  {
-    ad::cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
+    cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "---" << std::endl;
    std::cout << ">> float" << std::endl;
--- a/tests/reduction.cpp
+++ b/tests/reduction.cpp
@@ -12,7 +12,7 @@ void test_reduction(T epsilon,  simple_vector_base<T> & cx, simple_vector_base<T
                                ad::array & x, ad::array & y)
 {
  using namespace std;
-  ad::cl::Context const & ctx = x.context();
+  cl::Context const & ctx = x.context();
  int_t N = cx.size();
  unsigned int failure_count = 0;

@@ -52,7 +52,7 @@ void test_reduction(T epsilon,  simple_vector_base<T> & cx, simple_vector_base<T
 }

 template<typename T>
-void test_impl(T epsilon, ad::cl::Context const & ctx)
+void test_impl(T epsilon, cl::Context const & ctx)
 {
  using atidlas::_;

@@ -74,9 +74,9 @@ void test_impl(T epsilon, ad::cl::Context const & ctx)

 int main()
 {
-  for(ad::cl::queues_t::iterator it = ad::cl::queues.begin() ; it != ad::cl::queues.end() ; ++it)
+  for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
  {
-    ad::cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
+    cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "---" << std::endl;
    std::cout << ">> float" << std::endl;
--- a/tests/vaxpy.cpp
+++ b/tests/vaxpy.cpp
@@ -14,7 +14,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect

  int failure_count = 0;
  ad::numeric_type dtype = x.dtype();
-  ad::cl::Context const & ctx = x.context();
+  cl::Context const & ctx = x.context();

  int_t N = cz.size();

@@ -89,7 +89,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
 }

 template<typename T>
-void test_impl(T epsilon, ad::cl::Context const & ctx)
+void test_impl(T epsilon, cl::Context const & ctx)
 {
  using atidlas::_;

@@ -114,9 +114,9 @@ void test_impl(T epsilon, ad::cl::Context const & ctx)

 int main()
 {
-  for(ad::cl::queues_t::iterator it = ad::cl::queues.begin() ; it != ad::cl::queues.end() ; ++it)
+  for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
  {
-    ad::cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
+    cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "---" << std::endl;
    std::cout << ">> float" << std::endl;