Ported to C++11

2015-02-04 22:06:15 -05:00
parent 3a296ae3b7
commit bbf2f0188e
40 changed files with 413 additions and 563 deletions
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -46,13 +46,13 @@ void bench(ad::numeric_type dtype)
  /*--BLAS1--*/
  /*---------*/
  std::cout << "#AXPY" << std::endl;
-  for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
+  for(auto N : BLAS1_N)
  {
-    int_t N = *it;
+    
    std::cout << N;
    /* ATIDLAS */
    ad::array x(N, dtype), y(N, dtype);
-    cl::CommandQueue & queue = ad::cl_ext::get_queue(x.context(), 0);
+    cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
    ad::model & model = ad::get_model(queue, ad::VECTOR_AXPY_TYPE, dtype);
    ad::array_expression E = ad::detail::assign(y, x + y);
    model.tune(E);
@@ -178,15 +178,16 @@ int main(int argc, char* argv[])
 #endif

  int device_idx = 0;
-  if(ad::cl_ext::queues.size()>1){
-    ad::cl_ext::queues_t & queues = ad::cl_ext::queues;
+  ad::cl_ext::queues_type::data_type const & queues = ad::cl_ext::queues.data();
+
+  if(queues.size()>1){
    if(argc!=2)
    {
      std::cerr << "usage : blas-bench [DEVICE_IDX]" << std::endl;
      std::cout << "Devices available: " << std::endl;
      unsigned int current=0;
-      for(ad::cl_ext::queues_t::const_iterator it = queues.begin() ; it != queues.end() ; ++it){
-        cl::Device device = it->first.getInfo<CL_CONTEXT_DEVICES>()[0];
+      for(const auto & queue : queues){
+        cl::Device device = queue.first.getInfo<CL_CONTEXT_DEVICES>()[0];
        std::cout << current++ << ": " << device.getInfo<CL_DEVICE_NAME>() << "(" << cl::Platform(device.getInfo<CL_DEVICE_PLATFORM>()).getInfo<CL_PLATFORM_NAME>() << ")" << std::endl;
      }
      exit(EXIT_FAILURE);
--- a/bench/common.hpp
+++ b/bench/common.hpp
@@ -3,6 +3,7 @@

 #include "vector"
 #include <cmath>
+#include <algorithm>

 int ceil(int N, int pad)
 {
--- a/bench/overhead.cpp
+++ b/bench/overhead.cpp
@@ -12,9 +12,9 @@ __global__ void dummy(){}

 int main()
 {
-  for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
+  for(const auto & elem : ad::cl_ext::queues.data())
  {
-    cl::CommandQueue queue = it->second[0];
+    cl::CommandQueue queue = elem.second[0];
    cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();
    cl::Program program("__kernel void dummy(){}");
                program.build();
--- a/include/atidlas/array.h
+++ b/include/atidlas/array.h
@@ -4,7 +4,7 @@
 #include <iostream>
 #include <CL/cl.hpp>
 #include "atidlas/types.h"
-#include "atidlas/cl/queues.h"
+#include "atidlas/cl_ext/backend.h"
 #include "atidlas/symbolic/expression.h"


@@ -12,11 +12,10 @@ namespace atidlas
 {

 class scalar;
-class array: public obj_base
+
+class array: public array_base
 {
  friend array reshape(array const &, int_t, int_t);
-  friend array reshape(array_expression const &, int_t, int_t);
-  static array_infos init_infos(numeric_type dtype, cl_mem data, int_t shape1, int_t shape2, int_t start1, int_t start2, int_t stride1, int_t stride2, int_t ld);
 public:
  //1D Constructors
  array(int_t size1, numeric_type dtype, cl::Context context = cl_ext::default_context());
@@ -32,15 +31,15 @@ public:

  //General constructor
  array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context = cl_ext::default_context());
-  array(control const & proxy);
+  array(array_expression const & proxy);
  array(array const &);

  //Getters
  numeric_type dtype() const;
  size4 shape() const;
+  int_t nshape() const;
  size4 start() const;
  size4 stride() const;
-  int_t nshape() const;
  int_t ld() const;
  cl::Context const & context() const;
  cl::Buffer const & data() const;
@@ -51,7 +50,8 @@ public:

  //Numeric operators
  array& operator=(array const &);
-  array& operator=(control const &);
+  array& operator=(array_expression const &);
+
  template<class T> array & operator=(std::vector<T> const & rhs);

  array_expression operator-();
@@ -78,9 +78,15 @@ public:

  array_expression T() const;
 protected:
+  numeric_type dtype_;
+
+  size4 shape_;
+  size4 start_;
+  size4 stride_;
+  int_t ld_;
+
  cl::Context context_;
  cl::Buffer data_;
-  array_infos infos_;
 };

 class scalar : public array
@@ -91,7 +97,7 @@ public:
  explicit scalar(numeric_type dtype, cl::Buffer const & data, int_t offset, cl::Context context = cl_ext::default_context());
  explicit scalar(value_scalar value, cl::Context context = cl_ext::default_context());
  explicit scalar(numeric_type dtype, cl::Context context = cl_ext::default_context());
-  scalar(control const & proxy);
+  scalar(array_expression const & proxy);
  scalar& operator=(value_scalar const &);
 //  scalar& operator=(scalar const & s);
  using array::operator =;
--- a/include/atidlas/backend/mapped_object.h
+++ b/include/atidlas/backend/mapped_object.h
@@ -20,7 +20,7 @@ enum leaf_t
 class mapped_object;

 typedef std::pair<int_t, leaf_t> mapping_key;
-typedef std::map<mapping_key, tools::shared_ptr<mapped_object> > mapping_type;
+typedef std::map<mapping_key, std::shared_ptr<mapped_object> > mapping_type;

 /** @brief Mapped Object
 *
--- a/include/atidlas/backend/templates/base.h
+++ b/include/atidlas/backend/templates/base.h
@@ -9,7 +9,7 @@
 #include "atidlas/backend/parse.h"
 #include "atidlas/backend/stream.h"
 #include <CL/cl.hpp>
-#include "atidlas/cl/lazy_compiler.h"
+#include "atidlas/cl_ext/lazy_compiler.h"
 #include "atidlas/symbolic/expression.h"

 namespace atidlas
@@ -75,15 +75,15 @@ protected:
    /** @brief Accessor for the numeric type */
    numeric_type get_numeric_type(atidlas::array_expression const * array_expression, int_t root_idx) const;
    /** @brief Creates a binary leaf */
-    template<class T> tools::shared_ptr<mapped_object> binary_leaf(atidlas::array_expression const * array_expression, int_t root_idx, mapping_type const * mapping) const;
+    template<class T> std::shared_ptr<mapped_object> binary_leaf(atidlas::array_expression const * array_expression, int_t root_idx, mapping_type const * mapping) const;
    /** @brief Creates a value scalar mapping */
-    tools::shared_ptr<mapped_object> create(numeric_type dtype, values_holder) const;
+    std::shared_ptr<mapped_object> create(numeric_type dtype, values_holder) const;
    /** @brief Creates a vector mapping */
-    tools::shared_ptr<mapped_object> create(array_infos const &) const;
+    std::shared_ptr<mapped_object> create(array_infos const &) const;
    /** @brief Creates a tuple mapping */
-    tools::shared_ptr<mapped_object> create(repeat_infos const &) const;
+    std::shared_ptr<mapped_object> create(repeat_infos const &) const;
    /** @brief Creates a mapping */
-    tools::shared_ptr<mapped_object> create(lhs_rhs_element const &) const;
+    std::shared_ptr<mapped_object> create(lhs_rhs_element const &) const;
  public:
    map_functor(symbolic_binder & binder, mapping_type & mapping);
    /** @brief Functor for traversing the tree */
@@ -143,7 +143,7 @@ protected:
  static bool is_reduction(array_expression::node const & node);
  static bool is_index_reduction(op_element const & op);

-  tools::shared_ptr<symbolic_binder> make_binder();
+  std::shared_ptr<symbolic_binder> make_binder();
  static std::string vstore(unsigned int simd_width, std::string const & value, std::string const & offset, std::string const & ptr);
  static std::string vload(unsigned int simd_width, std::string const & offset, std::string const & ptr);
  static std::string append_width(std::string const & str, unsigned int width);
@@ -164,7 +164,7 @@ public:
  virtual int check_invalid(expressions_tuple const & expressions, cl::Device const & device) const = 0;
  virtual void enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler> & programs,
                       unsigned int label, expressions_tuple const & expressions, operation_cache* cache = NULL) = 0;
-  virtual tools::shared_ptr<base> clone() const = 0;
+  virtual std::shared_ptr<base> clone() const = 0;
 private:
  binding_policy_t binding_policy_;
 };
@@ -180,7 +180,7 @@ public:
  base_impl(parameters_type const & parameters, binding_policy_t binding_policy);
  int_t local_size_0() const;
  int_t local_size_1() const;
-  tools::shared_ptr<base> clone() const;
+  std::shared_ptr<base> clone() const;
  /** @brief returns whether or not the profile has undefined behavior on particular device */
  int check_invalid(expressions_tuple const & expressions, cl::Device const & device) const;
 protected:
--- a/include/atidlas/cl/queues.h
+++ b/include/atidlas/cl/queues.h
@@ -1,31 +0,0 @@
-#ifndef ATIDLAS_CL_QUEUES_H
-#define ATIDLAS_CL_QUEUES_H
-
-#include <map>
-#include <CL/cl.hpp>
-#include "atidlas/cl/compare.hpp"
-
-namespace atidlas
-{
-namespace cl_ext
-{
-
-typedef std::map<std::pair<cl_program, unsigned int>, cl::Kernel> kernels_t;
-typedef std::vector<std::pair<cl::Context, std::vector<cl::CommandQueue> > > queues_t;
-
-extern kernels_t kernels;
-extern queues_t queues;
-extern unsigned int default_context_idx;
-extern cl_command_queue_properties queue_properties;
-
-
-void synchronize(cl::Context const & context);
-cl::Context default_context();
-cl::CommandQueue & get_queue(cl::Context const &, std::size_t);
-cl::Device get_device(cl::CommandQueue &);
-std::vector<cl::CommandQueue> & get_queues(cl::Context const & ctx);
-
-}
-}
-
-#endif
--- a/include/atidlas/cl_ext/backend.h
+++ b/include/atidlas/cl_ext/backend.h
@@ -0,0 +1,40 @@
+#ifndef ATIDLAS_CL_QUEUES_H
+#define ATIDLAS_CL_QUEUES_H
+
+#include <map>
+#include <list>
+#include <CL/cl.hpp>
+
+namespace atidlas
+{
+namespace cl_ext
+{
+
+class queues_type
+{
+private:
+  void append(cl::Context const &);
+  void init();
+public:
+  typedef std::list<std::pair<cl::Context, std::vector<cl::CommandQueue> > > data_type;
+  std::vector<cl::CommandQueue> & operator[](cl::Context const &);
+  cl::Context default_context();
+  data_type const & data();
+private:
+  data_type data_;
+};
+
+typedef std::map<std::pair<cl_program, unsigned int>, cl::Kernel> kernels_type;
+
+extern kernels_type kernels;
+extern queues_type queues;
+extern unsigned int default_context_idx;
+extern cl_command_queue_properties queue_properties;
+
+void synchronize(cl::Context const & context);
+cl::Context default_context();
+
+}
+}
+
+#endif
--- a/include/atidlas/cl_ext/compare.hpp
+++ b/include/atidlas/cl_ext/compare.hpp
--- a/include/atidlas/cl_ext/lazy_compiler.h
+++ b/include/atidlas/cl_ext/lazy_compiler.h
@@ -2,7 +2,7 @@
 #define ATIDLAS_CL_LAZY_COMPILER_H

 #include <CL/cl.hpp>
-#include "atidlas/cl/program_map.h"
+#include "atidlas/cl_ext/program_map.h"

 namespace atidlas
 {
--- a/include/atidlas/cl_ext/program_map.h
+++ b/include/atidlas/cl_ext/program_map.h
--- a/include/atidlas/model/model.h
+++ b/include/atidlas/model/model.h
@@ -6,8 +6,8 @@
 #include <map>

 #include "atidlas/backend/templates/base.h"
-#include "atidlas/cl/compare.hpp"
-#include "atidlas/cl/lazy_compiler.h"
+#include "atidlas/cl_ext/compare.hpp"
+#include "atidlas/cl_ext/lazy_compiler.h"
 #include "atidlas/model/predictors/random_forest.h"
 #include "atidlas/symbolic/expression.h"

@@ -16,7 +16,7 @@ namespace atidlas

  class model
  {
-    typedef std::vector< tools::shared_ptr<base> > templates_container;
+    typedef std::vector< std::shared_ptr<base> > templates_container;
  public:
    struct runtime_options
    {
@@ -34,8 +34,8 @@ namespace atidlas
    std::vector<cl_ext::lazy_compiler>& init(expressions_tuple const & expressions, runtime_options const & opt = runtime_options());

  public:
-    model(predictors::random_forest const &, std::vector< tools::shared_ptr<base> > const &, cl::CommandQueue &);
-    model(std::vector< tools::shared_ptr<base> > const &, cl::CommandQueue &);
+    model(predictors::random_forest const &, std::vector< std::shared_ptr<base> > const &, cl::CommandQueue &);
+    model(std::vector< std::shared_ptr<base> > const &, cl::CommandQueue &);
    model(base const &, cl::CommandQueue &);

    void execute(expressions_tuple const &, operation_cache * cache = NULL, runtime_options const & opt = runtime_options());
@@ -44,13 +44,13 @@ namespace atidlas
    templates_container const & templates() const;
  private:
    templates_container templates_;
-    tools::shared_ptr<predictors::random_forest> predictor_;
+    std::shared_ptr<predictors::random_forest> predictor_;
    std::map<std::vector<int_t>, int> hardcoded_;
    std::map<cl_context, std::map<std::string, std::vector<cl_ext::lazy_compiler> > > lazy_programs_;
    cl::CommandQueue & queue_;
  };

-  typedef std::map<std::pair<expression_type, numeric_type>, tools::shared_ptr<model> > model_map_t;
+  typedef std::map<std::pair<expression_type, numeric_type>, std::shared_ptr<model> > model_map_t;

  model_map_t init_models(cl::CommandQueue const & queue);
  model_map_t& get_model_map(cl::CommandQueue & queue);
--- a/include/atidlas/symbolic/expression.h
+++ b/include/atidlas/symbolic/expression.h
@@ -6,7 +6,7 @@
 #include <CL/cl.hpp>
 #include "atidlas/types.h"
 #include "atidlas/value_scalar.h"
-#include "atidlas/tools/shared_ptr.hpp"
+#include <memory>

 namespace atidlas
 {
@@ -168,7 +168,7 @@ void fill(lhs_rhs_element & x, array const & a);
 void fill(lhs_rhs_element & x, value_scalar const & v);
 void fill(lhs_rhs_element & x, repeat_infos const & r);

-class array_expression
+class array_expression : public array_base
 {
 public:
  struct node
@@ -208,21 +208,21 @@ private:
  size4 shape_;
 };

-class control
+template<class TYPE>
+class controller
 {
 public:
-  control(array_expression const & x, cl::Event* event = NULL, std::vector<cl::Event>* dependencies = NULL,
+  controller(TYPE const & x, cl::Event* event = NULL, std::vector<cl::Event>* dependencies = NULL,
          cl::CommandQueue* queue = NULL, operation_cache* cache = NULL) : x_(x), event_(event), dependencies_(dependencies), queue_(queue), cache_(cache){}

-  array_expression const & expression() const { return x_; }
+  TYPE const & x() const { return x_; }
  cl::Event* event() const { return event_; }
  std::vector<cl::Event>* dependencies() const { return dependencies_; }
  cl::CommandQueue* queue() const { return queue_; }
  operation_cache* cache() const { return cache_; }

 private:
-  array_expression const & x_;
-
+  TYPE const & x_;
  cl::Event* event_;
  std::vector<cl::Event>* dependencies_;
  cl::CommandQueue* queue_;
@@ -232,9 +232,9 @@ private:
 class expressions_tuple
 {
 private:
-  tools::shared_ptr<array_expression> create(array_expression const & s);
+  std::shared_ptr<array_expression> create(array_expression const & s);
 public:
-  typedef std::list<tools::shared_ptr<array_expression> > data_type;
+  typedef std::list<std::shared_ptr<array_expression> > data_type;
  enum order_type { SEQUENTIAL, INDEPENDENT };

  expressions_tuple(array_expression const & s0);
--- a/include/atidlas/tools/shared_ptr.hpp
+++ b/include/atidlas/tools/shared_ptr.hpp
@@ -1,162 +0,0 @@
-#ifndef ATIDLAS_TOOLS_SHARED_PTR_HPP
-#define ATIDLAS_TOOLS_SHARED_PTR_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2012, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
-
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file tools/shared_ptr.hpp
-    @brief Implementation of a shared pointer class (cf. std::shared_ptr, boost::shared_ptr). Will be used until C++11 is widely available.
-
-    Contributed by Philippe Tillet.
-*/
-
-#include <cstdlib>
-#include <algorithm>
-
-namespace atidlas
-{
-namespace tools
-{
-namespace detail
-{
-
-  /** @brief Reference counting class for the shared_ptr implementation */
-  class count
-  {
-  public:
-    count(unsigned int val) : val_(val){ }
-    void dec(){ --val_; }
-    void inc(){ ++val_; }
-    bool is_null(){ return val_ == 0; }
-    unsigned int val(){ return val_; }
-  private:
-    unsigned int val_;
-  };
-
-  /** @brief Interface for the reference counter inside the shared_ptr */
-  struct aux
-  {
-    detail::count count;
-
-    aux() :count(1) {}
-    virtual void destroy()=0;
-    virtual ~aux() {}
-  };
-
-  /** @brief Implementation helper for the reference counting mechanism inside shared_ptr. */
-  template<class U, class Deleter>
-  struct auximpl: public detail::aux
-  {
-    U* p;
-    Deleter d;
-
-    auximpl(U* pu, Deleter x) :p(pu), d(x) {}
-    virtual void destroy() { d(p); }
-  };
-
-  /** @brief Default deleter class for a pointer. The default is to just call 'delete' on the pointer. Provide your own implementations for 'delete[]' and 'free'. */
-  template<class U>
-  struct default_deleter
-  {
-    void operator()(U* p) const { delete p; }
-  };
-
-}
-
-class shared_ptr_base
-{
-protected:
-  detail::aux* pa;
-public:
-  unsigned int count() { return pa->count.val(); }
-};
-
-/** @brief A shared pointer class similar to boost::shared_ptr. Reimplemented in order to avoid a Boost-dependency. Will be replaced by std::shared_ptr as soon as C++11 is widely available. */
-template<class T>
-class shared_ptr : public shared_ptr_base
-{
-  template<class U>
-  friend class shared_ptr;
-
-  detail::aux* pa;
-  T* pt;
-
-public:
-
-  shared_ptr() :pa(NULL), pt(NULL) {}
-
-  template<class U, class Deleter>
-  shared_ptr(U* pu, Deleter d) : pa(new detail::auximpl<U, Deleter>(pu, d)), pt(pu) {}
-
-  template<class U>
-  explicit shared_ptr(U* pu) : pa(new detail::auximpl<U, detail::default_deleter<U> >(pu, detail::default_deleter<U>())), pt(pu) {}
-
-  template<class U>
-  shared_ptr(const shared_ptr<U>& s) :pa(s.pa), pt(s.pt)  { inc(); }
-
-  shared_ptr(const shared_ptr& s) :pa(s.pa), pt(s.pt)  { inc(); }
-  ~shared_ptr() { dec(); }
-
-  T* get() const {  return pt; }
-  T* operator->() const {  return pt; }
-  T& operator*() const { return *pt; }
-
-  void reset() { shared_ptr<T>().swap(*this); }
-  void reset(T * ptr) { shared_ptr<T>(ptr).swap(*this); }
-
-  void swap(shared_ptr<T> & other)
-  {
-    std::swap(pt,other.pt);
-    std::swap(pa, other.pa);
-  }
-
-  shared_ptr& operator=(const shared_ptr& s)
-  {
-    if (this!=&s)
-    {
-      dec();
-      pa = s.pa;
-      pt = s.pt;
-      inc();
-    }
-    return *this;
-  }
-
-  void inc()
-  {
-    if (pa) pa->count.inc();
-  }
-
-  void dec()
-  {
-    if (pa)
-    {
-      pa->count.dec();
-      if (pa->count.is_null())
-      {
-        pa->destroy();
-        delete pa;
-        pa = NULL;
-      }
-    }
-  }
-};
-
-}
-}
-
-#endif
--- a/include/atidlas/types.h
+++ b/include/atidlas/types.h
@@ -83,8 +83,8 @@ public:

  void enqueue()
  {
-    for(std::list<infos>::iterator it = l_.begin() ; it != l_.end() ; ++it)
-      it->queue.enqueueNDRangeKernel(it->kernel, it->offset, it->grange, it->lrange);
+    for(infos & elem : l_)
+      elem.queue.enqueueNDRangeKernel(elem.kernel, elem.offset, elem.grange, elem.lrange);
  }

 private:
@@ -173,7 +173,8 @@ struct slice
 };
 typedef slice _;

-class obj_base{};
+class array_base{ };
+

 }
 #endif
--- a/lib/array.cpp
+++ b/lib/array.cpp
@@ -11,40 +11,27 @@
 namespace atidlas
 {

-array_infos array::init_infos(numeric_type dtype, cl_mem data, int_t shape1, int_t shape2, int_t start1, int_t start2, int_t stride1, int_t stride2, int_t ld)
-{
-  array_infos res;
-  res.dtype = dtype;
-  res.data = data;
-  res.shape1 = shape1;
-  res.shape2 = shape2;
-  res.start1 = start1;
-  res.start2 = start2;
-  res.stride1 = stride1;
-  res.stride2 = stride2;
-  res.ld = ld;
-  return res;
-}
+
 /*--- Constructors ---*/

 //1D Constructors

 array::array(int_t size1, numeric_type dtype, cl::Context context) :
-  context_(context), data_(context_, CL_MEM_READ_WRITE, size_of(dtype)*size1),
-  infos_(init_infos(dtype, data_(), size1, 1, 0, 0, 1, 1, size1))
-{}
+  dtype_(dtype), shape_(size1, 1), start_(0, 0), stride_(1, 1), ld_(shape_._1),
+  context_(context), data_(context_, CL_MEM_READ_WRITE, size_of(dtype)*dsize())
+{ }

 template<class DT>
 array::array(std::vector<DT> const & x, cl::Context context):
-  context_(context), data_(context, CL_MEM_READ_WRITE, size_of(to_numeric_type<DT>::value)*x.size()),
-  infos_(init_infos(to_numeric_type<DT>::value, data_(), x.size(), 1, 0, 0, 1, 1, x.size()))
+  dtype_(to_numeric_type<DT>::value), shape_(x.size(), 1), start_(0, 0), stride_(1, 1), ld_(shape_._1),
+  context_(context), data_(context, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
 { *this = x; }

-array::array(array & v, slice const & s1) : context_(v.data_.getInfo<CL_MEM_CONTEXT>()), data_(v.data_),
- infos_(init_infos(v.infos_.dtype, data_(), s1.size, 1, v.infos_.start1 + v.infos_.stride1*s1.start, 0, v.infos_.stride1*s1.stride, 1, v.infos_.ld))
+array::array(array & v, slice const & s1) : dtype_(v.dtype_), shape_(s1.size, 1), start_(v.start_._1 + v.stride_._1*s1.start, 0), stride_(v.stride_._1*s1.stride, 1),
+                                            ld_(v.ld_), context_(v.data_.getInfo<CL_MEM_CONTEXT>()), data_(v.data_)
 {}

-#define INSTANTIATE(T) template array::array<T>(std::vector<T> const &, cl::Context)
+#define INSTANTIATE(T) template array::array(std::vector<T> const &, cl::Context)
 INSTANTIATE(cl_char);
 INSTANTIATE(cl_uchar);
 INSTANTIATE(cl_short);
@@ -58,26 +45,26 @@ INSTANTIATE(cl_double);
 #undef INSTANTIATE

 // 2D
-array::array(int_t size1, int_t size2, numeric_type dtype, cl::Context context) :
-context_(context), data_(context_, CL_MEM_READ_WRITE, size_of(dtype)*size1*size2),
-infos_(init_infos(dtype, data_(), size1, size2, 0, 0, 1, 1, size1))
+array::array(int_t size1, int_t size2, numeric_type dtype, cl::Context context) : dtype_(dtype), shape_(size1, size2), start_(0, 0), stride_(1, 1), ld_(size1),
+                                                                              context_(context), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
 {}

-array::array(array & M, slice const & s1, slice const & s2) :
-context_(M.data_.getInfo<CL_MEM_CONTEXT>()), data_(M.data_),
-infos_(init_infos(M.dtype(), data_(), s1.size, s2.size, M.start()._1 + M.stride()._1*s1.start, M.start()._2 + M.stride()._2*s2.start,
-       M.stride()._1*s1.stride, M.stride()._2*s2.stride, M.ld()))
+array::array(array & M, slice const & s1, slice const & s2) :  dtype_(M.dtype_), shape_(s1.size, s2.size),
+                                                          start_(M.start_._1 + M.stride_._1*s1.start, M.start_._2 + M.stride_._2*s2.start),
+                                                          stride_(M.stride_._1*s1.stride, M.stride_._2*s2.stride), ld_(M.ld_),
+                                                          context_(M.data_.getInfo<CL_MEM_CONTEXT>()), data_(M.data_)
 { }

 template<typename DT>
-array::array(int_t size1, int_t size2, std::vector<DT> const & data, cl::Context context):
-context_(context), data_(context_, CL_MEM_READ_WRITE, size_of(to_numeric_type<DT>::value)*size1*size2),
-infos_(init_infos(to_numeric_type<DT>::value, data_(), size1, size2, 0, 0, 1, 1, size1))
+array::array(int_t size1, int_t size2, std::vector<DT> const & data, cl::Context context)
+  : dtype_(to_numeric_type<DT>::value),
+    shape_(size1, size2), start_(0, 0), stride_(1, 1), ld_(size1),
+    context_(context), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
 {
  atidlas::copy(data, *this);
 }

-#define INSTANTIATE(T) template array::array<T>(int_t, int_t, std::vector<T> const &, cl::Context)
+#define INSTANTIATE(T) template array::array(int_t, int_t, std::vector<T> const &, cl::Context)
 INSTANTIATE(cl_char);
 INSTANTIATE(cl_uchar);
 INSTANTIATE(cl_short);
@@ -92,43 +79,45 @@ INSTANTIATE(cl_double);

 // General
 array::array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context):
-context_(context), data_(data),
-infos_(init_infos(dtype, data_(), s1.size, s2.size, s1.start, s2.start, s1.stride, s2.stride, ld))
+  dtype_(dtype), shape_(s1.size, s2.size), start_(s1.start, s2.start), stride_(s1.stride, s2.stride),
+   ld_(ld), context_(context), data_(data)
 { }

-array::array(control const & x):
-context_(x.expression().context()), data_(context_, CL_MEM_READ_WRITE, size_of(x.expression().dtype())*prod(x.expression().shape())),
-infos_(init_infos(x.expression().dtype(), data_(), x.expression().shape()._1, x.expression().shape()._2, 0, 0, 1, 1, x.expression().shape()._1))
+array::array(array_expression const & proxy) :
+  dtype_(proxy.dtype()),
+  shape_(proxy.shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
+  context_(proxy.context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
 {
-  *this = x;
+  *this = proxy;
 }

-array::array(array const & x) :
-context_(x.context()), data_(context_, CL_MEM_READ_WRITE, size_of(x.dtype())*x.shape()._1*x.shape()._2),
-infos_(init_infos(x.dtype(), data_(), x.shape()._1, x.shape()._2, 0, 0, 1, 1, x.shape()._1))
+array::array(array const & other) :
+  dtype_(other.dtype()),
+  shape_(other.shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
+  context_(other.context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
 {
-  *this = x;
+  *this = other;
 }


 /*--- Getters ---*/
 numeric_type array::dtype() const
-{ return infos_.dtype; }
+{ return dtype_; }

 size4 array::shape() const
-{ return size4(infos_.shape1, infos_.shape2); }
+{ return shape_; }

 int_t array::nshape() const
-{ return int_t((infos_.shape1 > 1) + (infos_.shape2 > 1)); }
+{ return int_t((shape_._1 > 1) + (shape_._2 > 1)); }

 size4 array::start() const
-{ return size4(infos_.start1, infos_.start2); }
+{ return start_; }

 size4 array::stride() const
-{ return size4(infos_.stride1, infos_.stride2); }
+{ return stride_; }

 int_t array::ld() const
-{ return infos_.ld; }
+{ return ld_; }

 cl::Context const & array::context() const
 { return context_; }
@@ -137,27 +126,25 @@ cl::Buffer const & array::data() const
 { return data_; }

 int_t array::dsize() const
-{ return infos_.ld*infos_.shape2; }
+{ return ld_*shape_._2; }

 /*--- Assignment Operators ----*/
 //---------------------------------------
 array & array::operator=(array const & rhs)
 {
-  assert(dtype() == rhs.dtype());
-  array_expression expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ASSIGN_TYPE), context_, dtype(), shape());
-  cl::CommandQueue & queue = cl_ext::get_queue(context_, 0);
+  assert(dtype_ == rhs.dtype());
+  array_expression expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ASSIGN_TYPE), context_, dtype_, shape_);
+  cl::CommandQueue & queue = cl_ext::queues[context_][0];
  model_map_t & mmap = atidlas::get_model_map(queue);
  execute(expression, mmap);
  return *this;
 }

-array & array::operator=(control const & x)
+array & array::operator=(array_expression const & rhs)
 {
-  array_expression const & rhs = x.expression();
-
-  assert(dtype() == rhs.dtype());
-  array_expression expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ASSIGN_TYPE), dtype(), shape());
-  cl::CommandQueue & queue = cl_ext::get_queue(context_, 0);
+  assert(dtype_ == rhs.dtype());
+  array_expression expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ASSIGN_TYPE), dtype_, shape_);
+  cl::CommandQueue & queue = cl_ext::queues[context_][0];
  model_map_t & mmap = atidlas::get_model_map(queue);
  execute(expression, mmap);
  return *this;
@@ -186,47 +173,47 @@ INSTANTIATE(cl_double);
 #undef INSTANTIATE

 array_expression array::operator-()
-{ return array_expression(*this, invalid_node(), op_element(OPERATOR_UNARY_TYPE_FAMILY, OPERATOR_SUB_TYPE), context_, dtype(), shape()); }
+{ return array_expression(*this, invalid_node(), op_element(OPERATOR_UNARY_TYPE_FAMILY, OPERATOR_SUB_TYPE), context_, dtype_, shape_); }

 array_expression array::operator!()
-{ return array_expression(*this, invalid_node(), op_element(OPERATOR_UNARY_TYPE_FAMILY, OPERATOR_NEGATE_TYPE), context_, INT_TYPE, shape()); }
+{ return array_expression(*this, invalid_node(), op_element(OPERATOR_UNARY_TYPE_FAMILY, OPERATOR_NEGATE_TYPE), context_, INT_TYPE, shape_); }

 //
 array & array::operator+=(value_scalar const & rhs)
-{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ADD_TYPE), context_, dtype(), shape()); }
+{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ADD_TYPE), context_, dtype_, shape_); }

 array & array::operator+=(array const & rhs)
-{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ADD_TYPE), context_, dtype(), shape()); }
+{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ADD_TYPE), context_, dtype_, shape_); }

 array & array::operator+=(array_expression const & rhs)
-{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ADD_TYPE), dtype(), shape()); }
+{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ADD_TYPE), dtype_, shape_); }
 //----
 array & array::operator-=(value_scalar const & rhs)
-{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_SUB_TYPE), context_, dtype(), shape()); }
+{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_SUB_TYPE), context_, dtype_, shape_); }

 array & array::operator-=(array const & rhs)
-{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_SUB_TYPE), context_, dtype(), shape()); }
+{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_SUB_TYPE), context_, dtype_, shape_); }

 array & array::operator-=(array_expression const & rhs)
-{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_SUB_TYPE), dtype(), shape()); }
+{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_SUB_TYPE), dtype_, shape_); }
 //----
 array & array::operator*=(value_scalar const & rhs)
-{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_MULT_TYPE), context_, dtype(), shape()); }
+{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_MULT_TYPE), context_, dtype_, shape_); }

 array & array::operator*=(array const & rhs)
-{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_MULT_TYPE), context_, dtype(), shape()); }
+{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_MULT_TYPE), context_, dtype_, shape_); }

 array & array::operator*=(array_expression const & rhs)
-{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_MULT_TYPE), dtype(), shape()); }
+{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_MULT_TYPE), dtype_, shape_); }
 //----
 array & array::operator/=(value_scalar const & rhs)
-{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_DIV_TYPE), context_, dtype(), shape()); }
+{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_DIV_TYPE), context_, dtype_, shape_); }

 array & array::operator/=(array const & rhs)
-{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_DIV_TYPE), context_, dtype(), shape()); }
+{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_DIV_TYPE), context_, dtype_, shape_); }

 array & array::operator/=(array_expression const & rhs)
-{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_DIV_TYPE), dtype(), shape()); }
+{ return *this = array_expression(*this, rhs, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_DIV_TYPE), dtype_, shape_); }

 array_expression array::T() const
 { return atidlas::trans(*this) ;}
@@ -236,13 +223,13 @@ array_expression array::T() const
 scalar array::operator [](int_t idx)
 {
  assert(nshape()==1);
-  return scalar(dtype(), data_, idx, context_);
+  return scalar(dtype_, data_, idx, context_);
 }

 const scalar array::operator [](int_t idx) const
 {
  assert(nshape()==1);
-  return scalar(dtype(), data_, idx, context_);
+  return scalar(dtype_, data_, idx, context_);
 }


@@ -263,7 +250,7 @@ namespace detail
 template<class T>
 void copy(cl::Context & ctx, cl::Buffer const & data, T value)
 {
-  cl_ext::get_queue(ctx, 0).enqueueWriteBuffer(data, CL_TRUE, 0, sizeof(T), (void*)&value);
+  cl_ext::queues[ctx][0].enqueueWriteBuffer(data, CL_TRUE, 0, sizeof(T), (void*)&value);
 }

 }
@@ -273,7 +260,7 @@ scalar::scalar(numeric_type dtype, const cl::Buffer &data, int_t offset, cl::Con

 scalar::scalar(value_scalar value, cl::Context context) : array(1, value.dtype(), context)
 {
-  switch(dtype())
+  switch(dtype_)
  {
 //    case BOOL_TYPE: detail::copy(context_, data_, (cl_bool)value); break;
    case CHAR_TYPE: detail::copy(context_, data_, (cl_char)value); break;
@@ -287,7 +274,7 @@ scalar::scalar(value_scalar value, cl::Context context) : array(1, value.dtype()
 //    case HALF_TYPE: detail::copy(context_, data_, (cl_float)value); break;
    case FLOAT_TYPE: detail::copy(context_, data_, (cl_float)value); break;
    case DOUBLE_TYPE: detail::copy(context_, data_, (cl_double)value); break;
-    default: throw unknown_datatype(dtype());
+    default: throw unknown_datatype(dtype_);
  }
 }

@@ -295,19 +282,19 @@ scalar::scalar(value_scalar value, cl::Context context) : array(1, value.dtype()
 scalar::scalar(numeric_type dtype, cl::Context context) : array(1, dtype, context)
 { }

-scalar::scalar(control const &proxy) : array(proxy){ }
+scalar::scalar(array_expression const & proxy) : array(proxy){ }

 template<class T>
 T scalar::cast() const
 {
  values_holder v;
-  int_t dtsize = size_of(dtype());
+  int_t dtsize = size_of(dtype_);
 #define HANDLE_CASE(DTYPE, VAL) \
 case DTYPE:\
-  cl_ext::get_queue(context_, 0).enqueueReadBuffer(data_, CL_TRUE, infos_.start1*dtsize, dtsize, (void*)&v.VAL);\
+  cl_ext::queues[context_][0].enqueueReadBuffer(data_, CL_TRUE, start_._1*dtsize, dtsize, (void*)&v.VAL);\
  return v.VAL

-  switch(dtype())
+  switch(dtype_)
  {
 //    HANDLE_CASE(BOOL_TYPE, bool8);
    HANDLE_CASE(CHAR_TYPE, int8);
@@ -321,7 +308,7 @@ case DTYPE:\
 //    HANDLE_CASE(HALF_TYPE, float16);
    HANDLE_CASE(FLOAT_TYPE, float32);
    HANDLE_CASE(DOUBLE_TYPE, float64);
-    default: throw unknown_datatype(dtype());
+    default: throw unknown_datatype(dtype_);
  }
 #undef HANDLE_CASE

@@ -329,16 +316,16 @@ case DTYPE:\

 scalar& scalar::operator=(value_scalar const & s)
 {
-  cl::CommandQueue& queue = cl_ext::get_queue(context_, 0);
-  int_t dtsize = size_of(dtype());
+  cl::CommandQueue& queue = cl_ext::queues[context_][0];
+  int_t dtsize = size_of(dtype_);

 #define HANDLE_CASE(TYPE, CLTYPE) case TYPE:\
                            {\
                              CLTYPE v = s;\
-                              queue.enqueueWriteBuffer(data_, CL_TRUE, infos_.start1*dtsize, dtsize, (void*)&v);\
+                              queue.enqueueWriteBuffer(data_, CL_TRUE, start_._1*dtsize, dtsize, (void*)&v);\
                              return *this;\
                            }
-  switch(dtype())
+  switch(dtype_)
  {
 //    HANDLE_CASE(BOOL_TYPE, cl_bool)
    HANDLE_CASE(CHAR_TYPE, cl_char)
@@ -352,7 +339,7 @@ scalar& scalar::operator=(value_scalar const & s)
 //    HANDLE_CASE(HALF_TYPE, cl_half)
    HANDLE_CASE(FLOAT_TYPE, cl_float)
    HANDLE_CASE(DOUBLE_TYPE, cl_double)
-    default: throw unknown_datatype(dtype());
+    default: throw unknown_datatype(dtype_);
  }
 }

@@ -707,18 +694,11 @@ namespace detail
 array reshape(array const & a, int_t size1, int_t size2)
 {
  array tmp(a);
-  tmp.infos_.shape1 = size1;
-  tmp.infos_.shape2 = size2;
+  tmp.shape_._1 = size1;
+  tmp.shape_._2 = size2;
  return tmp;
 }

-array reshape(array_expression const & a, int_t size1, int_t size2)
-{
-  array tmp(a);
-  tmp.infos_.shape1 = size1;
-  tmp.infos_.shape2 = size2;
-  return tmp;
-}

 #define DEFINE_DOT(LTYPE, RTYPE) \
 array_expression dot(LTYPE const & x, RTYPE const & y)\
@@ -804,10 +784,10 @@ void copy(array const & x, void* data, cl::CommandQueue & queue, bool blocking)
 }

 void copy(void const *data, array &x, bool blocking)
-{ copy(data, x, cl_ext::get_queue(x.context(), 0), blocking); }
+{ copy(data, x, cl_ext::queues[x.context()][0], blocking); }

 void copy(array const & x, void* data, bool blocking)
-{ copy(x, data, cl_ext::get_queue(x.context(), 0), blocking); }
+{ copy(x, data, cl_ext::queues[x.context()][0], blocking); }

 //std::vector<>
 template<class T>
@@ -832,11 +812,11 @@ void copy(array const & x, std::vector<T> & cx, cl::CommandQueue & queue, bool b

 template<class T>
 void copy(std::vector<T> const & cx, array & x, bool blocking)
-{ copy(cx, x, cl_ext::get_queue(x.context(), 0), blocking); }
+{ copy(cx, x, cl_ext::queues[x.context()][0], blocking); }

 template<class T>
 void copy(array const & x, std::vector<T> & cx, bool blocking)
-{ copy(x, cx, cl_ext::get_queue(x.context(), 0), blocking); }
+{ copy(x, cx, cl_ext::queues[x.context()][0], blocking); }

 #define INSTANTIATE(T) \
  template void copy<T>(std::vector<T> const &, array &, cl::CommandQueue&, bool);\
--- a/lib/backend/mapped_object.cpp
+++ b/lib/backend/mapped_object.cpp
@@ -74,8 +74,8 @@ std::string mapped_object::process(std::string const & in) const
 {
  std::string res(in);
  preprocess(res);
-  for (std::map<std::string,std::string>::const_iterator it = keywords_.begin(); it != keywords_.end(); ++it)
-    tools::find_and_replace(res, it->first, it->second);
+  for (const auto & elem : keywords_)
+    tools::find_and_replace(res, elem.first, elem.second);
  postprocess(res);
  return res;
 }
--- a/lib/backend/templates/base.cpp
+++ b/lib/backend/templates/base.cpp
@@ -30,44 +30,44 @@ numeric_type base::map_functor::get_numeric_type(atidlas::array_expression const

 /** @brief Binary leaf */
 template<class T>
-tools::shared_ptr<mapped_object> base::map_functor::binary_leaf(atidlas::array_expression const * array_expression, int_t root_idx, mapping_type const * mapping) const
+std::shared_ptr<mapped_object> base::map_functor::binary_leaf(atidlas::array_expression const * array_expression, int_t root_idx, mapping_type const * mapping) const
 {
-  return tools::shared_ptr<mapped_object>(new T(numeric_type_to_string(array_expression->dtype()), binder_.get(NULL), mapped_object::node_info(mapping, array_expression, root_idx)));
+  return std::shared_ptr<mapped_object>(new T(numeric_type_to_string(array_expression->dtype()), binder_.get(NULL), mapped_object::node_info(mapping, array_expression, root_idx)));
 }

 /** @brief Scalar mapping */
-tools::shared_ptr<mapped_object> base::map_functor::create(numeric_type dtype, values_holder) const
+std::shared_ptr<mapped_object> base::map_functor::create(numeric_type dtype, values_holder) const
 {
  std::string strdtype = numeric_type_to_string(dtype);
-  return tools::shared_ptr<mapped_object>(new mapped_host_scalar(strdtype, binder_.get(NULL)));
+  return std::shared_ptr<mapped_object>(new mapped_host_scalar(strdtype, binder_.get(NULL)));
 }

 /** @brief Vector mapping */
-tools::shared_ptr<mapped_object> base::map_functor::create(array_infos const & a) const
+std::shared_ptr<mapped_object> base::map_functor::create(array_infos const & a) const
 {
  std::string dtype = numeric_type_to_string(a.dtype);
  unsigned int id = binder_.get(a.data);
  //Scalar
  if(a.shape1==1 && a.shape2==1)
-    return tools::shared_ptr<mapped_object>(new mapped_array(dtype, id, 's'));
+    return std::shared_ptr<mapped_object>(new mapped_array(dtype, id, 's'));
  //Column vector
  else if(a.shape1>1 && a.shape2==1)
-    return tools::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'c'));
+    return std::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'c'));
  //Row vector
  else if(a.shape1==1 && a.shape2>1)
-    return tools::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'r'));
+    return std::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'r'));
  //Matrix
  else
-    return tools::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'm'));
+    return std::shared_ptr<mapped_object>(new mapped_array(dtype, id, 'm'));
 }

-tools::shared_ptr<mapped_object> base::map_functor::create(repeat_infos const &) const
+std::shared_ptr<mapped_object> base::map_functor::create(repeat_infos const &) const
 {
  //TODO: Make it less specific!
-  return tools::shared_ptr<mapped_object>(new mapped_tuple("int",binder_.get(NULL),4));
+  return std::shared_ptr<mapped_object>(new mapped_tuple("int",binder_.get(NULL),4));
 }

-tools::shared_ptr<mapped_object> base::map_functor::create(lhs_rhs_element const & lhs_rhs) const
+std::shared_ptr<mapped_object> base::map_functor::create(lhs_rhs_element const & lhs_rhs) const
 {
  switch(lhs_rhs.type_family)
  {
@@ -111,7 +111,7 @@ void base::map_functor::operator()(atidlas::array_expression const & array_expre
    else if (root_node.op.type == OPERATOR_OUTER_PROD_TYPE)
      mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_outer>(&array_expression, root_idx, &mapping_)));
    else if (detail::is_cast(root_node.op))
-      mapping_.insert(mapping_type::value_type(key, tools::shared_ptr<mapped_object>(new mapped_cast(root_node.op.type, binder_.get(NULL)))));
+      mapping_.insert(mapping_type::value_type(key, std::shared_ptr<mapped_object>(new mapped_cast(root_node.op.type, binder_.get(NULL)))));
  }
 }

@@ -219,8 +219,8 @@ void base::compute_index_reduction(kernel_generation_stream & os, std::string ac
 void base::process_all(std::string const & type_key, std::string const & str,
                        kernel_generation_stream & stream, std::vector<mapping_type> const & mappings)
 {
-  for (std::vector<mapping_type>::const_iterator mit = mappings.begin(); mit != mappings.end(); ++mit)
-    for (mapping_type::const_iterator mmit = mit->begin(); mmit != mit->end(); ++mmit)
+  for (const auto & mapping : mappings)
+    for (mapping_type::const_iterator mmit = mapping.begin(); mmit != mapping.end(); ++mmit)
      if (mmit->second->type_key()==type_key)
        stream << mmit->second->process(str) << std::endl;
 }
@@ -230,9 +230,9 @@ void base::base::process_all_at(std::string const & type_key, std::string const
                           kernel_generation_stream & stream, std::vector<mapping_type> const & mappings,
                           size_t root_idx, leaf_t leaf)
 {
-  for (std::vector<mapping_type>::const_iterator mit = mappings.begin(); mit != mappings.end(); ++mit)
+  for (const auto & mapping : mappings)
  {
-    mapped_object * obj = mit->at(mapping_key(root_idx, leaf)).get();
+    mapped_object * obj = mapping.at(mapping_key(root_idx, leaf)).get();
    if (obj->type_key()==type_key)
      stream << obj->process(str) << std::endl;
  }
@@ -280,9 +280,9 @@ std::string base::generate_arguments(std::string const & data_type, std::vector<

 void base::set_arguments(expressions_tuple const & expressions, cl::Kernel & kernel, unsigned int & current_arg)
 {
-  tools::shared_ptr<symbolic_binder> binder = make_binder();
-  for (expressions_tuple::data_type::const_iterator itt = expressions.data().begin(); itt != expressions.data().end(); ++itt)
-    traverse(**itt, (*itt)->root(), set_arguments_functor(*binder, current_arg, kernel), true);
+  std::shared_ptr<symbolic_binder> binder = make_binder();
+  for (const auto & elem : expressions.data())
+    traverse(*elem, (elem)->root(), set_arguments_functor(*binder, current_arg, kernel), true);
 }

 void base::fill_kernel_name(char * ptr, unsigned int label, const char * suffix)
@@ -372,8 +372,8 @@ bool base::is_strided(array_expression::node const & node)

 bool base::requires_fallback(expressions_tuple const & expressions)
 {
-  for (expressions_tuple::data_type::const_iterator it = expressions.data().begin(); it != expressions.data().end(); ++it)
-    for(array_expression::container_type::const_iterator itt = (*it)->tree().begin(); itt != (*it)->tree().end() ; ++itt)
+  for (const auto & elem : expressions.data())
+    for(array_expression::container_type::const_iterator itt = (elem)->tree().begin(); itt != (elem)->tree().end() ; ++itt)
      if(   (itt->lhs.subtype==DENSE_ARRAY_TYPE && (std::max(itt->lhs.array.stride1, itt->lhs.array.stride2)>1 || std::max(itt->lhs.array.start1,itt->lhs.array.start2)>0))
         || (itt->rhs.subtype==DENSE_ARRAY_TYPE && (std::max(itt->rhs.array.stride1, itt->rhs.array.stride2)>1 || std::max(itt->rhs.array.start1,itt->rhs.array.start2)>0)))
        return true;
@@ -478,12 +478,12 @@ unsigned int base::align(unsigned int to_round, unsigned int base)
  return (to_round + base - 1)/base * base;
 }

-tools::shared_ptr<symbolic_binder> base::make_binder()
+std::shared_ptr<symbolic_binder> base::make_binder()
 {
  if (binding_policy_==BIND_TO_HANDLE)
-    return tools::shared_ptr<symbolic_binder>(new bind_to_handle());
+    return std::shared_ptr<symbolic_binder>(new bind_to_handle());
  else
-    return tools::shared_ptr<symbolic_binder>(new bind_all_unique());
+    return std::shared_ptr<symbolic_binder>(new bind_all_unique());
 }


@@ -509,7 +509,7 @@ std::vector<std::string> base::generate(unsigned int label, expressions_tuple co

  //Create mapping
  std::vector<mapping_type> mappings(expressions.data().size());
-  tools::shared_ptr<symbolic_binder> binder = make_binder();
+  std::shared_ptr<symbolic_binder> binder = make_binder();
  for (mit = mappings.begin(), sit = expressions.data().begin(); sit != expressions.data().end(); ++sit, ++mit)
    traverse(**sit, (*sit)->root(), map_functor(*binder,*mit), true);

@@ -533,8 +533,8 @@ int_t base_impl<TType, PType>::local_size_1() const
 { return p_.local_size_1; }

 template<class TType, class PType>
-tools::shared_ptr<base> base_impl<TType, PType>::clone() const
-{ return tools::shared_ptr<base>(new TType(*dynamic_cast<TType const *>(this))); }
+std::shared_ptr<base> base_impl<TType, PType>::clone() const
+{ return std::shared_ptr<base>(new TType(*dynamic_cast<TType const *>(this))); }

 template<class TType, class PType>
 int base_impl<TType, PType>::check_invalid(expressions_tuple const & expressions, cl::Device const & device) const
--- a/lib/backend/templates/mproduct.cpp
+++ b/lib/backend/templates/mproduct.cpp
@@ -1,6 +1,6 @@
 #include "atidlas/array.h"
 #include "atidlas/backend/templates/mproduct.h"
-#include "atidlas/cl/lazy_compiler.h"
+#include "atidlas/cl_ext/lazy_compiler.h"
 #include "atidlas/tools/make_vector.hpp"
 #include "atidlas/tools/to_string.hpp"

@@ -587,7 +587,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
    kernel.setArg(current_arg++, cl_uint(N));
    kernel.setArg(current_arg++, cl_uint(K));

-    tools::shared_ptr<symbolic_binder> binder = make_binder();
+    std::shared_ptr<symbolic_binder> binder = make_binder();
    set_arguments_functor fun(*binder, current_arg, kernel);
    fun.set_arguments(C);
    fun.set_arguments(alpha.dtype(), alpha.values());
--- a/lib/backend/templates/mreduction.cpp
+++ b/lib/backend/templates/mreduction.cpp
@@ -50,8 +50,8 @@ std::string mreduction::generate_impl(unsigned int label, expressions_tuple cons
                                                                            ("array2", "#pointer += #start1 + #start2*#ld; "
                                                                                       "#ld *= #nldstride; "), expressions, mappings);

-  for (std::vector<mapped_mreduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
-    stream << (*it)->process("__local #scalartype #name_buf[" + to_string(lsize0*lsize1) + "];") << std::endl;
+  for (const auto & expr : exprs)
+    stream << (expr)->process("__local #scalartype #name_buf[" + to_string(lsize0*lsize1) + "];") << std::endl;

  stream << "unsigned int lid0 = get_local_id(0);" << std::endl;
  stream << "unsigned int lid1 = get_local_id(1);" << std::endl;
@@ -59,8 +59,8 @@ std::string mreduction::generate_impl(unsigned int label, expressions_tuple cons
  stream << "for(unsigned int r = get_global_id(0); r < upper_bound_0; r += get_global_size(0)){" << std::endl;
  stream.inc_tab();

-  for (std::vector<mapped_mreduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
-    stream << (*it)->process("#scalartype #name_acc = " + neutral_element((*it)->root_op()) + ";") << std::endl;
+  for (const auto & expr : exprs)
+    stream << (expr)->process("#scalartype #name_acc = " + neutral_element((expr)->root_op()) + ";") << std::endl;

  stream << "if (r < M)" << std::endl;
  stream << "{" << std::endl;
@@ -75,7 +75,7 @@ std::string mreduction::generate_impl(unsigned int label, expressions_tuple cons
    {
      std::string data_type = append_width("#scalartype",simd_width);

-      for (std::vector<mapped_mreduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
+      for (const auto & elem : exprs)
      {
        std::map<std::string, std::string> accessors;
        if(reduction==REDUCE_COLUMNS)
@@ -88,7 +88,7 @@ std::string mreduction::generate_impl(unsigned int label, expressions_tuple cons
          accessors["array2"] = "#scalartype #namereg = #pointer[r*#stride1 + c*#ld];";
          accessors["repeat"] = "#scalartype #namereg = $VALUE{(r%#tuplearg0)*#stride, (c%#tuplearg1)*#stride};";
        }
-        (*it)->process_recursive(stream, PARENT_NODE_TYPE, accessors);
+        (elem)->process_recursive(stream, PARENT_NODE_TYPE, accessors);
      }


@@ -101,7 +101,7 @@ std::string mreduction::generate_impl(unsigned int label, expressions_tuple cons
          str[a] = append_simd_suffix("#namereg.s",a);


-      for (unsigned int k = 0; k < exprs.size(); ++k)
+      for (auto & elem : exprs)
      {
        for (unsigned int a = 0; a < simd_width; ++a)
        {
@@ -109,11 +109,11 @@ std::string mreduction::generate_impl(unsigned int label, expressions_tuple cons
          accessors["array2"] = str[a];
          accessors["repeat"] = "#namereg";
          accessors["array0"] = "#namereg";
-          std::string value = exprs[k]->evaluate_recursive(LHS_NODE_TYPE, accessors);
-          if (exprs[k]->is_index_reduction())
-            compute_index_reduction(stream, exprs[k]->process("#name_acc"), "c*"+to_string(simd_width) + to_string(a), exprs[k]->process("#name_acc_value"), value,exprs[k]->root_op());
+          std::string value = elem->evaluate_recursive(LHS_NODE_TYPE, accessors);
+          if (elem->is_index_reduction())
+            compute_index_reduction(stream, elem->process("#name_acc"), "c*"+to_string(simd_width) + to_string(a), elem->process("#name_acc_value"), value,elem->root_op());
          else
-            compute_reduction(stream, exprs[k]->process("#name_acc"), value,exprs[k]->root_op());
+            compute_reduction(stream, elem->process("#name_acc"), value,elem->root_op());
        }
      }
    }
@@ -126,8 +126,8 @@ std::string mreduction::generate_impl(unsigned int label, expressions_tuple cons
  stream.dec_tab();
  stream << "}" << std::endl;

-  for (unsigned int k = 0; k < exprs.size(); ++k)
-    stream << exprs[k]->process("#name_buf[lid0*" + lsize1str + "+ lid1] = #name_acc;") << std::endl;
+  for (auto & expr : exprs)
+    stream << expr->process("#name_buf[lid0*" + lsize1str + "+ lid1] = #name_acc;") << std::endl;

  stream << "#pragma unroll" << std::endl;
  stream << "for(unsigned int stride = " << p_.local_size_1/2 << "; stride >0; stride /=2)" << std::endl;
@@ -139,13 +139,13 @@ std::string mreduction::generate_impl(unsigned int label, expressions_tuple cons
  stream << "{" << std::endl;
  stream.inc_tab();

-  for (unsigned int k = 0; k < exprs.size(); k++)
-    if (exprs[k]->is_index_reduction())
-      compute_index_reduction(stream, exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1]"), exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1 + stride]")
-                              , exprs[k]->process("#name_buf_value[lid0*" + lsize1str + " + lid1]"), exprs[k]->process("#name_buf_value[lid0*" + lsize1str + " + lid1 + stride]"),
-                              exprs[k]->root_op());
+  for (auto & expr : exprs)
+    if (expr->is_index_reduction())
+      compute_index_reduction(stream, expr->process("#name_buf[lid0*" + lsize1str + " + lid1]"), expr->process("#name_buf[lid0*" + lsize1str + " + lid1 + stride]")
+                              , expr->process("#name_buf_value[lid0*" + lsize1str + " + lid1]"), expr->process("#name_buf_value[lid0*" + lsize1str + " + lid1 + stride]"),
+                              expr->root_op());
    else
-      compute_reduction(stream,exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1]"), exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1 + stride]"), exprs[k]->root_op());
+      compute_reduction(stream,expr->process("#name_buf[lid0*" + lsize1str + " + lid1]"), expr->process("#name_buf[lid0*" + lsize1str + " + lid1 + stride]"), expr->root_op());

  stream.dec_tab();
  stream << "}" << std::endl;
@@ -183,8 +183,8 @@ std::vector<std::string> mreduction::generate_impl(unsigned int label, expressio
  {
    array_expression const & first_expression = *expressions.data().front();
    std::vector<size_t> idx = filter_nodes(&is_reduction, first_expression, false);
-    for (unsigned int j = 0; j < idx.size(); ++j)
-      exprs.push_back((mapped_mreduction*)(mit->at(mapping_key(idx[j], PARENT_NODE_TYPE)).get()));
+    for (auto & elem : idx)
+      exprs.push_back((mapped_mreduction*)(mit->at(mapping_key(elem, PARENT_NODE_TYPE)).get()));
  }

  std::vector<std::string> res;
--- a/lib/backend/templates/reduction.cpp
+++ b/lib/backend/templates/reduction.cpp
@@ -16,9 +16,9 @@ reduction_parameters::reduction_parameters(unsigned int _simd_width,
 unsigned int reduction::lmem_usage(expressions_tuple const & expressions) const
 {
  unsigned int res = 0;
-  for(expressions_tuple::data_type::const_iterator it = expressions.data().begin() ; it != expressions.data().end() ; ++it)
+  for(const auto & elem : expressions.data())
  {
-    numeric_type numeric_t= lhs_most((*it)->tree(), (*it)->root()).lhs.dtype;
+    numeric_type numeric_t= lhs_most((elem)->tree(), (elem)->root()).lhs.dtype;
    res += p_.local_size_0*size_of(numeric_t);
  }
  return res;
@@ -43,13 +43,13 @@ inline void reduction::reduce_1d_local_memory(kernel_generation_stream & stream,
  stream << "{" << std::endl;
  stream.inc_tab();

-  for (unsigned int k = 0; k < exprs.size(); k++)
-    if (exprs[k]->is_index_reduction())
-      compute_index_reduction(stream, exprs[k]->process(buf_str+"[lid]"), exprs[k]->process(buf_str+"[lid+stride]")
-                              , exprs[k]->process(buf_value_str+"[lid]"), exprs[k]->process(buf_value_str+"[lid+stride]"),
-                              exprs[k]->root_op());
+  for (auto & expr : exprs)
+    if (expr->is_index_reduction())
+      compute_index_reduction(stream, expr->process(buf_str+"[lid]"), expr->process(buf_str+"[lid+stride]")
+                              , expr->process(buf_value_str+"[lid]"), expr->process(buf_value_str+"[lid+stride]"),
+                              expr->root_op());
    else
-      compute_reduction(stream, exprs[k]->process(buf_str+"[lid]"), exprs[k]->process(buf_str+"[lid+stride]"), exprs[k]->root_op());
+      compute_reduction(stream, expr->process(buf_str+"[lid]"), expr->process(buf_str+"[lid+stride]"), expr->root_op());
  stream.dec_tab();
  stream << "}" << std::endl;
  stream.dec_tab();
@@ -61,8 +61,8 @@ std::string reduction::generate_impl(unsigned int label, const char * type, expr
  kernel_generation_stream stream;

  std::vector<mapped_scalar_reduction*> exprs;
-  for (std::vector<mapping_type>::const_iterator it = mappings.begin(); it != mappings.end(); ++it)
-    for (mapping_type::const_iterator iit = it->begin(); iit != it->end(); ++iit)
+  for (const auto & mapping : mappings)
+    for (mapping_type::const_iterator iit = mapping.begin(); iit != mapping.end(); ++iit)
      if (mapped_scalar_reduction * p = dynamic_cast<mapped_scalar_reduction*>(iit->second.get()))
        exprs.push_back(p);
  std::size_t N = exprs.size();
@@ -122,8 +122,8 @@ std::string reduction::generate_impl(unsigned int label, const char * type, expr
    {
      std::string i = (simd_width==1)?"i*#stride":"i";
      //Fetch vector entry
-      for (std::vector<mapped_scalar_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
-        (*it)->process_recursive(stream, PARENT_NODE_TYPE, tools::make_map<std::map<std::string, std::string> >("array1",  append_width("#scalartype",simd_width) + " #namereg = " + vload(simd_width,i,"#pointer")+";")
+      for (const auto & elem : exprs)
+        (elem)->process_recursive(stream, PARENT_NODE_TYPE, tools::make_map<std::map<std::string, std::string> >("array1",  append_width("#scalartype",simd_width) + " #namereg = " + vload(simd_width,i,"#pointer")+";")
                                                                                         ("matrix_row",  "#scalartype #namereg = #pointer[$OFFSET{#row*#stride, i*#stride2}];")
                                                                                         ("matrix_column", "#scalartype #namereg = #pointer[$OFFSET{i*#stride,#column*#stride2}];")
                                                                                         ("matrix_diag", "#scalartype #namereg = #pointer[#diag_offset<0?$OFFSET{(i - #diag_offset)*#stride, i*#stride2}:$OFFSET{i*#stride, (i + #diag_offset)*#stride2}];"));
@@ -137,7 +137,7 @@ std::string reduction::generate_impl(unsigned int label, const char * type, expr
        for (unsigned int a = 0; a < simd_width; ++a)
          str[a] = append_simd_suffix("#namereg.s", a);

-      for (unsigned int k = 0; k < exprs.size(); ++k)
+      for (auto & elem : exprs)
      {
        for (unsigned int a = 0; a < simd_width; ++a)
        {
@@ -147,12 +147,12 @@ std::string reduction::generate_impl(unsigned int label, const char * type, expr
          accessors["matrix_column"] = str[a];
          accessors["matrix_diag"] = str[a];
          accessors["array0"] = "#namereg";
-          std::string value = exprs[k]->evaluate_recursive(LHS_NODE_TYPE, accessors);
-          if (exprs[k]->is_index_reduction())
-            compute_index_reduction(stream, exprs[k]->process("#name_acc"),  "i*" + tools::to_string(simd_width) + "+"
-                                    + tools::to_string(a), exprs[k]->process("#name_acc_value"), value,exprs[k]->root_op());
+          std::string value = elem->evaluate_recursive(LHS_NODE_TYPE, accessors);
+          if (elem->is_index_reduction())
+            compute_index_reduction(stream, elem->process("#name_acc"),  "i*" + tools::to_string(simd_width) + "+"
+                                    + tools::to_string(a), elem->process("#name_acc_value"), value,elem->root_op());
          else
-            compute_reduction(stream, exprs[k]->process("#name_acc"), value,exprs[k]->root_op());
+            compute_reduction(stream, elem->process("#name_acc"), value,elem->root_op());
        }
      }
    }
@@ -286,11 +286,11 @@ void reduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compi
  //Preprocessing
  int_t size = input_sizes(expressions)[0];
  std::vector<array_expression::node const *> reductions;
-  for (expressions_tuple::data_type::const_iterator it = expressions.data().begin(); it != expressions.data().end(); ++it)
+  for (const auto & elem : expressions.data())
  {
-    std::vector<size_t> reductions_idx = filter_nodes(&is_reduction, **it, false);
-    for (std::vector<size_t>::iterator itt = reductions_idx.begin(); itt != reductions_idx.end(); ++itt)
-      reductions.push_back(&(*it)->tree()[*itt]);
+    std::vector<size_t> reductions_idx = filter_nodes(&is_reduction, *elem, false);
+    for (auto & reductions_idx_itt : reductions_idx)
+      reductions.push_back(&(elem)->tree()[reductions_idx_itt]);
  }

  //Kernel
@@ -314,10 +314,10 @@ void reduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compi
  cl::Context context = expressions.context();
  array_expression const & s = *(expressions.data().front());
  unsigned int dtype_size = size_of(lhs_most(s.tree(), s.root()).lhs.dtype);
-  for (unsigned int k = 0; k < 2; k++)
+  for (auto & kernel : kernels)
  {
    unsigned int n_arg = 0;
-    kernels[k].setArg(n_arg++, cl_uint(size));
+    kernel.setArg(n_arg++, cl_uint(size));

    //Temporary buffers
    unsigned int i = 0;
@@ -328,15 +328,15 @@ void reduction::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compi
      {
        if (tmpidx_.size() <= j)
          tmpidx_.push_back(cl::Buffer(context, CL_MEM_READ_WRITE, p_.num_groups*4));
-        kernels[k].setArg(n_arg++, tmpidx_[j]);
+        kernel.setArg(n_arg++, tmpidx_[j]);
        j++;
      }
      if (tmp_.size() <= i)
        tmp_.push_back(cl::Buffer(context, CL_MEM_READ_WRITE, p_.num_groups*dtype_size));
-      kernels[k].setArg(n_arg++, tmp_[i]);
+      kernel.setArg(n_arg++, tmp_[i]);
      i++;
    }
-    set_arguments(expressions, kernels[k], n_arg);
+    set_arguments(expressions, kernel, n_arg);
  }

  for (unsigned int k = 0; k < 2; k++)
--- a/lib/backend/templates/vaxpy.cpp
+++ b/lib/backend/templates/vaxpy.cpp
@@ -1,5 +1,5 @@
 #include "atidlas/backend/templates/vaxpy.h"
-#include "atidlas/cl/queues.h"
+#include "atidlas/cl_ext/backend.h"
 #include "atidlas/tools/make_map.hpp"
 #include "atidlas/tools/make_vector.hpp"
 #include "atidlas/tools/to_string.hpp"
@@ -121,8 +121,8 @@ void vaxpy::enqueue(cl::CommandQueue & queue, std::vector<cl_ext::lazy_compiler>
  bool fallback = p_.simd_width > 1 && (requires_fallback(expressions) || (size%p_.simd_width>0));

  cl::Program const & program = programs[fallback?0:1].program();
-  cl_ext::kernels_t::key_type key(program(), label);
-  cl_ext::kernels_t::iterator it = cl_ext::kernels.find(key);
+  cl_ext::kernels_type::key_type key(program(), label);
+  cl_ext::kernels_type::iterator it = cl_ext::kernels.find(key);
  if(it==cl_ext::kernels.end())
    it = cl_ext::kernels.insert(std::make_pair(key, cl::Kernel(program, fallback?kfb:kopt))).first;
  cl::Kernel & kernel = it->second;
--- a/lib/cl/queues.cpp
+++ b/lib/cl/queues.cpp
@@ -1,63 +0,0 @@
-#include "atidlas/cl/queues.h"
-#include <assert.h>
-#include <stdexcept>
-
-namespace atidlas
-{
-
-namespace cl_ext
-{
-
-cl_command_queue_properties queue_properties = 0;
-unsigned int default_context_idx = 0;
-queues_t queues;
-kernels_t kernels;
-
-void synchronize(cl::Context const & context)
-{
-  std::vector<cl::CommandQueue> & q = get_queues(context);
-  for(std::vector<cl::CommandQueue>::iterator it = q.begin() ; it != q.end() ; ++it)
-    it->finish();
-}
-
-void init_queues()
-{
-    std::vector<cl::Platform> platforms;
-    cl::Platform::get(&platforms);
-    for(std::vector<cl::Platform>::iterator it = platforms.begin() ; it != platforms.end() ; ++it)
-    {
-      std::vector<cl::Device> devices;
-      it->getDevices(CL_DEVICE_TYPE_ALL, &devices);
-      for(std::vector<cl::Device>::iterator itt = devices.begin() ; itt != devices.end() ; ++itt)
-        queues.push_back(std::make_pair(cl::Context(std::vector<cl::Device>(1, *itt)), std::vector<cl::CommandQueue>()));
-    }
-    for(queues_t::iterator it = queues.begin() ; it != queues.end() ; ++it)
-      it->second.push_back(cl::CommandQueue(it->first, it->first.getInfo<CL_CONTEXT_DEVICES>()[0], queue_properties));
-}
-
-cl::Context default_context()
-{
-  if(queues.empty())
-    init_queues();
-  return queues.begin()->first;
-}
-
-std::vector<cl::CommandQueue> & get_queues(cl::Context const & ctx)
-{
-  if(queues.empty())
-    init_queues();
-  for(queues_t::iterator it = queues.begin() ; it != queues.end() ; ++it)
-    if(it->first()==ctx()) return it->second;
-  throw std::out_of_range("No such context registered in the backend. Please run atidlas::cl_ext:;register(context, queues)");
-}
-
-cl::CommandQueue & get_queue(cl::Context const & ctx, std::size_t idx)
-{
-  return get_queues(ctx)[idx];
-}
-
-
-
-}
-
-}
--- a/lib/cl_ext/backend.cpp
+++ b/lib/cl_ext/backend.cpp
@@ -0,0 +1,78 @@
+#include "atidlas/cl_ext/backend.h"
+#include <assert.h>
+#include <stdexcept>
+
+namespace atidlas
+{
+
+namespace cl_ext
+{
+
+void synchronize(cl::Context const & context)
+{
+  for(std::vector<cl::CommandQueue>::const_iterator it = queues[context].begin() ; it != queues[context].end() ; ++it)
+    it->finish();
+}
+
+void queues_type::append(const cl::Context & context)
+{
+  data_.push_back(std::make_pair(context, std::vector<cl::CommandQueue>()));
+  std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+  for(auto & device : devices)
+    data_.back().second.push_back(cl::CommandQueue(context, device, queue_properties));
+}
+
+void queues_type::init()
+{
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+    for(auto & platform : platforms)
+    {
+      std::vector<cl::Device> devices;
+      platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
+      for(auto & device : devices)
+        data_.push_back(std::make_pair(cl::Context(std::vector<cl::Device>(1, device)), std::vector<cl::CommandQueue>()));
+    }
+    for(auto & elem : data_)
+      elem.second.push_back(cl::CommandQueue(elem.first, elem.first.getInfo<CL_CONTEXT_DEVICES>()[0], queue_properties));
+}
+
+std::vector<cl::CommandQueue> & queues_type::operator [](cl::Context const & ctx)
+{
+  if(data_.empty())
+    init();
+  for(auto & elem : data_)
+    if(elem.first()==ctx()) return elem.second;
+  append(ctx);
+  return data_.back().second;
+}
+
+cl::Context queues_type::default_context()
+{
+  if(data_.empty())
+    init();
+  data_type::iterator it = data_.begin();
+  std::advance(it, default_context_idx);
+  return it->first;
+}
+
+cl::Context default_context()
+{
+  return queues.default_context();
+}
+
+queues_type::data_type const & queues_type::data()
+{
+  if(data_.empty())
+    init();
+  return data_;
+}
+
+cl_command_queue_properties queue_properties = 0;
+unsigned int default_context_idx = 0;
+queues_type queues;
+kernels_type kernels;
+
+}
+
+}
--- a/lib/cl_ext/lazy_compiler.cpp
+++ b/lib/cl_ext/lazy_compiler.cpp
@@ -1,4 +1,4 @@
-#include "atidlas/cl/lazy_compiler.h"
+#include "atidlas/cl_ext/lazy_compiler.h"

 namespace atidlas
 {
--- a/lib/cl_ext/program_map.cpp
+++ b/lib/cl_ext/program_map.cpp
@@ -3,7 +3,7 @@
 #include <iostream>
 #include <fstream>

-#include "atidlas/cl/program_map.h"
+#include "atidlas/cl_ext/program_map.h"
 #include "sha1.hpp"

 namespace atidlas
--- a/lib/cl_ext/sha1.hpp
+++ b/lib/cl_ext/sha1.hpp
@@ -213,8 +213,8 @@ inline std::string sha1(std::string const & src)
  sha1.getDigest(hash);

  std::ostringstream oss;
-  for (int i = 0; i < 5; ++i)
-    oss << std::hex << std::setfill('0') << std::setw(8) << hash[i];
+  for (auto & elem : hash)
+    oss << std::hex << std::setfill('0') << std::setw(8) << elem;

  return oss.str();
 }
--- a/lib/model/model.cpp
+++ b/lib/model/model.cpp
@@ -1,6 +1,7 @@
 #include <set>
 #include <fstream>
 #include <stdexcept>
+#include <algorithm>

 #include "rapidjson/document.h"
 #include "atidlas/backend/parse.h"
@@ -39,8 +40,8 @@ void model::fill_program_name(char* program_name, expressions_tuple const & expr
    binder = new bind_to_handle();
  else
    binder = new bind_all_unique();
-  for (expressions_tuple::data_type::const_iterator it = expressions.data().begin(); it != expressions.data().end(); ++it)
-    traverse(**it, (*it)->root(), array_expression_representation_functor(*binder, program_name),true);
+  for (const auto & elem : expressions.data())
+    traverse(*elem, elem->root(), array_expression_representation_functor(*binder, program_name),true);
  *program_name='\0';
  delete binder;
 }
@@ -80,11 +81,11 @@ std::vector<cl_ext::lazy_compiler>& model::init(expressions_tuple const & expres
  return to_init;
 }

-model::model(predictors::random_forest const & predictor, std::vector< tools::shared_ptr<base> > const & templates, cl::CommandQueue & queue) :
+model::model(predictors::random_forest const & predictor, std::vector< std::shared_ptr<base> > const & templates, cl::CommandQueue & queue) :
  templates_(templates), predictor_(new predictors::random_forest(predictor)), queue_(queue)
 {}

-model::model(std::vector< tools::shared_ptr<base> > const & templates, cl::CommandQueue & queue) :  templates_(templates), queue_(queue)
+model::model(std::vector< std::shared_ptr<base> > const & templates, cl::CommandQueue & queue) :  templates_(templates), queue_(queue)
 {}

 model::model(base const & tp, cl::CommandQueue & queue) : templates_(1,tp.clone()), queue_(queue)
@@ -166,27 +167,27 @@ namespace detail
    throw std::invalid_argument("Invalid datatype: " + name);
  }

-  static tools::shared_ptr<base> create(std::string const & template_name, std::vector<int> const & a)
+  static std::shared_ptr<base> create(std::string const & template_name, std::vector<int> const & a)
  {
    fetching_policy_type fetch[] = {FETCH_FROM_LOCAL, FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_CONTIGUOUS};
    if(template_name=="vaxpy")
-      return tools::shared_ptr<base>(new vaxpy(a[0], a[1], a[2], fetch[a[3]]));
+      return std::shared_ptr<base>(new vaxpy(a[0], a[1], a[2], fetch[a[3]]));
    else if(template_name=="dot")
-      return tools::shared_ptr<base>(new reduction(a[0], a[1], a[2], fetch[a[3]]));
+      return std::shared_ptr<base>(new reduction(a[0], a[1], a[2], fetch[a[3]]));
    else if(template_name=="maxpy")
-      return tools::shared_ptr<base>(new maxpy(a[0], a[1], a[2], a[3], a[4], fetch[a[5]]));
+      return std::shared_ptr<base>(new maxpy(a[0], a[1], a[2], a[3], a[4], fetch[a[5]]));
    else if(template_name.find("gemvN")!=std::string::npos)
-      return tools::shared_ptr<base>(new mreduction_rows(a[0], a[1], a[2], a[3], fetch[a[4]]));
+      return std::shared_ptr<base>(new mreduction_rows(a[0], a[1], a[2], a[3], fetch[a[4]]));
    else if(template_name.find("gemvT")!=std::string::npos)
-      return tools::shared_ptr<base>(new mreduction_cols(a[0], a[1], a[2], a[3], fetch[a[4]]));
+      return std::shared_ptr<base>(new mreduction_cols(a[0], a[1], a[2], a[3], fetch[a[4]]));
    else if(template_name.find("gemmNN")!=std::string::npos)
-      return tools::shared_ptr<base>(new mproduct_nn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+      return std::shared_ptr<base>(new mproduct_nn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else if(template_name.find("gemmTN")!=std::string::npos)
-      return tools::shared_ptr<base>(new mproduct_tn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+      return std::shared_ptr<base>(new mproduct_tn(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else if(template_name.find("gemmNT")!=std::string::npos)
-      return tools::shared_ptr<base>(new mproduct_nt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+      return std::shared_ptr<base>(new mproduct_nt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else if(template_name.find("gemmTT")!=std::string::npos)
-      return tools::shared_ptr<base>(new mproduct_tt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
+      return std::shared_ptr<base>(new mproduct_tt(a[0], a[1], a[2], a[3], a[4], a[5], a[6], fetch[a[7]], fetch[a[8]], a[9], a[10]));
    else
      throw std::invalid_argument("Invalid expression: " + template_name);
  }
@@ -208,32 +209,32 @@ void import(std::string const & fname, cl::CommandQueue & queue, model_map_t& re
  //Deserialize
  std::vector<std::string> operations = tools::make_vector<std::string>() << "vaxpy" << "dot"  << "maxpy" << "gemvN" << "gemvT"  << "gemmNN" << "gemmTN" << "gemmTT";
  std::vector<std::string> dtype = tools::make_vector<std::string>() << "float32" << "float64";
-  for(std::vector<std::string>::iterator op = operations.begin() ; op != operations.end() ; ++op)
+  for(auto & operation : operations)
  {
-    const char * opcstr = op->c_str();
+    const char * opcstr = operation.c_str();
    if(document.HasMember(opcstr))
    {
-      expression_type etype = detail::get_expression_type(*op);
-      for(std::vector<std::string>::iterator dt = dtype.begin() ; dt != dtype.end() ; ++dt)
+      expression_type etype = detail::get_expression_type(operation);
+      for(auto & elem : dtype)
      {
-        const char * dtcstr = dt->c_str();
+        const char * dtcstr = elem.c_str();
        if(document[opcstr].HasMember(dtcstr))
        {
-          numeric_type dtype = detail::get_dtype(*dt);
+          numeric_type dtype = detail::get_dtype(elem);

          // Get profiles
-          std::vector<tools::shared_ptr<base> > templates;
+          std::vector<std::shared_ptr<base> > templates;
          js::Value const & profiles = document[opcstr][dtcstr]["profiles"];
          for (js::SizeType id = 0 ; id < profiles.Size() ; ++id)
-            templates.push_back(detail::create(*op, tools::to_int_array<int>(profiles[id])));
+            templates.push_back(detail::create(operation, tools::to_int_array<int>(profiles[id])));
          if(templates.size()>1)
          {
            // Get predictor
            predictors::random_forest predictor(document[opcstr][dtcstr]["predictor"]);
-            result[std::make_pair(etype, dtype)] = tools::shared_ptr<model>(new model(predictor, templates, queue));
+            result[std::make_pair(etype, dtype)] = std::shared_ptr<model>(new model(predictor, templates, queue));
          }
          else
-            result[std::make_pair(etype, dtype)] = tools::shared_ptr<model>(new model(templates, queue));
+            result[std::make_pair(etype, dtype)] = std::shared_ptr<model>(new model(templates, queue));
        }
      }
    }
@@ -243,11 +244,11 @@ void import(std::string const & fname, cl::CommandQueue & queue, model_map_t& re
 model_map_t init_models(cl::CommandQueue & queue)
 {
  model_map_t res;
-  typedef tools::shared_ptr<model> ptr_t;
+  typedef std::shared_ptr<model> ptr_t;
  numeric_type types[] = {CHAR_TYPE, UCHAR_TYPE, SHORT_TYPE, USHORT_TYPE, INT_TYPE, UINT_TYPE, LONG_TYPE, ULONG_TYPE, FLOAT_TYPE, DOUBLE_TYPE};

-  for(size_t i = 0 ; i < 10 ; ++i){
-    numeric_type DTYPE = types[i];
+  for(auto DTYPE : types){
+    
    res[std::make_pair(SCALAR_AXPY_TYPE, DTYPE)] = ptr_t(new model(vaxpy(1,64,128,FETCH_FROM_GLOBAL_STRIDED), queue));
    res[std::make_pair(VECTOR_AXPY_TYPE, DTYPE)] = ptr_t (new model(vaxpy(1,64,128,FETCH_FROM_GLOBAL_STRIDED), queue));
    res[std::make_pair(REDUCTION_TYPE, DTYPE)] = ptr_t(new model(reduction(1,64,128,FETCH_FROM_GLOBAL_STRIDED), queue));
--- a/lib/model/predictors/random_forest.cpp
+++ b/lib/model/predictors/random_forest.cpp
@@ -38,9 +38,9 @@ random_forest::random_forest(rapidjson::Value const & estimators)
 std::vector<float> random_forest::predict(std::vector<int_t> const & x) const
 {
  std::vector<float> res(D_, 0);
-  for(std::vector<tree>::const_iterator it = estimators_.begin() ; it != estimators_.end() ; ++it)
+  for(const auto & elem : estimators_)
  {
-    std::vector<float> const & subres = it->predict(x);
+    std::vector<float> const & subres = elem.predict(x);
    for(int_t i = 0 ; i < D_ ; ++i)
      res[i] += subres[i];
  }
--- a/lib/symbolic/execute.cpp
+++ b/lib/symbolic/execute.cpp
@@ -172,30 +172,30 @@ namespace atidlas

    /*----Parse required temporaries-----*/
    detail::parse(tree, rootidx, current_type, breakpoints, final_type);
-    std::vector<tools::shared_ptr<obj_base> > temporaries_;
+    std::vector<std::shared_ptr<array> > temporaries_;

    /*----Compute required temporaries----*/
    for(detail::breakpoints_t::reverse_iterator rit = breakpoints.rbegin() ; rit != breakpoints.rend() ; ++rit)
    {
-      tools::shared_ptr<model> const & pmodel = models[std::make_pair(rit->first, dtype)];
+      std::shared_ptr<model> const & pmodel = models[std::make_pair(rit->first, dtype)];
      array_expression::node const & node = tree[rit->second->node_index];
      array_expression::node const & lmost = lhs_most(tree, node);

      //Creates temporary
-      tools::shared_ptr<obj_base> tmp;
+      std::shared_ptr<array> tmp;
      switch(rit->first){
        case SCALAR_AXPY_TYPE:
-        case REDUCTION_TYPE:           tmp = tools::shared_ptr<obj_base>(new array(1, dtype, context));                                                        break;
+        case REDUCTION_TYPE:           tmp = std::shared_ptr<array>(new array(1, dtype, context));                                                        break;

-        case VECTOR_AXPY_TYPE:         tmp = tools::shared_ptr<obj_base>(new array(lmost.lhs.array.shape1, dtype, context));                              break;
-        case ROW_WISE_REDUCTION_TYPE:  tmp = tools::shared_ptr<obj_base>(new array(lmost.lhs.array.shape1, dtype, context));                              break;
-        case COL_WISE_REDUCTION_TYPE:  tmp = tools::shared_ptr<obj_base>(new array(lmost.lhs.array.shape2, dtype, context));                              break;
+        case VECTOR_AXPY_TYPE:         tmp = std::shared_ptr<array>(new array(lmost.lhs.array.shape1, dtype, context));                              break;
+        case ROW_WISE_REDUCTION_TYPE:  tmp = std::shared_ptr<array>(new array(lmost.lhs.array.shape1, dtype, context));                              break;
+        case COL_WISE_REDUCTION_TYPE:  tmp = std::shared_ptr<array>(new array(lmost.lhs.array.shape2, dtype, context));                              break;

-        case MATRIX_AXPY_TYPE:         tmp = tools::shared_ptr<obj_base>(new array(lmost.lhs.array.shape1, lmost.lhs.array.shape2, dtype, context)); break;
-        case MATRIX_PRODUCT_NN_TYPE:   tmp = tools::shared_ptr<obj_base>(new array(node.lhs.array.shape1, node.rhs.array.shape2, dtype, context));   break;
-        case MATRIX_PRODUCT_NT_TYPE:   tmp = tools::shared_ptr<obj_base>(new array(node.lhs.array.shape1, node.rhs.array.shape1, dtype, context));   break;
-        case MATRIX_PRODUCT_TN_TYPE:   tmp = tools::shared_ptr<obj_base>(new array(node.lhs.array.shape2, node.rhs.array.shape2, dtype, context));   break;
-        case MATRIX_PRODUCT_TT_TYPE:   tmp = tools::shared_ptr<obj_base>(new array(node.lhs.array.shape2, node.rhs.array.shape1, dtype, context));   break;
+        case MATRIX_AXPY_TYPE:         tmp = std::shared_ptr<array>(new array(lmost.lhs.array.shape1, lmost.lhs.array.shape2, dtype, context)); break;
+        case MATRIX_PRODUCT_NN_TYPE:   tmp = std::shared_ptr<array>(new array(node.lhs.array.shape1, node.rhs.array.shape2, dtype, context));   break;
+        case MATRIX_PRODUCT_NT_TYPE:   tmp = std::shared_ptr<array>(new array(node.lhs.array.shape1, node.rhs.array.shape1, dtype, context));   break;
+        case MATRIX_PRODUCT_TN_TYPE:   tmp = std::shared_ptr<array>(new array(node.lhs.array.shape2, node.rhs.array.shape2, dtype, context));   break;
+        case MATRIX_PRODUCT_TT_TYPE:   tmp = std::shared_ptr<array>(new array(node.lhs.array.shape2, node.rhs.array.shape1, dtype, context));   break;

        default: throw std::invalid_argument("Unrecognized operation");
      }
--- a/lib/symbolic/expression.cpp
+++ b/lib/symbolic/expression.cpp
@@ -176,9 +176,9 @@ array_expression array_expression::operator!()


 //
-tools::shared_ptr<array_expression> expressions_tuple::create(array_expression const & s)
+std::shared_ptr<array_expression> expressions_tuple::create(array_expression const & s)
 {
-  return tools::shared_ptr<array_expression>(new array_expression(static_cast<array_expression const &>(s)));
+  return std::shared_ptr<array_expression>(new array_expression(static_cast<array_expression const &>(s)));
 }

 expressions_tuple::expressions_tuple(data_type const & data, order_type order) : data_(data), order_(order)
--- a/python/pyatidlas/setup.py
+++ b/python/pyatidlas/setup.py
@@ -45,7 +45,7 @@ def main():
        return optlist

    cvars = sysconfig.get_config_vars()
-    cvars['OPT'] = "-DNDEBUG -O3 " + str.join(' ', remove_prefixes(cvars['OPT'].split(), ['-g', '-O', '-Wstrict-prototypes', '-DNDEBUG']))
+    cvars['OPT'] = "-DNDEBUG -O3 -std=c++11 " + str.join(' ', remove_prefixes(cvars['OPT'].split(), ['-g', '-O', '-Wstrict-prototypes', '-DNDEBUG']))
    cvars["CFLAGS"] = cvars["BASECFLAGS"] + " " + cvars["OPT"]
    cvars["LDFLAGS"] = '-Wl,--no-as-needed ' + cvars["LDFLAGS"]
    
--- a/python/pyatidlas/src/_atidlas.cpp
+++ b/python/pyatidlas/src/_atidlas.cpp
@@ -1,4 +1,5 @@
 #include <list>
+#include <functional>

 #include <boost/python.hpp>
 #include <boost/python/suite/indexing/vector_indexing_suite.hpp>
@@ -190,9 +191,6 @@ namespace detail
    return to_list(devices.begin(), devices.end());
  }

-  std::vector<cl::CommandQueue> & get_queue(cl::Context const & ctx)
-  { return atd::cl_ext::get_queues(ctx); }
-
  atd::numeric_type extract_dtype(bp::object const & odtype)
  {
      std::string name = bp::extract<std::string>(odtype.attr("__class__").attr("__name__"))();
@@ -352,7 +350,7 @@ void export_cl()
      .def("__init__", bp::make_constructor(&detail::make_context))
    #define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_context_info<NAME>)
    #undef WRAP
-      .add_property("queues", bp::make_function(&detail::get_queue, bp::return_internal_reference<>()))
+      .add_property("queues", bp::make_function(static_cast<std::vector<cl::CommandQueue> & (*)(const cl::Context&)>( [](const cl::Context & ctx) -> std::vector<cl::CommandQueue> & { return atd::cl_ext::queues[ctx]; }) , bp::return_internal_reference<>()))
      ;

  bp::class_<cl::CommandQueue>("command_queue", bp::init<cl::Context, cl::Device>())
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -3,5 +3,5 @@ foreach(PROG maxpy vaxpy reduction mreduction mproduct)
     add_executable(${PROG}-test ${PROG}.cpp)
     add_test(${PROG} ${PROG}-test)
     target_link_libraries(${PROG}-test atidlas ${OPENCL_LIBRARIES})
-     set_target_properties(${PROG}-test PROPERTIES COMPILE_FLAGS "-Wall -Wextra")
+     set_target_properties(${PROG}-test PROPERTIES COMPILE_FLAGS "-Wall -Wextra -std=c++11")
 endforeach(PROG)
--- a/tests/maxpy.cpp
+++ b/tests/maxpy.cpp
@@ -125,15 +125,15 @@ void test_impl(T epsilon, cl::Context const & ctx)

 int main()
 {
-  for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
+  for(const auto & elem : ad::cl_ext::queues.data())
  {
-    cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
+    cl::Device device = elem.second[0].getInfo<CL_QUEUE_DEVICE>();
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "---" << std::endl;
    std::cout << ">> float" << std::endl;
-    test_impl<float>(1e-4, it->first);
+    test_impl<float>(1e-4, elem.first);
    std::cout << ">> double" << std::endl;
-    test_impl<double>(1e-9, it->first);
+    test_impl<double>(1e-9, elem.first);
    std::cout << "---" << std::endl;
  }
  return EXIT_SUCCESS;
--- a/tests/model.cpp
+++ b/tests/model.cpp
@@ -2,7 +2,7 @@

 #include "viennacl/vector.hpp"

-#include "atidlas/tools/shared_ptr.hpp"
+#include <memory>
 #include "atidlas/model/import.hpp"

 namespace ad = atidlas;
@@ -10,7 +10,7 @@ namespace ad = atidlas;
 int main()
 {
  viennacl::vector<float> x(10000), y(10000), z(10000);
-  std::map<std::string, ad::tools::shared_ptr<ad::model> > models = ad::import("geforce_gt_540m.json");
+  std::map<std::string, ad::std::shared_ptr<ad::model> > models = ad::import("geforce_gt_540m.json");
  models["vector-axpy-float32"]->tune(viennacl::symbolic_expression(z, viennacl::op_assign(), x));
  models["vector-axpy-float32"]->execute(viennacl::symbolic_expression(z, viennacl::op_assign(), x));
  return EXIT_SUCCESS;
--- a/tests/mproduct.cpp
+++ b/tests/mproduct.cpp
@@ -75,15 +75,15 @@ void test_impl(T epsilon, cl::Context const & ctx)

 int main()
 {
-  for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
+  for(const auto & elem : ad::cl_ext::queues.data())
  {
-    cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
+    cl::Device device = elem.second[0].getInfo<CL_QUEUE_DEVICE>();
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "---" << std::endl;
    std::cout << ">> float" << std::endl;
-    test_impl<float>(1e-4, it->first);
+    test_impl<float>(1e-4, elem.first);
    std::cout << ">> double" << std::endl;
-    test_impl<double>(1e-9, it->first);
+    test_impl<double>(1e-9, elem.first);
    std::cout << "---" << std::endl;
  }
  return EXIT_SUCCESS;
--- a/tests/mreduction.cpp
+++ b/tests/mreduction.cpp
@@ -65,15 +65,15 @@ void test_impl(T epsilon, cl::Context const & ctx)

 int main()
 {
-  for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
+  for(const auto & elem : ad::cl_ext::queues.data())
  {
-    cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
+    cl::Device device = elem.second[0].getInfo<CL_QUEUE_DEVICE>();
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "---" << std::endl;
    std::cout << ">> float" << std::endl;
-    test_impl<float>(1e-4, it->first);
+    test_impl<float>(1e-4, elem.first);
    std::cout << ">> double" << std::endl;
-    test_impl<double>(1e-9, it->first);
+    test_impl<double>(1e-9, elem.first);
    std::cout << "---" << std::endl;
  }
  return EXIT_SUCCESS;
--- a/tests/reduction.cpp
+++ b/tests/reduction.cpp
@@ -74,15 +74,15 @@ void test_impl(T epsilon, cl::Context const & ctx)

 int main()
 {
-  for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
+  for(const auto & elem : ad::cl_ext::queues.data())
  {
-    cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
+    cl::Device device = elem.second[0].getInfo<CL_QUEUE_DEVICE>();
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "---" << std::endl;
    std::cout << ">> float" << std::endl;
-    test_impl<float>(1e-4, it->first);
+    test_impl<float>(1e-4, elem.first);
    std::cout << ">> double" << std::endl;
-    test_impl<double>(1e-9, it->first);
+    test_impl<double>(1e-9, elem.first);
    std::cout << "---" << std::endl;
  }
  return EXIT_SUCCESS;
--- a/tests/vaxpy.cpp
+++ b/tests/vaxpy.cpp
@@ -120,15 +120,15 @@ void test_impl(T epsilon, cl::Context const & ctx)

 int main()
 {
-  for(ad::cl_ext::queues_t::iterator it = ad::cl_ext::queues.begin() ; it != ad::cl_ext::queues.end() ; ++it)
+  for(const auto & elem : ad::cl_ext::queues.data())
  {
-    cl::Device device = it->second[0].getInfo<CL_QUEUE_DEVICE>();
+    cl::Device device = elem.second[0].getInfo<CL_QUEUE_DEVICE>();
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "---" << std::endl;
    std::cout << ">> float" << std::endl;
-    test_impl<float>(1e-4, it->first);
+    test_impl<float>(1e-4, elem.first);
    std::cout << ">> double" << std::endl;
-    test_impl<double>(1e-9, it->first);
+    test_impl<double>(1e-9, elem.first);
    std::cout << "---" << std::endl;
  }
  return EXIT_SUCCESS;