Tuning: Merged tune branch.

- Much cleaner and more concise source - Better exceptions handling - Checks local minima to see if retuning is needed. Resolved conflicts: bench/blas.cpp include/isaac/backend/templates/mproduct.h include/isaac/driver/buffer.h lib/array.cpp lib/backend/templates/mproduct.cpp lib/driver/buffer.cpp python/setup.py tune/pysrc/autotune.py tune/pysrc/dataset.py tune/pysrc/misc_tools.py
2015-06-28 17:53:16 -07:00
parent 48073dc710
commit e7cabf65ac
50 changed files with 832 additions and 3017 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 2.8.10)
 # Add visibility of headers
-file( GLOB_RECURSE MAKE_HEADERS_VISIBLE_SRC *.hpp *.h)
+file( GLOB_RECURSE MAKE_HEADERS_VISIBLE_SRC *.cpp *.hpp *.h)
 add_custom_target( MAKE_HEADERS_VISIBLE SOURCES ${MAKE_HEADERS_VISIBLE_SRC} )
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -27,9 +27,9 @@ if(MKL_FOUND)
 else()
    find_package(OpenBlas)
    if(OPENBLAS_FOUND)
-        #set(BLAS_DEF ${BLAS_DEF} "-DBENCH_CBLAS")
+        set(BLAS_DEF ${BLAS_DEF} "-DBENCH_CBLAS")
-        #include_directories(${OPENBLAS_INCLUDE_DIR})
+        include_directories(${OPENBLAS_INCLUDE_DIR})
-        #set(BLAS_LIBS ${BLAS_LIBS}  ${OPENBLAS_LIBRARIES} )
+        set(BLAS_LIBS ${BLAS_LIBS}  ${OPENBLAS_LIBRARIES} )
    endif()
 endif()
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -112,10 +112,17 @@ void bench(ad::numeric_type dtype, std::string operation)
 #define BENCHMARK_HOST(OP, PERF) \
  {\
  ad::tools::timer tmr;\
  double total_time = 0;\
  std::vector<double> times;\
  while(total_time < 1e-2){\
    std::vector<int> cache_flusher(10000000, 0);\
    tmr.start();\
    OP;\
-  double t = 1e9*tmr.get();\
+    double time = tmr.get();\
    times.push_back(time);\
    total_time += time;\
  }\
  double t = 1e9*median(times);\
  std::cout << " " << PERF << std::flush;\
  }
@@ -127,6 +134,8 @@ void bench(ad::numeric_type dtype, std::string operation)
  cudaEvent_t start, stop;\
  cudaEventCreate(&start);\
  cudaEventCreate(&stop);\
  OP;\
  cudaThreadSynchronize();\
  while(total_time*1e-3 < 1e-3){\
    flush = ad::zeros(1e6, 1, dtype);\
    cudaEventRecord(start,0);\
@@ -290,15 +299,15 @@ void bench(ad::numeric_type dtype, std::string operation)
  if(operation.substr(0,4)=="gemm")
  {
    std::vector<std::tuple<int_t, int_t, int_t> > MNKs;
-//    MNKs.push_back(std::make_tuple(896,896,896));
+    MNKs.push_back(std::make_tuple(896,896,896));
-//    MNKs.push_back(std::make_tuple(3072,3072,3072));
+    MNKs.push_back(std::make_tuple(3072,3072,3072));
-//    MNKs.push_back(std::make_tuple(1024,64,768));
+    MNKs.push_back(std::make_tuple(1024,64,768));
-//    MNKs.push_back(std::make_tuple(768,64,128));
+    MNKs.push_back(std::make_tuple(768,64,128));
-//    MNKs.push_back(std::make_tuple(64,64,32000));
+    MNKs.push_back(std::make_tuple(64,64,32000));
-//    MNKs.push_back(std::make_tuple(1024,1024,32000));
+    MNKs.push_back(std::make_tuple(1024,1024,32000));
-    for(unsigned int N = 1 ; N <10 ; ++N)
+//    for(unsigned int N = 1 ; N <10 ; ++N)
-        MNKs.push_back(std::make_tuple(128*N, 128*N, 128*N));
+//        MNKs.push_back(std::make_tuple(128*N, 128*N, 128*N));
    /*---------*/
    /*--BLAS3--*/
    /*---------*/
@@ -308,6 +317,7 @@ void bench(ad::numeric_type dtype, std::string operation)
        int_t N = std::get<1>(MNK);
        int_t K = std::get<2>(MNK);
        std::cout << M << "," << N << "," << K;
        std::cout << std::flush;
        /* ISAAC */
        ad::array C(M, N, dtype), A(M, K, dtype), B(N, K, dtype);
    #if HAS_A_BLAS
--- a/cmake/python/setup.py
+++ b/cmake/python/setup.py
@@ -49,13 +49,44 @@ def main():
        return optlist
    def find_library(name, cmake_glob_list):
-        compiler=new_compiler()
+        cvars = sysconfig.get_config_vars()
        compiler = new_compiler()
        dirs = []
        for gpath in cmake_glob_list.split(';'):
            path = glob(gpath)
            if path:
                dirs += [path[0]]
-        return compiler.find_library_file(dirs, name)
+        return compiler.find_library_file(cvars['LIBDIR'].split(';') + dirs, name)
    def find_opencl():
        cvars = sysconfig.get_config_vars()
        is_on_android = '-mandroid' in cvars['PY_CFLAGS']
        lib = find_library('OpenCL', '${ANDROID_CL_GLOB_HINTS}' if is_on_android else '${X86_CL_GLOB_HINTS}')
        return {'include': '', 'lib': dirname(lib)} if lib else None
    def find_in_path(name, path):
        "Find a file in a search path"
        #adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
        for dir in path.split(os.pathsep):
            binpath = os.path.join(dir, name)
            if os.path.exists(binpath):
                return os.path.abspath(binpath)
        return None
    def find_cuda():
        if 'CUDAHOME' in os.environ:
            home = os.environ['CUDAHOME']
            nvcc = os.path.join(home, 'bin', 'nvcc')
        else:
            nvcc = find_in_path('nvcc', os.environ['PATH'])
        if nvcc:
            home = dirname(os.path.dirname(nvcc))
            return {'include': os.path.join(home, 'include'),
                    'lib': os.path.join(home, 'lib64')}
        else:
            return None
    #Tweaks warning, because boost-numpy and boost-python won't compile cleanly without these changes
    cvars = sysconfig.get_config_vars()
@@ -63,14 +94,27 @@ def main():
    cvars["CFLAGS"] = cvars["BASECFLAGS"] + ' ' + cvars['OPT']
    cvars["LDFLAGS"] = '-Wl,--no-as-needed ' + cvars["LDFLAGS"]
-    is_on_android = '-mandroid' in cvars['PY_CFLAGS']
+    #OpenCL
-    opencl = find_library('OpenCL', '${ANDROID_CL_GLOB_HINTS}' if is_on_android else '${X86_CL_GLOB_HINTS}')
+    opencl_config = find_opencl()
-    library_dirs = [dirname(library) for library in [opencl] if library is not None]
+    #CUDA
    cuda_config = find_cuda()
-    #Includes
+    #Libraries
    libraries = ['OpenCL']
    if cuda_config: libraries += ['cuda', 'nvrtc']
    #Backends:
    backend_defines = ['-DISAAC_WITH_OPENCL']
    if cuda_config: backend_defines += ['-DISAAC_WITH_CUDA']
    #Library directories
    library_dirs = [config['lib'] for config in [opencl_config, cuda_config] if config is not None]
    #Include directories
    include ='${INCLUDE_DIRECTORIES_STR}'.split() + ['external/boost/include', os.path.join(find_module("numpy")[1], "core", "include")]
-    #Sources
+
    #Source files
    src =  '${LIBISAAC_SRC_STR}'.split() + [os.path.join('src', 'wrap', sf)  for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'model.cpp', 'exceptions.cpp']]
    boostsrc = 'external/boost/libs/'
    for s in ['numpy','python','smart_ptr','system','thread']:
@@ -84,7 +128,7 @@ def main():
        src += glob(boostsrc + "/thread/src/pthread/*.cpp")
    src= [f for f in src  if not f.endswith("once_atomic.cpp")]
-
+    #Setup
    setup(
                name='isaac',
                version='1.0',
@@ -96,12 +140,12 @@ def main():
                ext_package="isaac",
                ext_modules=[Extension(
                    '_isaac',src,
-                    extra_compile_args= ['-D__CL_ENABLE_EXCEPTIONS', '-std=c++11', '-Wno-unused-function', '-Wno-unused-local-typedefs',  '-Wno-sign-compare'],
+                    extra_compile_args= backend_defines + ['-std=c++11', '-Wno-unused-function', '-Wno-unused-local-typedefs',  '-Wno-sign-compare'],
 		    extra_link_args=['-Wl,-soname=_isaac.so'],
                    undef_macros=[],
                    include_dirs=include,
                    library_dirs=library_dirs,
-                    libraries=['OpenCL']
+                    libraries=libraries
                )],
                cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass},
                classifiers=[
--- a/include/isaac/array.h
+++ b/include/isaac/array.h
@@ -179,10 +179,8 @@ ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(pow)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(dot)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(outer)
-namespace detail
+ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(assign)
-{
+
  ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(assign)
 }
 #undef ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR
--- a/include/isaac/backend/parse.h
+++ b/include/isaac/backend/parse.h
@@ -15,6 +15,7 @@ namespace detail
  bool is_node_leaf(op_element const & op);
  bool is_scalar_reduction(array_expression::node const & node);
  bool is_vector_reduction(array_expression::node const & node);
  bool is_assignment(op_element const & op);
  bool is_elementwise_operator(op_element const & op);
  bool is_elementwise_function(op_element const & op);
  bool is_cast(op_element const & op);
--- a/include/isaac/backend/templates/base.h
+++ b/include/isaac/backend/templates/base.h
@@ -175,7 +175,7 @@ public:
  base(binding_policy_t binding_policy);
  virtual unsigned int lmem_usage(expressions_tuple const &) const;
  virtual unsigned int registers_usage(expressions_tuple const &) const;
-  virtual std::vector<int_t> input_sizes(expressions_tuple const & expressions) = 0;
+  virtual std::vector<int_t> input_sizes(expressions_tuple const & expressions) const = 0;
  virtual ~base();
  std::string generate(const char * suffix, expressions_tuple const & expressions, driver::Device const & device);
  virtual int is_invalid(expressions_tuple const & expressions, driver::Device const & device) const = 0;
--- a/include/isaac/backend/templates/maxpy.h
+++ b/include/isaac/backend/templates/maxpy.h
@@ -25,7 +25,7 @@ private:
 public:
  maxpy(parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
  maxpy(unsigned int simd, unsigned int ls1, unsigned int ls2,  unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
-  std::vector<int_t> input_sizes(expressions_tuple const & expressions);
+  std::vector<int_t> input_sizes(expressions_tuple const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program & program, const char * suffix, base & fallback, controller<expressions_tuple> const &);
 };
--- a/include/isaac/backend/templates/mproduct.h
+++ b/include/isaac/backend/templates/mproduct.h
@@ -48,10 +48,10 @@ private:
  void enqueue_block(driver::CommandQueue & queue, int_t M, int_t N, int_t K, array const & A, array const & B, array const & C,
                     value_scalar const &alpha, value_scalar const &beta, driver::Program & program, const char * suffix, execution_options_type const & options);
  array create_slice(array & M, int_t s0_0, int_t s0_1, int_t s1_0, int_t s1_1, bool swap);
-  std::vector<int_t> infos(expressions_tuple const & expressions,  isaac::symbolic::preset::gemm::args &arguments);
+  std::vector<int_t> infos(expressions_tuple const & expressions,  isaac::symbolic::preset::gemm::args &arguments) const;
 public:
  mproduct(mproduct::parameters_type const & parameters, bool check_bound, char A_trans, char B_trans);
-  std::vector<int_t> input_sizes(expressions_tuple const & expressions);
+  std::vector<int_t> input_sizes(expressions_tuple const & expressions) const;
  void cleanup(values_holder beta, controller<expressions_tuple> const & ctr, model & fallback,
               lhs_rhs_element* eA, lhs_rhs_element* eB, lhs_rhs_element* eC, lhs_rhs_element* ebeta, array const & A, array const & B, array const & C);
  void enqueue(driver::CommandQueue & queue, driver::Program & program, const char * suffix, base & fallback, controller<expressions_tuple> const &ctr);
--- a/include/isaac/backend/templates/mreduction.h
+++ b/include/isaac/backend/templates/mreduction.h
@@ -34,7 +34,7 @@ private:
  unsigned int lmem_usage() const;
  std::string generate_impl(const char * suffix, expressions_tuple const &, driver::Device const & device, std::vector<mapping_type> const &) const;
 public:
-  virtual std::vector<int_t> input_sizes(expressions_tuple const & expressions);
+  virtual std::vector<int_t> input_sizes(expressions_tuple const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program & program, const char * suffix, base & fallback, controller<expressions_tuple> const &);
 private:
  reduction_type reduction_type_;
--- a/include/isaac/backend/templates/reduction.h
+++ b/include/isaac/backend/templates/reduction.h
@@ -27,7 +27,7 @@ private:
 public:
  reduction(reduction::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
  reduction(unsigned int simd, unsigned int ls, unsigned int ng, fetching_policy_type fetch, binding_policy_t bind = BIND_ALL_UNIQUE);
-  std::vector<int_t> input_sizes(expressions_tuple const & expressions);
+  std::vector<int_t> input_sizes(expressions_tuple const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program & program, const char * suffix, base & fallback, controller<expressions_tuple> const &);
 private:
  std::vector< driver::Buffer > tmp_;
--- a/include/isaac/backend/templates/vaxpy.h
+++ b/include/isaac/backend/templates/vaxpy.h
@@ -22,7 +22,7 @@ private:
 public:
  vaxpy(vaxpy::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
  vaxpy(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy, binding_policy_t binding_policy = BIND_ALL_UNIQUE);
-  std::vector<int_t> input_sizes(expressions_tuple const & expressions);
+  std::vector<int_t> input_sizes(expressions_tuple const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program & program, const char * suffix, base & fallback, controller<expressions_tuple> const &);
 };
--- a/include/isaac/driver/buffer.h
+++ b/include/isaac/driver/buffer.h
@@ -1,6 +1,8 @@
 #ifndef ISAAC_DRIVER_BUFFER_H
 #define ISAAC_DRIVER_BUFFER_H
 #include "isaac/types.h"
 #include "isaac/driver/common.h"
 #include "isaac/driver/context.h"
 #include "isaac/driver/handle.h"
--- a/include/isaac/driver/common.h
+++ b/include/isaac/driver/common.h
@@ -27,6 +27,7 @@ enum device_type
  DEVICE_TYPE_ACCELERATOR = CL_DEVICE_TYPE_ACCELERATOR
 };
 #ifdef ISAAC_WITH_CUDA
 namespace nvrtc
@@ -34,7 +35,7 @@ namespace nvrtc
  namespace exception
  {
-  #define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class name: public std::exception { const char * what() const throw(){ return "NVRTC: Error- " msg; } }
+#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
  ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory              ,"out of memory exception");
  ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure   ,"program creation failure");
@@ -59,7 +60,7 @@ namespace cuda
    class base: public std::exception{};
-    #define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class name: public base { const char * what() const throw(){ return "CUDA: Error- " msg; } }
+#define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "CUDA: Error- " msg; } }
    ISAAC_CREATE_CUDA_EXCEPTION(invalid_value                   ,"invalid value");
@@ -129,6 +130,72 @@ void check(CUresult);
 #endif
 namespace ocl
 {
  namespace exception
  {
    class base: public std::exception{};
 #define ISAAC_CREATE_CL_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "OpenCL: Error- " msg; } }
   ISAAC_CREATE_CL_EXCEPTION(device_not_found,                  "device not found");
   ISAAC_CREATE_CL_EXCEPTION(device_not_available,              "device not available");
   ISAAC_CREATE_CL_EXCEPTION(compiler_not_available,            "compiler not available");
   ISAAC_CREATE_CL_EXCEPTION(mem_object_allocation_failure,     "object allocation failure");
   ISAAC_CREATE_CL_EXCEPTION(out_of_resources,                  "launch out of resources");
   ISAAC_CREATE_CL_EXCEPTION(out_of_host_memory,                "out of host memory");
   ISAAC_CREATE_CL_EXCEPTION(profiling_info_not_available,      "profiling info not available");
   ISAAC_CREATE_CL_EXCEPTION(mem_copy_overlap,                  "mem copy overlap");
   ISAAC_CREATE_CL_EXCEPTION(image_format_mismatch,             "image format mismatch");
   ISAAC_CREATE_CL_EXCEPTION(image_format_not_supported,        "image format not supported");
   ISAAC_CREATE_CL_EXCEPTION(build_program_failure,             "build program failure");
   ISAAC_CREATE_CL_EXCEPTION(map_failure,                       "map failure");
   ISAAC_CREATE_CL_EXCEPTION(invalid_value,                     "invalid value");
   ISAAC_CREATE_CL_EXCEPTION(invalid_device_type,               "invalid device type");
   ISAAC_CREATE_CL_EXCEPTION(invalid_platform,                  "invalid platform");
   ISAAC_CREATE_CL_EXCEPTION(invalid_device,                    "invalid device");
   ISAAC_CREATE_CL_EXCEPTION(invalid_context,                   "invalid context");
   ISAAC_CREATE_CL_EXCEPTION(invalid_queue_properties,          "invalid queue properties");
   ISAAC_CREATE_CL_EXCEPTION(invalid_command_queue,             "invalid command queue");
   ISAAC_CREATE_CL_EXCEPTION(invalid_host_ptr,                  "invalid host pointer");
   ISAAC_CREATE_CL_EXCEPTION(invalid_mem_object,                "invalid mem object");
   ISAAC_CREATE_CL_EXCEPTION(invalid_image_format_descriptor,   "invalid image format descriptor");
   ISAAC_CREATE_CL_EXCEPTION(invalid_image_size,                "invalid image size");
   ISAAC_CREATE_CL_EXCEPTION(invalid_sampler,                   "invalid sampler");
   ISAAC_CREATE_CL_EXCEPTION(invalid_binary,                    "invalid binary");
   ISAAC_CREATE_CL_EXCEPTION(invalid_build_options,             "invalid build options");
   ISAAC_CREATE_CL_EXCEPTION(invalid_program,                   "invalid program");
   ISAAC_CREATE_CL_EXCEPTION(invalid_program_executable,        "invalid program executable");
   ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_name,               "invalid kernel name");
   ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_definition,         "invalid kernel definition");
   ISAAC_CREATE_CL_EXCEPTION(invalid_kernel,                    "invalid kernel");
   ISAAC_CREATE_CL_EXCEPTION(invalid_arg_index,                 "invalid arg index");
   ISAAC_CREATE_CL_EXCEPTION(invalid_arg_value,                 "invalid arg value");
   ISAAC_CREATE_CL_EXCEPTION(invalid_arg_size,                  "invalid arg size");
   ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_args,               "invalid kernel args");
   ISAAC_CREATE_CL_EXCEPTION(invalid_work_dimension,            "invalid work dimension");
   ISAAC_CREATE_CL_EXCEPTION(invalid_work_group_size,           "invalid work group size");
   ISAAC_CREATE_CL_EXCEPTION(invalid_work_item_size,            "invalid work item size");
   ISAAC_CREATE_CL_EXCEPTION(invalid_global_offset,             "invalid global offset");
   ISAAC_CREATE_CL_EXCEPTION(invalid_event_wait_list,           "invalid event wait list");
   ISAAC_CREATE_CL_EXCEPTION(invalid_event,                     "invalid event");
   ISAAC_CREATE_CL_EXCEPTION(invalid_operation,                 "invalid operation");
   ISAAC_CREATE_CL_EXCEPTION(invalid_gl_object,                 "invalid GL object");
   ISAAC_CREATE_CL_EXCEPTION(invalid_buffer_size,               "invalid buffer size");
   ISAAC_CREATE_CL_EXCEPTION(invalid_mip_level,                 "invalid MIP level");
   ISAAC_CREATE_CL_EXCEPTION(invalid_global_work_size,          "invalid global work size");
 #ifdef CL_INVALID_PROPERTY
   ISAAC_CREATE_CL_EXCEPTION(invalid_property,                  "invalid property");
 #endif
  }
 void check(cl_int err);
 }
 }
 }
--- a/lib/array.cpp
+++ b/lib/array.cpp
@@ -64,6 +64,7 @@ array::array(array & M, slice const & s0, slice const & s1) :  dtype_(M.dtype_),
                                                          context_(M.data_.context()), data_(M.data_)
 { }
 template<typename DT>
 array::array(int_t shape0, int_t shape1, std::vector<DT> const & data, driver::Context context)
  : dtype_(to_numeric_type<DT>::value),
@@ -471,8 +472,7 @@ DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_MAX_TYPE, maximum, x.dtype())
 DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_MIN_TYPE, minimum, x.dtype())
 DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_POW_TYPE, pow, x.dtype())
-namespace detail
+DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ASSIGN_TYPE, assign, x.dtype())
 { DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ASSIGN_TYPE, assign, x.dtype()) }
 DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_GREATER_TYPE, operator >, INT_TYPE)
--- a/lib/backend/parse.cpp
+++ b/lib/backend/parse.cpp
@@ -21,11 +21,16 @@ namespace detail
        || node.op.type_family==OPERATOR_COLUMNS_REDUCTION_TYPE_FAMILY;
  }
-  bool is_elementwise_operator(op_element const & op)
+  bool is_assignment(op_element const & op)
  {
      return op.type== OPERATOR_ASSIGN_TYPE
              || op.type== OPERATOR_INPLACE_ADD_TYPE
-        || op.type== OPERATOR_INPLACE_SUB_TYPE
+              || op.type== OPERATOR_INPLACE_SUB_TYPE;
  }
  bool is_elementwise_operator(op_element const & op)
  {
    return is_assignment(op)
        || op.type== OPERATOR_ADD_TYPE
        || op.type== OPERATOR_SUB_TYPE
        || op.type== OPERATOR_ELEMENT_PROD_TYPE
--- a/lib/backend/templates/maxpy.cpp
+++ b/lib/backend/templates/maxpy.cpp
@@ -97,7 +97,7 @@ maxpy::maxpy(unsigned int simd, unsigned int ls1, unsigned int ls2,
    base_impl<maxpy, maxpy_parameters>(maxpy_parameters(simd, ls1, ls2, ng1, ng2, fetch), bind)
 {}
-std::vector<int_t> maxpy::input_sizes(expressions_tuple const & expressions)
+std::vector<int_t> maxpy::input_sizes(expressions_tuple const & expressions) const
 {
  isaac::array_expression const & array_expression = *(expressions.data().front());
  std::pair<int_t, int_t> size = matrix_size(lhs_most(array_expression.tree(), array_expression.root()));
--- a/lib/backend/templates/mproduct.cpp
+++ b/lib/backend/templates/mproduct.cpp
@@ -3,6 +3,7 @@
 #include "isaac/backend/keywords.h"
 #include "isaac/model/model.h"
 #include "isaac/symbolic/preset.h"
 #include "isaac/exception/operation_not_supported.h"
 #include "isaac/tools/make_vector.hpp"
 #include "isaac/tools/to_string.hpp"
 #include "isaac/tools/miscellaneous.hpp"
@@ -42,10 +43,13 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
    return N*size_of(numeric_t);
  }
-  int mproduct::is_invalid_impl(driver::Device const &, expressions_tuple const &) const
+  int mproduct::is_invalid_impl(driver::Device const &, expressions_tuple const & expressions) const
  {
-    if (p_.A_fetching_policy!=FETCH_FROM_LOCAL && p_.B_fetching_policy!=FETCH_FROM_LOCAL&& (p_.local_fetch_0!=0 || p_.local_fetch_1!=0))
+    std::vector<int_t> MNK = input_sizes(expressions);
-      return TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH;
+    int_t M = MNK[0]; int_t N = MNK[1];
    if(p_.depth > 1 && M*N*p_.depth > 1e6)
      throw operation_not_supported_exception("This would necessitate a temporary larger than 1MB");
    if ((p_.mS % p_.simd_width) > 0 || (p_.nS % p_.simd_width) > 0)
      return TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE;
@@ -642,7 +646,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
    return array(M, s0, s1);
  }
-  std::vector<int_t> mproduct::infos(expressions_tuple const & expressions, symbolic::preset::gemm::args& arguments)
+  std::vector<int_t> mproduct::infos(expressions_tuple const & expressions, symbolic::preset::gemm::args& arguments) const
  {
    isaac::array_expression & array_expression = (*expressions.data().front());
    array_expression::container_type & array = array_expression.tree();
@@ -663,7 +667,7 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
    else throw;
  }
-  std::vector<int_t> mproduct::input_sizes(expressions_tuple const & expressions)
+  std::vector<int_t> mproduct::input_sizes(expressions_tuple const & expressions) const
  {
    symbolic::preset::gemm::args dummy;
    return infos(expressions, dummy);
--- a/lib/backend/templates/mreduction.cpp
+++ b/lib/backend/templates/mreduction.cpp
@@ -26,7 +26,7 @@ int mreduction::is_invalid_impl(driver::Device const &, expressions_tuple const
 unsigned int mreduction::lmem_usage() const
 {
-  return p_.local_size_0*(p_.local_size_1+1);
+  return (p_.local_size_0+1)*p_.local_size_1;
 }
 std::string mreduction::generate_impl(const char * suffix, expressions_tuple const & expressions, driver::Device const & device, std::vector<mapping_type> const & mappings) const
@@ -83,7 +83,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
                         {"array2", "#pointer += #start1 + #start2*#ld; "
                                    "#ld *= #nldstride; "}}, expressions, mappings);
-  unsigned int local_size_0_ld = p_.local_size_0+1;
+  unsigned int local_size_0_ld = p_.local_size_0;
  std::string local_size_0_ld_str = to_string(local_size_0_ld);
  for (const auto & e : reductions)
@@ -321,7 +321,7 @@ mreduction::mreduction(mreduction::parameters_type const & parameters,
  base_impl<mreduction, mreduction_parameters>(parameters, binding_policy),
  reduction_type_(rtype){ }
-std::vector<int_t> mreduction::input_sizes(expressions_tuple const & expressions)
+std::vector<int_t> mreduction::input_sizes(expressions_tuple const & expressions) const
 {
  array_expression const & first_expression = *expressions.data().front();
  std::vector<std::size_t> idx = filter_nodes(&is_reduction, first_expression, false);
--- a/lib/backend/templates/reduction.cpp
+++ b/lib/backend/templates/reduction.cpp
@@ -35,7 +35,7 @@ inline void reduction::reduce_1d_local_memory(kernel_generation_stream & stream,
                                   std::string const & buf_str, std::string const & buf_value_str, driver::backend_type backend) const
 {
  stream << "#pragma unroll" << std::endl;
-  stream << "for(unsigned int stride = " << size/2 << "; stride >0; stride /=2)" << std::endl;
+  stream << "for(unsigned int stride = " << size/2 << "; stride > 0; stride /=2)" << std::endl;
  stream << "{" << std::endl;
  stream.inc_tab();
  stream << LocalBarrier(backend) << ";" << std::endl;
@@ -269,7 +269,7 @@ reduction::reduction(unsigned int simd, unsigned int ls, unsigned int ng,
    base_impl<reduction, reduction_parameters>(reduction_parameters(simd,ls,ng,fetch), bind)
 {}
-std::vector<int_t> reduction::input_sizes(expressions_tuple const & expressions)
+std::vector<int_t> reduction::input_sizes(expressions_tuple const & expressions) const
 {
  std::vector<size_t> reductions_idx = filter_nodes(&is_reduction, *(expressions.data().front()), false);
  int_t N = vector_size(lhs_most(expressions.data().front()->tree(), reductions_idx[0]));
--- a/lib/backend/templates/vaxpy.cpp
+++ b/lib/backend/templates/vaxpy.cpp
@@ -101,7 +101,7 @@ vaxpy::vaxpy(unsigned int simd, unsigned int ls, unsigned int ng,
 {}
-std::vector<int_t> vaxpy::input_sizes(expressions_tuple const & expressions)
+std::vector<int_t> vaxpy::input_sizes(expressions_tuple const & expressions) const
 {
  int_t size = static_cast<array_expression const *>(expressions.data().front().get())->shape()[0];
  return tools::make_vector<int_t>() << size;
--- a/lib/driver/buffer.cpp
+++ b/lib/driver/buffer.cpp
@@ -13,14 +13,22 @@ Buffer::Buffer(cl::Buffer const & buffer) : backend_(OPENCL), context_(buffer.ge
 }
 Buffer::Buffer(Context const & context, std::size_t size) : backend_(context.backend_), context_(context), h_(backend_)
 {
  switch(backend_)
  {
 #ifdef ISAAC_WITH_CUDA
-    case CUDA: cuda::check(cuMemAlloc(h_.cu.get(), size)); break;
+    case CUDA:
      cuda::check(cuMemAlloc(h_.cu.get(), size));
      break;
 #endif
-    case OPENCL: *h_.cl = cl::Buffer(*context.h_.cl, CL_MEM_READ_WRITE, size); break;
+    case OPENCL:
-    default: throw;
+      cl_int err;
      *h_.cl = cl::Buffer(*context.h_.cl, CL_MEM_READ_WRITE, size, NULL, &err);
      ocl::check(err);
      break;
    default:
      throw;
  }
 }
--- a/lib/driver/check.cpp
+++ b/lib/driver/check.cpp
@@ -103,6 +103,70 @@ void check(CUresult err)
 #endif
 namespace ocl
 {
 void check(cl_int err)
 {
    using namespace isaac::driver::ocl::exception;
    switch(err)
    {
        case CL_SUCCESS:                        break;
        case CL_DEVICE_NOT_FOUND:               throw device_not_found();
        case CL_DEVICE_NOT_AVAILABLE:           throw device_not_available();
        case CL_COMPILER_NOT_AVAILABLE:         throw compiler_not_available();
        case CL_MEM_OBJECT_ALLOCATION_FAILURE:  throw mem_object_allocation_failure();
        case CL_OUT_OF_RESOURCES:               throw out_of_resources();
        case CL_OUT_OF_HOST_MEMORY:             throw out_of_host_memory();
        case CL_PROFILING_INFO_NOT_AVAILABLE:   throw profiling_info_not_available();
        case CL_MEM_COPY_OVERLAP:               throw mem_copy_overlap();
        case CL_IMAGE_FORMAT_MISMATCH:          throw image_format_mismatch();
        case CL_IMAGE_FORMAT_NOT_SUPPORTED:     throw image_format_not_supported();
        case CL_BUILD_PROGRAM_FAILURE:          throw build_program_failure();
        case CL_MAP_FAILURE:                    throw map_failure();
        case CL_INVALID_VALUE:                  throw invalid_value();
        case CL_INVALID_DEVICE_TYPE:            throw invalid_device_type();
        case CL_INVALID_PLATFORM:               throw invalid_platform();
        case CL_INVALID_DEVICE:                 throw invalid_device();
        case CL_INVALID_CONTEXT:                throw invalid_context();
        case CL_INVALID_QUEUE_PROPERTIES:       throw invalid_queue_properties();
        case CL_INVALID_COMMAND_QUEUE:          throw invalid_command_queue();
        case CL_INVALID_HOST_PTR:               throw invalid_host_ptr();
        case CL_INVALID_MEM_OBJECT:             throw invalid_mem_object();
        case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: throw invalid_image_format_descriptor();
        case CL_INVALID_IMAGE_SIZE:             throw invalid_image_size();
        case CL_INVALID_SAMPLER:                throw invalid_sampler();
        case CL_INVALID_BINARY:                 throw invalid_binary();
        case CL_INVALID_BUILD_OPTIONS:          throw invalid_build_options();
        case CL_INVALID_PROGRAM:                throw invalid_program();
        case CL_INVALID_PROGRAM_EXECUTABLE:     throw invalid_program_executable();
        case CL_INVALID_KERNEL_NAME:            throw invalid_kernel_name();
        case CL_INVALID_KERNEL_DEFINITION:      throw invalid_kernel_definition();
        case CL_INVALID_KERNEL:                 throw invalid_kernel();
        case CL_INVALID_ARG_INDEX:              throw invalid_arg_index();
        case CL_INVALID_ARG_VALUE:              throw invalid_arg_value();
        case CL_INVALID_ARG_SIZE:               throw invalid_arg_size();
        case CL_INVALID_KERNEL_ARGS:            throw invalid_kernel_args();
        case CL_INVALID_WORK_DIMENSION:         throw invalid_work_dimension();
        case CL_INVALID_WORK_GROUP_SIZE:        throw invalid_work_group_size();
        case CL_INVALID_WORK_ITEM_SIZE:         throw invalid_work_item_size();
        case CL_INVALID_GLOBAL_OFFSET:          throw invalid_global_offset();
        case CL_INVALID_EVENT_WAIT_LIST:        throw invalid_event_wait_list();
        case CL_INVALID_EVENT:                  throw invalid_event();
        case CL_INVALID_OPERATION:              throw invalid_operation();
        case CL_INVALID_GL_OBJECT:              throw invalid_gl_object();
        case CL_INVALID_BUFFER_SIZE:            throw invalid_buffer_size();
        case CL_INVALID_MIP_LEVEL:              throw invalid_mip_level();
        case CL_INVALID_GLOBAL_WORK_SIZE:       throw invalid_global_work_size();
    #ifdef CL_INVALID_PROPERTY
        case CL_INVALID_PROPERTY:               throw invalid_property();
    #endif
        default: throw;
    }
 }
 }
 }
 }
--- a/lib/driver/command_queue.cpp
+++ b/lib/driver/command_queue.cpp
@@ -1,4 +1,5 @@
 #include "isaac/driver/command_queue.h"
 #include "isaac/driver/common.h"
 #include "isaac/driver/context.h"
 #include "isaac/driver/device.h"
 #include "isaac/driver/event.h"
@@ -22,9 +23,15 @@ CommandQueue::CommandQueue(Context const & context, Device const & device, cl_co
  switch(backend_)
  {
 #ifdef ISAAC_WITH_CUDA
-    case CUDA: cuda::check(cuStreamCreate(h_.cu.get(), 0)); break;
+    case CUDA:
      cuda::check(cuStreamCreate(h_.cu.get(), 0));
      break;
 #endif
-    case OPENCL: *h_.cl = cl::CommandQueue(*context.h_.cl, *device.h_.cl, properties); break;
+    case OPENCL:
      cl_int err;
      *h_.cl = cl::CommandQueue(*context.h_.cl, *device.h_.cl, properties, &err);
      ocl::check(err);
      break;
    default: throw;
  }
 }
@@ -61,7 +68,7 @@ Event CommandQueue::enqueue(Kernel const & kernel, NDRange global, driver::NDRan
      break;
 #endif
    case OPENCL:
-      h_.cl->enqueueNDRangeKernel(*kernel.h_.cl, cl::NullRange, (cl::NDRange)global, (cl::NDRange)local, NULL, event.h_.cl.get());
+      ocl::check(h_.cl->enqueueNDRangeKernel(*kernel.h_.cl, cl::NullRange, (cl::NDRange)global, (cl::NDRange)local, NULL, event.h_.cl.get()));
      break;
    default: throw;
  }
--- a/lib/driver/context.cpp
+++ b/lib/driver/context.cpp
@@ -29,7 +29,9 @@ Context::Context(Device const & device) : backend_(device.backend_), device_(dev
      break;
 #endif
    case OPENCL:
-      *h_.cl = cl::Context(std::vector<cl::Device>(1, *device_.h_.cl));
+      cl_int err;
      *h_.cl = cl::Context(std::vector<cl::Device>(1, *device_.h_.cl), NULL, NULL, NULL, &err);
      ocl::check(err);
      break;
    default:
      throw;
--- a/lib/driver/program.cpp
+++ b/lib/driver/program.cpp
@@ -123,8 +123,8 @@ Program::Program(Context const & context, std::string const & source) : backend_
      *h_.cl = cl::Program(*context_.h_.cl, source);
      try{
-        h_.cl->build(devices);
+        ocl::check(h_.cl->build(devices));
-      }catch(cl::Error const & e){
+      }catch(ocl::exception::build_program_failure const & e){
            for(std::vector< cl::Device >::const_iterator it = devices.begin(); it != devices.end(); ++it)
              std::cout << "Device : " << it->getInfo<CL_DEVICE_NAME>()
                      << "Build Status = " << h_.cl->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*it) << std::endl
--- a/lib/wrap/clBLAS.cpp
+++ b/lib/wrap/clBLAS.cpp
@@ -59,7 +59,7 @@ extern "C"
        clRetainMemObject(mx); \
        is::array y(N, TYPE_ISAAC, cl::Buffer(my), offy, incy); \
        clRetainMemObject(my); \
-        execute(is::detail::assign(y, x + alpha*y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
+        execute(is::assign(y, x + alpha*y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
        return clblasSuccess; \
    }
@@ -75,7 +75,7 @@ extern "C"
    {\
        is::array x(N, TYPE_ISAAC, cl::Buffer(mx), offx, incx);\
        clRetainMemObject(mx);\
-        execute(is::detail::assign(x, alpha*x), x.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
+        execute(is::assign(x, alpha*x), x.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
        return clblasSuccess;\
    }
@@ -94,7 +94,7 @@ extern "C"
        clRetainMemObject(mx);\
        is::array y(N, TYPE_ISAAC, cl::Buffer(my), offy, incy);\
        clRetainMemObject(my);\
-        execute(is::detail::assign(y, x), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
+        execute(is::assign(y, x), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
        return clblasSuccess;\
    }
@@ -116,7 +116,7 @@ extern "C"
        clRetainMemObject(my); \
        is::scalar s(TYPE_ISAAC, cl::Buffer(dotProduct), offDP); \
        clRetainMemObject(dotProduct); \
-        execute(is::detail::assign(s, dot(x,y)), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
+        execute(is::assign(s, dot(x,y)), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
        return clblasSuccess; \
    }
@@ -134,7 +134,7 @@ extern "C"
        clRetainMemObject(mx);\
        is::scalar s(TYPE_ISAAC, cl::Buffer(asum), offAsum);\
        clRetainMemObject(asum);\
-        execute(is::detail::assign(s, sum(abs(x))), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
+        execute(is::assign(s, sum(abs(x))), s.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
        return clblasSuccess;\
    }
@@ -170,9 +170,9 @@ extern "C"
        \
        is::driver::Context const & context = A.context();\
        if(transA==clblasTrans)\
-            execute(is::detail::assign(y, alpha*dot(A.T(), x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
+            execute(is::assign(y, alpha*dot(A.T(), x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
        else\
-            execute(is::detail::assign(y, alpha*dot(A, x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
+            execute(is::assign(y, alpha*dot(A, x) + beta*y), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
        return clblasSuccess;\
    }
@@ -215,14 +215,14 @@ extern "C"
        is::driver::Context const & context = C.context();\
        /*Operation*/\
        if((transA==clblasTrans) && (transB==clblasTrans)){\
-            execute(is::detail::assign(C, alpha*dot(A.T(), B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
+            execute(is::assign(C, alpha*dot(A.T(), B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
 }\
        else if((transA==clblasTrans) && (transB==clblasNoTrans))\
-            execute(is::detail::assign(C, alpha*dot(A.T(), B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
+            execute(is::assign(C, alpha*dot(A.T(), B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
        else if((transA==clblasNoTrans) && (transB==clblasTrans))\
-            execute(is::detail::assign(C, alpha*dot(A, B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
+            execute(is::assign(C, alpha*dot(A, B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
        else\
-            execute(is::detail::assign(C, alpha*dot(A, B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
+            execute(is::assign(C, alpha*dot(A, B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
        return clblasSuccess;\
    }
--- a/python/setup.py
+++ b/python/setup.py
@@ -49,13 +49,44 @@ def main():
        return optlist
    def find_library(name, cmake_glob_list):
-        compiler=new_compiler()
+        cvars = sysconfig.get_config_vars()
        compiler = new_compiler()
        dirs = []
        for gpath in cmake_glob_list.split(';'):
            path = glob(gpath)
            if path:
                dirs += [path[0]]
-        return compiler.find_library_file(dirs, name)
+        return compiler.find_library_file(cvars['LIBDIR'].split(';') + dirs, name)
    def find_opencl():
        cvars = sysconfig.get_config_vars()
        is_on_android = '-mandroid' in cvars['PY_CFLAGS']
        lib = find_library('OpenCL', '/opt/adreno-driver*/lib' if is_on_android else '/opt/AMDAPPSDK*/lib/x86_64')
        return {'include': '', 'lib': dirname(lib)} if lib else None
    def find_in_path(name, path):
        "Find a file in a search path"
        #adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
        for dir in path.split(os.pathsep):
            binpath = os.path.join(dir, name)
            if os.path.exists(binpath):
                return os.path.abspath(binpath)
        return None
    def find_cuda():
        if 'CUDAHOME' in os.environ:
            home = os.environ['CUDAHOME']
            nvcc = os.path.join(home, 'bin', 'nvcc')
        else:
            nvcc = find_in_path('nvcc', os.environ['PATH'])
        if nvcc:
            home = dirname(os.path.dirname(nvcc))
            return {'include': os.path.join(home, 'include'),
                    'lib': os.path.join(home, 'lib64')}
        else:
            return None
    #Tweaks warning, because boost-numpy and boost-python won't compile cleanly without these changes
    cvars = sysconfig.get_config_vars()
@@ -63,14 +94,27 @@ def main():
    cvars["CFLAGS"] = cvars["BASECFLAGS"] + ' ' + cvars['OPT']
    cvars["LDFLAGS"] = '-Wl,--no-as-needed ' + cvars["LDFLAGS"]
-    is_on_android = '-mandroid' in cvars['PY_CFLAGS']
+    #OpenCL
-    opencl = find_library('OpenCL', '/opt/adreno-driver*/lib' if is_on_android else '/opt/AMDAPPSDK*/lib/x86_64')
+    opencl_config = find_opencl()
-    library_dirs = [dirname(library) for library in [opencl] if library is not None]
+    #CUDA
    cuda_config = find_cuda()
-    #Includes
+    #Libraries
    libraries = ['OpenCL']
    if cuda_config: libraries += ['cuda', 'nvrtc']
    #Backends:
    backend_defines = ['-DISAAC_WITH_OPENCL']
    if cuda_config: backend_defines += ['-DISAAC_WITH_CUDA']
    #Library directories
    library_dirs = [config['lib'] for config in [opencl_config, cuda_config] if config is not None]
    #Include directories
    include =' src/include'.split() + ['external/boost/include', os.path.join(find_module("numpy")[1], "core", "include")]
-    #Sources
+
    #Source files
    src =  'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/model/model.cpp src/lib/model/predictors/random_forest.cpp src/lib/backend/templates/mreduction.cpp src/lib/backend/templates/reduction.cpp src/lib/backend/templates/mproduct.cpp src/lib/backend/templates/maxpy.cpp src/lib/backend/templates/base.cpp src/lib/backend/templates/vaxpy.cpp src/lib/backend/mapped_object.cpp src/lib/backend/stream.cpp src/lib/backend/parse.cpp src/lib/backend/keywords.cpp src/lib/backend/binder.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/wrap/clBLAS.cpp '.split() + [os.path.join('src', 'wrap', sf)  for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'model.cpp', 'exceptions.cpp']]
    boostsrc = 'external/boost/libs/'
    for s in ['numpy','python','smart_ptr','system','thread']:
@@ -84,7 +128,7 @@ def main():
        src += glob(boostsrc + "/thread/src/pthread/*.cpp")
    src= [f for f in src  if not f.endswith("once_atomic.cpp")]
-
+    #Setup
    setup(
                name='isaac',
                version='1.0',
@@ -96,12 +140,12 @@ def main():
                ext_package="isaac",
                ext_modules=[Extension(
                    '_isaac',src,
-                    extra_compile_args= ['-D__CL_ENABLE_EXCEPTIONS', '-std=c++11', '-Wno-unused-function', '-Wno-unused-local-typedefs',  '-Wno-sign-compare'],
+                    extra_compile_args= backend_defines + ['-std=c++11', '-Wno-unused-function', '-Wno-unused-local-typedefs',  '-Wno-sign-compare'],
 		    extra_link_args=['-Wl,-soname=_isaac.so'],
                    undef_macros=[],
                    include_dirs=include,
                    library_dirs=library_dirs,
-                    libraries=['OpenCL']
+                    libraries=libraries
                )],
                cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass},
                classifiers=[
--- a/python/src/wrap/core.cpp
+++ b/python/src/wrap/core.cpp
@@ -106,6 +106,11 @@ namespace detail
    return ndarray_to_iscarray(np::from_object(obj, to_np_dtype(tools::extract_dtype(odtype))), context);
  }
  std::shared_ptr<isc::array> create_zeros_array(isc::int_t M, isc::int_t N, bp::object odtype, isc::driver::Context context)
  {
   return std::shared_ptr<isc::array>(new isc::array(isc::zeros(M, N, tools::extract_dtype(odtype), context)));
  }
  std::shared_ptr<isc::array> create_empty_array(bp::object sizes, bp::object odtype, isc::driver::Context context)
  {
      typedef std::shared_ptr<isc::array> result_type;
@@ -281,9 +286,13 @@ void export_core()
      .def("__init__", bp::make_constructor(detail::construct_scalar, bp::default_call_policies(), (bp::arg(""), bp::arg("context")=isc::driver::queues.default_context())))
      ;
-  //Other numpy-like initializers
+//Other numpy-like initializers
  bp::def("empty", &detail::create_empty_array, (bp::arg("shape"), bp::arg("dtype") = bp::scope().attr("float32"), bp::arg("context")=isc::driver::queues.default_context()));
 //Assign
    bp::def("assign", static_cast<isc::array_expression (*)(isc::array const &, isc::array const &)>(&isc::assign));\
    bp::def("assign", static_cast<isc::array_expression (*)(isc::array const &, isc::array_expression const &)>(&isc::assign));\
 //Binary
 #define MAP_FUNCTION(name) \
      bp::def(#name, static_cast<isc::array_expression (*)(isc::array const &, isc::array const &)>(&isc::name));\
@@ -302,6 +311,8 @@ void export_core()
      bp::def(#name, static_cast<isc::array_expression (*)(isc::array const &)>(&isc::name));\
      bp::def(#name, static_cast<isc::array_expression (*)(isc::array_expression const &)>(&isc::name));
      bp::def("zeros", &detail::create_zeros_array, (bp::arg("shape"), bp::arg("dtype") = bp::scope().attr("float32"), bp::arg("context")=isc::driver::queues.default_context()));
  MAP_FUNCTION(abs)
  MAP_FUNCTION(acos)
  MAP_FUNCTION(asin)
--- a/python/src/wrap/driver.cpp
+++ b/python/src/wrap/driver.cpp
@@ -2,7 +2,9 @@
 #include <boost/python/suite/indexing/vector_indexing_suite.hpp>
 #include <boost/python/suite/indexing/map_indexing_suite.hpp>
 #include "isaac/model/model.h"
 #include "isaac/symbolic/execute.h"
 #include "common.hpp"
 #include "driver.h"
@@ -65,16 +67,26 @@ namespace detail
  std::shared_ptr<isc::driver::Context> make_context(isc::driver::Device const & dev)
  { return std::shared_ptr<isc::driver::Context>(new isc::driver::Context(dev)); }
-  bp::tuple flush(isc::array_expression const & expression, unsigned int queue_id, bp::list dependencies, bool tune, int label, std::string const & program_name, bool force_recompile)
+  bp::object enqueue(isc::array_expression const & expression, unsigned int queue_id, bp::list dependencies, bool tune, int label, std::string const & program_name, bool force_recompile)
  {
      std::list<isc::driver::Event> events;
      std::vector<isc::driver::Event> cdependencies = tools::to_vector<isc::driver::Event>(dependencies);
-      std::shared_ptr<isc::array> parray(new isc::array(isc::control(expression,
+
-                                                                    isc::execution_options_type(queue_id, &events, &cdependencies),
+      isc::execution_options_type execution_options(queue_id, &events, &cdependencies);
-                                                                    isc::dispatcher_options_type(tune, label),
+      isc::dispatcher_options_type dispatcher_options(tune, label);
-                                                                    isc::compilation_options_type(program_name, force_recompile))));
+      isc::compilation_options_type compilation_options(program_name, force_recompile);
      isc::array_expression::container_type::value_type root = expression.tree()[expression.root()];
      if(isc::detail::is_assignment(root.op))
      {
          isc::execute(isc::control(expression, execution_options, dispatcher_options, compilation_options), isaac::models(execution_options.queue(expression.context())));
          return bp::make_tuple(bp::ptr(root.lhs.array), tools::to_list(events.begin(), events.end()));
      }
      else
      {
          std::shared_ptr<isc::array> parray(new isc::array(isc::control(expression, execution_options, dispatcher_options, compilation_options)));
          return bp::make_tuple(parray, tools::to_list(events.begin(), events.end()));
      }
  }
 }
 struct state_type{ };
@@ -152,7 +164,7 @@ void export_driver()
  bp::def("get_platforms", &detail::get_platforms);
-  bp::def("flush", &detail::flush, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("tune") = false, bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));
+  bp::def("enqueue", &detail::enqueue, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("tune") = false, bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));
  bp::class_<state_type>("state_type")
          .def_readwrite("queue_properties",&isc::driver::queues.queue_properties)
--- a/python/src/wrap/exceptions.cpp
+++ b/python/src/wrap/exceptions.cpp
@@ -2,6 +2,7 @@
 #include <boost/python.hpp>
 #include "isaac/exception/operation_not_supported.h"
 #include "isaac/driver/common.h"
 #include "common.hpp"
 #include "exceptions.h"
@@ -83,4 +84,12 @@ void export_exceptions()
    wrap::exception<isaac::operation_not_supported_exception>("OperationNotSupported", bp::init<std::string>())
        .def("__str__", &isaac::operation_not_supported_exception::what)
        ;
    wrap::exception<isaac::driver::ocl::exception::out_of_resources>("LaunchOutOfResources")
        .def("__str__", &isaac::driver::ocl::exception::out_of_resources::what)
        ;
    wrap::exception<isaac::driver::ocl::exception::mem_object_allocation_failure>("MemObjectAllocationFailure")
        .def("__str__", &isaac::driver::ocl::exception::mem_object_allocation_failure::what)
        ;
 }
--- a/python/src/wrap/model.cpp
+++ b/python/src/wrap/model.cpp
@@ -47,7 +47,7 @@ void export_model()
    #undef __PROP
  }
-  #define WRAP_BASE(name) bp::class_<isaac::base_impl<isaac::name, isaac::name::parameters_type>, bp::bases<isaac::base>, boost::noncopyable>(#name "_base_impl", bp::no_init);
+  #define WRAP_BASE(name) bp::class_<isaac::base_impl<isaac::name, isaac::name::parameters_type>, bp::bases<isaac::base>, boost::noncopyable>(#name, bp::no_init);
  #define WRAP_TEMPLATE(name, basename, ...) bp::class_<isaac::name, bp::bases<isaac::base_impl<isaac::basename, isaac::basename::parameters_type> > >(#name, bp::init<__VA_ARGS__>())\
                                      .add_property("local_size_0", &isc::name::local_size_0)\
                                      .add_property("local_size_1", &isc::name::local_size_1);
--- a/tune/CMakeLists.txt
+++ b/tune/CMakeLists.txt
@@ -1,18 +0,0 @@
 find_program(PYINSTALLER pyinstaller)
 if(PYINSTALLER)
 set(SPEC_IN "${CMAKE_CURRENT_SOURCE_DIR}/pyinstaller_build.spec")
 set(SPEC    "${CMAKE_CURRENT_BINARY_DIR}/pyinstaller_build.spec")
 set(OUTPUT      "${CMAKE_CURRENT_BINARY_DIR}/build/timestamp")
 file(GLOB DEPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/pysrc/*.py")
 LIST(APPEND DEPS "${CMAKE_CURRENT_SOURCE_DIR}/pyinstaller_build.spec")
 configure_file(${SPEC_IN} ${SPEC})
 add_custom_command(OUTPUT ${OUTPUT}
                   COMMAND ${PYINSTALLER} ${SPEC_IN} ${CMAKE_CURRENT_SOURCE_DIR}
                   COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT}
                   DEPENDS ${DEPS} python)
 add_custom_target(autotune ALL DEPENDS ${OUTPUT})
 endif()
--- a/tune/external/pyopencl-2014.1-py2.7.egg-info/PKG-INFO
+++ b/tune/external/pyopencl-2014.1-py2.7.egg-info/PKG-INFO
@@ -1,74 +0,0 @@
 Metadata-Version: 1.1
 Name: pyopencl
 Version: 2014.1
 Summary: Python wrapper for OpenCL
 Home-page: http://mathema.tician.de/software/pyopencl
 Author: Andreas Kloeckner
 Author-email: inform@tiker.net
 License: MIT
 Description: PyOpenCL lets you access GPUs and other massively parallel compute
        devices from Python. It tries to offer computing goodness in the
        spirit of its sister project `PyCUDA <http://mathema.tician.de/software/pycuda>`_:
        * Object cleanup tied to lifetime of objects. This idiom, often
          called
          `RAII <http://en.wikipedia.org/wiki/Resource_Acquisition_Is_Initialization>`_
          in C++, makes it much easier to write correct, leak- and
          crash-free code.
        * Completeness. PyOpenCL puts the full power of OpenCL's API at
          your disposal, if you wish.  Every obscure `get_info()` query and 
          all CL calls are accessible.
        * Automatic Error Checking. All CL errors are automatically
          translated into Python exceptions.
        * Speed. PyOpenCL's base layer is written in C++, so all the niceties
          above are virtually free.
        * Helpful and complete `Documentation <http://documen.tician.de/pyopencl>`_
          as well as a `Wiki <http://wiki.tiker.net/PyOpenCL>`_.
        * Liberal license. PyOpenCL is open-source under the 
          `MIT license <http://en.wikipedia.org/wiki/MIT_License>`_
          and free for commercial, academic, and private use.
        * Broad support. PyOpenCL was tested and works with Apple's, AMD's, and Nvidia's 
          CL implementations.
        To use PyOpenCL, you just need `numpy <http://numpy.org>`_ and an OpenCL
        implementation.
        (See this `howto <http://wiki.tiker.net/OpenCLHowTo>`_ for how to get one.)
        Places on the web related to PyOpenCL:
        * `Python package index <http://pypi.python.org/pypi/pyopencl>`_ (download releases)
          .. image:: https://badge.fury.io/py/pyopencl.png
              :target: http://pypi.python.org/pypi/pyopencl
        * `C. Gohlke's Windows binaries <http://www.lfd.uci.edu/~gohlke/pythonlibs/#pyopencl>`_ (download Windows binaries)
        * `Github <http://github.com/pyopencl/pyopencl>`_ (get latest source code, file bugs)
        * `Documentation <http://documen.tician.de/pyopencl>`_ (read how things work)
        * `Wiki <http://wiki.tiker.net/PyOpenCL>`_ (read installation tips, get examples, read FAQ)
 Platform: UNKNOWN
 Classifier: Environment :: Console
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Other Audience
 Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Natural Language :: English
 Classifier: Programming Language :: C++
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 2
 Classifier: Programming Language :: Python :: 2.4
 Classifier: Programming Language :: Python :: 2.5
 Classifier: Programming Language :: Python :: 2.6
 Classifier: Programming Language :: Python :: 2.7
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.2
 Classifier: Programming Language :: Python :: 3.3
 Classifier: Topic :: Scientific/Engineering
 Classifier: Topic :: Scientific/Engineering :: Mathematics
 Classifier: Topic :: Scientific/Engineering :: Physics
--- a/tune/external/pyopencl-2014.1-py2.7.egg-info/SOURCES.txt
+++ b/tune/external/pyopencl-2014.1-py2.7.egg-info/SOURCES.txt
--- a/tune/external/pyopencl-2014.1-py2.7.egg-info/dependency_links.txt
+++ b/tune/external/pyopencl-2014.1-py2.7.egg-info/dependency_links.txt
@@ -1 +0,0 @@
--- a/tune/external/pyopencl-2014.1-py2.7.egg-info/installed-files.txt
+++ b/tune/external/pyopencl-2014.1-py2.7.egg-info/installed-files.txt
@@ -1,55 +0,0 @@
 ../pyopencl/_mymako.py
 ../pyopencl/array.py
 ../pyopencl/algorithm.py
 ../pyopencl/version.py
 ../pyopencl/cache.py
 ../pyopencl/clrandom.py
 ../pyopencl/reduction.py
 ../pyopencl/ipython.py
 ../pyopencl/_cluda.py
 ../pyopencl/__init__.py
 ../pyopencl/scan.py
 ../pyopencl/capture_call.py
 ../pyopencl/tools.py
 ../pyopencl/clmath.py
 ../pyopencl/elementwise.py
 ../pyopencl/characterize/performance.py
 ../pyopencl/characterize/__init__.py
 ../pyopencl/compyte/dtypes.py
 ../pyopencl/compyte/array.py
 ../pyopencl/compyte/__init__.py
 ../pyopencl/cl/pyopencl-ranluxcl.cl
 ../pyopencl/cl/pyopencl-airy.cl
 ../pyopencl/cl/pyopencl-eval-tbl.cl
 ../pyopencl/cl/pyopencl-bessel-y.cl
 ../pyopencl/cl/pyopencl-bessel-j.cl
 ../pyopencl/cl/pyopencl-complex.h
 ../pyopencl/_mymako.pyc
 ../pyopencl/array.pyc
 ../pyopencl/algorithm.pyc
 ../pyopencl/version.pyc
 ../pyopencl/cache.pyc
 ../pyopencl/clrandom.pyc
 ../pyopencl/reduction.pyc
 ../pyopencl/ipython.pyc
 ../pyopencl/_cluda.pyc
 ../pyopencl/__init__.pyc
 ../pyopencl/scan.pyc
 ../pyopencl/capture_call.pyc
 ../pyopencl/tools.pyc
 ../pyopencl/clmath.pyc
 ../pyopencl/elementwise.pyc
 ../pyopencl/characterize/performance.pyc
 ../pyopencl/characterize/__init__.pyc
 ../pyopencl/compyte/dtypes.pyc
 ../pyopencl/compyte/array.pyc
 ../pyopencl/compyte/__init__.pyc
 ../pyopencl/_cl.so
 ../pyopencl/_pvt_struct.so
 ./
 dependency_links.txt
 SOURCES.txt
 top_level.txt
 requires.txt
 not-zip-safe
 PKG-INFO
--- a/tune/external/pyopencl-2014.1-py2.7.egg-info/not-zip-safe
+++ b/tune/external/pyopencl-2014.1-py2.7.egg-info/not-zip-safe
@@ -1 +0,0 @@
--- a/tune/external/pyopencl-2014.1-py2.7.egg-info/requires.txt
+++ b/tune/external/pyopencl-2014.1-py2.7.egg-info/requires.txt
@@ -1,3 +0,0 @@
 pytools>=2014.2
 pytest>=2
 decorator>=3.2.0
--- a/tune/external/pyopencl-2014.1-py2.7.egg-info/top_level.txt
+++ b/tune/external/pyopencl-2014.1-py2.7.egg-info/top_level.txt
@@ -1,3 +0,0 @@
 _cl
 _pvt_struct
 pyopencl
--- a/tune/pysrc/model.py
+++ b/tune/pysrc/model.py
@@ -1,6 +1,4 @@
 from sklearn import tree
 from sklearn import ensemble
 from sklearn.grid_search import GridSearchCV
 import numpy as np
 def gmean(a, axis=0, dtype=None):
@@ -18,29 +16,33 @@ def gmean(a, axis=0, dtype=None):
 def nrmse(y_ground, y):
    N = y.size
    rmsd = np.sqrt(np.sum((y_ground - y)**2)/N)
    if len(y_ground) > 1:
        return rmsd/(np.max(y_ground) - np.min(y_ground))
    else:
        return rmsd
 def train(X, Y, profiles):      
    X = np.array(X)
    Y = np.array(Y)
    M = X.shape[0]
 def train_model(X, Y, profiles, perf, metric):
    p = np.random.permutation(X.shape[0])
    X = X[p,:]
    Y = Y[p,:]   
    Y = np.array([perf(xx, yy) for xx, yy in zip(X, Y)])
    Y[np.isinf(Y)] = 0 
    #Train the model
    cut = int(0.9*X.shape[0])
    #Train the model
    cut = int(0.9*M)
    XTr, YTr = X[:cut,:], Y[:cut,:]
    XCv, YCv = X[cut:,:], Y[cut:,:]
    nrmses = {}
-    for N in range(1,20):
+    for N in range(1,min(M+1,20)):
-        for depth in range(1,20):
+        for depth in range(1,min(M+1,20)):
            clf = ensemble.RandomForestRegressor(N, max_depth=depth).fit(XTr, YTr)
            t = np.argmax(clf.predict(XCv), axis = 1)
            y = np.array([YCv[i,t[i]] for i in range(t.size)])
            ground = np.max(YCv[:,:], axis=1)
            nrmses[clf] = nrmse(ground, y)
    clf = min(nrmses, key=nrmses.get)
-    print 'The optimal classifer has NRMSE = %.2g (%d estimators and the max depth is %d'%(nrmses[clf], clf.n_estimators, clf.max_depth)
+    return clf, nrmses[clf]
    return clf
--- a/tune/optimize.py
+++ b/tune/optimize.py
@@ -0,0 +1,197 @@
 import isaac as isc
 import random
 from copy import deepcopy
 from sys import stdout
 from itertools import product
 from deap import algorithms
 from deap import base
 from deap import creator
 from deap import tools as deap_tools
 from numpy import cumsum
 import tools
 fetch_types = [isc.fetching_policy_type.FETCH_FROM_LOCAL,
               isc.fetching_policy_type.FETCH_FROM_LOCAL,
               isc.fetching_policy_type.FETCH_FROM_GLOBAL_CONTIGUOUS,
               isc.fetching_policy_type.FETCH_FROM_GLOBAL_STRIDED]
 def exhaustive(template, sizes, context):
    tree, _ = tools.tree_of(template, sizes, context)
    metric = tools.metric_of(template)
    nbits = tools.genetic_infos_of(template)['nbits']
    categorical = tools.genetic_infos_of(template)['categorical']
    ranges = [range(2**x) for x in nbits]
    ranges = list(product(*ranges))
    timings = {}
    best = None
    for idx, r in enumerate(ranges):
        parameters = tuple([fetch_types[x] if i in categorical else 2**x for i,x in enumerate(r)])
        try:
            time = tools.benchmark(template, parameters, tree)
            if not best or time < best[1]:
                best = parameters, time
        except (isc.OperationNotSupported, isc.LaunchOutOfResources, isc.MemObjectAllocationFailure):
            pass
        if best:
            stdout.write('%.2f %% | Best %.2f [ for %s ]\r'%(float(idx*100)/len(ranges),metric(sizes, best[1]), best[0]))
    return best[0]
 def genetic(template, sizes, context, naccept=200, niter = 1000, cxpb=0.4, mutpb=0.4, popsize = 10, initializer = None, prior = None):
    tree, _ = tools.tree_of(template, sizes, context)
    metric = tools.metric_of(template)
    genetic_infos = tools.genetic_infos_of(template)
    nbits = genetic_infos['nbits']
    offsets = cumsum([0] + nbits)
    def bin2gray(A):
        g = [int(A[0])]
        for i in range(1, len(A)): 
            g += [int(A[i-1] != A[i])]
        return g
    def gray2int(A):
        b = [A[0]]
        for i in range(1, len(A)):
            b += [int(b[i-1] != A[i])]
        return int(''.join(map(str,b)), 2)
    def encode(genome):
        encoded = [bin2gray(bin(x)[2:].zfill(nb)) for x, nb in zip(genome, nbits)]
        return sum(encoded, [])
    def decode(genome):
        result = []
        for off1,off2 in zip(offsets[:-1],offsets[1:]):
            result += [gray2int(genome[off1:off2])]
        result = [fetch_types[x] if i in genetic_infos['categorical'] else 2**x for i,x in enumerate(result)]
        return result
    def evaluate(genome):
        idx = tuple(genome)
        if idx not in cache:
            cache[idx] = tools.benchmark(template, decode(genome), tree)
        return cache[idx],
    cache = {}
    hof = deap_tools.HallOfFame(1)
    creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMin)
    toolbox = base.Toolbox()
    toolbox.register("evaluate", evaluate)
    toolbox.register("mate", deap_tools.cxTwoPoint)
    toolbox.register("mutate", deap_tools.mutFlipBit)
    toolbox.register("select", deap_tools.selNSGA2)
    #Initialization
    if initializer is None:
        initializer = ([random.randint(0, 2**x) for x in nbits] for i in iter(int,1))
    population = [] 
    genome = encode(prior if prior else list(initializer.next()))
    while len(population) < popsize:
        individual = creator.Individual(genome)
        try:
            individual.fitness.values = toolbox.evaluate(genome)
            population += [individual]
        except (isc.OperationNotSupported, isc.LaunchOutOfResources, isc.MemObjectAllocationFailure ):
            pass
        genome = encode(list(initializer.next()))
    hof.update(population)
    x = []
    y = []
    it = 0
    while len(cache) < naccept and it<niter:
        pad = len(cache) - len(x)
        x += [len(cache)]*pad
        y += [metric(sizes, hof[0].fitness.values[0])]*pad
        offspring = []
        while len(offspring) < popsize:
            try:
                op_choice = random.random()
                #Cross-over
                if op_choice < cxpb: 
                    ind1, ind2 = map(toolbox.clone, random.sample(population, 2))
                    ind1, ind2 = toolbox.mate(ind1, ind2)
                    ind = ind1
                    toolbox.evaluate(ind)
                    offspring += [ind]
                #Mutation
                elif op_choice < cxpb + mutpb: 
                    ind = toolbox.clone(random.choice(population))
                    ind, = toolbox.mutate(ind, 1.0/offsets[-1])
                    toolbox.evaluate(ind)
                    offspring += [ind]
                #Reproduction
                else: 
                    offspring += [random.choice(population)]
            except (isc.OperationNotSupported, isc.LaunchOutOfResources, isc.MemObjectAllocationFailure):
                pass
        #Update fitnesses
        fitnesses = toolbox.map(toolbox.evaluate, offspring)
        for ind, fit in zip(offspring, fitnesses):
            ind.fitness.values = fit
        #Update population
        population[:] = toolbox.select(population + offspring, popsize)
        hof.update(population)
        optimal = '(%s)'%','.join(map(str,decode(hof[0])))
        stdout.write('Iter %d | %d evaluated | Best %.2f [ for %s ]\r'%(it, x[-1], y[-1], optimal))
        stdout.flush()
        it += 1
    stdout.write('\n')
    return tuple(decode(hof[0])), x, y
 def is_local_optimum(parameters, template, sizes, context):
    tree, _ = tools.tree_of(template, sizes, context)
    genetic_infos = tools.genetic_infos_of(template)
    if issubclass(template, isc.vaxpy):
        sweep_over = [0,1,2]
    elif issubclass(template, isc.reduction):
        sweep_over = [0,1,2]
    elif issubclass(template, isc.maxpy):
        sweep_over = [0,1,2,3,4]
    elif issubclass(template, isc.mreduction):
        sweep_over = [0,1,2,3,4]
    elif issubclass(template, isc.mproduct):
        sweep_over = [1,2,3,4,5,7,10,11]
    #Evaluate the provided parameters guess
    try:
        reference = tools.benchmark(template, parameters, tree)
    except (isc.OperationNotSupported, isc.LaunchOutOfResources, isc.MemObjectAllocationFailure):
        return False
    #Latency bound -- ignore
    if reference < 2e-5:
        return True
    timings = {}
    domain = [[v  for v in [x/2, x, x*2] if 1 <= v <= 2**2**genetic_infos['nbits'][i]] \
              if i in sweep_over else [x] for i, x in enumerate(parameters)]
    for x in product(*domain):
        if x==parameters:
            pass
        try:
            time = tools.benchmark(template, x, tree)
            if time/reference < .97:
                return False
        except (isc.OperationNotSupported, isc.LaunchOutOfResources, isc.MemObjectAllocationFailure):
            pass
    return True
--- a/tune/pyinstaller_build.spec
+++ b/tune/pyinstaller_build.spec
@@ -1,32 +0,0 @@
 #!/usr/bin/env
 import os, sys
 prefix = sys.argv[2]
 sys.path.append('/home/philippe/Development/ATIDLAS/build/python/pyatidlas/build/lib.linux-x86_64-2.7/')
 sys.path.append(os.path.join(prefix, 'pysrc'))
 a = Analysis([os.path.join(prefix, 'pysrc','autotune.py')],
         hiddenimports=['scipy.sparse.csgraph._validation',
                         'scipy.special._ufuncs_cxx',
                         'scipy.sparse.linalg.dsolve.umfpack',
                         'scipy.integrate.vode',
                         'scipy.integrate.lsoda',
                         'sklearn.utils.sparsetools._graph_validation',
                         'sklearn.utils.sparsetools._graph_tools',
                         'sklearn.utils.lgamma',
                         'sklearn.tree._utils'],
         hookspath=None,
         excludes=['scipy.io.matlab','matplotlib','PyQt4'],
         runtime_hooks=None)
 pyz = PYZ(a.pure)
 exe = EXE(pyz,
      a.scripts,
      a.binaries,
      a.zipfiles,
      a.datas,
      name='autotune',
      debug=False,
      strip=None,
      upx=True,
      console=True )
--- a/tune/pysrc/autotune.py
+++ b/tune/pysrc/autotune.py
@@ -1,235 +0,0 @@
 from __future__ import division
 import argparse, itertools, os, sys, json
 import misc_tools, optimize, dataset
 import isaac as isc
 import numpy as np
 from numpy import random
 from model import train_model
 TYPES = { 'vaxpy': {'template':isc.vaxpy,
                          'perf-index':lambda x: 3*x[0]*x[1][0]/x[2]*1e-9,
                          'perf-measure':'GB/s'},
          'maxpy': {'template':isc.maxpy,
                          'perf-index':lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
                          'perf-measure':'GB/s'},
          'dot': {'template':isc.reduction,
                        'perf-index':lambda x: 2*x[0]*x[1][0]/x[2]*1e-9,
                        'perf-measure':'GB/s'},
          'gemv': {'template': {'N': isc.mreduction_rows, 'T': isc.mreduction_cols},
                                'perf-index':lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
                                'perf-measure':'GB/s'},
          'gemm': {'template': {('N','N'): isc.mproduct_nn, ('T','N'): isc.mproduct_tn, 
                                          ('N','T'): isc.mproduct_nt, ('T','T'): isc.mproduct_tt},
                            'perf-index': lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9,
                            'perf-measure': 'GFLOP/s'} }
 def do_tuning(args):
    device = args.device
    context = isc.context(device)
    context.queues.append(isc.command_queue(context, device))
    if os.path.isfile(args.out):
        json_out = json.load(open(args.out, 'r'))
    else:
        json_out = {}
        json_out["version"] = "1.0"
    def map_to_list(T, x):
        return list(map(T, x if isinstance(x, list) else [x]))
    if(args.method=='simple'):
        default_tuning_sizes = {'vaxpy': args.blas1_size, 'dot': args.blas1_size,
                                'maxpy' : args.blas2_size, 'gemv' : args.blas2_size,
                                'gemm': args.blas3_size}
    for operation in ['vaxpy', 'dot', 'maxpy', 'gemv', 'gemm']:
          for datatype in [isc.float32, isc.float64]:
              dtypestr = datatype.__name__
              if operation not in args.operations and operation + '-' + dtypestr not in args.operations:
                  continue
              #Check data-type
              if datatype is isc.float64 and not device.double_fp_config:
                  sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
                  continue
              #~ #Helper for execution
              def execute(symbolic, sizes, Template, parameters = None, fname = os.devnull):
                  if parameters is not None:
                    return misc_tools.benchmark(Template(*parameters), symbolic)
                  with open(fname, "w+") as archive:
                    return optimize.genetic(symbolic, Template, lambda t: TYPES[operation]['perf-index']([datatype(0).size, sizes, t]), 
                                             TYPES[operation]['perf-measure'], archive)
              def log_spaced_points(a,b,N,r=128):
                  t = np.ceil(np.exp(np.linspace(np.log(a), np.log(b), N))/r)*r
                  return t.reshape(t.size,1).astype(int)
              #Helper for tuning
              def tune(execution_handler, layouts, tuning_sizes, training_sizes):
                  print('-----')
                  print(' '.join(map(str, ("Now tuning:", dtypestr, '-', operation, '-'.join(layouts), '[' + device.name, '(' + device.platform.name + ')]'))))
                  #Update JSON
                  full_operation = operation + ''.join(layouts)
                  prefix = os.path.join('data',os.path.join(full_operation,dtypestr))
                  if not os.path.exists(prefix):
                      os.makedirs(prefix)
                  if full_operation not in json_out:
                      json_out[full_operation] = {}
                  json_out[full_operation][dtypestr] = {}
                  D = json_out[full_operation][dtypestr]
                  if args.method == 'simple':
                      print 'Size : ', ','.join(map(str, default_tuning_sizes[operation]))
                      profiles = [execution_handler(map(int,default_tuning_sizes[operation]))]
                  else:
                      def compute_perf(x, t):
                          return TYPES[operation]['perf-index']([datatype(0).size, x, t])
                      profiles = dataset.sample_profiles(execution_handler, tuning_sizes)
                      if args.build_model:
                        X, Y, profiles = dataset.sample_dataset(prefix, profiles, execution_handler, training_sizes)
                        #profiles = np.loadtxt(prefix+'/profiles.csv')
                        #X = np.loadtxt(prefix+'/X.csv',ndmin=2)
                        #Y = np.loadtxt(prefix+'/Y.csv',ndmin=2)
                        clf = train_model(X, Y, profiles, compute_perf, TYPES[operation]['perf-measure'])
                        D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
                                       'children_right': e.tree_.children_right.tolist(),
                                       'threshold': e.tree_.threshold.astype('float64').tolist(),
                                       'feature': e.tree_.feature.astype('float64').tolist(),
                                       'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
                  D['profiles'] = [map(int, x) for x in profiles]
              Template = TYPES[operation]['template']
              #Vector AXPY
              if operation=='vaxpy':
                  def execution_handler(sizes, fname=os.devnull, parameters=None):
                      x = isc.empty(sizes[0], datatype, context=context)
                      y = isc.empty(sizes[0], datatype, context=context)
                      return execute(x + y, sizes, Template, parameters, fname)
                  tune(execution_handler, (), log_spaced_points(1e4, 1e7, 20), log_spaced_points(1e4, 1e7, 1000))
              #Dot
              if operation=='dot':
                  def execution_handler(sizes, fname=os.devnull, parameters=None):
                      x = isc.empty(sizes[0], datatype, context=context)
                      y = isc.empty(sizes[0], datatype, context=context)
                      s = isc.scalar(datatype)
                      return execute(isc.dot(x, y), sizes, Template, parameters, fname)
                  tune(execution_handler, (), log_spaced_points(1e4, 1e7, 50), log_spaced_points(1e4, 1e7, 1000))
              #Matrix AXPY
              if operation=='maxpy':
                  def execution_handler(sizes, fname=os.devnull, parameters=None):
                      A = isc.empty(sizes, datatype, context=context)
                      C = isc.empty(sizes, datatype, context=context)
                      return execute(A + C, sizes, Template, parameters, fname)
                  tune(execution_handler, 64, 5000, 2, (),'log', 'log')
              #Row-wise dot
              if operation=='gemv':
                  for A_trans in  args.gemv_layouts:
                      def execution_handler(sizes, fname=os.devnull, parameters=None):
                          A = isc.empty(sizes if A_trans=='N' else sizes[::-1], datatype, context=context)
                          x = isc.empty(sizes[1], datatype, context=context)
                          LHS = A if A_trans=='N' else A.T
                          return execute(isc.dot(LHS, x), sizes, Template[A_trans], parameters, fname)
                      tuning_sizes = itertools.chain( itertools.product([128, 512, 2048, 8192], [128, 512, 2048, 8192]),
                                                     itertools.product([128, 512, 2048, 8192], [16384, 32768, 65536]),
                                                     itertools.product([16384, 32768, 65536], [128, 512, 2048, 8192]))
                      training_sizes = itertools.chain( itertools.product([2**k for k in range(4, 13)], [2**k for k in range(4, 13)]),
                                                        itertools.product([2**k for k in range(4, 13)], [2**k for k in range(13, 17)]),
                                                        itertools.product([2**k for k in range(13, 17)], [2**k for k in range(4, 13)]))
                      tune(execution_handler, (A_trans,), tuning_sizes, training_sizes)
              #Matrix Product
              if operation=='gemm':
                  for L in args.gemm_layouts:
                      A_trans = L[0]
                      B_trans = L[1]
                      def execution_handler(sizes, fname=os.devnull, parameters=None):
                          A = isc.empty((sizes[0], sizes[2]) if A_trans=='N' else (sizes[2], sizes[0]), datatype, context=context)
                          B = isc.empty((sizes[2], sizes[1]) if B_trans=='N' else (sizes[1], sizes[2]), datatype, context=context)
                          LHS = A if A_trans=='N' else A.T
                          RHS = B if B_trans=='N' else B.T
                          return execute(isc.dot(LHS, RHS), sizes, Template[(A_trans, B_trans)], parameters, fname)
                      tuning_sizes = itertools.product([64, 256, 1024, 2560], [64, 256, 1024, 2560], [256, 2560, 32768, 65536])
                      training_sizes = itertools.product([2**k for k in range(6, 13)], [2**k for k in range(6, 13)], [2**k for k in range(6, 17)])
                      tune(execution_handler,(A_trans,B_trans), tuning_sizes, training_sizes)
              json.dump(json_out, open(args.out,'w'))
 class ArgumentsHandler:
    def __init__(self, devices):
        #Command line arguments
        parser = argparse.ArgumentParser()
        subparsers = parser.add_subparsers(dest='action')
        print_devices_parser = subparsers.add_parser('list-devices', help='List the devices available')
        tune_parser = subparsers.add_parser('tune', help='Auto-tuning')
        tune_parser.add_argument("--device", default=0, type=int)
        tune_parser.add_argument("--operations", default = 'vaxpy,maxpy,dot,gemv,gemm-float32', type=str)
        tune_parser.add_argument("--gemm-layouts", default='NN,NT,TN,TT', type=str)
        tune_parser.add_argument("--gemv-layouts", default='N,T', type=str)
        tune_parser.add_argument("--out", default='', type=str)
        tune_parser.add_argument("--viennacl-src-path", default='', type=str)
        tune_subparsers = tune_parser.add_subparsers(dest='method')
        simple_parser = tune_subparsers.add_parser('simple', help = 'Tune each operation for unique sizes')
        simple_parser.add_argument("--blas1-size", default = 10e6, type=int)
        simple_parser.add_argument("--blas2-size", nargs=2, default=[2560,2560], type=int)
        simple_parser.add_argument("--blas3-size", nargs=3, default=[1536,1536,1536],type=int)
        full_parser = tune_subparsers.add_parser('full', help = 'Tune each operation for randomly chosen sizes')
        full_parser.add_argument("--build-model", default=True, type=bool)
        full_parser.add_argument("--sample-size", default=64, type=int)
        args = parser.parse_args()
        self.__dict__ = args.__dict__.copy()
        if self.action == 'tune':
            #Retypes
            self.device = devices[int(self.device)]
            if not self.out:
                self.out = misc_tools.sanitize_string(self.device.name) + '.json'
            self.operations = self.operations.split(',')
            self.gemm_layouts = self.gemm_layouts.split(',')
            self.gemv_layouts = self.gemv_layouts.split(',')
            if self.method == 'simple':
                self.blas1_size = [int(float(self.blas1_size))]
                self.blas2_size = map(int, self.blas2_size)
                self.blas3_size = map(int, self.blas3_size)
 if __name__ == "__main__":
    isc.state.queue_properties = isc.CL_QUEUE_PROFILING_ENABLE
    platforms = isc.get_platforms()
    devices = [d for platform in platforms for d in platform.get_devices()]
    args = ArgumentsHandler(devices)
    print("----------------")
    print("Devices available:")
    print("----------------")
    for (i, d) in enumerate(devices):
        print 'Device', i, '|',  isc.device_type_to_string(d.type), '|', d.name, 'on', d.platform.name
    print("----------------")
    if args.action=='tune':
        print("------")
        print("Auto-tuning")
        print("------")
        do_tuning(args)
--- a/tune/pysrc/dataset.py
+++ b/tune/pysrc/dataset.py
@@ -1,54 +0,0 @@
 import os
 import sys
 import re
 import random
 import numpy as np
 def sample_profiles(execution_handler, generator):
    print "Sampling profiles..."
    t = np.empty(0)
    profiles = []
    for i, x in enumerate(generator):
        print x
        if i==0:
            X = np.empty((0,len(x)))
        y = execution_handler(x)
        if y not in profiles:
            profiles.append(y)
        idx = profiles.index(y)
        X = np.vstack((X, x))
        t = np.append(t, idx)
    idx = int(t[np.argmax(np.linalg.norm(X, axis=1))])
    profiles = [profiles[idx]] + [x for i,x in enumerate(profiles) if i!=idx]
    return profiles
 def sample_dataset(prefix_name, profiles, execution_handler, generator):
    P = len(profiles)
    print "Generating the dataset..."
    Y = np.empty((0, P))
    for i,x in enumerate(generator):
        if i==0:
            X = np.empty((0,len(x)))
        new_y = np.zeros(P)
        for j,y in enumerate(profiles):
            try:
                new_y[j] = execution_handler(x, os.devnull, y)
            except:
                new_y[j] = float('inf')
        X = np.vstack((X, x))
        Y = np.vstack((Y, new_y))
        if i%10==0:
            sys.stdout.write('%d data points generated\r'%i)
            sys.stdout.flush()
    idx = np.argsort(Y[np.argmax(np.linalg.norm(X, axis=1)),:])
    Y = Y[:, idx]
    profiles = [profiles[i] for i in idx]
    if not os.path.exists(prefix_name):
        os.makedirs(prefix_name)
    np.savetxt(os.path.join(prefix_name,"X.csv"), X)
    np.savetxt(os.path.join(prefix_name,"Y.csv"), Y)
    np.savetxt(os.path.join(prefix_name,"profiles.csv"), profiles)
    return X, Y, profiles
--- a/tune/pysrc/genetic.py
+++ b/tune/pysrc/genetic.py
@@ -1,205 +0,0 @@
 import random, time, sys, copy
 import misc_tools
 import numpy as np
 import isaac as isc
 from deap import algorithms
 from deap import base
 from deap import creator
 from deap import tools as deap_tools
 from collections import OrderedDict as odict
 def closest_divisor(N, x):
    x_low=x_high=max(1,min(round(x),N))
    while N % x_low > 0 and x_low>0:
        x_low = x_low - 1
    while N % x_high > 0 and x_high < N:
        x_high = x_high + 1
    return x_low if x - x_low < x_high - x else x_high
 def b_gray_to_bin(A='00000000', endian='big'):
    assert type(endian) is str
    assert endian == 'little' or endian == 'big'
    if endian == 'little': A = A[::-1] # Make sure endianness is big before conversion
    b = A[0]
    for i in range(1, len(A)): b += str( int(b[i-1] != A[i]) )
    if endian == 'little': b = b[::-1] # Convert back to little endian if necessary
    return b
 class GeneticOperators(object):
    class Pow2(object):
        def __init__(self, v):
            self.value = v
        @property
        def decoded():
            return 2**self.value
    def __init__(self, symbolic, Template, out):
        self.device = symbolic.context.queues[0].device
        self.symbolic = symbolic
        self.Template = Template
        self.cache = {}
        self.out = out
        self.genome_info = {
                            isc.vaxpy: [2,4,4,isc.fetching_policy_type],
                            isc.reduction: [2,4,4,isc.fetching_policy_type],
                            isc.maxpy: [2,3,3,3,3,isc.fetching_policy_type],
                            isc.mreduction_rows: [2,3,3,3,3,isc.fetching_policy_type],
                            isc.mreduction_cols: [2,3,3,3,3,isc.fetching_policy_type],
                            isc.mproduct_nn: [2,3,3,3,3,3,3,3,isc.fetching_policy_type,isc.fetching_policy_type,3],
                            isc.mproduct_nt: [2,3,3,3,3,3,3,3,isc.fetching_policy_type,isc.fetching_policy_type,3],
                            isc.mproduct_tn: [2,3,3,3,3,3,3,3,isc.fetching_policy_type,isc.fetching_policy_type,3],
                            isc.mproduct_tt: [2,3,3,3,3,3,3,3,isc.fetching_policy_type,isc.fetching_policy_type,3]
                           }[Template]
        self.indpb = 1.0/sum([1 if x==isc.fetching_policy_type else x for x in self.genome_info])
        creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMin)
        self.toolbox = base.Toolbox()
        self.toolbox.register("population", self.init)
        self.toolbox.register("evaluate", self.evaluate)
        self.toolbox.register("mate", deap_tools.cxTwoPoint)
        self.toolbox.register("mutate", self.mutate)
        self.toolbox.register("select", deap_tools.selNSGA2)
    def decode(self, genome):
        fetching_policy_type = isc.fetching_policy_type
        fetch = [fetching_policy_type.FETCH_FROM_LOCAL, fetching_policy_type.FETCH_FROM_GLOBAL_STRIDED, fetching_policy_type.FETCH_FROM_GLOBAL_CONTIGUOUS]
        is_gemm = self.Template in [isc.mproduct_nn, isc.mproduct_nt, isc.mproduct_tn, isc.mproduct_tt]
        result = []
        offset = 0
        for i, x in enumerate(self.genome_info):
            if x==isc.fetching_policy_type:
                result.append(fetch[genome[offset]])
                offset = offset + 1
            else:
                decoded = int(b_gray_to_bin(''.join(genome[offset:offset+x])), 2)
                result.append(decoded if is_gemm and  i in [11, 12] else 2**decoded)
                offset = offset + x
        #GEMM peculiarities
        if is_gemm:
            if fetching_policy_type.FETCH_FROM_LOCAL in result:
                lf1 = result[1]*result[3]/result[10]
            else:
                result[10] = 0
                lf1 = 0
            result.append(lf1)
        return result
    def init(self, N):
        result = []
        allowed_idx = [0] if self.Template in [isc.mproduct_nn, isc.mproduct_nt, isc.mproduct_tn, isc.mproduct_tt] else [1,2]
        for idx in allowed_idx:
            current = []
            while len(current) < N/len(allowed_idx):
                while True:
                    bincode = []
                    for i, x in enumerate(self.genome_info):
                        if x==isc.fetching_policy_type:
                            bincode = bincode + [idx]
                        else:
                            bincode = bincode + [str(random.randint(0,1)) for i in range(x)]
                    parameters = self.decode(bincode)
                    template = self.Template(*parameters)
                    array_expressions = isc.array_expression_container(self.symbolic)
                    registers_usage = template.registers_usage(array_expressions)/4
                    lmem_usage = template.lmem_usage(array_expressions)
                    local_size = parameters[1]*parameters[3]
                    occupancy_record = misc_tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
                    if not misc_tools.skip(template, self.symbolic):
                        current.append(creator.Individual(bincode))
                        break
            result = result + current
        return result
    def mutate(self, individual):
        while True:
            new_individual = copy.deepcopy(individual)
            for i in range(len(new_individual)):
                if isinstance(individual[i], int) and random.random() < 0.1:
                    while new_individual[i] == individual[i]:
                        new_individual[i] = random.randint(0, 2)
                elif not isinstance(individual[i], int) and random.random() < self.indpb:
                    new_individual[i] = '1' if new_individual[i]=='0' else '0'
            parameters = self.decode(new_individual)
            template = self.Template(*parameters)
            if not misc_tools.skip(template, self.symbolic):
                break
        return new_individual,
    def evaluate(self, individual):
        if tuple(individual) not in self.cache:
            parameters = self.decode(individual)
            template = self.Template(*parameters)
            tt = misc_tools.benchmark(template, self.symbolic)
            self.out.write(','.join([str(tt)]+map(str,map(int,parameters)))+'\n')
            self.cache[tuple(individual)] = tt
        return self.cache[tuple(individual)],
    def optimize(self, maxtime, maxgen, compute_perf, perf_metric):
        hof = deap_tools.HallOfFame(1)
        # Begin the generational process
        gen = 0
        maxtime = time.strptime(maxtime, '%Mm%Ss')
        maxtime = maxtime.tm_min*60 + maxtime.tm_sec
        start_time = time.time()
        mu = 30
        cxpb = 0.2
        mutpb = 0.7
        population = self.init(mu)
        invalid_ind = [ind for ind in population if not ind.fitness.valid]
        fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
        hof.update(population)
        while time.time() - start_time < maxtime and gen < maxgen:
            # Vary the population
            offspring = []
            for _ in xrange(mu):
                op_choice = random.random()
                if op_choice < cxpb:            # Apply crossover
                    while True:
                        ind1, ind2 = map(self.toolbox.clone, random.sample(population, 2))
                        ind1, ind2 = self.toolbox.mate(ind1, ind2)
                        del ind1.fitness.values
                        parameters = self.decode(ind1)
                        template = self.Template(*parameters)
                        if not misc_tools.skip(template, self.symbolic):
                            break
                    offspring.append(ind1)
                elif op_choice < cxpb + mutpb:  # Apply mutation
                    ind = self.toolbox.clone(random.choice(population))
                    ind, = self.toolbox.mutate(ind)
                    del ind.fitness.values
                    offspring.append(ind)
                else:                           # Apply reproduction
                    offspring.append(random.choice(population))
            #for x in offspring:
                    #print self.decode(x)
            # Evaluate the individuals with an invalid fitness
            invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
            fitnesses = self.toolbox.map(self.evaluate, invalid_ind)
            for ind, fit in zip(invalid_ind, fitnesses):
                ind.fitness.values = fit
            # Update the hall of fame with the generated individuals
            hof.update(offspring)
            # Select the next generation population
            population[:] = self.toolbox.select(population + offspring, mu)
            #Update
            gen = gen + 1
            best_profile = '(%s)'%','.join(map(str,self.decode(hof[0])))
            best_performance = compute_perf(hof[0].fitness.values[0])
            sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\r'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
            sys.stdout.flush()
        sys.stdout.write('\n')
        return self.decode(hof[0])
--- a/tune/pysrc/misc_tools.py
+++ b/tune/pysrc/misc_tools.py
@@ -1,246 +0,0 @@
 from __future__ import division
 import time
 import os
 import sys
 import isaac as isc
 import numpy as np
 class PhysicalLimitsNV:
    def __init__(self, dev):
        self.compute_capability = dev.nv_compute_capability
        if self.compute_capability[0]==1:
            if self.compute_capability[1]<=1:
                self.warps_per_mp = 24
                self.threads_per_mp = 768
                self.num_32b_reg_per_mp = 8192
                self.reg_alloc_unit_size = 256
            else:
                self.warps_per_mp = 32
                self.threads_per_mp = 1024
                self.num_32b_reg_per_mp = 16384
                self.reg_alloc_unit_size = 512
            self.threads_per_warp = 32
            self.thread_blocks_per_mp = 8
            self.reg_alloc_granularity = 'block'
            self.reg_per_thread = 124
            self.shared_mem_per_mp = 16384
            self.shared_mem_alloc_unit_size = 512
            self.warp_alloc_granularity = 2
            self.max_thread_block_size = 512
        elif self.compute_capability[0]==2:
            self.threads_per_warp = 32
            self.warps_per_mp = 48
            self.threads_per_mp = 1536
            self.thread_blocks_per_mp = 8
            self.num_32b_reg_per_mp = 32768
            self.reg_alloc_unit_size = 64
            self.reg_alloc_granularity = 'warp'
            self.reg_per_thread = 63
            self.shared_mem_per_mp = 49152
            self.shared_mem_alloc_unit_size = 128
            self.warp_alloc_granularity = 2
            self.max_thread_block_size = 1024
        elif self.compute_capability[0]==3:
            self.threads_per_warp = 32
            self.warps_per_mp = 64
            self.threads_per_mp = 2048
            self.thread_blocks_per_mp = 16
            self.num_32b_reg_per_mp = 65536
            self.reg_alloc_unit_size = 256
            self.reg_alloc_granularity = 'warp'
            if(self.compute_capability[1]==5):
                self.reg_per_thread = 255
            else:
                self.reg_per_thread = 63
            self.shared_mem_per_mp = 49152
            self.shared_mem_alloc_unit_size = 256
            self.warp_alloc_granularity = 4
            self.max_thread_block_size = 1024
        elif self.compute_capability[0]==5:  #[KR]: copy-pasted from Kepler and adjusted according to http://en.wikipedia.org/wiki/CUDA
            self.threads_per_warp = 32
            self.warps_per_mp = 64
            self.threads_per_mp = 2048
            self.thread_blocks_per_mp = 32
            self.num_32b_reg_per_mp = 65536
            self.reg_alloc_unit_size = 256
            self.reg_alloc_granularity = 'warp'
            self.reg_per_thread = 255
            self.shared_mem_per_mp = 65536
            self.shared_mem_alloc_unit_size = 256
            self.warp_alloc_granularity = 4
            self.max_thread_block_size = 1024
        else:
            raise Exception('Compute capability not supported!')
 class PhysicalLimitsAMD:
    def __init__(self, dev):
        infos =\
        {
            #APU:
            'Devastator': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 8192},
            'Scrapper': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 8192},
            #HD5000
            'Cedar': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 8192},
            'Redwood': {'arch': 'VLIW', 'WFmax_cu': 62, 'LDS_cu': 32768, 'GPR_cu': 16384},
            'Juniper': {'arch': 'VLIW', 'WFmax_cu': 24.8, 'LDS_cu': 32768, 'GPR_cu': 16384},
            'Cypress': {'arch': 'VLIW', 'WFmax_cu': 27.6, 'LDS_cu': 32768, 'GPR_cu': 16384},
            'Hemlock': {'arch': 'VLIW', 'WFmax_cu': 24.8, 'LDS_cu': 32768, 'GPR_cu': 16384},
            #HD6000
            'Seymour': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 16384},
            'Caicos': {'arch': 'VLIW', 'WFmax_cu': 96, 'LDS_cu': 32768, 'GPR_cu': 16384},
            'Turks': {'arch': 'VLIW', 'WFmax_cu': 41.3, 'LDS_cu': 32768, 'GPR_cu': 16384},
            'Whistler': {'arch': 'VLIW', 'WFmax_cu': 41.3, 'LDS_cu': 32768, 'GPR_cu': 16384},
            'Barts': {'arch': 'VLIW', 'WFmax_cu': 49.6, 'LDS_cu': 32768, 'GPR_cu': 16384},
            #HD7000
            'Capeverde': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
            'Pitcairn': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
            'Bonaire': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
            'Tahiti': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
            #Rx 200
            'Oland': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
            'Tonga': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536},
            'Hawaii': {'arch': 'GCN', 'WFmax_cu': 40, 'LDS_cu': 65536, 'GPR_cu': 65536}
        }
        self.WFsize = 64
        self.WFmax_cu = infos[dev.name]['WFmax_cu']
        self.LDS_cu = infos[dev.name]['LDS_cu']
        self.GPR_cu = infos[dev.name]['GPR_cu']
        self.arch =  infos[dev.name]['arch']
        pass
 def _int_floor(value, multiple_of=1):
    """Round C{value} down to be a C{multiple_of} something."""
    # Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
    from math import floor
    return int(floor(value/multiple_of))*multiple_of
 def _int_ceiling(value, multiple_of=1):
    """Round C{value} up to be a C{multiple_of} something."""
    # Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
    from math import ceil
    return int(ceil(value/multiple_of))*multiple_of
 class OccupancyRecord:
    def init_nvidia(self, dev, threads, shared_mem, registers):
        pl = PhysicalLimitsNV(dev)
        limits = []
        allocated_warps =  max(1,_int_ceiling(threads/pl.threads_per_warp))
        max_warps_per_mp = pl.warps_per_mp
        limits.append((min(pl.thread_blocks_per_mp, _int_floor(max_warps_per_mp/allocated_warps)), 'warps'))
        if registers>0:
            if registers > pl.reg_per_thread:
                limits.append((0, 'registers'))
            else:
                allocated_regs = {'warp': allocated_warps,
                                  'block': _int_ceiling(_int_ceiling(allocated_warps, pl.warp_alloc_granularity)*registers*pl.threads_per_warp,allocated_warps)}[pl.reg_alloc_granularity]
                max_reg_per_mp = {'warp': _int_floor(pl.num_32b_reg_per_mp/_int_ceiling(registers*pl.threads_per_warp, pl.reg_alloc_unit_size), pl.warp_alloc_granularity),
                                  'block':pl.num_32b_reg_per_mp}[pl.reg_alloc_granularity]
                limits.append((_int_floor(max_reg_per_mp/allocated_regs), 'registers'))
        if shared_mem>0:
            allocated_shared_mem = _int_ceiling(shared_mem, pl.shared_mem_alloc_unit_size)
            max_shared_mem_per_mp = pl.shared_mem_per_mp
            limits.append((_int_floor(max_shared_mem_per_mp/allocated_shared_mem), 'shared memory'))
        limit, limited_by = min(limits)
        warps_per_mp = limit*allocated_warps
        self.occupancy = 100*warps_per_mp/pl.warps_per_mp
    def init_amd(self, dev, threads, shared_mem, NReg):
        pl = PhysicalLimitsAMD(dev)
        limits = {}
        WFwg = _int_ceiling(threads/pl.WFsize)
        #WFmax without constraint
        if pl.arch=='VLIW':
            limits['wg'] = pl.WFmax_cu if WFwg > pl.WFmax_cu else _int_floor(pl.WFmax_cu,WFwg)
        else:
            limits['wg'] = min(16*WFwg, pl.WFmax_cu)
        #WFmax with LDS constraints
        if shared_mem > 0:
            WGmax = _int_floor(pl.LDS_cu/shared_mem)
            limits['lds'] = WGmax*WFwg
        #WFmax with GPR constraints
        if NReg > 0:
            #Amount of work group per CU
            NRegWG = NReg*pl.WFsize*WFwg
            WGmax = _int_floor(pl.GPR_cu/NRegWG)
            limits['gpr'] = WFwg*WGmax
        self.occupancy = 100.0*min(list(limits.values()))/pl.WFmax_cu
    def __init__(self, dev, threads, shared_mem=0, registers=0):
        vendor = dev.vendor
        if vendor == isc.vendor.AMD:
            self.init_amd(dev, threads, shared_mem, registers)
        elif vendor == isc.vendor.NVIDIA:
            self.init_nvidia(dev, threads, shared_mem, registers)
        elif vendor == isc.vendor.INTEL:
            if registers>128:
                self.occupancy = 0
            else:
                self.occupancy = 100
 def skip(template, symbolic):
    device = symbolic.context.queues[0].device
    local_size = template.local_size_0*template.local_size_1
    vendor = device.vendor
    if vendor == isc.vendor.AMD and local_size%64!=0:
      return True
    elif vendor == isc.vendor.NVIDIA and local_size%32!=0:
      return True
    elif vendor == isc.vendor.INTEL and local_size%8!=0:
      return True
    array_expressions = isc.array_expression_container(symbolic)
    registers_usage = template.registers_usage(array_expressions)/4
    lmem_usage = template.lmem_usage(array_expressions)
    occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
    if template.is_invalid(array_expressions, device) or occupancy_record.occupancy < 10:
        return True
    return False
 def benchmark(template, symbolic):
    queue = symbolic.context.queues[0]
    device = queue.device
    array_expressions = isc.array_expression_container(symbolic)
    registers_usage = template.registers_usage(array_expressions)/4
    lmem_usage = template.lmem_usage(array_expressions)
    local_size = template.local_size_0*template.local_size_1
    occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
    if occupancy_record.occupancy < 15 :
        return float("inf")
    else:
        queue.models[template, isc.float32] = isc.model(isc.float32, template, queue)
        timings = []
        current_time = 0
        x, events = isc.flush(symbolic)
        symbolic.context.queues[0].synchronize()
        while current_time < 1e-3:
            x, events = isc.flush(symbolic)
            symbolic.context.queues[0].synchronize()
            timings.append(1e-9*sum([e.elapsed_time for e in events]))
            current_time = current_time + timings[-1]
        return np.max(timings)
 def sanitize_string(string, keep_chars = ['_']):
    string = string.replace(' ', '_').replace('-', '_').lower()
    string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
    return string
--- a/tune/pysrc/optimize.py
+++ b/tune/pysrc/optimize.py
@@ -1,9 +0,0 @@
 import array, random, itertools
 import deap.tools
 import numpy as np
 from genetic import GeneticOperators
 def genetic(symbolic, Template, compute_perf, perf_metric, out):
    GA = GeneticOperators(symbolic, Template, out)
    return GA.optimize(maxtime='5m0s', maxgen=10000, compute_perf=compute_perf, perf_metric=perf_metric)
--- a/tune/tools.py
+++ b/tune/tools.py
@@ -0,0 +1,102 @@
 import isaac as isc
 from numpy import mean, median
 from math import ceil, exp, log, sqrt
 def sanitize(string, keep_chars = ['_']):
    string = string.replace(' ', '_').replace('-', '_').lower()
    string = "".join(c for c in string if c.isalnum() or c in keep_chars).rstrip()
    return string
 def distance(x, y):
    return sqrt(sum([(a - b)**2 for a, b in zip(x, y)]))
 def linspace(a, b, n=100):
    if n < 2:
        return b
    diff = (float(b) - a)/(n - 1)
    return [diff * i + a  for i in range(n)]
 def expspace(a,b,N,r=128):
    return [int(ceil(exp(x)/r)*r) for x in linspace(log(a), log(b), N)]
 def benchmark(template, setting, tree):
    queue = tree.context.queues[0]
    queue.models[template, isc.float32] = isc.model(isc.float32, template(*setting), queue)
    times = []
    total = 0
    i = 0
    while total < 1e-2:
        #z = isc.zeros(1, 10000000, isc.float32, tree.context)
        z, events = isc.enqueue(tree)
        tree.context.queues[0].synchronize()
        times.append(1e-9*sum([e.elapsed_time for e in events]))
        total += times[-1]
        i+=1
    return mean(times)
 def tree_of(template, sizes, context):
    if issubclass(template, isc.vaxpy):
        N, = sizes
        x = isc.empty(N, dtype=isc.float32, context=context)
        y = isc.empty(N, dtype=isc.float32, context=context)
        return x + y, (x, y)
    elif issubclass(template, isc.reduction):
        N, = sizes
        x = isc.empty(N, context=context)
        y = isc.empty(N, context=context)
        return isc.dot(x, y), (x, y)
    elif issubclass(template, isc.maxpy):
        M, N = sizes
        A = isc.empty((M,N), context=context)
        B = isc.empty((M,N), context=context)
        return A + B, (A, B)
    elif issubclass(template, isc.mreduction):
        T = template is isc.mreduction_cols
        M, N = sizes[::-1] if T else sizes
        A = isc.empty((M,N), context=context)
        x = isc.empty(N, context=context)
        return isc.dot(A.T, x) if T else isc.dot(A, x), (A, x)
    elif issubclass(template, isc.mproduct):
        AT = template is isc.mproduct_tn or template is isc.mproduct_tt
        BT = template is isc.mproduct_nt or template is isc.mproduct_tt
        M, N, K = sizes
        A = isc.empty((K, M) if AT else (M, K), context=context)
        B = isc.empty((N, K) if BT else (K, N), context=context)
        AA = A.T if AT else A
        BB = B.T if BT else B
        return isc.dot(AA, BB), (A, B)
 def memory_footprint(template, sizes):
    if issubclass(template, isc.vaxpy):
        return 4*3*sizes[0]*1e-9
    elif issubclass(template, isc.reduction):
        return 4*2*sizes[0]*1e-9
    elif issubclass(template, isc.maxpy):
        return 4*3*sizes[0]*sizes[1]*1e-9
    elif issubclass(template, isc.mreduction):
        return 4*sizes[0]*sizes[1]*1e-9
    elif issubclass(template, isc.mproduct):
        return 4*(sizes[0]*sizes[1] + sizes[0]*sizes[2] + sizes[1]*sizes[2])*1e-9
 def metric_of(template):
    memory_bound = [isc.vaxpy, isc.reduction, isc.maxpy, isc.mreduction]
    compute_bound = [isc.mproduct]
    if any([issubclass(template, x) for x in memory_bound]):
        return lambda sizes, t: memory_footprint(template, sizes)/t
    elif any([issubclass(template, x) for x in compute_bound]):
        return lambda sizes, t: 2*sizes[0]*sizes[1]*sizes[2]*1e-9/t
 def genetic_infos_of(template):
    if issubclass(template, isc.vaxpy):
        return {'categorical': [3], 'nbits': [3,4,4,2] }
    elif issubclass(template, isc.reduction):
        return {'categorical': [3], 'nbits':[3,4,4,2]}
    elif issubclass(template, isc.maxpy):
        return {'categorical': [5], 'nbits': [3,3,3,3,4,2]}
    elif issubclass(template, isc.mreduction):
        return {'categorical': [5], 'nbits': [3,3,3,3,4,2]}
    elif issubclass(template, isc.mproduct):
        return {'categorical': [8,9], 'nbits': [3,3,3,3,3,2,2,2,2,2,3,3]}
--- a/tune/tune.py
+++ b/tune/tune.py
@@ -0,0 +1,133 @@
 import random, argparse, json, os
 from math import log, isinf
 from itertools import chain, product
 from numpy import argsort, argmax
 from operator import mul
 from sklearn import ensemble
 import isaac as isc
 import optimize, tools, model
 def unique(L):
    seen = set()
    seen_add = seen.add
    return [ x for x in L if not (x in seen or seen_add(x))]
 def pow2range(a, b):
    return [2**x for x in range(a, b)]
 def tune(device, operation, json_path): 
    #List devices
    platforms = isc.get_platforms()
    context = isc.context(device)
    #List of size tuples to use
    sizes = list({isc.vaxpy: [(x,) for x in tools.expspace(1e3, 1e7, 4)],
                  isc.mreduction_cols: product(pow2range(4,17), pow2range(4,17)),
                  isc.mproduct_nt: product(pow2range(4, 17), pow2range(4, 17), pow2range(4, 17))}[operation])
    sizes = unique(sizes)
    sizes = [x for x in sizes if 1e-4 <= tools.memory_footprint(operation, x) <= 1e-1]
    #Training data
    performance = tools.metric_of(operation)
    profiles = []
    X = []
    Y = []
    for idx, x in enumerate(sizes):
        print x
        nparams = len(profiles)
        tree, operands = tools.tree_of(operation, x, context)
        #Check if the current best prediction is not a local optimum
        if idx==0:
            tune = True
            predicted = None
        else:
            if nparams==1:
                predicted = profiles[0]
            else:
                clf = ensemble.RandomForestRegressor(min(10, idx+1), max_depth=min(10, idx+1)).fit(X, Y)
                #clf, nrmse = model.train(X, Y, profiles)
                predperf = clf.predict(x)[0]
                best = (-predperf).argsort()[:5]
                perf = [performance(x, tools.benchmark(operation, profiles[b], tree)) for b in best]
                predicted = profiles[best[argmax(perf)]]
            tune = not optimize.is_local_optimum(predicted, operation, x, context)     
        #Retune if necessary
        if tune:
            #new = optimize.exhaustive(operation, x, context)
            new = optimize.genetic(operation, x, context, niter=1000, naccept=1000, popsize=20, prior=predicted)[0]
            if new not in profiles:
                profiles.append(new)
                if idx > 0:
                    for xx,yy in zip(X, Y):
                        _tree, _operands = tools.tree_of(operation, xx, context)
                        time = tools.benchmark(operation, new, _tree)
                        perf = performance(xx, time)
                        yy.append(0 if isinf(perf) else perf)
        #Update dataset
        y = []
        fastest = max(predperf) if nparams > 1 else None
        for ip, p in enumerate(profiles):
            perf = 0 if fastest and ip < nparams and predperf[ip]/fastest < .1 else performance(x,tools.benchmark(operation, p, tree))
            y.append(0 if isinf(perf) else perf)
        X.append(x)
        Y.append(y)
    #Build model
    clf, nrmse = model.train(X, Y, profiles)
    print 'The optimal classifer has NRMSE = %.2g (%d estimators and the max depth is %d'%(nrmse, clf.n_estimators, clf.max_depth)
    #Export to JSON
    if os.path.isfile(json_path):
        json_data = json.load(open(args.out, 'r'))
    else:
        json_data = {}
        json_data["version"] = "1.0"
    operation_name = operation.__name__
    if operation_name not in json_data:
        json_data[operation_name] = {}
    json_data[operation_name]['float32'] = {}
    D = json_data[operation_name]['float32']
    if len(profiles) > 1:
        D['predictor'] = [{'children_left': e.tree_.children_left.tolist(),
                            'children_right': e.tree_.children_right.tolist(),
                            'threshold': e.tree_.threshold.astype('float64').tolist(),
                            'feature': e.tree_.feature.astype('float64').tolist(),
                            'value': e.tree_.value[:,:,0].astype('float64').tolist()} for e in clf.estimators_]
    D['profiles'] = [map(int, x) for x in profiles]
    json.dump(json_data, open(json_path,'w'))
 def parse_arguments():
    platforms = isc.get_platforms()
    devices = [d for platform in platforms for d in platform.get_devices()]
    #Command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--device", default=0, type=int, help='Device to tune for')
    parser.add_argument("-o", "--operation", type=str, required=True, help='Operation to tune for')
    parser.add_argument("-j", "--json", default='', type=str)
    args = parser.parse_args()
    device = devices[int(args.device)]
    print("----------------")
    print("Devices available:")
    print("----------------")
    for (i, d) in enumerate(devices):
        selected = '[' + ('x' if device==d else '') + ']'
        print selected , '-',  isc.device_type_to_string(d.type), '-', d.name, 'on', d.platform.name
    print("----------------")
    operation = {'vaxpy': isc.vaxpy, 'dot': isc.reduction,
                 'maxpy': isc.maxpy, 'gemv_n': isc.mreduction_rows, 'gemv_t': isc.mreduction_cols,
                 'gemm_nn': isc.mproduct_nn, 'gemv_tn': isc.mproduct_tn, 'gemm_nt': isc.mproduct_nt, 'gemm_tt':isc.mproduct_tt}[args.operation]
    if not args.json:
        json = tools.sanitize(device.name) + '.json'
    return (device, operation, json)
 if __name__ == "__main__":
    isc.state.queue_properties = isc.CL_QUEUE_PROFILING_ENABLE
    args = parse_arguments()
    tune(*args)