ISAAC-V2.0: INITIAL COMMIT

2017-05-07 16:51:51 -07:00
parent 911f1fdb71
commit e99759d3b3
2882 changed files with 73975 additions and 1087661 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,84 +1,28 @@
 cmake_minimum_required(VERSION 2.8.7)
 project(isaac-research)
 include(CTest)
 #QtCreator: add visibility of headers
 file( GLOB_RECURSE MAKE_HEADERS_VISIBLE_SRC *.cpp *.hpp *.h)
 add_custom_target( MAKE_HEADERS_VISIBLE SOURCES ${MAKE_HEADERS_VISIBLE_SRC} )
 #Default build type
 if(NOT CMAKE_BUILD_TYPE)
  message(STATUS "Default build type: Release")
  set(CMAKE_BUILD_TYPE "Release")
 endif()
 if(WIN32)
 SET(CMAKE_FIND_LIBRARY_PREFIXES "")
 SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib" ".dll")
 endif()
 # Add visibility of headers
 file( GLOB_RECURSE MAKE_HEADERS_VISIBLE_SRC *.cpp *.hpp *.h)
 add_custom_target( MAKE_HEADERS_VISIBLE SOURCES ${MAKE_HEADERS_VISIBLE_SRC} )
 #Modules
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/lib/external/)
 #Compiler flags
-add_definitions(${BACKEND_DEFINES})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
-if(WIN32)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -pedantic")
 	add_definitions("-DNOMINMAX")
 else()
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -pedantic")
 endif()
-#Includes
+#Source
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/lib/tools/ ${CMAKE_CURRENT_SOURCE_DIR}/include/external/ ${CMAKE_CURRENT_SOURCE_DIR}/include/external/cuda)
 #Binary to convert .cu files to const char *
 if(NOT ANDROID)
    add_executable(bin2cpp ${CMAKE_MODULE_PATH}/helpers/bin2cpp.cpp)
    include("${CMAKE_MODULE_PATH}/helpers/CodeToH.cmake")
 endif()
 #Source files
 file(GLOB_RECURSE LIBISAAC_SRC lib/*.cpp)
 add_library(isaac SHARED ${LIBISAAC_SRC})
 target_link_libraries(isaac "dl")
-#Python wrapper
+#Examples
 set(SETUP_PY_IN "${CMAKE_MODULE_PATH}/python/setup.py")
 set(SETUP_PY    "${CMAKE_SOURCE_DIR}/python/setup.py")
 set(LIBISAAC_SRC_STR)
 foreach(FILE ${LIBISAAC_SRC})
    string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}" "src" _TMP ${FILE})
    set(LIBISAAC_SRC_STR "${_TMP} ${LIBISAAC_SRC_STR}")
 endforeach()
 #Include directories
 set(INCLUDE_DIRECTORIES_STR)
 get_property(INCLUDE_DIRECTORIES_LST DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
 set(INCLUDE_DIRECTORIES_STR)
 foreach(FILE ${INCLUDE_DIRECTORIES_LST})
    string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}" "src" _TMP ${FILE})
    set(INCLUDE_DIRECTORIES_STR "${INCLUDE_DIRECTORIES_STR} ${_TMP}")
 endforeach()
 configure_file(${SETUP_PY_IN} ${SETUP_PY})
 add_custom_command(OUTPUT "${CMAKE_BINARY_DIR}/build/timestamp"
                    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/python ${CMAKE_BINARY_DIR}/python
                    COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_BINARY_DIR}/python/src/lib/CMakeLists.txt
                    COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_BINARY_DIR}/python/build
                    COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_BINARY_DIR}/python/src/lib
                    COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_BINARY_DIR}/python/src/include
                    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/lib ${CMAKE_BINARY_DIR}/python/src/lib
                    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/include ${CMAKE_BINARY_DIR}/python/src/include
                    COMMAND ${CMAKE_COMMAND} -E tar czf isaac-1.0.tar.gz ${CMAKE_BINARY_DIR}/python
                    )
 add_custom_target(package-python DEPENDS "${CMAKE_BINARY_DIR}/build/timestamp")
 #Isaac
 include(CTest)
 add_subdirectory(lib)
 add_subdirectory(tests)
 add_subdirectory(bench)
 add_subdirectory(examples)
 #Tests
 add_subdirectory(tests)
--- a/1
+++ b/1
@@ -19,3 +19,4 @@
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
--- a/README.md
+++ b/README.md
@@ -1,77 +1,40 @@
 # ISAAC
-This is the developer repository for ISAAC, a library that uses machine learning to find input-aware kernels for element-wise operations, 1D/2D reductions and GEMM. It works with both cuBLAS and clBLAS, and fallbacks on those when appropriate (typically large square matrices).
+This is the development branch for ISAAC v2.0. This is a major rewrite more targetted at compute-bound applications, with major performance gains at the expense of portability.
 ### License
-ISAAC is distributed under the MIT License.
+ISAAC is distributed under the MIT/X11 license.
 ### Installation
-ISAAC is dependency-free, and will load either OpenCL and/or CUDA 7.0+ _dynamically_ depending on which GPUs are detected at runtime.
+ISAAC only requires an NVIDIA GPU with compute-capability > 5.0 and the corresponding proprietary driver. 
 You only need CMake 2.8.7+ and a C++11 compliant compiler:  
 The CUDA SDK is *not* required.
 ```
 git clone https://github.com/ptillet/isaac.git
 mkdir -p isaac/build && cd isaac/build
-cmake ../ && make -j4
+cmake ../ && make -j8
 ./examples/bench
 ```
-Link against libisaac.so instead of libcublas.so or libclblas.so, and you're good to go! 
+### Benchmarks
 Below is the TFLOPS you get for sGEMM on a Pascal Titan X vs cuBLAS 8.0.
 ![alt tag](https://github.com/ptillet/isaac/raw/master/documentation/bench/GEMM.png)
-The C++ and Python API does some kernel fusion, but is not entirely stable. It works well to compose element-wise operations, though.
+Below is the TFLOPS you get for FCONV on a Pascal Titan X vs cuDNN v6.
 ![alt tag](https://github.com/ptillet/isaac/raw/master/documentation/bench/CONV.png)
 There's still room for improvement.
-### Benchmark
+### APIs
-```
+ISAAC implements both GEMM and FCONV for fp16x2, fp32, and fp64. Half-precision with 32-bits accumulation and complex data-types is not yet supported.
 Usage : blas-bench [--op {axpy, dot, gemv, gemm}] [--dtype {float32, float64}] [--device DEVICE_IDX] [--help]
 --op: operation to benchmark (default = gemm)
 --dtype: data-type to benchmark (default = float32)
 --device: index of isaac device in [0, ..., ndevices - 1] (default = 0)
 --help: display this message
 ```
 It detects clBLAS or cuBLAS and compares it against ISAAC for e.g., DeepBench, Covariance, LAPACK (packed rank1 updates), etc.
-Below is the TFLOPS you get for GEMM on a Pascal Titan X (cuBLAS 8.0). Numbers in bold represent speed-ups greater than 5%.
+### Future Plans
 ![alt tag](https://github.com/ptillet/isaac/raw/master/documentation/bench/bench-cuBLAS.png)
-For AMD Fury (clBLAS-2.10-Fiji):
+Future plans include (but are not limited to):
-![alt tag](https://github.com/ptillet/isaac/raw/master/documentation/bench/bench-clBLAS.png)
+* Transparent use over cuBLAS/cuDNN using LD_PRELOAD
-
+* Backward Convolution
-Same trend on Intel Broadwell iGPU
+* Complex data-types for GEMM
 ### BLAS routines supported
 Currently supported functions are:
 | BLAS1         | BLAS2         | BLAS3         |
 | --------------| --------------| --------------|
 | xAXPY         | xGEMV         | xGEMM         |
 | xCOPY         | xGER          |               |
 | xSCAL         |               |               |
 | xDOT          |               |               |
 | xASUM         |               |               |
 For x in {S, D}
 ### Contributing
 You can contribute to further tuning isaac if you have one of the following architecture:
 - NVidia: SM 2.x ; SM 3.5 ; SM 5.0
 If you have one of the following architectures you can contribute by running:
 ```
 git clone https://github.com/ptillet/isaac.git
 cd isaac/python ;
 python setup.py build;
 cd ../tune
 PYTHONPATH=../python/build/lib.linux-x86_64-2.7/ python main.py --float64 --float32 --elementwise_1d --elementwise_2d --reduce_1d --reduce_2d_rows --reduce_2d_cols --gemm_nn --gemm_nt --gemm_tn --gemm_tt
 ```
 This will output a .json file that you can submit for integration.
 Bug reports are more than welcome!
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -1,44 +0,0 @@
 set(CMAKE_BUILD_TYPE Release)
 set(BLAS_DEF)
 set(BLAS_LIBS)
 #CUBLAS
 find_package(CUDA QUIET)
 if(CUDA_FOUND)
    set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_CUBLAS")
    include_directories(${CUDA_INCLUDE_DIRS})
    set(BLAS_LIBS ${BLAS_LIBS} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
 endif()
 #CLBLAS
 find_package(CLBLAS QUIET)
 if(CLBLAS_FOUND)
    set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_CLBLAS")
    include_directories(${CLBLAS_INCLUDE_DIR})
    set(BLAS_LIBS ${BLAS_LIBS}  ${CLBLAS_LIBRARIES} OpenCL pthread)
 endif()
 ##CBLAS
 #find_package(MKL QUIET)
 #if(MKL_FOUND)
 #    set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_MKL")
 #    include_directories(${MKL_INCLUDE_DIR})
 #    set(BLAS_LIBS ${BLAS_LIBS}  ${MKL_LIBRARIES} )
 #else()
 # find_package(OpenBlas)
 # if(OPENBLAS_FOUND)
 #     set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_CBLAS")
 #     include_directories(${OPENBLAS_INCLUDE_DIR})
 #     set(BLAS_LIBS ${BLAS_LIBS}  ${OPENBLAS_LIBRARIES} )
 # endif()
 #endif()
 string(REPLACE ";" " " BLAS_DEF_STR "${BLAS_DEF}")
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/tests/common)
 foreach(PROG blas)
   add_executable(bench-${PROG}  ${PROG}.cpp)
   set_target_properties(bench-${PROG} PROPERTIES COMPILE_FLAGS "${BLAS_DEF_STR}")
   target_link_libraries(bench-${PROG} ${BLAS_LIBS} isaac)
 endforeach(PROG)
--- a/bench/blas.cpp
+++ b/bench/blas.cpp
@@ -1,412 +0,0 @@
 #include "isaac/array.h"
 #include "isaac/runtime/execute.h"
 #ifdef BENCH_CLBLAS
 #include "clBLAS.h"
 #endif
 #ifdef BENCH_MKL
 #include "mkl_cblas.h"
 #elif defined(BENCH_CBLAS)
 #include "cblas.h"
 #endif
 #ifdef BENCH_CUBLAS
 #include <cublas.h>
 #endif
 #include <iomanip>
 #include <stdlib.h>
 #include <cmath>
 #include <numeric>
 #include <regex>
 #include <string>
 #include "common.hpp"
 #include "half.hpp"
 typedef sc::int_t int_t;
 Timer tmr;
 /* C++ wrapper for BLAS */
 #ifdef BENCH_CLBLAS
 template<typename... Args> void clblasAxpy(float, Args... args){ clblasSaxpy(args...); }
 template<typename... Args> void clblasAxpy(double, Args... args){ clblasDaxpy(args...); }
 template<typename... Args> void clblasDot(float, Args... args){ clblasSdot(args...); }
 template<typename... Args> void clblasDot(double, Args... args){ clblasDdot(args...); }
 template<typename... Args> void clblasGemv(float, Args... args){ clblasSgemv(args...); }
 template<typename... Args> void clblasGemv(double, Args... args){ clblasDgemv(args...); }
 template<typename... Args> void clblasGemm(float, Args... args){ clblasSgemm(args...); }
 template<typename... Args> void clblasGemm(double, Args... args){ clblasDgemm(args...); }
 #endif
 #ifdef BENCH_CBLAS
 template<typename... Args> void cblasAxpy(float, Args... args){ cblas_saxpy(args...); }
 template<typename... Args> void cblasAxpy(double, Args... args){ cblas_daxpy(args...); }
 template<typename... Args> void cblasDot(float, Args... args){ cblas_sdot(args...); }
 template<typename... Args> void cblasDot(double, Args... args){ cblas_ddot(args...); }
 template<typename... Args> void cblasGemv(float, Args... args){ cblas_sgemv(args...); }
 template<typename... Args> void cblasGemv(double, Args... args){ cblas_dgemv(args...); }
 template<typename... Args> void cblasGemm(float, Args... args){ cblas_sgemm(args...); }
 template<typename... Args> void cblasGemm(double, Args... args){ cblas_dgemm(args...); }
 #endif
 //cuBLAS
 #ifdef BENCH_CUBLAS
 template<typename... Args> void cublasAxpy(float, Args... args){ cublasSaxpy(args...); }
 template<typename... Args> void cublasAxpy(double, Args... args){ cublasDaxpy(args...); }
 template<typename... Args> void cublasDot(float, Args... args){ cublasSdot(args...); }
 template<typename... Args> void cublasDot(double, Args... args){ cublasDdot(args...); }
 template<typename... Args> void cublasGemv(float, Args... args){ cublasSgemv(args...); }
 template<typename... Args> void cublasGemv(double, Args... args){ cublasDgemv(args...); }
 template<typename... Args> void cublasGemm(float, Args... args){ cublasSgemm(args...); }
 template<typename... Args> void cublasGemm(double, Args... args){ cublasDgemm(args...); }
 #endif
 //
 template<class OP, class SYNC>
 double bench(OP const & op, SYNC const & sync)
 {
  std::vector<long> times;
  double total_time = 0;
  op();
  sync();
  while(total_time*1e-9 < 2e-1){
    tmr.start();
    op();
    sync();
    times.push_back(tmr.get().count());
    total_time+=times.back();
  }
  return min(times);
 }
 void print_results_header(std::vector<std::string> sections, bool
                          #ifdef BENCH_CLBLAS
                          on_cl
                          #endif
                          , bool
                          #ifdef BENCH_CUBLAS
                          on_cu
                          #endif
                          ){
    std::cout << color_stream(ITALIC) << color_stream(BOLD) ;
    std::copy(sections.begin(), sections.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
    std::cout << "ISAAC";
 #ifdef BENCH_CLBLAS
    if(on_cl)
    std::cout << "\tclBLAS";
 #endif
 #ifdef BENCH_CBLAS
    std::cout << "\tBLAS";
 #endif
 #ifdef BENCH_CUBLAS
    if(on_cu)
    std::cout << "\tcuBLAS";
 #endif
    std::cout << color_stream(RESET) << std::endl;
 }
 void print_results(std::vector<double> const & times, std::vector<std::string> const & prefix, std::function<double(double)> fn){
    std::copy(prefix.begin(), prefix.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
    std::vector<double> perf;
    std::transform(times.begin(), times.end(), std::back_inserter(perf), fn);
    auto fastest = perf;
    std::sort(fastest.begin(), fastest.end(), std::greater<double>());
    for(auto x: perf){
      if(x/fastest[1] >= 1.05)
        std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
      else
        std::cout << x;
      std::cout << "\t";
    }
    std::cout << std::endl;
 }
 template<class T>
 std::string str(T const & x){ return std::to_string(x); }
 template<class T>
 void bench(sc::numeric_type dtype, std::string operation)
 {
  using std::get;
  using std::make_tuple;
  //unsigned int dtsize = sc::size_of(dtype);
  sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0);
  auto sync = [&](){ queue.synchronize(); };
 #ifdef BENCH_CUBLAS
  auto cusync = [&](){ cudaDeviceSynchronize(); };
 #endif
  bool on_cl = queue.backend()==sc::driver::OPENCL;
  bool on_cu = queue.backend()==sc::driver::CUDA;
  size_t dtsize = sc::size_of(dtype);
  /*---------*/
  /*--BLAS1--*/
  /*---------*/
  if(operation=="axpy")
  {
    float alpha = 1;
    print_results_header({"N"}, on_cl, on_cu);
    for(int_t MB: std::vector<int_t>{1, 10, 100, 1000})
    {
      int_t N = MB*1e6/dtsize/3;
      std::vector<double> times;
      sc::array x(N, dtype), y(N, dtype);
      //Bench
      times.push_back(bench([&](){y = x + alpha*y;}, sync));
 #ifdef BENCH_CLBLAS
      if(on_cl)
        times.push_back(bench([&]() {clblasAxpy(T(), N, alpha, cl(x), 0, 1, cl(y), 0, 1, 1, &cl(queue), 0, nullptr, nullptr);}, sync));
 #endif
 #ifdef BENCH_CBLAS
      std::vector<float> cx(N), cy(N);
      sc::copy(x, cx);
      sc::copy(y, cy);
      times.push_back(bench([&](){cblasAxpy(T(), N, alpha, cx.data(), 1, cy.data(), 1);}, sync));
 #endif
 #ifdef BENCH_CUBLAS
      if(on_cu)
        times.push_back(bench([&](){cublasAxpy(T(), N, alpha, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
 #endif
      print_results(times, {str(MB)}, [&](double t){return MB*1e6/t;});
    }
  }
  if(operation=="dot")
  {
    print_results_header({"MB"}, on_cl, on_cu);
    for(int_t MB: std::vector<int_t>{1, 10, 100, 1000})
    {
      int_t N = MB*1e6/dtsize/2;
      std::vector<double> times;
      sc::array x(N, dtype), y(N, dtype);
      sc::array scratch(N, dtype);
      sc::scalar s(dtype);
      //Bench
      times.push_back(bench([&](){s = dot(x,y);}, sync));
 #ifdef BENCH_CLBLAS
      if(on_cl)
        times.push_back(bench([&]() {clblasDot(T(), N, cl(s), 0, cl(x), 0, 1, cl(y), 0, 1, cl(scratch), 1, &cl(queue), 0, nullptr, nullptr);}, sync));
 #endif
 #ifdef BENCH_CBLAS
      std::vector<float> cx(N), cy(N);
      sc::copy(x, cx);
      sc::copy(y, cy);
      times.push_back(bench([&](){cblasDot(T(), N, cx.data(), 1, cy.data(), 1);}, sync));
 #endif
 #ifdef BENCH_CUBLAS
      if(on_cu)
        times.push_back(bench([&](){cublasDot(T(), N, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
 #endif
      print_results(times, {str(MB)}, [&](double t){return MB*1e6/t;});
    }
  }
  if(operation.substr(0, 4)=="gemv")
  {
    std::vector<std::tuple<std::string, std::string,int_t, int_t> > MNs;
    //Linear System
    MNs.push_back(make_tuple("Square", "N",153,153));
    MNs.push_back(make_tuple("Square", "N",1024, 1024));
    MNs.push_back(make_tuple("Square", "N",2867,2867));
    MNs.push_back(make_tuple("Square", "T",153,153));
    MNs.push_back(make_tuple("Square", "T",1024,1024));
    MNs.push_back(make_tuple("Square", "T",2867,2867));
    //Normalization
    MNs.push_back(make_tuple("Short", "N", 64, 60000));
    MNs.push_back(make_tuple("Short", "N", 256, 60000));
    MNs.push_back(make_tuple("Short", "N", 1024, 60000));
    MNs.push_back(make_tuple("Short", "T", 64, 60000));
    MNs.push_back(make_tuple("Short", "T", 256, 60000));
    MNs.push_back(make_tuple("Short", "T", 1024, 60000));
    //Householder
    MNs.push_back(make_tuple("Tall", "N", 10, 60000));
    MNs.push_back(make_tuple("Tall", "N", 30, 60000));
    MNs.push_back(make_tuple("Tall", "T", 10, 60000));
    MNs.push_back(make_tuple("Tall", "T", 30, 60000));
    /*---------*/
    /*--BLAS2--*/
    /*---------*/
    print_results_header({"BENCH", "M", "N", "AT"}, on_cl, on_cu);
    for(auto MN: MNs)
    {
      std::vector<double> times;
      std::string name = get<0>(MN);
      std::string cAT = get<1>(MN);
      int_t M = get<2>(MN);
      int_t N = get<3>(MN);
      int_t As1 = M, As2 = N;
      bool AT = (cAT == "T");
      if(AT) std::swap(As1, As2);
      sc::array A(As1, As2, dtype), y(M, dtype), x(N, dtype);
 #ifdef HAS_A_BLAS
      int_t lda = A.stride()[1];
 #endif
      //Bench
      times.push_back(bench([&](){y = AT?dot(A.T,x):dot(A,x);}, sync));
 #ifdef BENCH_CLBLAS
      if(on_cl)
        times.push_back(bench([&]() {clblasGemv(T(), clblasColumnMajor, AT?clblasTrans:clblasNoTrans, As1, As2, 1, cl(A), 0, lda, cl(x), 0, 1, 0, cl(y), 0, 1, 1, &cl(queue),0, nullptr, nullptr);}, sync));
 #endif
 #ifdef BENCH_CBLAS
      std::vector<float> cA(M*N), cx(N), cy(M);
      sc::copy(x, cx);
      sc::copy(y, cy);
      sc::copy(A, cA);
      times.push_back(bench([&](){cblasGemv(T(), CblasColMajor, AT?CblasTrans:CblasNoTrans, As1, As2, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1);}, sync));
 #endif
 #ifdef BENCH_CUBLAS
      if(on_cu)
        times.push_back(bench([&](){cublasGemv(T(), AT?'t':'n', As1, As2, 1, (T*)cu(A), lda, (T*)cu(x), 1, 0, (T*)cu(y), 1);}, cusync));
 #endif
      print_results(times, {name, str(M), str(N), cAT}, [&](double t){ return (M*N + M + N)*dtsize/t;});
    }
  }
  if(operation.substr(0,4)=="gemm")
  {
    std::vector<std::tuple<std::string, int_t, int_t, int_t, std::string, std::string> > MNKs;
    //DeepBench
    for(size_t MK: std::vector<size_t>{1760, 2048, 2560})
      for(size_t N: std::vector<size_t>{16, 32, 64, 128, 7000})
        MNKs.push_back(make_tuple("Deep", MK, N, MK, "N", "N"));
    for(size_t MK: std::vector<size_t>{1760, 2048, 2560})
      for(size_t N: std::vector<size_t>{16, 32, 64, 128, 7000})
        MNKs.push_back(make_tuple("Deep", MK, N, MK, "T", "N"));
    for(size_t MK: std::vector<size_t>{1760, 4096})
      MNKs.push_back(make_tuple("Deep", MK, 7133, MK, "N", "T"));
    //Covariance (e.g., ICA, 10minutes/100Hz)
    MNKs.push_back(make_tuple("Cov",32,32,60000,"N","T"));
    MNKs.push_back(make_tuple("Cov",256,256,60000,"N","T"));
    //Bi-diagonalization
    MNKs.push_back(make_tuple("Lapack",4096,4096,32,"N","T"));
    MNKs.push_back(make_tuple("Lapack",3456,3456,32,"N","T"));
    MNKs.push_back(make_tuple("Lapack",896,896,32,"N","T"));
    print_results_header({"BENCH", "M", "N", "K", "AT", "BT"}, on_cl, on_cu);
    /*---------*/
    /*--BLAS3--*/
    /*---------*/
    for(auto MNK: MNKs)
    {
      std::vector<double> times;
      std::vector<double> tflops;
      std::string name = get<0>(MNK);
      int_t M = get<1>(MNK);
      int_t N = get<2>(MNK);
      int_t K = get<3>(MNK);
      std::string cAT = get<4>(MNK);
      std::string cBT = get<5>(MNK);
      bool AT = cAT=="T";
      bool BT = cBT=="T";
      int_t As1 = M, As2 = K;
      if(AT) std::swap(As1, As2);
      int_t Bs1 = K, Bs2 = N;
      if(BT) std::swap(Bs1, Bs2);
      sc::array C(M, N, dtype), A(As1, As2, dtype), B(Bs1, Bs2, dtype);
 #ifdef HAS_A_BLAS
      int_t lda = A.stride()[1], ldb = B.stride()[1], ldc = C.stride()[1];
 #endif
      //bench
      times.push_back(bench([&](){C = AT?(BT?dot(A.T,B.T)
                                            :dot(A.T,B))
                                        :(BT?dot(A,B.T)
                                            :dot(A,B));}, sync));
 #ifdef BENCH_CLBLAS
      if(on_cl)
        times.push_back(bench([&]() {clblasGemm(T(), clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans,
                                                 M, N, K, 1, cl(A), 0, lda, cl(B), 0, ldb,
                                                 0, cl(C), 0, ldc, 1, &cl(queue),0, nullptr, nullptr);}, sync));
 #endif
 #ifdef BENCH_CBLAS
      std::vector<float> cC(M*N), cA(M*K), cB(N*K);
      sc::copy(C, cC);
      sc::copy(A, cA);
      sc::copy(B, cB);
      times.push_back(bench([&](){cblasGemm(T(), CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc);}, sync));
 #endif
 #ifdef BENCH_CUBLAS
      if(on_cu)
        times.push_back(bench([&](){cublasGemm(T(), AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 0, (T*)cu(C), ldc);}, cusync));
 #endif
      print_results(times, {name, str(M), str(N), str(K), cAT, cBT}, [&](double t){ return 2*M*N*K/t*1e-3;});
    }
  }
 }
 void handle_misusage(){
  std::cerr << "Usage : blas-bench [--op {axpy, dot, gemv, gemm}] [--dtype {float32, float64}] [--device DEVICE_IDX] [--help]" << std::endl;
  std::cerr << "--op: operation to benchmark (default = gemm)" << std::endl;
  std::cerr << "--dtype: data-type to benchmark (default = float32)" << std::endl;
  std::cerr << "--device: index of isaac device in [0, ..., ndevices - 1] (default = 0)" << std::endl;
  std::cerr << "--help: display this message" << std::endl;
  exit(EXIT_FAILURE);
 }
 std::string getopt(std::vector<std::string> const & args,
            std::string const & key,
            std::vector<std::string> const & set = {},
            std::string dft = "")
 {
  auto it = std::find(args.begin(), args.end(), key);
  if(it==args.end()){
    if(dft.empty())
      handle_misusage();
    return dft;
  }
  auto next = it + 1;
  if(next==args.end() || next->compare(0, 2, "--")==0)
    handle_misusage();
  if(set.size() && std::find(set.begin(), set.end(), *next)==set.end())
    handle_misusage();
  return *next;
 }
 int main(int argc, char* argv[])
 {
  std::vector<std::string> args(argv, argv + argc);
 #ifdef BENCH_CLBLAS
  clblasSetup();
 #endif
  sc::driver::backend::default_queue_properties = CL_QUEUE_PROFILING_ENABLE;
  if(std::find(args.begin(), args.end(), "--help") != args.end())
    handle_misusage();
  std::string operation = getopt(args, "--op", {"axpy", "dot", "gemv", "gemm"}, "gemm");
  std::string dtype = getopt(args, "--dtype", {"float16", "float32", "float64"}, "float32");
  int device;
  try{
    device = std::stoi(getopt(args, "--device", {}, "0"));
  }catch(...){ handle_misusage(); }
  sc::driver::backend::default_device = device;
  /* List devices */
  std::cout << "Devices available:" << std::endl;
  std::cout << "------------------" << std::endl;
  size_t i = 0;
  std::vector<sc::driver::Platform> platforms;
  sc::driver::backend::platforms(platforms);
  for(sc::driver::Platform const & pf: platforms){
    std::vector<sc::driver::Device> devices;
    pf.devices(devices);
    for(sc::driver::Device const & device: devices)
      std::cout << "[" << (i++==sc::driver::backend::default_device?"x":" ") << "]"
                << " - " << device.name()
                << " on " << pf.name() << std::endl;
  }
  std::cout << "------------------" << std::endl;
  std::cout << std::fixed << std::setprecision(2);
  //if(dtype=="float16")
  //  bench<half_float::half>(sc::HALF_TYPE, operation);
  if(dtype=="float32")
    bench<float>(sc::FLOAT_TYPE, operation);
  if(dtype=="float64")
    bench<double>(sc::DOUBLE_TYPE, operation);
 #ifdef BENCH_CLBLAS
  clblasTeardown();
 #endif
 }
--- a/bench/common.hpp
+++ b/bench/common.hpp
@@ -1,152 +0,0 @@
 #ifndef ISAAC_BENCH_COMMON_HPP_
 #define ISAAC_BENCH_COMMON_HPP_
 #include <chrono>
 #include <algorithm>
 #include "isaac/array.h"
 namespace sc = isaac;
 template<std::size_t> struct int_{};
 template <class Tuple, size_t Pos>
 std::ostream& print_tuple(std::ostream& out, const Tuple& t, int_<Pos> )
 {
    out << std::get< std::tuple_size<Tuple>::value-Pos >(t) << ',';
    return print_tuple(out, t, int_<Pos-1>());
 }
 template <class Tuple>
 std::ostream& print_tuple(std::ostream& out, const Tuple& t, int_<1> )
 {
    return out << std::get<std::tuple_size<Tuple>::value-1>(t);
 }
 template <class... Args>
 std::ostream& operator<<(std::ostream& out, const std::tuple<Args...>& t)
 {
    print_tuple(out, t, int_<sizeof...(Args)>());
    return out;
 }
 int ceil(int N, int pad)
 {
    return (N%pad==0)?N:(N+pad-1)/pad*pad;
 }
 std::vector<int> create_log_range(int min, int max, int N, int pad)
 {
  std::vector<int> res(N);
  for(int i = 0 ; i < N ; ++i)
  {
    res[i] = static_cast<int>(std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N));
    res[i] = ceil(res[i], pad);
  }
  return res;
 }
 std::vector<int> create_full_range(int min, int max, int pad)
 {
    std::vector<int> N;
    for(int i = ceil(min, pad) ; i < ceil(max, pad) ; i+=pad)
        N.push_back(i);
    return N;
 }
 template<class T>
 T median(std::vector<T> x)
 {
  size_t size = x.size();
  std::sort(x.begin(), x.end());
  if (size  % 2 == 0)
      return (x[size / 2 - 1] + x[size / 2]) / 2;
  else
      return x[size / 2];
 }
 template<class T>
 T min(std::vector<T> x)
 { return *std::min_element(x.begin(), x.end()); }
 template<class T>
 T max(std::vector<T> x)
 { return *std::max_element(x.begin(), x.end()); }
 template<class T>
 T mean(std::vector<T> x)
 {
  T res = 0;
  int N = x.size();
  for(int i = 0 ; i < N ; ++i)
    res += x[i];
  return res/N;
 }
 class Timer
 {
    typedef std::chrono::high_resolution_clock high_resolution_clock;
    typedef std::chrono::nanoseconds nanoseconds;
 public:
    explicit Timer(bool run = false)
    { if (run) start(); }
    void start()
    { _start = high_resolution_clock::now(); }
    nanoseconds get() const
    { return std::chrono::duration_cast<nanoseconds>(high_resolution_clock::now() - _start); }
 private:
    high_resolution_clock::time_point _start;
 };
 cl_mem& cl(sc::array& x)
 { return x.data().handle().cl(); }
 cl_mem& cl(sc::scalar& x)
 { return x.data().handle().cl(); }
 cl_command_queue& cl(sc::driver::CommandQueue& x)
 { return x.handle().cl(); }
 CUdeviceptr& cu(sc::array& x)
 { return x.data().handle().cu(); }
 CUdeviceptr& cu(sc::scalar& x)
 { return x.data().handle().cu(); }
 CUstream& cu(sc::driver::CommandQueue& x)
 { return x.handle().cu(); }
 enum Code {
  RESET = 0,
  BOLD = 1,
  ITALIC = 3,
  FG_RED = 31,
  FG_GREEN = 32,
  FG_YELLOW = 33,
  FG_BLUE = 34,
  FG_MAGENTA = 35,
  FG_CYAN = 36,
  FG_LIGHT_GRAY = 37,
  FG_DARK_GRAY = 90,
  FG_LIGHT_RED = 91,
  FG_LIGHT_GREEN = 92,
  FG_LIGHT_YELLOW = 93,
  FG_LIGHT_BLUE = 94,
  FG_LIGHT_MAGENTA = 95,
  FG_LIGHT_CYAN = 96,
  FG_WHITE = 97
 };
 class color_stream {
    Code code;
 public:
    color_stream(Code pCode) : code(pCode) {}
    friend std::ostream&
    operator<<(std::ostream& os, const color_stream& mod) {
        return os << "\033[" << mod.code << "m";
    }
 };
 #endif
--- a/bench/overhead.cpp
+++ b/bench/overhead.cpp
@@ -1,54 +0,0 @@
 #include "isaac/array.h"
 #include <vector>
 namespace sc = isaac;
 #ifdef BENCH_CUBLAS
 __global__ void dummy(){}
 #endif
 int main()
 {
  for(sc::driver::backend::data_type::const_iterator it = sc::driver::queues.data().begin() ; it != sc::driver::queues.data().end() ; ++it)
  {
    cl::CommandQueue queue = it->second[0];
    cl::Context context = it->first;
    cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();
    cl::Program program(context,"__kernel void dummy(){}");
    program.build();
    cl::Kernel kernel(program, "dummy");
    cl::NDRange offset = cl::NullRange;
    cl::NDRange global(1);
    cl::NDRange local(1);
    cl::Event event;
    std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
    std::cout << "-------------------------" << std::endl;
    queue.enqueueNDRangeKernel(kernel, offset, global, local, NULL, &event);
    queue.flush();
    queue.finish();
    {
    long time = event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
    std::cout << "Kernel launch overhead: " << time << std::endl;
    }
 #ifdef BENCH_CUBLAS
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    dummy<<<1, 1>>>();
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    std::cout << "CUDA Kernel launch overhead: " << time << std::endl;
 #endif
    std::cout << "-------------------------" << std::endl;
  }
 }
--- a/cmake/FindCLBLAS.cmake
+++ b/cmake/FindCLBLAS.cmake
@@ -1,15 +0,0 @@
 file(GLOB CLBLAS_ROOT /opt/clBLAS*)
 set(CLBLAS_INCLUDE_HINTS "${CLBLAS_ROOT}/include")
 set(CLBLAS_LIBRARIES_HINTS "${CLBLAS_ROOT}/lib64")
 find_path(CLBLAS_INCLUDE_DIR clBLAS.h HINTS ${CLBLAS_INCLUDE_HINTS})
 find_library(CLBLAS_LIBRARIES NAMES clBLAS HINTS ${CLBLAS_LIBRARIES_HINTS})
 if(CLBLAS_LIBRARIES)
    set(CLBLAS_LIBRARIES ${CLBLAS_LIBRARIES})
 endif()
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(CLBLAS  DEFAULT_MSG CLBLAS_LIBRARIES CLBLAS_INCLUDE_DIR)
 mark_as_advanced(CLBLAS)
--- a/cmake/FindMKL.cmake
+++ b/cmake/FindMKL.cmake
@@ -1,19 +0,0 @@
 file(GLOB SYSTEM_STUDIO_ROOT /opt/intel/ /opt/intel/composerxe* /opt/intel/system_studio_*)
 find_path(MKL_INCLUDE_DIR mkl_blas.h HINTS ${SYSTEM_STUDIO_ROOT}/mkl/include/)
 find_library(MKL_LIBRARIES NAMES mkl_core HINTS ${SYSTEM_STUDIO_ROOT}/mkl/lib/intel64/)
 find_library(ICC_LIBRARIES NAMES iomp5 HINTS ${SYSTEM_STUDIO_ROOT}/compiler/lib/intel64/)
 if(ICC_LIBRARIES)
    set(OMP_LIBRARIES ${ICC_LIBRARIES})
 else()
    set(OMP_LIBRARIES gomp)
 endif()
 if(MKL_LIBRARIES AND OMP_LIBRARIES)
    set(MKL_LIBRARIES -lmkl_mc3 -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core ${MKL_LIBRARIES} ${OMP_LIBRARIES} pthread)
 endif()
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(MKL  DEFAULT_MSG MKL_LIBRARIES MKL_INCLUDE_DIR)
 mark_as_advanced(MKL)
--- a/cmake/FindOpenBlas.cmake
+++ b/cmake/FindOpenBlas.cmake
@@ -1,10 +0,0 @@
 find_path(OPENBLAS_INCLUDE_DIR cblas.h)
 find_library(OPENBLAS_LIBRARIES NAMES openblas PATHS /lib/ /lib64/  /usr/lib /usr/lib64 /usr/local/lib /usr/local/lib64 /opt/OpenBLAS/lib $ENV{OPENBLAS_HOME}/lib)
 if(OPENBLAS_LIBRARIES)
    set(OPENBLAS_LIBRARIES ${OPENBLAS_LIBRARIES})
 endif()
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(OpenBlas DEFAULT_MSG OPENBLAS_LIBRARIES OPENBLAS_INCLUDE_DIR)
 mark_as_advanced(OpenBlas)
--- a/cmake/FindOpenCL.cmake
+++ b/cmake/FindOpenCL.cmake
@@ -1,30 +0,0 @@
 #Hints for finding libOpenCL
 #OpenCL Hints
 if(CMAKE_SIZEOF_VOID_P EQUAL 8)
    set(L_HINTS $ENV{INTELOCLSDKROOT}/lib/x64)
 else()
    set(L_HINTS $ENV{INTELOCLSDKROOT}/lib/x86)
 endif()
 set(ANDROID_CL_GLOB_HINTS /opt/adreno-driver*/lib)
 set(X86_CL_GLOB_HINTS /opt/AMDAPPSDK*/lib/x86_64)
 if(ANDROID)
    foreach(PATH ${ANDROID_CL_GLOB_HINTS})
        file(GLOB _TMP ${PATH})
        set(L_HINTS ${L_HINTS} ${_TMP})
    endforeach()
    find_library(OPENCL_LIBRARIES NAMES OpenCL NO_CMAKE_FIND_ROOT_PATH HINTS ${L_HINTS} )
 else()
    foreach(PATH ${X86_CL_GLOB_HINTS})
        file(GLOB _TMP ${PATH})
        set(L_HINTS ${L_HINTS} ${_TMP})
    endforeach()
    set(L_HINTS ${L_HINTS} ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/)
    find_library(OPENCL_LIBRARIES NAMES OpenCL HINTS ${L_HINTS} )
 endif()
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(OpenCL DEFAULT_MSG OPENCL_LIBRARIES)
 mark_as_advanced(OpenCL)
--- a/cmake/android/apk.cmake
+++ b/cmake/android/apk.cmake
@@ -1,138 +0,0 @@
 #*********************************************************#
 #*  File: Apk.cmake                                      *
 #*    Android apk tools
 #*
 #*  Copyright (C) 2002-2013 The PixelLight Team (http://www.pixellight.org/)
 #*
 #*  This file is part of PixelLight.
 #*
 #*  Permission is hereby granted, free of charge, to any person obtaining a copy of this software
 #*  and associated documentation files (the "Software"), to deal in the Software without
 #*  restriction, including without limitation the rights to use, copy, modify, merge, publish,
 #*  distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
 #*  Software is furnished to do so, subject to the following conditions:
 #*
 #*  The above copyright notice and this permission notice shall be included in all copies or
 #*  substantial portions of the Software.
 #*
 #*  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
 #*  BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 #*  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 #*  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 #*  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #*********************************************************#
 ##################################################
 ## Options
 ##################################################
 set(ANDROID_APK_API_LEVEL "10" CACHE STRING "Android APK API level")
 set(ANDROID_APK_INSTALL "0" CACHE BOOL "Install created apk file on the device automatically?")
 set(ANDROID_APK_RUN "0" CACHE BOOL "Run created apk file on the device automatically? (installs it automatically as well, \"ANDROID_APK_INSTALL\"-option is ignored)")
 set(ANDROID_APK_SIGNER_KEYSTORE	"~/my-release-key.keystore" CACHE STRING "Keystore for signing the apk file (only required for release apk)")
 set(ANDROID_APK_SIGNER_ALIAS "myalias" CACHE STRING "Alias for signing the apk file (only required for release apk)")
 ##################################################
 ## Variables
 ##################################################
 set(ANDROID_THIS_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})	# Directory this CMake file is in
 ##################################################
 ## MACRO: android_create_apk
 ##
 ## Create/copy Android apk related files
 ##
 ## @param name
 ##   Name of the project (e.g. "MyProject"), this will also be the name of the created apk file
 ## @param apk_pacakge_name
 ##   Pacakge name of the application
 ## @param apk_directory
 ##   Directory were to construct the apk file in (e.g. "${CMAKE_BINARY_DIR}/apk")
 ## @param libs_directory
 ##   Directory where the built android libraries will be POST_BUILD, e.g ${CMAKE_SOURCE_DIR}/libs 
 ## @param assets_directory
 ##   Directory where the assets for the application are locatated
 ##   
 ## @remarks
 ##   Requires the following tools to be found automatically
 ##   - "android" (part of the Android SDK)
 ##   - "adb" (part of the Android SDK)
 ##   - "ant" (type e.g. "sudo apt-get install ant" on your Linux system to install Ant)
 ##   - "jarsigner" (part of the JDK)
 ##   - "zipalign" (part of the Android SDK)
 ##################################################
 macro(android_create_apk name apk_package_name apk_directory libs_directory android_directory assets_directory)
  set(ANDROID_NAME ${name})
  set(ANDROID_APK_PACKAGE ${apk_package_name})
  # Create the directory for the libraries
  add_custom_command(TARGET ${ANDROID_NAME} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E remove_directory "${apk_directory}/libs")
  add_custom_command(TARGET ${ANDROID_NAME} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory "${apk_directory}/libs/armeabi-v7a/")
  get_property(MAINLIB TARGET ${name} PROPERTY LOCATION)
  get_property(ISAAC TARGET isaac PROPERTY LOCATION)
  add_custom_command(TARGET ${ANDROID_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy  "${MAINLIB}" "${apk_directory}/libs/armeabi-v7a/")
  # Create "build.xml", "default.properties", "local.properties" and "proguard.cfg" files
  if(CMAKE_BUILD_TYPE MATCHES Release)
    set(ANDROID_APK_DEBUGGABLE "false")
  else()
    set(ANDROID_APK_DEBUGGABLE "true")
  endif()
  add_custom_command(TARGET ${ANDROID_NAME} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory "${apk_directory}/res")
  add_custom_command(TARGET ${ANDROID_NAME} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory  "${android_directory}/res" "${apk_directory}/res/")
  configure_file("${android_directory}/AndroidManifest.xml" "${apk_directory}/AndroidManifest.xml")
  add_custom_command(TARGET ${ANDROID_NAME} COMMAND android update project -t android-${ANDROID_APK_API_LEVEL} --name ${ANDROID_NAME} --path "${apk_directory}")
  # Copy assets
  add_custom_command(TARGET ${ANDROID_NAME} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E remove_directory "${apk_directory}/assets")
  add_custom_command(TARGET ${ANDROID_NAME} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory "${apk_directory}/assets/")
  add_custom_command(TARGET ${ANDROID_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory  "${CMAKE_SOURCE_DIR}/assets" "${apk_directory}/assets/")
  # Build the apk file
  if(CMAKE_BUILD_TYPE MATCHES Release)
    # Let Ant create the unsigned apk file
    add_custom_command(TARGET ${ANDROID_NAME}
      COMMAND ant release
      WORKING_DIRECTORY "${apk_directory}")
    # Sign the apk file
    add_custom_command(TARGET ${ANDROID_NAME}
      COMMAND jarsigner -verbose -keystore ${ANDROID_APK_SIGNER_KEYSTORE} bin/${ANDROID_NAME}-unsigned.apk ${ANDROID_APK_SIGNER_ALIAS}
      WORKING_DIRECTORY "${apk_directory}")
    # Align the apk file
    add_custom_command(TARGET ${ANDROID_NAME}
      COMMAND zipalign -v -f 4 bin/${ANDROID_NAME}-unsigned.apk bin/${ANDROID_NAME}.apk
      WORKING_DIRECTORY "${apk_directory}")
    # Install current version on the device/emulator
    if(ANDROID_APK_INSTALL OR ANDROID_APK_RUN)
      add_custom_command(TARGET ${ANDROID_NAME}
 	COMMAND adb install -r bin/${ANDROID_NAME}.apk
 	WORKING_DIRECTORY "${apk_directory}")
    endif()
  else()
    # Let Ant create the unsigned apk file
    add_custom_command(TARGET ${ANDROID_NAME}
      COMMAND ant debug
      WORKING_DIRECTORY "${apk_directory}")
    # Install current version on the device/emulator
    if(ANDROID_APK_INSTALL OR ANDROID_APK_RUN)
      add_custom_command(TARGET ${ANDROID_NAME}
 	COMMAND adb install -r bin/${ANDROID_NAME}-debug.apk
 	WORKING_DIRECTORY "${apk_directory}")
    endif()
  endif()
  # Start the application
  if(ANDROID_APK_RUN)
    add_custom_command(TARGET ${ANDROID_NAME}
      COMMAND adb shell am start -n ${ANDROID_APK_PACKAGE}/android.app.NativeActivity)
  endif()
 endmacro(android_create_apk name apk_directory libs_directory assets_directory)
--- a/cmake/android/example.sh
+++ b/cmake/android/example.sh
@@ -1 +0,0 @@
 cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchain/android.cmake -DANDROID_NDK=/opt/android-ndk-r10d/ -DANDROID_ABI=armeabi-v7a with NEON -DANDROID_NATIVE_API_LEVEL=19 -DANDROID_APK_API_LEVEL=19 -DANDROID_APK_RUN=1 ../
--- a/cmake/helpers/CodeToH.cmake
+++ b/cmake/helpers/CodeToH.cmake
@@ -1,61 +0,0 @@
 #Copyright (c) 2014, ArrayFire
 #All rights reserved.
 # Function to turn an OpenCL source file into a C string within a source file.
 # xxd uses its input's filename to name the string and its length, so we
 # need to move them to a name that depends only on the path output, not its
 # input.  Otherwise, builds in different relative locations would put the
 # source into different variable names, and everything would fall over.
 # The actual name will be filename (.s replaced with underscores), and length
 # name_len.
 #
 # Usage example:
 #
 # set(KERNELS a.cl b/c.cl)
 # resource_to_cxx_source(
 #   SOURCES ${KERNELS}
 #   VARNAME OUTPUTS
 # )
 # add_executable(foo ${OUTPUTS})
 #
 # The namespace they are placed in is taken from filename.namespace.
 #
 # For example, if the input file is kernel.cl, the two variables will be
 #  unsigned char ns::kernel_cl[];
 #  unsigned int ns::kernel_cl_len;
 #
 # where ns is the contents of kernel.cl.namespace.
 include(CMakeParseArguments)
 set(BIN2CPP_PROGRAM "bin2cpp")
 function(CODE_TO_H)
    cmake_parse_arguments(ARGS "" "VARNAME;EXTENSION;OUTPUT_DIR;TARGET;NAMESPACE;EOF" "SOURCES" ${ARGN})
    set(_output_files "")
    foreach(_input_file ${ARGS_SOURCES})
        get_filename_component(_path "${_input_file}" PATH)
        get_filename_component(_name "${_input_file}" NAME)
        get_filename_component(_name_we "${_input_file}" NAME_WE)
        set(var_name ${_name_we})
        set(_namespace "${ARGS_NAMESPACE}")
        string(REPLACE "." "_" var_name ${var_name})
        set(_output_path "${ARGS_OUTPUT_DIR}")
        set(_output_file "${_output_path}/${_name_we}.${ARGS_EXTENSION}")
        add_custom_command(
            OUTPUT ${_output_file}
            DEPENDS ${_input_file} ${BIN2CPP_PROGRAM}
            COMMAND ${CMAKE_COMMAND} -E make_directory "${_output_path}"
            COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\<${_path}/${_name_we}.hpp\\>"  >>"${_output_file}"
            COMMAND ${BIN2CPP_PROGRAM} --file ${_name} --namespace ${_namespace} --output ${_output_file} --name ${var_name} --eof ${ARGS_EOF} --extension ${ARGS_EXTENSION}
            WORKING_DIRECTORY "${_path}"
            COMMENT "Compiling ${_input_file} to C++ source"
        )
        list(APPEND _output_files ${_output_file})
    endforeach()
    add_custom_target(${ARGS_TARGET} ALL DEPENDS ${_output_files})
 endfunction()
--- a/cmake/helpers/bin2cpp.cpp
+++ b/cmake/helpers/bin2cpp.cpp
@@ -1,194 +0,0 @@
 // Copyright (c) 2014, ArrayFire
 // All rights reserved.
 // Umar Arshad
 // Copyright 2014
 #include <stdlib.h>
 #include <fstream>
 #include <sstream>
 #include <iostream>
 #include <string>
 #include <vector>
 #include <map>
 #include <memory>
 #include <algorithm>
 using namespace std;
 typedef map<string, string> opt_t;
 static
 void print_usage() {
    cout << R"delimiter(BIN2CPP
 Converts files from a binary file to C++ headers. It is similar to bin2c and
 xxd but adds support for namespaces.
 | --name        | name of the variable (default: var)                               |
 | --file        | input file                                                        |
 | --output      | output file (If no output is specified then it prints to stdout   |
 | --type        | Type of variable (default: char)                                  |
 | --namespace   | A space seperated list of namespaces                              |
 | --formatted   | Tabs for formatting                                               |
 | --version     | Prints my name                                                    |
 | --help        | Prints usage info                                                 |
 Example
 -------
 Command:
 ./bin2cpp --file blah.txt --namespace blah detail --formatted --name blah_var
 Will produce:
 #pragma once
 #include <cstddef>
 namespace blah {
    namespace detail {
        static const char blah_var[] = {
            0x2f,    0x2f,    0x20,    0x62,    0x6c,    0x61,    0x68,    0x2e,    0x74,    0x78,
            0x74,    0xa,    0x62,    0x6c,    0x61,    0x68,    0x20,    0x62,    0x6c,    0x61,
            0x68,    0x20,    0x62,    0x6c,    0x61,    0x68,    0xa,    };
        static const size_t blah_var_len = 27;
    }
 })delimiter";
        exit(0);
 }
 static bool formatted;
 static
 void add_tabs(const int level ){
    if(formatted) {
        for(int i =0; i < level; i++) {
            cout << "\t";
        }
    }
 }
 static
 opt_t
 parse_options(const vector<string>& args) {
    opt_t options;
    options["--name"]       = "";
    options["--type"]       = "";
    options["--file"]       = "";
    options["--output"]     = "";
    options["--extension"]     = "";
    options["--namespace"]  = "";
    options["--eof"]        = "";
    //Parse Arguments
    string curr_opt;
    bool verbose = false;
    for(auto arg : args) {
        if(arg == "--verbose") {
            verbose = true;
        }
        else if(arg == "--formatted") {
            formatted = true;
        }
        else if(arg == "--version") {
            cout << args[0] << " By Umar Arshad" << endl;
        }
        else if(arg == "--help") {
            print_usage();
        }
        else if(options.find(arg) != options.end()) {
            curr_opt = arg;
        }
        else if(curr_opt.empty()) {
            //cerr << "Invalid Argument: " << arg << endl;
        }
        else {
            if(options[curr_opt] != "") {
                options[curr_opt] += " " + arg;
            }
            else {
                options[curr_opt] += arg;
            }
        }
    }
    if(verbose) {
        for(auto opts : options) {
            cout << get<0>(opts) << " " << get<1>(opts) << endl;
        }
    }
    return options;
 }
 int main(int argc, const char * const * const argv)
 {
    vector<string> args(argv, argv+argc);
    opt_t&& options = parse_options(args);
    //Save default cout buffer. Need this to prevent crash.
    auto bak = cout.rdbuf();
    unique_ptr<ofstream> outfile;
    // Set defaults
    if(options["--name"] == "")     { options["--name"]     = "var"; }
    if(options["--output"] != "")   {
        //redirect stream if output file is specified
        outfile.reset(new ofstream(options["--output"]));
        cout.rdbuf(outfile->rdbuf());
    }
    if(options["--extension"] != "cpp")
        cout << "#pragma once\n";
    cout << "\n";
    cout << "#include <cstddef>\n"; // defines size_t
    cout << "\n";
    int ns_cnt = 0;
    int level = 0;
    if(options["--namespace"] != "") {
        std::stringstream namespaces(options["--namespace"]);
        string name;
        namespaces >> name;
        do {
            add_tabs(level++);
            cout << "namespace " << name << "\n";
            cout << "{\n";
            ns_cnt++;
            namespaces >> name;
        } while(!namespaces.fail());
    }
    if(options["--type"] == "") {
        options["--type"]     = "char";
    }
    add_tabs(level);
    cout << "\n";
    cout << "static const " << options["--type"] << " " << options["--name"] << "[] = {\n";
    ifstream input(options["--file"]);
    size_t char_cnt = 0;
    add_tabs(++level);
    for(char i; input.get(i);) {
        cout << "0x" << std::hex << static_cast<int>(i) << ",\t";
        char_cnt++;
        if(!(char_cnt % 10)) {
            cout << endl;
            add_tabs(level);
        }
    }
    if (options["--eof"].c_str()[0] == '1') {
        // Add end of file character
        cout << "0x0";
        char_cnt++;
    }
    cout << "};\n";
    add_tabs(--level);
    cout << "\n";
    cout << "static const std::size_t " << options["--name"] << "_len" << " = " << std::dec << char_cnt << ";\n";
    cout << "\n";
    while(ns_cnt--) {
        add_tabs(--level);
        cout << "}\n";
    }
    cout.rdbuf(bak);
 }
--- a/cmake/python/setup.py
+++ b/cmake/python/setup.py
@@ -1,130 +0,0 @@
 #Thanks to Andreas Knoeckler for providing stand-alone boost.python
 #through PyOpenCL and PyCUDA
 import os, sys
 from distutils.ccompiler import show_compilers,new_compiler
 from distutils.command.build_ext import build_ext
 from distutils.command.build_py import build_py
 from distutils.core import setup, Extension
 from distutils.sysconfig import get_python_inc
 from distutils import sysconfig
 from imp import find_module
 from glob import glob
 from os.path import dirname
 platform_cflags = {}
 platform_ldflags = {}
 platform_libs = {}
 class build_ext_subclass(build_ext):
    def build_extensions(self):
        c = self.compiler.compiler_type
        if c in platform_cflags.keys():
            for e in self.extensions:
                e.extra_compile_args = platform_cflags[c]
        if c in platform_ldflags.keys():
            for e in self.extensions:
                e.extra_link_args = platform_ldflags[c]
        if c in platform_libs.keys():
            for e in self.extensions:
                try:
                    e.libraries += platform_libs[c]
                except:
                    e.libraries = platform_libs[c]
        build_ext.build_extensions(self)
 def main():
    def recursive_glob(rootdir='.', suffix=''):
        return [os.path.join(looproot, filename)
                for looproot, _, filenames in os.walk(rootdir)
                for filename in filenames if filename.endswith(suffix)]
    def remove_prefixes(optlist, bad_prefixes):
        for bad_prefix in bad_prefixes:
            for i, flag in enumerate(optlist):
                if flag.startswith(bad_prefix):
                    optlist.pop(i)
                    break
        return optlist
    #Tweaks warning, because boost-numpy and boost-python won't compile cleanly without these changes
    cvars = sysconfig.get_config_vars()
    cvars['OPT'] = str.join(' ', remove_prefixes(cvars['OPT'].split(), ['-g', '-Wstrict-prototypes']))
    cvars["CFLAGS"] = cvars["BASECFLAGS"] + ' ' + cvars['OPT']
    cvars["LDFLAGS"] = '-Wl,--no-as-needed ' + cvars["LDFLAGS"]
    #Check Android
    for_android = '-mandroid' in cvars['PY_CFLAGS']
    #Dynamic load for backend switching
    libraries = ['dl']
    library_dirs = []
    #Include directories
    numpy_include = os.path.join(find_module("numpy")[1], "core", "include")
    include ='${INCLUDE_DIRECTORIES_STR}'.split() + ['external/boost/', 'external/boost/boost/', numpy_include]
    #Android
    if for_android:
      ANDROID_ROOT = os.environ['ANDROIDNDK'] + '/sources/cxx-stl/gnu-libstdc++/' + os.environ['TOOLCHAIN_VERSION']
      library_dirs += [ANDROID_ROOT + '/libs/armeabi']
      include += [ANDROID_ROOT + '/include/', ANDROID_ROOT + '/libs/armeabi/include/']
      libraries += ['gnustl_shared']
    #Source files
    src =  '${LIBISAAC_SRC_STR}'.split() + [os.path.join('src', 'bind', sf)  for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
    boostsrc = 'external/boost/libs/'
    for s in ['numpy','python','smart_ptr','system','thread']:
        src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]
    extensions = []
    #isaac
    extensions += [Extension(
                    '_isaac',src,
                    extra_compile_args= ['-std=c++11', '-Wno-unused-function', '-Wno-unused-local-typedefs',  '-Wno-sign-compare', '-Wno-attributes', '-DBOOST_PYTHON_SOURCE '],
 		    extra_link_args=['-Wl,-soname=_isaac.so'],
                    undef_macros=[],
                    include_dirs=include,
                    library_dirs=library_dirs,
                    libraries=libraries)]
    #External
    extensions += [Extension('external.sklearn._tree',
                             ['external/sklearn/_tree.c'],
                             include_dirs = [numpy_include])]
    #Setup
    setup(
                name='isaac',
                version='1.0',
                description="Input-specific architecture-aware computations",
                author='Philippe Tillet',
                author_email='ptillet@g.harvard.edu',
                license='MPL 2.0',
                packages=['isaac', 'isaac.external', 'isaac.external.sklearn'],
                ext_package="isaac",
                ext_modules=extensions,
                cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass},
                classifiers=[
                    'Environment :: Console',
                    'Development Status :: 1 - Experimental',
                    'Intended Audience :: Developers',
                    'Intended Audience :: Other Audience',
                    'Intended Audience :: Science/Research',
                    'License :: OSI Approved :: MIT License',
                    'Natural Language :: English',
                    'Programming Language :: C++',
                    'Programming Language :: Python',
                    'Programming Language :: Python :: 3',
                    'Topic :: Scientific/Engineering',
                    'Topic :: Scientific/Engineering :: Mathematics',
                    'Topic :: Scientific/Engineering :: Physics',
                    'Topic :: Scientific/Engineering :: Machine Learning',
                ]
    )
 if __name__ == "__main__":
    main()
--- a/cmake/toolchain/android.cmake
+++ b/cmake/toolchain/android.cmake
--- a/cmake/toolchain/cross-win32-mingw32.cmake
+++ b/cmake/toolchain/cross-win32-mingw32.cmake
@@ -1,12 +0,0 @@
 #System
 SET(CMAKE_SYSTEM_NAME Windows)
 #Compilers
 SET(CMAKE_C_COMPILER /usr/bin/i686-w64-mingw32-gcc)
 SET(CMAKE_CXX_COMPILER /usr/bin/i686-w64-mingw32-g++)
 SET(CMAKE_RC_COMPILER /usr/bin/i686-w64-mingw32-windres)
 # search headers and libraries in the target environment, search 
 # programs in the host environment
 SET(CMAKE_FIND_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/crossdeps" /usr/i686-w64-mingw32)
 SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
--- a/cmake/toolchain/cross-win64-mingw32.cmake
+++ b/cmake/toolchain/cross-win64-mingw32.cmake
@@ -1,12 +0,0 @@
 #System
 SET(CMAKE_SYSTEM_NAME Windows)
 #Compilers
 SET(CMAKE_C_COMPILER /usr/bin/x86_64-w64-mingw32-gcc)
 SET(CMAKE_CXX_COMPILER /usr/bin/x86_64-w64-mingw32-g++)
 SET(CMAKE_RC_COMPILER /usr/bin/x86_64-w64-mingw32-windres)
 # search headers and libraries in the target environment, search 
 # programs in the host environment
 SET(CMAKE_FIND_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/crossdeps" /usr/x86_64-w64-mingw32)
 SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
--- a/documentation/add-license.sh
+++ b/documentation/add-license.sh
@@ -2,6 +2,6 @@ for i in $(find ../lib/ ../include/isaac/ ../python/src/bind -name '*.cpp' -or -
 do
  if ! grep -q Copyright $i
  then
-    cat license-header.txt $i >$i.new && mv $i.new $i
+    cat ../LICENSE $i >$i.new && mv $i.new $i
  fi
 done
--- a/documentation/bench/CONV.pdf
+++ b/documentation/bench/CONV.pdf
--- a/documentation/bench/CONV.png
+++ b/documentation/bench/CONV.png
--- a/documentation/bench/GEMM.pdf
+++ b/documentation/bench/GEMM.pdf
--- a/documentation/bench/GEMM.png
+++ b/documentation/bench/GEMM.png
--- a/documentation/bench/bench-clBLAS.png
+++ b/documentation/bench/bench-clBLAS.png
--- a/documentation/bench/bench-cuBLAS.png
+++ b/documentation/bench/bench-cuBLAS.png
--- a/documentation/bench/plot.py
+++ b/documentation/bench/plot.py
@@ -1,69 +0,0 @@
 import matplotlib.pyplot as plt
 import numpy as np
 def add_line(ax, xpos, ypos, height=.1):
    line = plt.Line2D([xpos, xpos], [ypos + height, ypos],
                      transform=ax.transAxes, color='black')
    line.set_clip_on(False)
    ax.add_line(line)
 bench = [('DeepBench-Forward\nM=K=1760', 'N'),
         ('DeepBench-Backward\nM=K=2560', 'N'),
         ('Covariance\nK=60000', 'M=N'),
         ('Blocked SVD\nK=32', 'M=N')]
 labels = [[16, 32, 64, 128, 7000],
         [16, 32, 64, 128, 7000],
         [32, 256],
         [896, 3456, 4096]]
 configs = {
  'Pascal Titan X': {'lib': 'cuBLAS',
                     'libperf': [1.65, 1.88, 2.58, 4.83, 11.5,
                                 0.72, 1.72, 2.39, 2.86, 7.77,
                                 0.80, 3.61,
                                 1.37, 2.50, 2.57],
                     'libcol': 'green',
                     'scperf': [1.15, 2.43, 3.83, 5.53, 11.5,
                              1.78, 3.06, 4.37, 5.52, 8.67,
                              1.44, 6.43,
                              1.14, 4.53, 4.91]},
  'R9 Fury': {'lib': 'clBLAS',
              'libperf': [0.22, 0.65, 1.35, 1.92, 3.35,
                          0.28, 0.64, 1.36, 1.91, 3.32,
                          0.02, 0.87,
                          0.43, 0.98, 1.95],
              'libcol': '#d30034',
              'scperf':  [0.67, 0.94, 1.18, 2.12, 4.66,
                          0.63, 1.15, 1.43, 1.82, 4.22,
                          0.19, 2.82,
                          0.35, 1.82, 1.80]}
 }
 for device, conf in configs.iteritems():
    width = 0.5
    sep = 1.3
    xx = sep*np.arange(len(conf['scperf'])) + width
    groups = [0] + [len(_) for _ in labels]
    for i in np.cumsum(groups)[:-1]:
        xx[i:] += sep
    xmax = xx[-1] + width + sep
    figure, ax = plt.subplots(figsize=(12,8))
    sc = ax.bar(xx - width, conf['scperf'], width, color='purple')
    cu = ax.bar(xx, conf['libperf'], width, color=conf['libcol'])
    linex = [(xx[i] - sep) for i in np.cumsum(groups)[1:-1]]
    linex = [0] + linex + [xmax]
    for i in range(len(linex)-1):
        group, sublabel = bench[i]
        add_line(ax, linex[i]/xmax, 0, -10)
        ax.text(.5*(linex[i] + linex[i+1])/xmax, -.12, group, ha='center', transform=ax.transAxes, fontsize = 10, color='darkblue')
        ax.text(.5*(linex[i] + linex[i+1])/xmax, -.07, sublabel, ha='center', transform=ax.transAxes, fontsize = 10)
    ax.set_xlim((0,xmax))
    ax.set_xticks(xx)
    ax.set_xticklabels([x for _ in labels for x in _ ], rotation=30, fontsize=10)
    ax.set_ylabel('TFLOPS')
    ax.legend((sc, cu), ('ISAAC', conf['lib']))
    ax.set_title('sGEMM - {}'.format(device))
    plt.savefig('bench-{}.png'.format(conf['lib']))
    plt.show()
--- a/documentation/license-header.txt
+++ b/documentation/license-header.txt
@@ -1,21 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,4 +1,6 @@
-foreach(PROG indexing)
+foreach(PROG bench)
-     add_executable(example-${PROG} ${PROG}.cpp)
+     add_executable(${PROG} ${PROG}.cpp)
-     target_link_libraries(example-${PROG} isaac)
+     set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG})
     include_directories(/usr/local/cuda/include/)
     target_link_libraries(${PROG} PRIVATE isaac)
 endforeach(PROG)
--- a/examples/bench.cpp
+++ b/examples/bench.cpp
@@ -0,0 +1,181 @@
 #include <tuple>
 #include "isaac/driver/backend.h"
 #include "isaac/driver/cublas.h"
 #include "isaac/driver/context.h"
 #include "isaac/driver/buffer.h"
 #include "isaac/driver/stream.h"
 #include "isaac/tools/bench.hpp"
 #include "isaac/api.h"
 namespace sc = isaac;
 namespace drv = sc::driver;
 using sc::param_t;
 using std::make_tuple;
 double geometric_mean(std::vector<double> const&data){
  double logsum = std::accumulate(data.begin(), data.end(),
                                  (double)0, [](double acc, double x){ return acc + std::log(x);});
  return std::exp(logsum/data.size());
 }
 void print_results_header(std::vector<std::string> sections){
    std::cout << color_stream(ITALIC) << color_stream(BOLD) ;
    std::copy(sections.begin(), sections.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
    std::cout << "ISAAC\tcuDNN";
    std::cout << color_stream(RESET) << std::endl;
 }
 void print_results(std::vector<double> const & times, std::vector<std::string> const & prefix, std::function<double(double)> fn){
    std::copy(prefix.begin(), prefix.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
    std::vector<double> perf;
    std::transform(times.begin(), times.end(), std::back_inserter(perf), fn);
    auto fastest = perf;
    std::sort(fastest.begin(), fastest.end(), std::greater<double>());
    for(auto x: perf){
      if(x/fastest[1] >= 1.05)
        std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
      else
        std::cout << x;
      std::cout << "\t";
    }
    std::cout << std::endl;
 }
 int main(){
  std::cout << std::fixed << std::setprecision(2);
  auto ctx = drv::backend::contexts::get_default();
  drv::Stream stream(ctx);
  sc::DType dtype = sc::FLOAT_TYPE;
  int32_t dtsize = sc::size_of(dtype);
  drv::Device const & device = drv::backend::contexts::get_default().device();
  {
    typedef std::tuple<param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> conv_tuple;
    std::vector<conv_tuple> shapes;
    //Cluster 1
    for(size_t N: std::vector<size_t>{4, 8, 16, 32})
      shapes.push_back(std::make_tuple(700, 161, 1, N, 32, 5, 20, 0, 0, 2, 2));
    //Cluster 2
    for(size_t N: std::vector<size_t>{4, 8, 16, 32})
      shapes.push_back(std::make_tuple(341, 79, 32, N, 32, 5, 10, 0, 0, 2, 2));
    //Cluster 3
    shapes.push_back(std::make_tuple(480, 48, 1, 16, 16, 3, 3, 1, 1, 1, 1));
    shapes.push_back(std::make_tuple(240, 24, 16, 16, 32, 3, 3, 1, 1, 1, 1));
    shapes.push_back(std::make_tuple(120, 12, 32, 16, 64, 3, 3, 1, 1, 1, 1));
    shapes.push_back(std::make_tuple(60, 6, 64, 16, 128, 3, 3, 1, 1, 1, 1));
    //Cluster 4
    shapes.push_back(std::make_tuple(108, 108, 3, 8, 64, 3, 3, 1, 1, 2, 2));
    shapes.push_back(std::make_tuple(54, 54, 64, 8, 64, 3, 3, 1, 1, 1, 1));
    shapes.push_back(std::make_tuple(27, 27, 128, 8, 128, 3, 3, 1, 1, 1, 1));
    shapes.push_back(std::make_tuple(14, 14, 128, 8, 256, 3, 3, 1, 1, 1, 1));
    shapes.push_back(std::make_tuple(7, 7, 256, 8, 512, 3, 3, 1, 1, 1, 1));
    //Cluster 5-6
    for(size_t N: std::vector<size_t>{8, 16}){
      shapes.push_back(std::make_tuple(224, 224, 3, N, 64, 3, 3, 1, 1, 1, 1));
      shapes.push_back(std::make_tuple(112, 112, 64, N, 128, 3, 3, 1, 1, 1, 1));
      shapes.push_back(std::make_tuple(56, 56, 128, N, 256, 3, 3, 1, 1, 1, 1));
      shapes.push_back(std::make_tuple(28, 28, 256, N, 512, 3, 3, 1, 1, 1, 1));
      shapes.push_back(std::make_tuple(14, 14, 512, N, 512, 3, 3, 1, 1, 1, 1));
      shapes.push_back(std::make_tuple(7, 7, 512, N, 512, 3, 3, 1, 1, 1, 1));
    }
    //Cluster 7
    shapes.push_back(std::make_tuple(224, 224, 3, 16, 64, 7, 7, 3, 3, 2, 2));
    shapes.push_back(std::make_tuple(28, 28, 192, 16, 32, 5, 5, 2, 2, 1, 1));
    shapes.push_back(std::make_tuple(28, 28, 192, 16, 64, 1, 1, 0, 0, 1, 1));
    shapes.push_back(std::make_tuple(14, 14, 512, 16, 48, 5, 5, 2, 2, 1, 1));
    shapes.push_back(std::make_tuple(14, 14, 512, 16, 192, 1, 1, 0, 0, 1, 1));
    shapes.push_back(std::make_tuple(7, 7, 832, 16, 256, 1, 1, 0, 0, 1, 1));
    shapes.push_back(std::make_tuple(7, 7, 832, 16, 128, 5, 5, 2, 2, 1, 1));
    param_t W, H, P, Q, C, N, K, R, S, pad_h, pad_w, stride_h, stride_w;
    std::cout << "======================================================================" << std::endl;
    std::cout << "FCONV" << std::endl;
    std::cout << "======================================================================" << std::endl;
    print_results_header({"N", "K", "P", "Q", "C", "R", "S"});
    std::vector<double> speedup;
    for(auto shape: shapes){
      std::tie(W, H, C, N, K, R, S, pad_h, pad_w, stride_h, stride_w) = shape;
      P = (H - R + 1 + 2*pad_h)/stride_h;
      Q = (W - S + 1 + 2*pad_w)/stride_w;
      sc::scalar alpha(1., dtype);
      sc::scalar beta(0., dtype);
      drv::Buffer O(ctx, N*K*P*Q*dtsize);
      drv::Buffer I(ctx, C*H*W*N*dtsize);
      drv::Buffer F(ctx, K*C*R*S*dtsize);
      std::vector<double> times;
      times.push_back(bench([&](){ sc::CONV(device, stream, dtype, N, K, P, Q, C, R, S, H, W, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device));
      times.push_back(bench([&](){ sc::driver::cudnnConv(dtype, ctx, stream, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize();  }, device));
      speedup.push_back(times[1]/times[0]);
      print_results(times, {str(N), str(K), str(P), str(Q), str(C), str(R), str(S)}, [&](double tsec){ return sc::templates::Conv::tflops(P,Q,K,N,C,R,S,tsec);});
    }
    std::cout << "======================================================================" << std::endl;
    std::cout << "Speedup: " << geometric_mean(speedup) << std::endl;
    std::cout << std::endl;
  }
  //GEMM
  {
    typedef std::tuple<sc::IsaacOperation_t, sc::IsaacOperation_t, param_t, param_t, param_t> gemm_tuple;
    std::vector<gemm_tuple> shapes;
    // LinPack
    for(param_t N: std::vector<param_t>{512, 1024, 2048})
      shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, N));
    // DeepBench [Forward]
    for(param_t M: std::vector<param_t>{1760})
      for(param_t N: std::vector<param_t>{8, 16, 32, 64, 128})
        shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_N, M, N, M));
    // DeepBench [Backward]
    for(param_t M: std::vector<param_t>{1760})
      for(param_t N: std::vector<param_t>{8, 16, 32, 64, 128})
        shapes.push_back(std::make_tuple(sc::ISAAC_OP_T, sc::ISAAC_OP_N, M, N, M));
    // PCA/ICA
    for(param_t N: std::vector<param_t>{16, 64, 256})
      for(param_t K: std::vector<param_t>{64000})
        shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
    // LaPACK
    for(param_t N: std::vector<param_t>{1024, 2048, 4096})
      for(param_t K: std::vector<param_t>{32})
        shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
    sc::IsaacOperation_t AT, BT;
    param_t M, N, K;
    std::cout << "======================================================================" << std::endl;
    std::cout << "GEMM:" << std::endl;
    std::cout << "======================================================================" << std::endl;
    print_results_header({"AT", "BT", "M", "N", "K"});
    std::vector<double> speedup;
    for(auto shape: shapes){
      std::tie(AT, BT, M, N, K) = shape;
      sc::scalar alpha(1., dtype);
      sc::scalar beta(0., dtype);
      size_t ldc = M;
      size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
      size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
      char cuAT = (AT==sc::ISAAC_OP_T)?'T':'N';
      char cuBT = (BT==sc::ISAAC_OP_T)?'T':'N';
      drv::Buffer C(ctx, M*N*dtsize);
      drv::Buffer A(ctx, M*K*dtsize);
      drv::Buffer B(ctx, K*N*dtsize);
      std::vector<double> times;
      times.push_back(bench([&](){ sc::GEMM(device, stream, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, alpha, A, B, beta, C); }, [&](){ stream.synchronize(); }, device));
      times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, ctx, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }, [&](){ stream.synchronize();  }, device));
      speedup.push_back(times[1]/times[0]);
      print_results(times, {str(AT), str(BT), str(M), str(N), str(K)}, [&](double tsec){ return sc::templates::GEMM::tflops(M, N, K, tsec);});
    }
    std::cout << "======================================================================" << std::endl;
    std::cout << "Speedup: " << geometric_mean(speedup) << std::endl;
  }
 }
--- a/examples/dag.cpp
+++ b/examples/dag.cpp
@@ -1,60 +0,0 @@
 #include "isaac/array.h"
 #include "isaac/symbolic/scheduler/dag.h"
 namespace sc = isaac;
 class carma_generator
 {
  void apply_impl(sc::array_base const & A, sc::array_base const & B, sc::view C, size_t depth)
  {
    if(depth>=split_.size()){
      dag_.append(sc::assign(C, sc::dot(A, B)), "C = dot(A, B)");
    }
    else
    {
      sc::int_t M = C.shape()[0], N = C.shape()[1], K = A.shape()[1];
      size_t new_depth = depth + 1;
      //Split along M
      if(M >= N && M >= K){
        apply_impl(A({0, M/2}, {sc::all}), B, C({0, M/2}, sc::all), new_depth);
        apply_impl(A({M/2, sc::end}, {sc::all}), B, C({M/2, sc::end}, sc::all), new_depth);
      }
      //Split along N
      else if(N >= M && N >= K){
        apply_impl(A, B(sc::all, {0, N/2}), C(sc::all, {0, N/2}), new_depth);
        apply_impl(A, B(sc::all, {N/2, sc::end}), C(sc::all, {N/2, sc::end}), new_depth);
      }
      //Split along K
      else{
        sc::array_base & C1 = dag_.create_temporary(new sc::array(C.shape(), C.dtype(), C.context()));
        sc::array_base & C2 = dag_.create_temporary(new sc::array(C.shape(), C.dtype(), C.context()));
        apply_impl(A(sc::all, {0, K/2}), B({0, K/2}, sc::all), C1, new_depth);
        apply_impl(A(sc::all, {K/2, sc::end}), B({K/2, sc::end}, sc::all), C2, new_depth);
        dag_.append(sc::assign(C, C1 + C2), "C = C1 + C2");
      }
    }
  }
 public:
  carma_generator(size_t depth): split_(depth)
  { }
  void apply(sc::array_base const & A, sc::array_base const & B, sc::array_base & C)
  {
    apply_impl(A, B, sc::view(C), 0);
    dag_.export_graphviz("test.dot");
  }
 private:
  sc::symbolic::scheduler::dag dag_;
  std::vector<sc::int_t> split_;
 };
 int main()
 {
  sc::int_t M = 131, N = 1402, K = 5023;
  sc::array C(M, N), A(M, K), B(K, N);
  carma_generator generator(3);
  generator.apply(A, B, C);
 }
--- a/examples/indexing.cpp
+++ b/examples/indexing.cpp
@@ -1,43 +0,0 @@
 #include "isaac/array.h"
 namespace sc = isaac;
 int main()
 {
 //    static const char * sline = "--------------------";
    static const char * dline = "====================";
    std::cout << dline << std::endl;
    std::cout << "Tutorial: Indexing " << std::endl;
    std::cout << dline << std::endl;
    sc::int_t M = 5, N = 12;
    std::vector<float> data(M*N);
    for(unsigned int i = 0 ; i < data.size(); ++i)
      data[i] = i;
    sc::array A = sc::array(M, N, data);
    sc::array s = sc::array({1,1}, std::vector<float>{5});
    sc::array x = sc::array({1,3},std::vector<float>{1,2,3});
    sc::array y = sc::array({3,3},std::vector<float>{1,2,3,4,5,6,7,8,9});
    sc::array B({4,3},std::vector<float>{0,1,2,3,4,5,6,7,8,9,10,11});
 //    std::cout << sc::sum(y, 1)*sc::sum(x) << std::endl;
 //    std::cout << sc::dot(B.T, B + B) << std::endl;
    std::cout << 1*s*x + x << std::endl;
 //    std::cout << sc::sum(B) << std::endl;
 //    std::cout << sc::reshape(x, {3,1}) + sc::sum(x)*sc::sum(sc::dot(B.T,B) + x + y, 1) + sc::sum(B)*sc::sum(B, 0)<< std::endl;
 //    std::cout << sline << std::endl;
 //    std::cout << "A[3, 2:end]:" << A(3, {2,sc::end}) << std::endl;
 //    std::cout << sline << std::endl;
 //    std::cout << "A[2:end, 4]:" << A({2,sc::end}, 4) << std::endl;
 //    std::cout << sline << std::endl;
 //    std::cout << "diag(A,  1): " << sc::diag(A, 1) << std::endl;
 //    std::cout << sline << std::endl;
 //    std::cout << "diag(A, -7): " << sc::diag(A, -7) << std::endl;
 }
--- a/examples/ptx-conv.cpp
+++ b/examples/ptx-conv.cpp
@@ -0,0 +1,186 @@
 #include <sstream>
 #include <chrono>
 #include <exception>
 #include <fstream>
 #include <iomanip>
 #include "isaac/driver/backend.h"
 #include "isaac/driver/module.h"
 #include "isaac/driver/error.h"
 #include "isaac/driver/kernel.h"
 #include "isaac/driver/cublas.h"
 #include "isaac/driver/stream.h"
 #include "isaac/driver/buffer.h"
 #include "isaac/templates/error.hpp"
 #include <string>
 #include <iostream>
 #include <cassert>
 #include <cstdlib>
 #include "isaac/tools/bench.hpp"
 #include "isaac/templates/conv.h"
 namespace sc = isaac;
 namespace drv = isaac::driver;
 inline int32_t idx(int32_t x, int32_t y, int32_t z, int32_t w,
                   int32_t /*s0*/, int32_t s1, int32_t s2, int32_t s3)
 { return w + z*s3 + y*s3*s2 + x*s3*s2*s1; }
 void cpp_conv_nchw(int32_t C, int32_t N, int32_t K,
              int32_t H, int32_t W,
              int32_t R, int32_t S,
              int32_t pad_h, int32_t pad_w,
              int32_t stride_h, int32_t stride_w,
              int32_t P, int32_t Q,
              float* O, float* I, float* F)
 {
  for(int32_t k = 0; k < K; ++k)
    for(int32_t p = 0 ; p < P; ++p)
      for(int32_t q = 0; q < Q; ++q)
        for(int32_t n = 0; n < N; ++n)
        {
          int32_t pp = p*stride_h - pad_h;
          int32_t qq = q*stride_w - pad_w;
          float acc = 0;
          for(int32_t c = 0; c < C; ++c)
            for(int32_t r = 0; r < R; ++r)
              for(int32_t s = 0; s < S; ++s)
              {
                int32_t h = pp + r;
                int32_t w = qq + s;
                if(h >= 0 && h < H && w >= 0 && w < W)
                  acc += F[idx(k, c, r, s, K, C, R, S)]*I[idx(n, c, h, w, N, C, H, W)];
              }
          O[idx(n, k, p, q, N, K, P, Q)] = acc;
        }
 }
 void cpp_conv_chwn(int32_t C, int32_t N, int32_t K,
              int32_t H, int32_t W,
              int32_t R, int32_t S,
              int32_t pad_h, int32_t pad_w,
              int32_t stride_h, int32_t stride_w,
              int32_t P, int32_t Q,
              float* O, float* I, float* F)
 {
  for(int32_t k = 0; k < K ; ++k)
    for(int32_t p = 0 ; p < P; ++p)
      for(int32_t q = 0; q < Q; ++q)
        for(int32_t n = 0; n < N; ++n)
        {
          int32_t pp = p*stride_h - pad_h;
          int32_t qq = q*stride_w - pad_w;
          float acc = 0;
          for(int32_t c = 0; c < C; ++c)
            for(int32_t r = 0; r < R; ++r)
              for(int32_t s = 0; s < S; ++s)
              {
                int32_t h = pp + r;
                int32_t w = qq + s;
                if(h >= 0 && h < H && w >= 0 && w < W)
                  acc += F[idx(c, r, s, k, C, R, S, K)]*I[idx(c, h, w, n, C, H, W, N)];
              }
          O[idx(k, p, q, n, K, P, Q, N)] = acc;
        }
 }
 double get_tflops(uint64_t P, uint64_t Q, uint64_t K, uint64_t N, uint64_t C, uint64_t R, uint64_t S, double time){
  return 2*P*Q*K*N*C*R*S/(time*1e3);
 }
 bool test = false;
 int main(){
  auto ctx = drv::backend::contexts::get_default();
  int32_t dtsize = 4;
  //Arguments
  int32_t C = 1, N = 4, K = 32;
  int32_t H = 68, W = 260;
  int32_t R = 5, S = 5;
  int32_t pad_h = 0, pad_w = 0;
  int32_t stride_h = 1, stride_w = 1;
  int32_t P = (H - R + 1 + 2*pad_h)/stride_h, Q = (W - S + 1 + 2*pad_w)/stride_w;
  std::vector<float> iO(K*P*Q*N);
  std::vector<float> iI(C*H*W*N);
  std::vector<float> iF(C*R*S*K);
  drv::Buffer O(ctx, iO.size()*dtsize);
  drv::Buffer I(ctx, iI.size()*dtsize);
  for(size_t i = 0; i < iI.size(); ++i) iI[i] = (float)rand()/RAND_MAX;
  drv::Buffer F(ctx, iF.size()*dtsize);
  for(size_t i = 0; i < iF.size(); ++i) iF[i] = (float)rand()/RAND_MAX;
  drv::Stream queue(ctx);
  queue.write(O, true, 0, iO.size()*dtsize, iO.data());
  queue.write(I, true, 0, iI.size()*dtsize, iI.data());
  queue.write(F, true, 0, iF.size()*dtsize, iF.data());
  sc::scalar alpha(1., sc::FLOAT_TYPE);
  sc::scalar beta(1., sc::FLOAT_TYPE);
  if(test)
    cpp_conv_chwn(C, N, K, H, W, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, iO.data(), iI.data(), iF.data());
  std::vector<float> rO(iO.size());
  std::vector<int> rv = {2,4};
  std::vector<int> rl = {1,2,4};
  std::vector<int> rs = {1,2,4,8};
  float best = 0;
  for(size_t vec: rv)
  for(size_t bp: std::vector<int>{})
  for(size_t bq: std::vector<int>{1,2,4})
  for(size_t bn: rl)
  for(size_t bk: rl)
  for(size_t bf_n: rl)
  for(size_t ps: std::vector<int>{1,2,4})
  for(size_t qs: std::vector<int>{1,2,4})
  for(size_t ns: rs)
  for(size_t ks: rs)
  for(size_t crs_l: rl)
  for(size_t crs_s: std::vector<int>{1})
  for(size_t cs: std::vector<int>{1})
  for(size_t bc: std::vector<int>{1})
  for(size_t gridc: std::vector<int>{1})
  {
  // Compile
  isaac::templates::Conv conv(sc::FLOAT_TYPE, C, H, W, N, K, P, Q, R, S, pad_h, pad_w, stride_h, stride_w, vec, bp, bq, bn, bk, bf_n, ps, qs, ns, ks, crs_l, crs_s, cs, bc, gridc);
  std::string src;
  try{
    src = conv.dump(ctx.device(), "fconv");
  }catch(isaac::templates::invalid_parameters){
    continue;
  }
  drv::Module program(ctx, src, true);
  drv::Kernel kernel(program, "fconv");
  //Launch
  float time;
  try{
    time = bench([&](){ conv.enqueue(kernel, queue, alpha, I, F, beta, O); },
      [&](){ queue.synchronize(); }, ctx.device());
  }catch(drv::exception::cuda::launch_out_of_resources){
    continue;
  }
  //Report
  float tflops = get_tflops(P,Q,K,N,C,R,S,time);
  best = std::max(tflops, best);
  std::cout << "//" << vec << " " << bp << " " << bq << " " << bn << " " << bk << " " << bf_n << " " << ps << " " << qs << " " << ns << " " << ks << " " << crs_l << " " << crs_s << " " << cs << " " << bc << " " << gridc << ": " << std::setprecision(3) << tflops << "  [ " << best << " ] " << std::endl;
  //Test
  if(test){
    queue.read(O, true, 0, rO.size()*dtsize, rO.data());
    for(size_t i = 0 ; i < rO.size(); ++i)
      if(fabs((iO[i] - rO[i])/rO[i]) > 1e-4 || std::isnan(rO[i])) {  std::cout << "// Failure at idx " << i << ": " << iO[i] << " != " << rO[i] << std::endl; exit(1); }
  }
  }
  //cuDNN
  float time = bench([&](){sc::driver::cudnnConv(sc::FLOAT_TYPE, ctx, queue, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); },
                      [&](){ queue.synchronize();  }, ctx.device());
  float tflops = get_tflops(P,Q,K,N,C,R,S,time);
  std::cout << "TFLOPs: " << tflops << std::endl;
 }
--- a/examples/ptx-gemm.cpp
+++ b/examples/ptx-gemm.cpp
@@ -0,0 +1,84 @@
 #include <sstream>
 #include <chrono>
 #include <exception>
 #include <iomanip>
 #include <string>
 #include <iostream>
 #include <cassert>
 #include "isaac/driver/backend.h"
 #include "isaac/driver/error.h"
 #include "isaac/driver/module.h"
 #include "isaac/driver/kernel.h"
 #include "isaac/driver/stream.h"
 #include "isaac/driver/buffer.h"
 #include "isaac/driver/cublas.h"
 #include "isaac/half.hpp"
 #include "isaac/tools/bench.hpp"
 #include "isaac/tools/collections.hpp"
 #include "isaac/templates/gemm.h"
 #include "isaac/templates/error.hpp"
 namespace sc = isaac;
 namespace drv = isaac::driver;
 void do_bench(int32_t M, int32_t N, int32_t K, sc::IsaacOperation_t AT, sc::IsaacOperation_t BT, sc::DType dtype){
  auto ctx = drv::backend::contexts::get_default();
  size_t dtsize = sc::size_of(dtype);
  //Buffers
  int32_t AS0 = M, AS1 = K;
  int32_t BS0 = K, BS1 = N;
  if(AT=='T') std::swap(AS0, AS1);
  if(BT=='T') std::swap(BS0, BS1);
  int32_t ldc = M, lda = AS0, ldb = BS0;
  int32_t offc = 0, offa = 0, offb = 0;
  drv::Buffer C(ctx, M*N*dtsize);
  drv::Buffer A(ctx, M*K*dtsize);
  drv::Buffer B(ctx, K*N*dtsize);
  drv::Stream queue(ctx);
  sc::scalar alpha(1., dtype), beta(0., dtype);
  // cuBlas
  double time = bench([&](){ sc::driver::cublasGemm(dtype, ctx, queue, AT, BT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);}
               , [&](){ queue.synchronize(); }, ctx.device());
  std::cout <<  2*1e-3*M*N*K/time << std::endl;
  //Exhaustive search
  std::vector<int> r1 = {1};
  std::vector<int> rv = {4};
  std::vector<int> rr = {1, 2, 4};
  std::vector<int> rl = {2, 4, 8, 16, 32};
  std::vector<int> rs = {1, 2, 4, 8, 16};
  double best = 0;
  for(auto x: sc::cpp::cartesian({rv, rl, rl, rl, rs, r1, rs, rl, rl, rl, rl, r1, r1, r1}))
  {
    isaac::templates::GEMM gemm(dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]);
    //Compile
    std::string src;
    try{
      src = gemm.dump(ctx.device(), "gemm");
    }catch(isaac::templates::invalid_parameters){
      continue;
    }
    drv::Module program(ctx, src, true);
    drv::Kernel kernel(program, "gemm");
    //Launch
    double time;
    try{
      time = bench([&](){ gemm.enqueue(kernel, queue, alpha, A, B, beta, C); }, [&](){ queue.synchronize(); }, ctx.device());
    }catch(drv::exception::cuda::launch_out_of_resources){
      continue;
    }
    //Report
    double tflops = 2*1e-3*M*N*K/time;
    best = std::max(tflops, best);
    std::cout << "//" << x[0] << " " << x[1] << " " << x[2] << " " << x[3] << " " << x[4] << " " << x[5] << " " << x[6] << " " << x[7] << " " << x[8] << " " << x[9] << " " << x[10] << " " << x[11] << " " << x[12] << " " << x[13] << " " << std::setprecision(3) << tflops << "  [ " << best << " ] " << std::endl;
  }
 }
 int main(){
  do_bench(2048, 2048, 2048, sc::ISAAC_OP_N, sc::ISAAC_OP_T, sc::FLOAT_TYPE);
 }
--- a/include/external/clBLAS-complex.h
+++ b/include/external/clBLAS-complex.h
@@ -1,53 +0,0 @@
 /* ************************************************************************
 * Copyright 2013 Advanced Micro Devices, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * ************************************************************************/
 #ifndef CLBLAS_COMPLEX_H_
 #define CLBLAS_COMPLEX H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef cl_float2 FloatComplex;
 typedef cl_double2 DoubleComplex;
 static __inline FloatComplex
 floatComplex(float real, float imag)
 {
    FloatComplex z;
    z.s[0] = real;
    z.s[1] = imag;
    return z;
 }
 static __inline DoubleComplex
 doubleComplex(double real, double imag)
 {
    DoubleComplex z;
    z.s[0] = real;
    z.s[1] = imag;
    return z;
 }
 #define CREAL(v) ((v).s[0])
 #define CIMAG(v) ((v).s[1])
 #ifdef __cplusplus
 }      /* extern "C" { */
 #endif
 #endif /* CLBLAS_COMPLEX_H_ */
--- a/include/external/clBLAS.h
+++ b/include/external/clBLAS.h
--- a/include/external/clBLAS.version.h
+++ b/include/external/clBLAS.version.h
@@ -1,22 +0,0 @@
 /* ************************************************************************
 * Copyright 2013 Advanced Micro Devices, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * ************************************************************************/
 /* the configured version and settings for clblas
 */
 #define clblasVersionMajor 2
 #define clblasVersionMinor 6
 #define clblasVersionPatch 0
--- a/include/external/cuda/builtin_types.h
+++ b/include/external/cuda/builtin_types.h
@@ -1,64 +0,0 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "device_types.h"
 #if !defined(__CUDACC_RTC__)
 #define EXCLUDE_FROM_RTC
 #include "driver_types.h"
 #undef EXCLUDE_FROM_RTC
 #endif /* !__CUDACC_RTC__ */
 #include "surface_types.h"
 #include "texture_types.h"
 #include "vector_types.h"
--- a/include/external/cuda/channel_descriptor.h
+++ b/include/external/cuda/channel_descriptor.h
@@ -1,412 +0,0 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__CHANNEL_DESCRIPTOR_H__)
 #define __CHANNEL_DESCRIPTOR_H__
 #if defined(__cplusplus)
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "driver_types.h"
 #include "cuda_runtime_api.h"
 #include "host_defines.h"
 #include "vector_types.h"
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 /**
 * \addtogroup CUDART_HIGHLEVEL
 *
 * @{
 */
 /**
 * \brief \hl Returns a channel descriptor using the specified format
 *
 * Returns a channel descriptor with format \p f and number of bits of each
 * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
 * defined as:
 * \code
  struct cudaChannelFormatDesc {
    int x, y, z, w;
    enum cudaChannelFormatKind f;
  };
 * \endcode
 *
 * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
 * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
 *
 * \return
 * Channel descriptor with format \p f
 *
 * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
 * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
 * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
 * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
 * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
 * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
 */
 template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
 {
  return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
 }
 static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
 }
 static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
 }
 static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
 }
 static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
 {
  int e = (int)sizeof(char) * 8;
 #if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 #else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 #endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
 {
  int e = (int)sizeof(signed char) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
 {
  int e = (int)sizeof(unsigned char) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
 {
  int e = (int)sizeof(signed char) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
 {
  int e = (int)sizeof(unsigned char) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
 {
  int e = (int)sizeof(signed char) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
 {
  int e = (int)sizeof(unsigned char) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
 {
  int e = (int)sizeof(signed char) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
 {
  int e = (int)sizeof(unsigned char) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
 {
  int e = (int)sizeof(short) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
 {
  int e = (int)sizeof(short) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
 {
  int e = (int)sizeof(short) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
 {
  int e = (int)sizeof(short) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
 {
  int e = (int)sizeof(int) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
 {
  int e = (int)sizeof(unsigned int) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
 {
  int e = (int)sizeof(int) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
 {
  int e = (int)sizeof(unsigned int) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
 {
  int e = (int)sizeof(int) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
 {
  int e = (int)sizeof(unsigned int) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
 {
  int e = (int)sizeof(int) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
 {
  int e = (int)sizeof(unsigned int) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
 }
 #if !defined(__LP64__)
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
 {
  int e = (int)sizeof(long) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
 {
  int e = (int)sizeof(unsigned long) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
 {
  int e = (int)sizeof(long) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
 {
  int e = (int)sizeof(unsigned long) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
 {
  int e = (int)sizeof(long) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
 {
  int e = (int)sizeof(unsigned long) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
 {
  int e = (int)sizeof(long) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
 {
  int e = (int)sizeof(unsigned long) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
 }
 #endif /* !__LP64__ */
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
 {
  int e = (int)sizeof(float) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
 {
  int e = (int)sizeof(float) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
 {
  int e = (int)sizeof(float) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
 {
  int e = (int)sizeof(float) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
 }
 #endif /* __cplusplus */
 /** @} */
 /** @} */ /* END CUDART_TEXTURE_HL */
 #endif /* !__CHANNEL_DESCRIPTOR_H__ */
--- a/include/external/cuda/cuComplex.h
+++ b/include/external/cuda/cuComplex.h
@@ -1,338 +0,0 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(CU_COMPLEX_H_)
 #define CU_COMPLEX_H_
 /* When trying to include C header file in C++ Code extern "C" is required
 * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
 * extern "C" cannot be nested
 * Hence keep the header out of extern "C" block
 */
 #include <math.h>       /* import fabsf, sqrt */
 #if defined(__cplusplus)
 extern "C" {
 #endif /* __cplusplus */
 #include "vector_types.h"
 typedef float2 cuFloatComplex;
 __host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) 
 { 
    return x.x; 
 }
 __host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) 
 { 
    return x.y; 
 }
 __host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex 
                                                             (float r, float i)
 {
    cuFloatComplex res;
    res.x = r;
    res.y = i;
    return res;
 }
 __host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
 {
    return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
 }
 __host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
                                                              cuFloatComplex y)
 {
    return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), 
                                cuCimagf(x) + cuCimagf(y));
 }
 __host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
                                                              cuFloatComplex y)
 {
        return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), 
                                    cuCimagf(x) - cuCimagf(y));
 }
 /* This implementation could suffer from intermediate overflow even though
 * the final result would be in range. However, various implementations do
 * not guard against this (presumably to avoid losing performance), so we 
 * don't do it either to stay competitive.
 */
 __host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
                                                              cuFloatComplex y)
 {
    cuFloatComplex prod;
    prod = make_cuFloatComplex  ((cuCrealf(x) * cuCrealf(y)) - 
                                 (cuCimagf(x) * cuCimagf(y)),
                                 (cuCrealf(x) * cuCimagf(y)) + 
                                 (cuCimagf(x) * cuCrealf(y)));
    return prod;
 }
 /* This implementation guards against intermediate underflow and overflow
 * by scaling. Such guarded implementations are usually the default for
 * complex library implementations, with some also offering an unguarded,
 * faster version.
 */
 __host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
                                                              cuFloatComplex y)
 {
    cuFloatComplex quot;
    float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
    float oos = 1.0f / s;
    float ars = cuCrealf(x) * oos;
    float ais = cuCimagf(x) * oos;
    float brs = cuCrealf(y) * oos;
    float bis = cuCimagf(y) * oos;
    s = (brs * brs) + (bis * bis);
    oos = 1.0f / s;
    quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
                                ((ais * brs) - (ars * bis)) * oos);
    return quot;
 }
 /* 
 * We would like to call hypotf(), but it's not available on all platforms.
 * This discrete implementation guards against intermediate underflow and 
 * overflow by scaling. Otherwise we would lose half the exponent range. 
 * There are various ways of doing guarded computation. For now chose the 
 * simplest and fastest solution, however this may suffer from inaccuracies 
 * if sqrt and division are not IEEE compliant. 
 */
 __host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
 {
    float a = cuCrealf(x);
    float b = cuCimagf(x);
    float v, w, t;
    a = fabsf(a);
    b = fabsf(b);
    if (a > b) {
        v = a;
        w = b; 
    } else {
        v = b;
        w = a;
    }
    t = w / v;
    t = 1.0f + t * t;
    t = v * sqrtf(t);
    if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
        t = v + w;
    }
    return t;
 }
 /* Double precision */
 typedef double2 cuDoubleComplex;
 __host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) 
 { 
    return x.x; 
 }
 __host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) 
 { 
    return x.y; 
 }
 __host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex 
                                                           (double r, double i)
 {
    cuDoubleComplex res;
    res.x = r;
    res.y = i;
    return res;
 }
 __host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
 {
    return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
 }
 __host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
                                                             cuDoubleComplex y)
 {
    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), 
                                 cuCimag(x) + cuCimag(y));
 }
 __host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
                                                             cuDoubleComplex y)
 {
    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), 
                                 cuCimag(x) - cuCimag(y));
 }
 /* This implementation could suffer from intermediate overflow even though
 * the final result would be in range. However, various implementations do
 * not guard against this (presumably to avoid losing performance), so we 
 * don't do it either to stay competitive.
 */
 __host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
                                                             cuDoubleComplex y)
 {
    cuDoubleComplex prod;
    prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - 
                                 (cuCimag(x) * cuCimag(y)),
                                 (cuCreal(x) * cuCimag(y)) + 
                                 (cuCimag(x) * cuCreal(y)));
    return prod;
 }
 /* This implementation guards against intermediate underflow and overflow
 * by scaling. Such guarded implementations are usually the default for
 * complex library implementations, with some also offering an unguarded,
 * faster version.
 */
 __host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
                                                             cuDoubleComplex y)
 {
    cuDoubleComplex quot;
    double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
    double oos = 1.0 / s;
    double ars = cuCreal(x) * oos;
    double ais = cuCimag(x) * oos;
    double brs = cuCreal(y) * oos;
    double bis = cuCimag(y) * oos;
    s = (brs * brs) + (bis * bis);
    oos = 1.0 / s;
    quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
                                 ((ais * brs) - (ars * bis)) * oos);
    return quot;
 }
 /* This implementation guards against intermediate underflow and overflow
 * by scaling. Otherwise we would lose half the exponent range. There are
 * various ways of doing guarded computation. For now chose the simplest
 * and fastest solution, however this may suffer from inaccuracies if sqrt
 * and division are not IEEE compliant.
 */
 __host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
 {
    double a = cuCreal(x);
    double b = cuCimag(x);
    double v, w, t;
    a = fabs(a);
    b = fabs(b);
    if (a > b) {
        v = a;
        w = b; 
    } else {
        v = b;
        w = a;
    }
    t = w / v;
    t = 1.0 + t * t;
    t = v * sqrt(t);
    if ((v == 0.0) || 
        (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
        t = v + w;
    }
    return t;
 }
 #if defined(__cplusplus)
 }
 #endif /* __cplusplus */
 /* aliases */
 typedef cuFloatComplex cuComplex;
 __host__ __device__ static __inline__ cuComplex make_cuComplex (float x, 
                                                                float y) 
 { 
    return make_cuFloatComplex (x, y); 
 }
 /* float-to-double promotion */
 __host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
                                                      (cuFloatComplex c)
 {
    return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
 }
 __host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
 (cuDoubleComplex c)
 {
 	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
 }
 __host__ __device__ static __inline__  cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
 {
    float real_res;
    float imag_res;
    real_res = (cuCrealf(x) *  cuCrealf(y)) + cuCrealf(d);
    imag_res = (cuCrealf(x) *  cuCimagf(y)) + cuCimagf(d);
    real_res = -(cuCimagf(x) * cuCimagf(y))  + real_res;  
    imag_res =  (cuCimagf(x) *  cuCrealf(y)) + imag_res;          
    return make_cuComplex(real_res, imag_res);
 }
 __host__ __device__ static __inline__  cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
 {
    double real_res;
    double imag_res;
    real_res = (cuCreal(x) *  cuCreal(y)) + cuCreal(d);
    imag_res = (cuCreal(x) *  cuCimag(y)) + cuCimag(d);
    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
    return make_cuDoubleComplex(real_res, imag_res);
 }
 #endif /* !defined(CU_COMPLEX_H_) */
--- a/include/external/cuda/cublas.h
+++ b/include/external/cuda/cublas.h
@@ -1,565 +0,0 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 /*
 * This is the public header file for the CUBLAS library, defining the API
 *
 * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines) 
 * on top of the CUDA runtime. 
 */
 #if !defined(CUBLAS_H_)
 #define CUBLAS_H_
 #include <cuda_runtime.h>
 #ifndef CUBLASWINAPI
 #ifdef _WIN32
 #define CUBLASWINAPI __stdcall
 #else
 #define CUBLASWINAPI 
 #endif
 #endif
 #undef CUBLASAPI
 #ifdef __CUDACC__
 #define CUBLASAPI __host__
 #else
 #define CUBLASAPI
 #endif
 #include "cublas_api.h"
 #if defined(__cplusplus)
 extern "C" {
 #endif
 /* CUBLAS data types */
 #define cublasStatus cublasStatus_t
 cublasStatus CUBLASWINAPI cublasInit (void);
 cublasStatus CUBLASWINAPI cublasShutdown (void);
 cublasStatus CUBLASWINAPI cublasGetError (void);
 cublasStatus CUBLASWINAPI cublasGetVersion(int *version);
 cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr);
 cublasStatus CUBLASWINAPI cublasFree (void *devicePtr);
 cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream);
 /* ---------------- CUBLAS BLAS1 functions ---------------- */
 /* NRM2 */
 float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx);
 double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx);
 float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx);
 double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* DOT */
 float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, 
                               int incy);
 double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, 
                               int incy);
 cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, 
                               int incy);
 cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, 
                               int incy);
 cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
                               int incy);
 cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
                               int incy);
 /*------------------------------------------------------------------------*/
 /* SCAL */
 void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx);
 void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx);
 void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx);
 void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* AXPY */
 void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, 
                               float *y, int incy);
 void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, 
                               int incx, double *y, int incy);
 void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, 
                               int incx, cuComplex *y, int incy);
 void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
                               int incx, cuDoubleComplex *y, int incy);
 /*------------------------------------------------------------------------*/
 /* COPY */
 void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, 
                               int incy);
 void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, 
                               int incy);
 void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
                               int incy);
 void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
                               int incy);
 /*------------------------------------------------------------------------*/
 /* SWAP */
 void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy);
 void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy);
 void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy);
 void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy);           
 /*------------------------------------------------------------------------*/
 /* AMAX */
 int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx);
 int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx);
 int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx);
 int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* AMIN */
 int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx);
 int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx);
 int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx);
 int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* ASUM */
 float CUBLASWINAPI cublasSasum (int n, const float *x, int incx);
 double CUBLASWINAPI cublasDasum (int n, const double *x, int incx);
 float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx);
 double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* ROT */
 void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, 
                              float sc, float ss);
 void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, 
                              double sc, double ss);
 void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, 
                              int incy, float c, cuComplex s);
 void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, 
                              cuDoubleComplex *y, int incy, double sc, 
                              cuDoubleComplex cs);
 void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
                               int incy, float c, float s);
 void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, 
                               cuDoubleComplex *y, int incy, double c, double s);
 /*------------------------------------------------------------------------*/
 /* ROTG */
 void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss);
 void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss);
 void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
                               cuComplex *cs);                                     
 void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
                               cuDoubleComplex *cs);                                                               
 /*------------------------------------------------------------------------*/
 /* ROTM */
 void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, 
                              const float* sparam);
 void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, 
                              const double* sparam);
 /*------------------------------------------------------------------------*/
 /* ROTMG */
 void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, 
                                const float *sy1, float* sparam);
 void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, 
                                const double *sy1, double* sparam);
 /* --------------- CUBLAS BLAS2 functions  ---------------- */
 /* GEMV */
 void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
                               const float *A, int lda, const float *x, int incx,
                               float beta, float *y, int incy);
 void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
                               const double *A, int lda, const double *x, int incx,
                               double beta, double *y, int incy);
 void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
                               const cuComplex *A, int lda, const cuComplex *x, int incx,
                               cuComplex beta, cuComplex *y, int incy);
 void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
                               cuDoubleComplex beta, cuDoubleComplex *y, int incy);
 /*------------------------------------------------------------------------*/
 /* GBMV */
 void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, 
                               float alpha, const float *A, int lda, 
                               const float *x, int incx, float beta, float *y, 
                               int incy);
 void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, 
                               double alpha, const double *A, int lda, 
                               const double *x, int incx, double beta, double *y, 
                               int incy);
 void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, 
                               cuComplex alpha, const cuComplex *A, int lda, 
                               const cuComplex *x, int incx, cuComplex beta, cuComplex *y, 
                               int incy);
 void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, 
                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, 
                               const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, 
                               int incy);                  
 /*------------------------------------------------------------------------*/
 /* TRMV */
 void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, 
                               const float *A, int lda, float *x, int incx);
 void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, 
                               const double *A, int lda, double *x, int incx);
 void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, 
                               const cuComplex *A, int lda, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, 
                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* TBMV */
 void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, 
                               const float *A, int lda, float *x, int incx);
 void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, 
                               const double *A, int lda, double *x, int incx);
 void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, 
                               const cuComplex *A, int lda, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, 
                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* TPMV */                                                    
 void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx);
 void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
 void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* TRSV */
 void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx);
 void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx);
 void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, 
                              cuDoubleComplex *x, int incx);       
 /*------------------------------------------------------------------------*/
 /* TPSV */
 void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, 
                              float *x, int incx);
 void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
 void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, 
                              cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/                                         
 /* TBSV */                                         
 void CUBLASWINAPI cublasStbsv(char uplo, char trans, 
                              char diag, int n, int k, const float *A, 
                              int lda, float *x, int incx);
 void CUBLASWINAPI cublasDtbsv(char uplo, char trans, 
                              char diag, int n, int k, const double *A, 
                              int lda, double *x, int incx);
 void CUBLASWINAPI cublasCtbsv(char uplo, char trans, 
                              char diag, int n, int k, const cuComplex *A, 
                              int lda, cuComplex *x, int incx);      
 void CUBLASWINAPI cublasZtbsv(char uplo, char trans, 
                              char diag, int n, int k, const cuDoubleComplex *A, 
                              int lda, cuDoubleComplex *x, int incx);  
 /*------------------------------------------------------------------------*/                                         
 /* SYMV/HEMV */
 void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
                               int lda, const float *x, int incx, float beta, 
                               float *y, int incy);
 void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
                               int lda, const double *x, int incx, double beta, 
                               double *y, int incy);
 void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
                               int lda, const cuComplex *x, int incx, cuComplex beta, 
                               cuComplex *y, int incy);
 void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
                               int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, 
                               cuDoubleComplex *y, int incy);
 /*------------------------------------------------------------------------*/       
 /* SBMV/HBMV */
 void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, 
                               const float *A, int lda, const float *x, int incx, 
                               float beta, float *y, int incy);
 void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, 
                               const double *A, int lda, const double *x, int incx, 
                               double beta, double *y, int incy);
 void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, 
                               const cuComplex *A, int lda, const cuComplex *x, int incx, 
                               cuComplex beta, cuComplex *y, int incy);
 void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, 
                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, 
                               cuDoubleComplex beta, cuDoubleComplex *y, int incy);
 /*------------------------------------------------------------------------*/       
 /* SPMV/HPMV */
 void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
                              const float *AP, const float *x,
                              int incx, float beta, float *y, int incy);
 void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
                              const double *AP, const double *x,
                              int incx, double beta, double *y, int incy);
 void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
                              const cuComplex *AP, const cuComplex *x,
                              int incx, cuComplex beta, cuComplex *y, int incy);
 void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
                              const cuDoubleComplex *AP, const cuDoubleComplex *x,
                              int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy);
 /*------------------------------------------------------------------------*/       
 /* GER */
 void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
                              const float *y, int incy, float *A, int lda);
 void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
                              const double *y, int incy, double *A, int lda);
 void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
                               int incx, const cuComplex *y, int incy,
                               cuComplex *A, int lda);
 void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
                               int incx, const cuComplex *y, int incy,
                               cuComplex *A, int lda);
 void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
                               int incx, const cuDoubleComplex *y, int incy,
                               cuDoubleComplex *A, int lda);
 void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
                               int incx, const cuDoubleComplex *y, int incy,
                               cuDoubleComplex *A, int lda);
 /*------------------------------------------------------------------------*/       
 /* SYR/HER */
 void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
                              int incx, float *A, int lda);
 void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
                              int incx, double *A, int lda);
 void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, 
                              const cuComplex *x, int incx, cuComplex *A, int lda);
 void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, 
                              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda);
 /*------------------------------------------------------------------------*/       
 /* SPR/HPR */
 void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
                              int incx, float *AP);
 void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
                              int incx, double *AP);
 void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
                              int incx, cuComplex *AP);
 void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
                              int incx, cuDoubleComplex *AP);
 /*------------------------------------------------------------------------*/       
 /* SYR2/HER2 */
 void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, 
                               int incx, const float *y, int incy, float *A, 
                               int lda);
 void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, 
                               int incx, const double *y, int incy, double *A, 
                               int lda);
 void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, 
                               int incx, const cuComplex *y, int incy, cuComplex *A, 
                               int lda);
 void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
                               int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, 
                               int lda);
 /*------------------------------------------------------------------------*/       
 /* SPR2/HPR2 */
 void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, 
                               int incx, const float *y, int incy, float *AP);
 void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
                               const double *x, int incx, const double *y,
                               int incy, double *AP);
 void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
                               const cuComplex *x, int incx, const cuComplex *y,
                               int incy, cuComplex *AP);
 void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
                               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
                               int incy, cuDoubleComplex *AP);
 /* ------------------------BLAS3 Functions ------------------------------- */
 /* GEMM */
 void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, 
                               float alpha, const float *A, int lda, 
                               const float *B, int ldb, float beta, float *C, 
                               int ldc);
 void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
                               double alpha, const double *A, int lda, 
                               const double *B, int ldb, double beta, double *C, 
                               int ldc);              
 void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, 
                               cuComplex alpha, const cuComplex *A, int lda,
                               const cuComplex *B, int ldb, cuComplex beta,
                               cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
                               int k, cuDoubleComplex alpha,
                               const cuDoubleComplex *A, int lda,
                               const cuDoubleComplex *B, int ldb,
                               cuDoubleComplex beta, cuDoubleComplex *C,
                               int ldc);                   
 /* -------------------------------------------------------*/
 /* SYRK */
 void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, 
                               const float *A, int lda, float beta, float *C, 
                               int ldc);
 void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
                               double alpha, const double *A, int lda,
                               double beta, double *C, int ldc);
 void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
                               cuComplex alpha, const cuComplex *A, int lda,
                               cuComplex beta, cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
                               cuDoubleComplex alpha,
                               const cuDoubleComplex *A, int lda,
                               cuDoubleComplex beta,
                               cuDoubleComplex *C, int ldc);
 /* ------------------------------------------------------- */
 /* HERK */
 void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
                               float alpha, const cuComplex *A, int lda,
                               float beta, cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
                               double alpha,
                               const cuDoubleComplex *A, int lda,
                               double beta,
                               cuDoubleComplex *C, int ldc);
 /* ------------------------------------------------------- */
 /* SYR2K */
 void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, 
                                const float *A, int lda, const float *B, int ldb, 
                                float beta, float *C, int ldc);
 void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
                                double alpha, const double *A, int lda,
                                const double *B, int ldb, double beta,
                                double *C, int ldc);
 void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
                                cuComplex alpha, const cuComplex *A, int lda,
                                const cuComplex *B, int ldb, cuComplex beta,
                                cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
                                const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
                                cuDoubleComplex *C, int ldc);                             
 /* ------------------------------------------------------- */
 /* HER2K */
 void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
                                cuComplex alpha, const cuComplex *A, int lda,
                                const cuComplex *B, int ldb, float beta,
                                cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
                                const cuDoubleComplex *B, int ldb, double beta,
                                cuDoubleComplex *C, int ldc); 
 /*------------------------------------------------------------------------*/       
 /* SYMM*/
 void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, 
                               const float *A, int lda, const float *B, int ldb,
                               float beta, float *C, int ldc);
 void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, 
                               const double *A, int lda, const double *B, int ldb,
                               double beta, double *C, int ldc);
 void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, 
                               const cuComplex *A, int lda, const cuComplex *B, int ldb,
                               cuComplex beta, cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, 
                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
                               cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
 /*------------------------------------------------------------------------*/       
 /* HEMM*/
 void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
                               cuComplex alpha, const cuComplex *A, int lda,
                               const cuComplex *B, int ldb, cuComplex beta,
                               cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
                               const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
                               cuDoubleComplex *C, int ldc);  
 /*------------------------------------------------------------------------*/       
 /* TRSM*/
 void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
                               int m, int n, float alpha, const float *A, int lda,
                               float *B, int ldb);
 void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
                               char diag, int m, int n, double alpha,
                               const double *A, int lda, double *B,
                               int ldb);
 void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
                               int m, int n, cuComplex alpha, const cuComplex *A,
                               int lda, cuComplex *B, int ldb);
 void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
                               char diag, int m, int n, cuDoubleComplex alpha,
                               const cuDoubleComplex *A, int lda,
                               cuDoubleComplex *B, int ldb);                                                        
 /*------------------------------------------------------------------------*/       
 /* TRMM*/
 void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
                               int m, int n, float alpha, const float *A, int lda,
                               float *B, int ldb);
 void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
                               char diag, int m, int n, double alpha,
                               const double *A, int lda, double *B,
                               int ldb);
 void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
                               int m, int n, cuComplex alpha, const cuComplex *A,
                               int lda, cuComplex *B, int ldb);
 void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
                               char diag, int m, int n, cuDoubleComplex alpha,
                               const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
                               int ldb);
 #if defined(__cplusplus)
 }
 #endif /* __cplusplus */
 #endif /* !defined(CUBLAS_H_) */
--- a/include/external/cuda/cublas_api.h
+++ b/include/external/cuda/cublas_api.h
--- a/include/external/cuda/cuda_device_runtime_api.h
+++ b/include/external/cuda/cuda_device_runtime_api.h
@@ -1,228 +0,0 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
 #define __CUDA_DEVICE_RUNTIME_API_H__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #if defined(__CUDABE__)
 #if (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
 struct cudaFuncAttributes;
 __device__ __attribute__((nv_weak)) cudaError_t cudaMalloc(void **p, size_t s) 
 { 
  return cudaErrorUnknown;
 }
 __device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c) 
 { 
  return cudaErrorUnknown;
 }
 __device__ __attribute__((nv_weak)) cudaError_t cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
 {
  return cudaErrorUnknown;
 }
 __device__ __attribute__((nv_weak)) cudaError_t cudaGetDevice(int *device)
 {
  return cudaErrorUnknown;
 }
 __device__ __attribute__((nv_weak)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
 {
  return cudaErrorUnknown;
 }
 __device__ __attribute__((nv_weak)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
 {
  return cudaErrorUnknown;
 }
 #endif /* (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
 #else /* defined(__CUDABE__) */
 #if defined(__cplusplus) && defined(__CUDACC__)         // Visible to nvcc front-end only
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)   // Visible to SM>=3.5 and "__host__ __device__" only
 #include "driver_types.h"
 #include "host_defines.h"
 extern "C"
 {
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
 extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
 extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
 /**
 * \ingroup CUDART_EXECUTION
 * \brief Obtains a parameter buffer
 *
 * Obtains a parameter buffer which can be filled with parameters for a kernel launch.
 * Parameters passed to ::cudaLaunchDevice must be allocated via this function.
 *
 * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
 * CUDA user code should use <<< >>> to launch kernels.
 *
 * \param alignment - Specifies alignment requirement of the parameter buffer
 * \param size      - Specifies size requirement in bytes
 *
 * \return
 * Returns pointer to the allocated parameterBuffer
 * \notefnerr
 *
 * \sa cudaLaunchDevice
 */
 extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
 /**
 * \ingroup CUDART_EXECUTION
 * \brief Launches a specified kernel
 *
 * Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
 * by calling ::cudaGetParameterBuffer().
 *
 * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
 * CUDA user code should use <<< >>> to launch the kernels.
 *
 * \param func            - Pointer to the kernel to be launched
 * \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
 * \param gridDimension   - Specifies grid dimensions
 * \param blockDimension  - Specifies block dimensions
 * \param sharedMemSize   - Specifies size of shared memory
 * \param stream          - Specifies the stream to be used
 *
 * \return
 * ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
 * ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
 * \notefnerr
 * \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
 * Guide for the detailed descriptions of launch configuration and parameter layout respectively.
 *
 * \sa cudaGetParameterBuffer
 */
 extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
 #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
    // When compiling for the device and per thread default stream is enabled, add
    // a static inline redirect to the per thread stream entry points.
    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
    cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
    {
        return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
    }
    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
    cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
    {
        return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
    }
 #else
    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
 #endif
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
 }
 template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
 template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
 template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
 template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
 #endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
 #endif // defined(__cplusplus) && defined(__CUDACC__)
 #endif /* defined(__CUDABE__) */
 #endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */
--- a/include/external/cuda/cuda_fp16.h
+++ b/include/external/cuda/cuda_fp16.h
--- a/include/external/cuda/cuda_runtime.h
+++ b/include/external/cuda/cuda_runtime.h
--- a/include/external/cuda/cuda_runtime_api.h
+++ b/include/external/cuda/cuda_runtime_api.h
--- a/include/external/cuda/device_types.h
+++ b/include/external/cuda/device_types.h
@@ -1,69 +0,0 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__DEVICE_TYPES_H__)
 #define __DEVICE_TYPES_H__
 #include "host_defines.h"
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 enum __device_builtin__ cudaRoundMode
 {
    cudaRoundNearest,
    cudaRoundZero,
    cudaRoundPosInf,
    cudaRoundMinInf
 };
 #endif /* !__DEVICE_TYPES_H__ */
--- a/include/external/cuda/driver_functions.h
+++ b/include/external/cuda/driver_functions.h
@@ -1,145 +0,0 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__DRIVER_FUNCTIONS_H__)
 #define __DRIVER_FUNCTIONS_H__
 #include "builtin_types.h"
 #include "host_defines.h"
 #include "driver_types.h"
 /**
 * \addtogroup CUDART_MEMORY
 *
 * @{
 */
 /**
 * \brief Returns a cudaPitchedPtr based on input parameters
 *
 * Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
 * \p p, \p xsz, and \p ysz.
 *
 * \param d   - Pointer to allocated memory
 * \param p   - Pitch of allocated memory in bytes
 * \param xsz - Logical width of allocation in elements
 * \param ysz - Logical height of allocation in elements
 *
 * \return
 * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
 *
 * \sa make_cudaExtent, make_cudaPos
 */
 static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) 
 {
  struct cudaPitchedPtr s;
  s.ptr   = d;
  s.pitch = p;
  s.xsize = xsz;
  s.ysize = ysz;
  return s;
 }
 /**
 * \brief Returns a cudaPos based on input parameters
 *
 * Returns a ::cudaPos based on the specified input parameters \p x,
 * \p y, and \p z.
 *
 * \param x - X position
 * \param y - Y position
 * \param z - Z position
 *
 * \return
 * ::cudaPos specified by \p x, \p y, and \p z
 *
 * \sa make_cudaExtent, make_cudaPitchedPtr
 */
 static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) 
 {
  struct cudaPos p;
  p.x = x;
  p.y = y;
  p.z = z;
  return p;
 }
 /**
 * \brief Returns a cudaExtent based on input parameters
 *
 * Returns a ::cudaExtent based on the specified input parameters \p w,
 * \p h, and \p d.
 *
 * \param w - Width in bytes
 * \param h - Height in elements
 * \param d - Depth in elements
 *
 * \return
 * ::cudaExtent specified by \p w, \p h, and \p d
 *
 * \sa make_cudaPitchedPtr, make_cudaPos
 */
 static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) 
 {
  struct cudaExtent e;
  e.width  = w;
  e.height = h;
  e.depth  = d;
  return e;
 }
 /** @} */ /* END CUDART_MEMORY */
 #endif /* !__DRIVER_FUNCTIONS_H__ */
--- a/include/external/cuda/driver_types.h
+++ b/include/external/cuda/driver_types.h
--- a/include/external/cuda/host_config.h
+++ b/include/external/cuda/host_config.h
@@ -1,201 +0,0 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__HOST_CONFIG_H__)
 #define __HOST_CONFIG_H__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #if defined(__CUDACC__)
 #if defined(__CUDACC_RTC__)
 #define _CRTIMP
 #define __THROW
 #else /* __CUDACC_RTC__ */
 /* check for host compilers that are compatible with nvcc */
 #if !defined(__GNUC__) && !defined(_WIN32)
 #error --- !!! UNSUPPORTED COMPILER !!! ---
 #endif /* !__GNUC__ && !_WIN32 */
 #if defined(__ICC)
 #if __ICC != 1500 || !defined(__GNUC__) || !defined(__LP64__)
 #error -- unsupported ICC configuration! Only ICC 15.0 on Linux x86_64 is supported!
 #endif /* __ICC != 1500 || !__GNUC__ || !__LP64__ */
 #endif /* __ICC */
 #if defined(__PGIC__)
 #if __PGIC__ != 15 || __PGIC_MINOR__ != 4 || !defined(__GNUC__) || !defined(__LP64__)
 #error -- unsupported pgc++ configuration! Only pgc++ 15.4 on Linux x86_64 is supported!
 #endif /* __PGIC__ != 15 || __PGIC_MINOR != 4 || !__GNUC__ || !__LP64__ */
 #endif /* __PGIC__ */
 #if defined(__powerpc__)
 #if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
 #error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
 #endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
 #if defined(__ibmxl_vrm__) && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000)
 #error -- unsupported xlC version! only xlC 13.1 is supported
 #endif /* __ibmxl_vrm__ && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) */
 #endif /* __powerpc__ */
 #if defined(__GNUC__)
 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 9)
 #error -- unsupported GNU version! gcc versions later than 4.9 are not supported!
 #endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 9) */
 #if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
 #error -- clang and clang++ are the only supported host compilers on Mac OS X!
 #endif /* __APPLE__ && __MACH__ && !__clang__ */
 #endif /* __GNUC__ */
 #if defined(_WIN32)
 #if _MSC_VER < 1600 || _MSC_VER > 1800
 #error -- unsupported Microsoft Visual Studio version! Only the versions 2010, 2012, and 2013 are supported!
 #endif /* _MSC_VER < 1600 || _MSC_VER > 1800 */
 #endif /* _WIN32 */
 /* configure host compiler */
 #if defined(__APPLE__)
 #define _CRTIMP
 #define __THROW
 #if defined(__BLOCKS__) /* nvcc does not support closures */
 #undef __BLOCKS__
 #endif /* __BLOCKS__ */
 #elif defined(__ANDROID__)
 #define _CRTIMP
 #define __THROW
 #elif defined(__QNX__)
 #define _CRTIMP
 #define __THROW
 #elif defined(__GNUC__)
 #define _CRTIMP
 #include <features.h> /* for __THROW */
 #elif defined(_WIN32)
 #if _MSC_VER >= 1500
 #undef _USE_DECLSPECS_FOR_SAL
 #define _USE_DECLSPECS_FOR_SAL \
        1
 #endif /* _MSC_VER >= 1500 */
 #if !defined(_CRT_NONSTDC_NO_WARNINGS)
 #define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
 #endif /* !_CRT_NONSTDC_NO_WARNINGS */
 #if !defined(_CRT_SECURE_NO_WARNINGS)
 #define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
 #endif /* !_CRT_SECURE_NO_WARNINGS */
 #if !defined(NOMINMAX)
 #define NOMINMAX /* min and max are part of cuda runtime */
 #endif /* !NOMINMAX */
 #include <crtdefs.h> /* for _CRTIMP */
 #define __THROW
 #endif /* __APPLE__ */
 #endif /* __CUDACC_RTC__ */
 #endif /* __CUDACC__ */
 #endif /* !__HOST_CONFIG_H__ */
--- a/include/external/cuda/host_defines.h
+++ b/include/external/cuda/host_defines.h
@@ -1,241 +0,0 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__HOST_DEFINES_H__)
 #define __HOST_DEFINES_H__
 /* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
 #if defined(__GNUC__) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
 #if defined(__CUDACC_RTC__)
 #define __volatile__ volatile
 #endif /* __CUDACC_RTC__ */
 #define __no_return__ \
        __attribute__((noreturn))
 #if defined(__CUDACC__) || defined(__CUDA_ARCH__)
 /* gcc allows users to define attributes with underscores, 
   e.g., __attribute__((__noinline__)).
   Consider a non-CUDA source file (e.g. .cpp) that has the 
   above attribute specification, and includes this header file. In that case,
   defining __noinline__ as below  would cause a gcc compilation error.
   Hence, only define __noinline__ when the code is being processed
   by a  CUDA compiler component.
 */   
 #define __noinline__ \
        __attribute__((noinline))
 #endif /* __CUDACC__  || __CUDA_ARCH__ */       
 #define __forceinline__ \
        __inline__ __attribute__((always_inline))
 #define __align__(n) \
        __attribute__((aligned(n)))
 #define __thread__ \
        __thread
 #define __import__
 #define __export__
 #define __cdecl
 #define __annotate__(a) \
        __attribute__((a))
 #define __location__(a) \
        __annotate__(a)
 #define CUDARTAPI
 #elif defined(_MSC_VER)
 #if _MSC_VER >= 1400
 #define __restrict__ \
        __restrict
 #else /* _MSC_VER >= 1400 */
 #define __restrict__
 #endif /* _MSC_VER >= 1400 */
 #define __inline__ \
        __inline
 #define __no_return__ \
        __declspec(noreturn)
 #define __noinline__ \
        __declspec(noinline)
 #define __forceinline__ \
        __forceinline
 #define __align__(n) \
        __declspec(align(n))
 #define __thread__ \
        __declspec(thread)
 #define __import__ \
        __declspec(dllimport)
 #define __export__ \
        __declspec(dllexport)
 #define __annotate__(a) \
        __declspec(a)
 #define __location__(a) \
        __annotate__(__##a##__)
 #define CUDARTAPI \
        __stdcall
 #else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
 #define __inline__
 #if !defined(__align__)
 #error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
 #endif /* !__align__ */
 #if !defined(CUDARTAPI)
 #error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
 #endif /* !CUDARTAPI */
 #endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
 #if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
    (defined(_MSC_VER) && _MSC_VER < 1900) || \
    (!defined(__GNUC__) && !defined(_MSC_VER))
 #define __specialization_static \
        static
 #else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
         (_MSC_VER && _MSC_VER < 1900) ||
         (!__GNUC__ && !_MSC_VER) */
 #define __specialization_static
 #endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
         (_MSC_VER && _MSC_VER < 1900) ||
         (!__GNUC__ && !_MSC_VER) */
 #if !defined(__CUDACC__) && !defined(__CUDABE__)
 #undef __annotate__
 #define __annotate__(a)
 #else /* !__CUDACC__ && !__CUDABE__ */
 #define __launch_bounds__(...) \
        __annotate__(launch_bounds(__VA_ARGS__))
 #endif /* !__CUDACC__ && !__CUDABE__ */
 #if defined(__CUDACC__) || defined(__CUDABE__) || \
    defined(__GNUC__) || defined(_WIN64)
 #define __builtin_align__(a) \
        __align__(a)
 #else /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */
 #define __builtin_align__(a)
 #endif /* __CUDACC__ || __CUDABE__ || __GNUC__  || _WIN64 */
 #define __host__ \
        __location__(host)
 #define __device__ \
        __location__(device)
 #define __global__ \
        __location__(global)
 #define __shared__ \
        __location__(shared)
 #define __constant__ \
        __location__(constant)
 #define __managed__ \
        __location__(managed)
 #if (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__)) || !defined(__CUDACC__)
 #define __device_builtin__
 #define __device_builtin_texture_type__
 #define __device_builtin_surface_type__
 #define __cudart_builtin__
 #else /* (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__))  || !__CUDACC__ */
 #define __device_builtin__ \
        __location__(device_builtin)
 #define __device_builtin_texture_type__ \
        __location__(device_builtin_texture_type)
 #define __device_builtin_surface_type__ \
        __location__(device_builtin_surface_type)
 #define __cudart_builtin__ \
        __location__(cudart_builtin)
 #endif /* (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__))  || !__CUDACC__ */
 #if defined(__CUDACC__) && defined(__clang__)
 #if !defined(__has_feature)
 #error --- !!! The Clang version does not support __has_feature !!! ---
 #endif /* !__has_feature */
 #if defined(__cplusplus) && defined(__CUDACC__)
 #if (__has_feature(cxx_noexcept))
 #define NV_CLANG_ATOMIC_NOEXCEPT noexcept
 #define NV_CLANG_ATOMIC_NOEXCEPT_(x) noexcept(x)
 #else /* !__has_feature(cxx_noexcept) */
 #define NV_CLANG_ATOMIC_NOEXCEPT throw()
 #define NV_CLANG_ATOMIC_NOEXCEPT_(x)
 #endif /* __has_feature(cxx_noexcept) */
 template <typename T> struct __nv_clang_atomic_t {
  __nv_clang_atomic_t() NV_CLANG_ATOMIC_NOEXCEPT;
  __nv_clang_atomic_t(const T &x) NV_CLANG_ATOMIC_NOEXCEPT; 
  operator T() volatile NV_CLANG_ATOMIC_NOEXCEPT;
  operator T() NV_CLANG_ATOMIC_NOEXCEPT;
 };
 #define _Atomic(X) __nv_clang_atomic_t<X>
 #endif /* defined(__cplusplus) && defined(__CUDACC__) */
 #endif /* __CUDACC__ && __clang__ */
 #endif /* !__HOST_DEFINES_H__ */
--- a/include/external/cuda/surface_types.h
+++ b/include/external/cuda/surface_types.h
@@ -1,119 +0,0 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__SURFACE_TYPES_H__)
 #define __SURFACE_TYPES_H__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "driver_types.h"
 /**
 * \addtogroup CUDART_TYPES
 *
 * @{
 */
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #define cudaSurfaceType1D              0x01
 #define cudaSurfaceType2D              0x02
 #define cudaSurfaceType3D              0x03
 #define cudaSurfaceTypeCubemap         0x0C
 #define cudaSurfaceType1DLayered       0xF1
 #define cudaSurfaceType2DLayered       0xF2
 #define cudaSurfaceTypeCubemapLayered  0xFC
 /**
 * CUDA Surface boundary modes
 */
 enum __device_builtin__ cudaSurfaceBoundaryMode
 {
    cudaBoundaryModeZero  = 0,    /**< Zero boundary mode */
    cudaBoundaryModeClamp = 1,    /**< Clamp boundary mode */
    cudaBoundaryModeTrap  = 2     /**< Trap boundary mode */
 };
 /**
 * CUDA Surface format modes
 */
 enum __device_builtin__  cudaSurfaceFormatMode
 {
    cudaFormatModeForced = 0,     /**< Forced format mode */
    cudaFormatModeAuto = 1        /**< Auto format mode */
 };
 /**
 * CUDA Surface reference
 */
 struct __device_builtin__ surfaceReference
 {
    /**
     * Channel descriptor for surface reference
     */
    struct cudaChannelFormatDesc channelDesc;
 };
 /**
 * An opaque value that represents a CUDA Surface object
 */
 typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
 /** @} */
 /** @} */ /* END CUDART_TYPES */
 #endif /* !__SURFACE_TYPES_H__ */
--- a/include/external/cuda/texture_types.h
+++ b/include/external/cuda/texture_types.h
@@ -1,213 +0,0 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__TEXTURE_TYPES_H__)
 #define __TEXTURE_TYPES_H__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "driver_types.h"
 /**
 * \addtogroup CUDART_TYPES
 *
 * @{
 */
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #define cudaTextureType1D              0x01
 #define cudaTextureType2D              0x02
 #define cudaTextureType3D              0x03
 #define cudaTextureTypeCubemap         0x0C
 #define cudaTextureType1DLayered       0xF1
 #define cudaTextureType2DLayered       0xF2
 #define cudaTextureTypeCubemapLayered  0xFC
 /**
 * CUDA texture address modes
 */
 enum __device_builtin__ cudaTextureAddressMode
 {
    cudaAddressModeWrap   = 0,    /**< Wrapping address mode */
    cudaAddressModeClamp  = 1,    /**< Clamp to edge address mode */
    cudaAddressModeMirror = 2,    /**< Mirror address mode */
    cudaAddressModeBorder = 3     /**< Border address mode */
 };
 /**
 * CUDA texture filter modes
 */
 enum __device_builtin__ cudaTextureFilterMode
 {
    cudaFilterModePoint  = 0,     /**< Point filter mode */
    cudaFilterModeLinear = 1      /**< Linear filter mode */
 };
 /**
 * CUDA texture read modes
 */
 enum __device_builtin__ cudaTextureReadMode
 {
    cudaReadModeElementType     = 0,  /**< Read texture as specified element type */
    cudaReadModeNormalizedFloat = 1   /**< Read texture as normalized float */
 };
 /**
 * CUDA texture reference
 */
 struct __device_builtin__ textureReference
 {
    /**
     * Indicates whether texture reads are normalized or not
     */
    int                          normalized;
    /**
     * Texture filter mode
     */
    enum cudaTextureFilterMode   filterMode;
    /**
     * Texture address mode for up to 3 dimensions
     */
    enum cudaTextureAddressMode  addressMode[3];
    /**
     * Channel descriptor for the texture reference
     */
    struct cudaChannelFormatDesc channelDesc;
    /**
     * Perform sRGB->linear conversion during texture read
     */
    int                          sRGB;
    /**
     * Limit to the anisotropy ratio
     */
    unsigned int                 maxAnisotropy;
    /**
     * Mipmap filter mode
     */
    enum cudaTextureFilterMode   mipmapFilterMode;
    /**
     * Offset applied to the supplied mipmap level
     */
    float                        mipmapLevelBias;
    /**
     * Lower end of the mipmap level range to clamp access to
     */
    float                        minMipmapLevelClamp;
    /**
     * Upper end of the mipmap level range to clamp access to
     */
    float                        maxMipmapLevelClamp;
    int                          __cudaReserved[15];
 };
 /**
 * CUDA texture descriptor
 */
 struct __device_builtin__ cudaTextureDesc
 {
    /**
     * Texture address mode for up to 3 dimensions
     */
    enum cudaTextureAddressMode addressMode[3];
    /**
     * Texture filter mode
     */
    enum cudaTextureFilterMode  filterMode;
    /**
     * Texture read mode
     */
    enum cudaTextureReadMode    readMode;
    /**
     * Perform sRGB->linear conversion during texture read
     */
    int                         sRGB;
    /**
     * Indicates whether texture reads are normalized or not
     */
    int                         normalizedCoords;
    /**
     * Limit to the anisotropy ratio
     */
    unsigned int                maxAnisotropy;
    /**
     * Mipmap filter mode
     */
    enum cudaTextureFilterMode  mipmapFilterMode;
    /**
     * Offset applied to the supplied mipmap level
     */
    float                       mipmapLevelBias;
    /**
     * Lower end of the mipmap level range to clamp access to
     */
    float                       minMipmapLevelClamp;
    /**
     * Upper end of the mipmap level range to clamp access to
     */
    float                       maxMipmapLevelClamp;
 };
 /**
 * An opaque value that represents a CUDA texture object
 */
 typedef __device_builtin__ unsigned long long cudaTextureObject_t;
 /** @} */
 /** @} */ /* END CUDART_TYPES */
 #endif /* !__TEXTURE_TYPES_H__ */
--- a/include/external/cuda/vector_functions.h
+++ b/include/external/cuda/vector_functions.h
@@ -1,177 +0,0 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__VECTOR_FUNCTIONS_H__)
 #define __VECTOR_FUNCTIONS_H__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "builtin_types.h"
 #include "host_defines.h"
 #include "vector_types.h"
 #if defined(__CUDACC_RTC__)
 #define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
 #else /* !__CUDACC_RTC__ */
 #define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
 #endif /* __CUDACC_RTC__ */
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 __VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
 __VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
 __VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
 __VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
 __VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
 __VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
 __VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
 __VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
 __VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
 __VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
 __VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
 __VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
 __VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
 __VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
 __VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
 __VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
 __VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
 __VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
 __VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
 __VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
 __VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
 __VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
 __VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
 __VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
 __VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
 __VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
 __VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
 __VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
 __VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
 __VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
 __VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
 __VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
 __VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
 __VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
 __VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
 __VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
 __VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
 __VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
 __VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
 __VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
 __VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
 __VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
 __VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
 __VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
 __VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
 __VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
 __VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
 __VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
 #undef __VECTOR_FUNCTIONS_DECL__
 #if !defined(__CUDACC_RTC__)
 #include "vector_functions.hpp"
 #endif /* !__CUDACC_RTC__ */
 #endif /* !__VECTOR_FUNCTIONS_H__ */
--- a/include/external/cuda/vector_functions.hpp
+++ b/include/external/cuda/vector_functions.hpp
@@ -1,318 +0,0 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__VECTOR_FUNCTIONS_HPP__)
 #define __VECTOR_FUNCTIONS_HPP__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "builtin_types.h"
 #include "host_defines.h"
 #include "vector_types.h"
 #if defined(__CUDACC_RTC__)
 #define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
 #else /* !__CUDACC_RTC__ */
 #define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
 #endif /* __CUDACC_RTC__ */
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 __VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
 {
  char1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
 {
  uchar1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
 {
  char2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
 {
  uchar2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
 {
  char3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
 {
  uchar3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
 {
  char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
 {
  uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
 {
  short1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
 {
  ushort1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
 {
  short2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
 {
  ushort2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
 { 
  short3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
 {
  ushort3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
 {
  short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
 {
  ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
 {
  int1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
 {
  uint1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
 {
  int2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
 {
  uint2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
 {
  int3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
 {
  uint3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
 {
  int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
 {
  uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
 {
  long1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
 {
  ulong1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
 {
  long2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
 {
  ulong2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
 {
  long3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
 {
  ulong3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
 {
  long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
 {
  ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
 {
  float1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
 {
  float2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
 {
  float3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
 {
  float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
 {
  longlong1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
 {
  ulonglong1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
 {
  longlong2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
 {
  ulonglong2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
 {
  longlong3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
 {
  ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
 {
  longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
 {
  ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
 {
  double1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
 {
  double2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
 {
  double3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
 {
  double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 #undef __VECTOR_FUNCTIONS_DECL__
 #endif /* !__VECTOR_FUNCTIONS_HPP__ */
--- a/include/external/cuda/vector_types.h
+++ b/include/external/cuda/vector_types.h
@@ -1,431 +0,0 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__VECTOR_TYPES_H__)
 #define __VECTOR_TYPES_H__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #if !defined(__CUDA_LIBDEVICE__) && !defined(__CUDACC_RTC__)
 #define EXCLUDE_FROM_RTC
 #include "builtin_types.h"
 #undef EXCLUDE_FROM_RTC
 #endif /* !__CUDA_LIBDEVICE__ && !__CUDACC_RTC__ */
 #include "host_defines.h"
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && !defined(__CUDABE__) && \
    defined(_WIN32) && !defined(_WIN64)
 #pragma warning(push)
 #pragma warning(disable: 4201 4408)
 #define __cuda_builtin_vector_align8(tag, members) \
 struct __device_builtin__ tag                      \
 {                                                  \
    union                                          \
    {                                              \
        struct { members };                        \
        struct { long long int :1,:0; };           \
    };                                             \
 }
 #else /* !__CUDACC__ && !__CUDACC_RTC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
 #define __cuda_builtin_vector_align8(tag, members) \
 struct __device_builtin__ __align__(8) tag         \
 {                                                  \
    members                                        \
 }
 #endif /* !__CUDACC__ && !__CUDACC_RTC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
 struct __device_builtin__ char1
 {
    signed char x;
 };
 struct __device_builtin__ uchar1
 {
    unsigned char x;
 };
 struct __device_builtin__ __align__(2) char2
 {
    signed char x, y;
 };
 struct __device_builtin__ __align__(2) uchar2
 {
    unsigned char x, y;
 };
 struct __device_builtin__ char3
 {
    signed char x, y, z;
 };
 struct __device_builtin__ uchar3
 {
    unsigned char x, y, z;
 };
 struct __device_builtin__ __align__(4) char4
 {
    signed char x, y, z, w;
 };
 struct __device_builtin__ __align__(4) uchar4
 {
    unsigned char x, y, z, w;
 };
 struct __device_builtin__ short1
 {
    short x;
 };
 struct __device_builtin__ ushort1
 {
    unsigned short x;
 };
 struct __device_builtin__ __align__(4) short2
 {
    short x, y;
 };
 struct __device_builtin__ __align__(4) ushort2
 {
    unsigned short x, y;
 };
 struct __device_builtin__ short3
 {
    short x, y, z;
 };
 struct __device_builtin__ ushort3
 {
    unsigned short x, y, z;
 };
 __cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
 __cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
 struct __device_builtin__ int1
 {
    int x;
 };
 struct __device_builtin__ uint1
 {
    unsigned int x;
 };
 __cuda_builtin_vector_align8(int2, int x; int y;);
 __cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
 struct __device_builtin__ int3
 {
    int x, y, z;
 };
 struct __device_builtin__ uint3
 {
    unsigned int x, y, z;
 };
 struct __device_builtin__ __builtin_align__(16) int4
 {
    int x, y, z, w;
 };
 struct __device_builtin__ __builtin_align__(16) uint4
 {
    unsigned int x, y, z, w;
 };
 struct __device_builtin__ long1
 {
    long int x;
 };
 struct __device_builtin__ ulong1
 {
    unsigned long x;
 };
 #if defined(__CUDACC_RTC__) || defined(_WIN32)
 __cuda_builtin_vector_align8(long2, long int x; long int y;);
 __cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
 #else /* __CUDACC_RTC__ || _WIN32 */
 struct __device_builtin__ __align__(2*sizeof(long int)) long2
 {
    long int x, y;
 };
 struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
 {
    unsigned long int x, y;
 };
 #endif /* __CUDACC_RTC__ || _WIN32 */
 struct __device_builtin__ long3
 {
    long int x, y, z;
 };
 struct __device_builtin__ ulong3
 {
    unsigned long int x, y, z;
 };
 struct __device_builtin__ __builtin_align__(16) long4
 {
    long int x, y, z, w;
 };
 struct __device_builtin__ __builtin_align__(16) ulong4
 {
    unsigned long int x, y, z, w;
 };
 struct __device_builtin__ float1
 {
    float x;
 };
 #if !defined(__CUDACC__) && !defined(__CUDABE__) && defined(__arm__) && \
    defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-pedantic"
 struct __device_builtin__ __attribute__((aligned(8))) float2
 {
    float x; float y; float __cuda_gnu_arm_ice_workaround[0];
 };
 #pragma GCC poison __cuda_gnu_arm_ice_workaround
 #pragma GCC diagnostic pop
 #else /* !__CUDACC__ && !__CUDABE__ && __arm__ && __ARM_PCS_VFP &&
         __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
 __cuda_builtin_vector_align8(float2, float x; float y;);
 #endif /* !__CUDACC__ && !__CUDABE__ && __arm__ && __ARM_PCS_VFP &&
          __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
 struct __device_builtin__ float3
 {
    float x, y, z;
 };
 struct __device_builtin__ __builtin_align__(16) float4
 {
    float x, y, z, w;
 };
 struct __device_builtin__ longlong1
 {
    long long int x;
 };
 struct __device_builtin__ ulonglong1
 {
    unsigned long long int x;
 };
 struct __device_builtin__ __builtin_align__(16) longlong2
 {
    long long int x, y;
 };
 struct __device_builtin__ __builtin_align__(16) ulonglong2
 {
    unsigned long long int x, y;
 };
 struct __device_builtin__ longlong3
 {
    long long int x, y, z;
 };
 struct __device_builtin__ ulonglong3
 {
    unsigned long long int x, y, z;
 };
 struct __device_builtin__ __builtin_align__(16) longlong4
 {
    long long int x, y, z ,w;
 };
 struct __device_builtin__ __builtin_align__(16) ulonglong4
 {
    unsigned long long int x, y, z, w;
 };
 struct __device_builtin__ double1
 {
    double x;
 };
 struct __device_builtin__ __builtin_align__(16) double2
 {
    double x, y;
 };
 struct __device_builtin__ double3
 {
    double x, y, z;
 };
 struct __device_builtin__ __builtin_align__(16) double4
 {
    double x, y, z, w;
 };
 #if !defined(__CUDACC__) && !defined(__CUDABE__) && \
    defined(_WIN32) && !defined(_WIN64)
 #pragma warning(pop)
 #endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 typedef __device_builtin__ struct char1 char1;
 typedef __device_builtin__ struct uchar1 uchar1;
 typedef __device_builtin__ struct char2 char2;
 typedef __device_builtin__ struct uchar2 uchar2;
 typedef __device_builtin__ struct char3 char3;
 typedef __device_builtin__ struct uchar3 uchar3;
 typedef __device_builtin__ struct char4 char4;
 typedef __device_builtin__ struct uchar4 uchar4;
 typedef __device_builtin__ struct short1 short1;
 typedef __device_builtin__ struct ushort1 ushort1;
 typedef __device_builtin__ struct short2 short2;
 typedef __device_builtin__ struct ushort2 ushort2;
 typedef __device_builtin__ struct short3 short3;
 typedef __device_builtin__ struct ushort3 ushort3;
 typedef __device_builtin__ struct short4 short4;
 typedef __device_builtin__ struct ushort4 ushort4;
 typedef __device_builtin__ struct int1 int1;
 typedef __device_builtin__ struct uint1 uint1;
 typedef __device_builtin__ struct int2 int2;
 typedef __device_builtin__ struct uint2 uint2;
 typedef __device_builtin__ struct int3 int3;
 typedef __device_builtin__ struct uint3 uint3;
 typedef __device_builtin__ struct int4 int4;
 typedef __device_builtin__ struct uint4 uint4;
 typedef __device_builtin__ struct long1 long1;
 typedef __device_builtin__ struct ulong1 ulong1;
 typedef __device_builtin__ struct long2 long2;
 typedef __device_builtin__ struct ulong2 ulong2;
 typedef __device_builtin__ struct long3 long3;
 typedef __device_builtin__ struct ulong3 ulong3;
 typedef __device_builtin__ struct long4 long4;
 typedef __device_builtin__ struct ulong4 ulong4;
 typedef __device_builtin__ struct float1 float1;
 typedef __device_builtin__ struct float2 float2;
 typedef __device_builtin__ struct float3 float3;
 typedef __device_builtin__ struct float4 float4;
 typedef __device_builtin__ struct longlong1 longlong1;
 typedef __device_builtin__ struct ulonglong1 ulonglong1;
 typedef __device_builtin__ struct longlong2 longlong2;
 typedef __device_builtin__ struct ulonglong2 ulonglong2;
 typedef __device_builtin__ struct longlong3 longlong3;
 typedef __device_builtin__ struct ulonglong3 ulonglong3;
 typedef __device_builtin__ struct longlong4 longlong4;
 typedef __device_builtin__ struct ulonglong4 ulonglong4;
 typedef __device_builtin__ struct double1 double1;
 typedef __device_builtin__ struct double2 double2;
 typedef __device_builtin__ struct double3 double3;
 typedef __device_builtin__ struct double4 double4;
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 struct __device_builtin__ dim3
 {
    unsigned int x, y, z;
 #if defined(__cplusplus)
    __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
    __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
    __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
 #endif /* __cplusplus */
 };
 typedef __device_builtin__ struct dim3 dim3;
 #undef  __cuda_builtin_vector_align8
 #endif /* !__VECTOR_TYPES_H__ */
--- a/include/isaac/api.h
+++ b/include/isaac/api.h
@@ -0,0 +1,87 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "isaac/runtime/predict.h"
 #include "isaac/driver/backend.h"
 #include "isaac/driver/cublas.h"
 #include "isaac/driver/context.h"
 #include "isaac/driver/kernel.h"
 #include "isaac/driver/buffer.h"
 #include "isaac/driver/stream.h"
 #include "isaac/tools/collections.hpp"
 #include "isaac/templates/conv.h"
 #include "isaac/templates/gemm.h"
 namespace isaac{
 void GEMM(driver::Device const & device, driver::Stream & stream,
          DType dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K,
          size_t offa, size_t lda, size_t offb, size_t ldb, size_t offc, size_t ldc,
          scalar const & alpha, driver::Buffer const & A, driver::Buffer const & B, scalar const & beta, driver::Buffer& C)
 {
  typedef std::tuple<driver::Stream, DType, IsaacOperation_t, IsaacOperation_t,
                    param_t, param_t, param_t, size_t, size_t, size_t, size_t, size_t, size_t> key_type;
  typedef std::pair<std::shared_ptr<templates::GEMM>, std::shared_ptr<driver::Kernel>> value_type;
  static std::function<value_type()> compile = [&](){
    //Fetch profile
    runtime::GEMMProfile* profile = (runtime::GEMMProfile*)runtime::database.at({device.architecture(), runtime::GEMM}).get();
    templates::GEMM generator = profile->predict(device, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc);
    //Execute
    std::string src = generator.dump(device, "gemm");
    driver::Module module(stream.context(), src);
    return value_type(std::make_shared<templates::GEMM>(generator), std::make_shared<driver::Kernel>(module, "gemm"));
  };
  static cpp::CachedMap<key_type, value_type> cache(compile);
  //Retrieve profile/kernel and execute
  value_type const & value = cache.get(key_type(stream, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc));
  value.first->enqueue(*value.second, stream, alpha, A, B, beta, C);
 }
 void CONV(driver::Device const & device, driver::Stream & stream,
          DType dtype, param_t N, param_t K, param_t P, param_t Q, param_t C, param_t R, param_t S,
          param_t H, param_t W, param_t pad_h, param_t pad_w, param_t stride_h, param_t stride_w,
          scalar const & alpha, driver::Buffer const & I, driver::Buffer const & F, scalar const & beta, driver::Buffer& O)
 {
  typedef std::tuple<driver::Stream, DType, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> key_type;
  typedef std::pair<std::shared_ptr<templates::Conv>, std::shared_ptr<driver::Kernel>> value_type;
  static std::function<value_type()> compile = [&](){
    //Fetch profile
    runtime::ConvProfile* profile = (runtime::ConvProfile*)runtime::database.at({device.architecture(), runtime::CONV}).get();
    templates::Conv generator = profile->predict(device, dtype, C, H, W, N, K, P, Q, R, S, pad_h, pad_w, stride_h, stride_w);
    //Execute
    std::string src = generator.dump(device, "fconv");
    driver::Module module(stream.context(), src);
    return value_type(std::make_shared<templates::Conv>(generator), std::make_shared<driver::Kernel>(module, "fconv"));
  };
  static cpp::CachedMap<key_type, value_type> cache(compile);
  //Retrieve profile/kernel and execute
  value_type const & value = cache.get(key_type(stream, dtype, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w));
  value.first->enqueue(*value.second, stream, alpha, I, F, beta, O);
 }
 }
--- a/include/isaac/array.h
+++ b/include/isaac/array.h
@@ -1,337 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_ARRAY_H_
 #define ISAAC_ARRAY_H_
 #include <iostream>
 #include "isaac/defines.h"
 #include "isaac/driver/backend.h"
 #include "isaac/jit/syntax/expression/expression.h"
 #include "isaac/runtime/handler.h"
 #include "isaac/types.h"
 #include "isaac/tools/cpp/tuple.hpp"
 namespace isaac
 {
 class scalar;
 class view;
 class ISAACAPI array_base
 {
  int_t dsize();
 public:
  //1D Constructors
  explicit array_base(int_t size1, numeric_type dtype = FLOAT_TYPE, driver::Context const & context = driver::backend::contexts::get_default());
  array_base(int_t size1, numeric_type dtype, driver::Buffer data, int_t start, int_t inc);
  template<typename DT>
  array_base(std::vector<DT> const & data, driver::Context const & context = driver::backend::contexts::get_default());
  array_base(array_base & v, slice const & s1);
  //2D Constructors
  array_base(int_t size1, int_t size2, numeric_type dtype = FLOAT_TYPE, driver::Context const & context = driver::backend::contexts::get_default());
  array_base(int_t size1, int_t size2, numeric_type dtype, driver::Buffer data, int_t start, int_t ld);
  template<typename DT>
  array_base(int_t size1, int_t size2, std::vector<DT> const & data, driver::Context const & context = driver::backend::contexts::get_default());
  array_base(array_base & M, slice const & s1, slice const & s2);
  //3D Constructors
  array_base(int_t size1, int_t size2, int_t size3, numeric_type dtype = FLOAT_TYPE, driver::Context const & context = driver::backend::contexts::get_default());
  //General constructor
  template<typename DT>
  array_base(tuple const & shape, std::vector<DT> const & data, driver::Context const & context = driver::backend::contexts::get_default());
  array_base(tuple const & shape, numeric_type dtype, driver::Context const & context = driver::backend::contexts::get_default());
  array_base(tuple const & shape, numeric_type dtype, int_t start, tuple const & stride, driver::Context const & context = driver::backend::contexts::get_default());
  array_base(tuple const & shape, numeric_type dtype, int_t start, tuple const & stride, driver::Buffer const & data);
  explicit array_base(runtime::execution_handler const &);
  //Make the class virtual
  virtual ~array_base() = 0;
  //Getters
  numeric_type dtype() const;
  tuple const & shape() const;
  size_t dim() const;
  int_t start() const;
  tuple const & stride() const;
  driver::Context const & context() const;
  driver::Buffer const & data() const;
  driver::Buffer & data();
  //Setters
  array_base& resize(int_t size1, int_t size2=1);
  //Numeric operators
  array_base& operator=(array_base const &);
  array_base& operator=(expression_tree const &);
  array_base& operator=(runtime::execution_handler const &);
  template<class T>
  array_base & operator=(std::vector<T> const & rhs);
  array_base & operator=(value_scalar const & rhs);
  expression_tree operator-();
  expression_tree operator!();
  array_base& operator+=(value_scalar const &);
  array_base& operator+=(array_base const &);
  array_base& operator+=(expression_tree const &);
  array_base& operator-=(value_scalar const &);
  array_base& operator-=(array_base const &);
  array_base& operator-=(expression_tree const &);
  array_base& operator*=(value_scalar const &);
  array_base& operator*=(array_base const &);
  array_base& operator*=(expression_tree const &);
  array_base& operator/=(value_scalar const &);
  array_base& operator/=(array_base const &);
  array_base& operator/=(expression_tree const &);
  //Indexing (1D)
  const scalar operator[](int_t) const;
  scalar operator[](int_t);
  view operator[](slice const &);
  //Indexing (2D)
  view operator()(int_t, int_t);
  view operator()(slice const &, int_t);
  view operator()(int_t, slice const &);
  view operator()(slice const &, slice const &);
  const view operator()(int_t, int_t) const;
  const view operator()(slice const &, int_t) const;
  const view operator()(int_t, slice const &) const;
  const view operator()(slice const &, slice const &) const;
 protected:
  numeric_type dtype_;
  tuple shape_;
  int_t start_;
  tuple stride_;
  driver::Context context_;
  driver::Buffer data_;
 public:
  const expression_tree T;
 };
 class ISAACAPI array : public array_base
 {
 public:
  using array_base::array_base;
  //Copy Constructor
  array(array_base const &);
  array(array const &);
  array(expression_tree const & proxy);
  using array_base::operator=;
 };
 class ISAACAPI view : public array_base
 {
 public:
  view(array_base & data);
  view(array_base& data, slice const & s1);
  view(array_base& data, slice const & s1, slice const & s2);
  view(int_t size1, numeric_type dtype, driver::Buffer data, int_t start, int_t inc);
  using array_base::operator=;
 };
 class ISAACAPI scalar : public array_base
 {
  friend value_scalar::value_scalar(const scalar &);
  friend value_scalar::value_scalar(const expression_tree &);
 private:
  void inject(values_holder&) const;
  template<class T> T cast() const;
 public:
  explicit scalar(numeric_type dtype, const driver::Buffer &data, int_t offset);
  explicit scalar(value_scalar value, driver::Context const & context = driver::backend::contexts::get_default());
  explicit scalar(numeric_type dtype, driver::Context const & context = driver::backend::contexts::get_default());
  scalar(expression_tree const & proxy);
  scalar& operator=(value_scalar const &);
 //  scalar& operator=(scalar const & s);
  using array_base::operator =;
 #define INSTANTIATE(type) operator type() const;
  INSTANTIATE(char)
  INSTANTIATE(unsigned char)
  INSTANTIATE(short)
  INSTANTIATE(unsigned short)
  INSTANTIATE(int)
  INSTANTIATE(unsigned int)
  INSTANTIATE(long)
  INSTANTIATE(unsigned long)
  INSTANTIATE(long long)
  INSTANTIATE(unsigned long long)
  INSTANTIATE(float)
  INSTANTIATE(double)
 #undef INSTANTIATE
 };
 //copy
 ISAACAPI void copy(void const * data, array_base & gx, driver::CommandQueue & queue, bool blocking = true);
 ISAACAPI void copy(array_base const & gx, void* data, driver::CommandQueue & queue, bool blocking = true);
 ISAACAPI void copy(void const *data, array_base &gx, bool blocking = true);
 ISAACAPI void copy(array_base const & gx, void* data, bool blocking = true);
 template<class T> ISAACAPI void copy(std::vector<T> const & cA, array_base& gA, driver::CommandQueue & queue, bool blocking = true);
 template<class T> ISAACAPI void copy(array_base const & gA, std::vector<T> & cA, driver::CommandQueue & queue, bool blocking = true);
 template<class T> ISAACAPI void copy(std::vector<T> const & cA, array_base & gA, bool blocking = true);
 template<class T> ISAACAPI void copy(array_base const & gA, std::vector<T> & cA, bool blocking = true);
 //Operators
 //Binary operators
 #define ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(OPNAME) \
 ISAACAPI expression_tree OPNAME (array_base const & x, expression_tree const & y);\
 ISAACAPI expression_tree OPNAME (array_base const & x, value_scalar const & y);\
 ISAACAPI expression_tree OPNAME (array_base const & x, array_base const & y);\
 \
 ISAACAPI expression_tree OPNAME (expression_tree const & x, expression_tree const & y);\
 ISAACAPI expression_tree OPNAME (expression_tree const & x, value_scalar const & y);\
 ISAACAPI expression_tree OPNAME (expression_tree const & x, array_base const & y);\
 \
 ISAACAPI expression_tree OPNAME (value_scalar const & y, expression_tree const & x);\
 ISAACAPI expression_tree OPNAME (value_scalar const & y, array_base const & x);\
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator +)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator -)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator *)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator /)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator >)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator >=)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator <)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator <=)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator ==)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator !=)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(maximum)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(minimum)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(pow)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(dot)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(outer)
 ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(assign)
 #undef ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR
 #define ISAAC_DECLARE_ROT(LTYPE, RTYPE, CTYPE, STYPE) \
  expression_tree rot(LTYPE const & x, RTYPE const & y, CTYPE const & c, STYPE const & s);
 ISAAC_DECLARE_ROT(array_base, array_base, scalar, scalar)
 ISAAC_DECLARE_ROT(expression_tree, array_base, scalar, scalar)
 ISAAC_DECLARE_ROT(array_base, expression_tree, scalar, scalar)
 ISAAC_DECLARE_ROT(expression_tree, expression_tree, scalar, scalar)
 ISAAC_DECLARE_ROT(array_base, array_base, value_scalar, value_scalar)
 ISAAC_DECLARE_ROT(expression_tree, array_base, value_scalar, value_scalar)
 ISAAC_DECLARE_ROT(array_base, expression_tree, value_scalar, value_scalar)
 ISAAC_DECLARE_ROT(expression_tree, expression_tree, value_scalar, value_scalar)
 ISAAC_DECLARE_ROT(array_base, array_base, expression_tree, expression_tree)
 ISAAC_DECLARE_ROT(expression_tree, array_base, expression_tree, expression_tree)
 ISAAC_DECLARE_ROT(array_base, expression_tree, expression_tree, expression_tree)
 ISAAC_DECLARE_ROT(expression_tree, expression_tree, expression_tree, expression_tree)
 //--------------------------------
 //Unary operators
 #define ISAAC_DECLARE_UNARY_OPERATOR(OPNAME) \
  ISAACAPI expression_tree OPNAME (array_base const & x);\
  ISAACAPI expression_tree OPNAME (expression_tree const & x);
 ISAAC_DECLARE_UNARY_OPERATOR(abs)
 ISAAC_DECLARE_UNARY_OPERATOR(acos)
 ISAAC_DECLARE_UNARY_OPERATOR(asin)
 ISAAC_DECLARE_UNARY_OPERATOR(atan)
 ISAAC_DECLARE_UNARY_OPERATOR(ceil)
 ISAAC_DECLARE_UNARY_OPERATOR(cos)
 ISAAC_DECLARE_UNARY_OPERATOR(cosh)
 ISAAC_DECLARE_UNARY_OPERATOR(exp)
 ISAAC_DECLARE_UNARY_OPERATOR(floor)
 ISAAC_DECLARE_UNARY_OPERATOR(log)
 ISAAC_DECLARE_UNARY_OPERATOR(log10)
 ISAAC_DECLARE_UNARY_OPERATOR(sin)
 ISAAC_DECLARE_UNARY_OPERATOR(sinh)
 ISAAC_DECLARE_UNARY_OPERATOR(sqrt)
 ISAAC_DECLARE_UNARY_OPERATOR(tan)
 ISAAC_DECLARE_UNARY_OPERATOR(tanh)
 ISAAC_DECLARE_UNARY_OPERATOR(trans)
 #undef ISAAC_DECLARE_UNARY_OPERATOR
 ISAACAPI expression_tree cast(array_base const &, numeric_type dtype);
 ISAACAPI expression_tree cast(expression_tree const &, numeric_type dtype);
 //Matrix reduction
 #define ISAAC_DECLARE_REDUCTION(OPNAME) \
 ISAACAPI expression_tree OPNAME(array_base const & M, int_t axis = -1);\
 ISAACAPI expression_tree OPNAME(expression_tree const & M, int_t axis = -1);
 ISAAC_DECLARE_REDUCTION(sum)
 ISAAC_DECLARE_REDUCTION(argmax)
 ISAAC_DECLARE_REDUCTION((max))
 ISAAC_DECLARE_REDUCTION((min))
 ISAAC_DECLARE_REDUCTION(argmin)
 //Shortcuts
 ISAACAPI expression_tree norm(array_base const &, unsigned int order = 2, int_t axis = -1);
 ISAACAPI expression_tree norm(expression_tree const &, unsigned int order = 2, int_t axis = -1);
 ISAACAPI expression_tree mean(array_base const &, int_t axis = -1);
 ISAACAPI expression_tree mean(expression_tree const &, int_t axis = -1);
 //ISAACAPI expression_tree var(array_base const &, int_t axis = -1);
 //ISAACAPI expression_tree var(expression_tree const &, int_t axis = -1);
 //Fusion
 ISAACAPI expression_tree fuse(expression_tree const & x, expression_tree const & y);
 //Initializers
 ISAACAPI expression_tree eye(int_t, int_t, isaac::numeric_type, driver::Context const & context = driver::backend::contexts::get_default());
 ISAACAPI expression_tree zeros(tuple const & shape, numeric_type dtype, driver::Context const & context = driver::backend::contexts::get_default());
 //Swap
 ISAACAPI void swap(view x, view y);
 //Reshape
 ISAACAPI expression_tree reshape(array_base const &, tuple const &);
 ISAACAPI expression_tree reshape(expression_tree const &, tuple const &);
 ISAACAPI expression_tree ravel(array_base const &);
 ISAACAPI expression_tree ravel(expression_tree const & x);
 //Diag
 array diag(array_base & x, int offset = 0);
 //
 ISAACAPI std::ostream& operator<<(std::ostream &, array_base const &);
 ISAACAPI std::ostream& operator<<(std::ostream &, expression_tree const &);
 }
 #endif
--- a/include/isaac/common/expression_type.h
+++ b/include/isaac/common/expression_type.h
@@ -1,63 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_COMMON_EXPRESSION_TYPE_H
 #define ISAAC_COMMON_EXPRESSION_TYPE_H
 #include <string>
 #include <stdexcept>
 namespace isaac
 {
 enum expression_type
 {
  INVALID_EXPRESSION_TYPE,
  ELEMENTWISE_1D,
  ELEMENTWISE_2D,
  REDUCE_1D,
  REDUCE_2D_ROWS,
  REDUCE_2D_COLS,
  GEMM_NN,
  GEMM_TN,
  GEMM_NT,
  GEMM_TT
 };
 inline expression_type expression_type_from_string(std::string const & name)
 {
  if(name=="elementwise_1d") return ELEMENTWISE_1D;
  if(name=="reduce_1d") return REDUCE_1D;
  if(name=="elementwise_2d") return ELEMENTWISE_2D;
  if(name=="reduce_2d_rows") return REDUCE_2D_ROWS;
  if(name=="reduce_2d_cols") return REDUCE_2D_COLS;
  if(name=="gemm_nn") return GEMM_NN;
  if(name=="gemm_nt") return GEMM_NT;
  if(name=="gemm_tn") return GEMM_TN;
  if(name=="gemm_tt") return GEMM_TT;
  throw std::invalid_argument("Unrecognized expression: " + name);
 }
 }
 #endif
--- a/include/isaac/common/numeric_type.h
+++ b/include/isaac/common/numeric_type.h
@@ -1,144 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_COMMON_NUMERIC_TYPE_H
 #define ISAAC_COMMON_NUMERIC_TYPE_H
 #include <stdexcept>
 #include "isaac/exception/api.h"
 namespace isaac
 {
 class half{
 /* It is a incompleted class for compiling using*/
 public:
  half() {};
 };
 enum numeric_type
 {
  INVALID_NUMERIC_TYPE = 0,
 //  BOOL_TYPE,
  CHAR_TYPE,
  UCHAR_TYPE,
  SHORT_TYPE,
  USHORT_TYPE,
  INT_TYPE,
  UINT_TYPE,
  LONG_TYPE,
  ULONG_TYPE,
  HALF_TYPE,
  FLOAT_TYPE,
  DOUBLE_TYPE
 };
 inline std::string to_string(numeric_type const & type)
 {
  switch (type)
  {
 //  case BOOL_TYPE: return "bool";
  case CHAR_TYPE: return "char";
  case UCHAR_TYPE: return "uchar";
  case SHORT_TYPE: return "short";
  case USHORT_TYPE: return "ushort";
  case INT_TYPE:  return "int";
  case UINT_TYPE: return "uint";
  case LONG_TYPE:  return "long";
  case ULONG_TYPE: return "ulong";
  case HALF_TYPE : return "half";
  case FLOAT_TYPE : return "float";
  case DOUBLE_TYPE : return "double";
  default : throw unknown_datatype(type);
  }
 }
 inline numeric_type numeric_type_from_string(std::string const & name)
 {
  if(name=="float16") return HALF_TYPE;
  if(name=="float32") return FLOAT_TYPE;
  if(name=="float64") return DOUBLE_TYPE;
  throw std::invalid_argument("Invalid datatype: " + name);
 }
 inline unsigned int size_of(numeric_type type)
 {
  switch (type)
  {
 //  case BOOL_TYPE:
  case UCHAR_TYPE:
  case CHAR_TYPE: return 1;
  case HALF_TYPE:
  case USHORT_TYPE:
  case SHORT_TYPE: return 2;
  case UINT_TYPE:
  case INT_TYPE:
  case FLOAT_TYPE: return 4;
  case ULONG_TYPE:
  case LONG_TYPE:
  case DOUBLE_TYPE: return 8;
  default: throw unknown_datatype(type);
  }
 }
 template<size_t size, bool is_unsigned>
 struct to_int_numeric_type_impl;
 #define ISAAC_INSTANTIATE_INT_TYPE_IMPL(SIZE, IS_UNSIGNED, TYPE) \
    template<> struct to_int_numeric_type_impl<SIZE, IS_UNSIGNED> { static const numeric_type value = TYPE; }
 ISAAC_INSTANTIATE_INT_TYPE_IMPL(1, false, CHAR_TYPE);
 ISAAC_INSTANTIATE_INT_TYPE_IMPL(2, false, SHORT_TYPE);
 ISAAC_INSTANTIATE_INT_TYPE_IMPL(4, false, INT_TYPE);
 ISAAC_INSTANTIATE_INT_TYPE_IMPL(8, false, LONG_TYPE);
 ISAAC_INSTANTIATE_INT_TYPE_IMPL(1, true, UCHAR_TYPE);
 ISAAC_INSTANTIATE_INT_TYPE_IMPL(2, true, USHORT_TYPE);
 ISAAC_INSTANTIATE_INT_TYPE_IMPL(4, true, UINT_TYPE);
 ISAAC_INSTANTIATE_INT_TYPE_IMPL(8, true, ULONG_TYPE);
 #undef ISAAC_INSTANTIATE_INT_TYPE_IMPL
 template<class T>
 struct to_int_numeric_type
 {
    static const numeric_type value = to_int_numeric_type_impl<sizeof(T), std::is_unsigned<T>::value>::value;
 };
 template<class T> struct to_numeric_type { static const numeric_type value = to_int_numeric_type<T>::value; };
 template<> struct to_numeric_type<char> { static const numeric_type value = CHAR_TYPE; };
 template<> struct to_numeric_type<unsigned char> { static const numeric_type value = UCHAR_TYPE ; };
 template<> struct to_numeric_type<short> { static const numeric_type value = SHORT_TYPE ; };
 template<> struct to_numeric_type<unsigned short> { static const numeric_type value = USHORT_TYPE ; };
 template<> struct to_numeric_type<int> { static const numeric_type value = INT_TYPE ; };
 template<> struct to_numeric_type<unsigned int> { static const numeric_type value = UINT_TYPE ; };
 template<> struct to_numeric_type<long> { static const numeric_type value = LONG_TYPE ; };
 template<> struct to_numeric_type<unsigned long> { static const numeric_type value = ULONG_TYPE ; };
 template<> struct to_numeric_type<half> { static const numeric_type value = HALF_TYPE; };
 template<> struct to_numeric_type<float> { static const numeric_type value = FLOAT_TYPE; };
 template<> struct to_numeric_type<double> { static const numeric_type value = DOUBLE_TYPE; };
 template<class T> typename std::enable_if<std::is_arithmetic<T>::value, numeric_type>::type numeric_type_of(T) { return to_numeric_type<T>::value; }
 template<class T> typename std::enable_if<!std::is_arithmetic<T>::value, numeric_type>::type numeric_type_of(T const & x) { return x.dtype(); }
 }
 #endif
--- a/include/isaac/defines.h
+++ b/include/isaac/defines.h
@@ -1,49 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_DEFINES_H
 #define ISAAC_DEFINES_H
 #if defined(_WIN32) || defined(_MSC_VER)
    #ifdef ISAAC_DLL
        #define ISAACAPI  __declspec(dllexport)
    #else
        #define ISAACAPI  __declspec(dllimport)
    #endif
 #else
    #define ISAACAPI   __attribute__((visibility("default")))
 #endif
 #if defined(_WIN32) || defined(_MSC_VER)
 	#define DISABLE_MSVC_WARNING_C4251 __pragma(warning(disable: 4251))
 	#define RESTORE_MSVC_WARNING_C4251 __pragma(warning(default: 4251))
 	#define DISABLE_MSVC_WARNING_C4275 __pragma(warning(disable: 4275))
 	#define RESTORE_MSVC_WARNING_C4275 __pragma(warning(disable: 4275))
 #else
    #define DISABLE_MSVC_WARNING_C4251
    #define RESTORE_MSVC_WARNING_C4251
    #define DISABLE_MSVC_WARNING_C4275
    #define RESTORE_MSVC_WARNING_C4275
 #endif
 #endif
--- a/include/isaac/driver/backend.h
+++ b/include/isaac/driver/backend.h
@@ -27,12 +27,6 @@
 #include <list>
 #include <vector>
 #include "isaac/common/expression_type.h"
 #include "isaac/common/numeric_type.h"
 #include "isaac/driver/dispatch.h"
 #include "isaac/defines.h"
 #include "isaac/types.h"
 namespace isaac
 {
@@ -40,93 +34,78 @@ namespace driver
 {
 class Buffer;
-class CommandQueue;
+class Stream;
 class Context;
 class Platform;
-class Program;
+class Module;
 class Kernel;
 class ProgramCache;
-class ISAACAPI backend
+struct backend
 {
-public:
+
-  class ISAACAPI workspaces
+  class modules
  {
    friend class backend;
  public:
-      static const size_t SIZE = 8000000; //8MB of temporary workspace per queue
+    static void release();
-      static void release();
+    static Module& get(Stream const & stream, std::string const & name, std::string const &src);
      static driver::Buffer & get(CommandQueue const & key);
  private:
-      DISABLE_MSVC_WARNING_C4251
+    static std::map<std::tuple<Stream, std::string>, Module * > cache_;
      static std::map<CommandQueue, Buffer * > cache_;
      RESTORE_MSVC_WARNING_C4251
  };
-  class ISAACAPI programs
+  class kernels
  {
-      friend class backend;
+    friend class backend;
  public:
-      static void release();
+    static void release();
-      static ProgramCache & get(CommandQueue const & queue, expression_type expression, numeric_type dtype);
+    static Kernel & get(Module const & program, std::string const & name);
  private:
-DISABLE_MSVC_WARNING_C4251
+    static std::map<std::tuple<Module, std::string>, Kernel * > cache_;
      static std::map<std::tuple<CommandQueue, expression_type, numeric_type>, ProgramCache * > cache_;
 RESTORE_MSVC_WARNING_C4251
  };
-  class ISAACAPI kernels
+  class contexts
  {
-      friend class backend;
+    friend class backend;
  public:
      static void release();
      static Kernel & get(Program const & program, std::string const & name);
  private:
-DISABLE_MSVC_WARNING_C4251
+    static void init(std::vector<Platform> const &);
-      static std::map<std::tuple<Program, std::string>, Kernel * > cache_;
+    static void release();
-RESTORE_MSVC_WARNING_C4251
+  public:
    static Context const & get_default();
    template<class T>
    static Context const & import(T context)
    {
      for(driver::Context const * x: cache_)
        if((T)*x==context)
          return *x;
      cache_.emplace_back(new Context(context, false));
      return *cache_.back();
    }
    static void get(std::list<Context const *> &);
  private:
    static std::list<Context const *> cache_;
  };
-  class ISAACAPI contexts
+  class streams
  {
-      friend class backend;
+    friend class backend;
  private:
-      static void init(std::vector<Platform> const &);
+    static void init(std::list<Context const *> const &);
-      static void release();
+    static void release();
  public:
-      static Context const & get_default();
+    static void get(Context const &, std::vector<Stream *> &streams);
-      static Context const & import(CUcontext context);
+    static Stream & get(Context const &, unsigned int id = 0);
-      static Context const & import(cl_context context);
+    static Stream & get_default();
      static void get(std::list<Context const *> &);
  private:
-DISABLE_MSVC_WARNING_C4251
+    static std::map< Context, std::vector<Stream*> > cache_;
      static std::list<Context const *> cache_;
 RESTORE_MSVC_WARNING_C4251
  };
  class ISAACAPI queues
  {
      friend class backend;
  private:
      static void init(std::list<Context const *> const &);
      static void release();
  public:
      static void get(Context const &, std::vector<CommandQueue *> &queues);
      static CommandQueue & get(Context const &, unsigned int id = 0);
  private:
 DISABLE_MSVC_WARNING_C4251
      static std::map< Context, std::vector<CommandQueue*> > cache_;
 RESTORE_MSVC_WARNING_C4251
  };
  static void init();
  static void release();
-  static void platforms(std::vector<Platform> &);
+  static std::vector<Platform> platforms();
  static void synchronize(Context const &);
 public:
  static unsigned int default_device;
  static cl_command_queue_properties default_queue_properties;
 };
 }
--- a/include/isaac/driver/buffer.h
+++ b/include/isaac/driver/buffer.h
@@ -23,61 +23,30 @@
 #ifndef ISAAC_DRIVER_BUFFER_H
 #define ISAAC_DRIVER_BUFFER_H
 #include "isaac/types.h"
 #include "isaac/defines.h"
 #include "isaac/driver/common.h"
 #include "isaac/driver/context.h"
 #include "isaac/driver/handle.h"
-#include "isaac/driver/dispatch.h"
+
 namespace isaac
 {
 namespace driver
 {
 class Stream;
 // Buffer
-class ISAACAPI Buffer: public has_handle_comparators<Buffer>
+class Buffer: public Handle<CUdeviceptr>
 {
-public:
+  typedef Handle<CUdeviceptr> base_type;
  typedef Handle<cl_mem, CUdeviceptr> handle_type;
 private:
  friend class CommandQueue;
  friend class Kernel;
  //Wrapper to get CUDA context from Memory
  static CUcontext context(CUdeviceptr h)
  {
      CUcontext res;
      check(dispatch::cuPointerGetAttribute((void*)&res, CU_POINTER_ATTRIBUTE_CONTEXT, h));
      return res;
  }
 public:
-  //Constructors
+  using base_type::base_type;
  Buffer(CUdeviceptr h = 0, bool take_ownership = true);
  Buffer(cl_mem Buffer = 0, bool take_ownership = true);
  Buffer(Context const & context, size_t size);
-  //Accessors
+  void set_zero(Stream const & queue);
-  handle_type&  handle();
+
  handle_type const &  handle() const;
  Context const & context() const;
 private:
-  backend_type backend_;
+  size_t size_;
  Context context_;
  handle_type h_;
 };
 inline Buffer make_buffer(backend_type backend, cl_mem clh = 0, CUdeviceptr cuh = 0, bool take_ownership = true)
 {
  if(backend==OPENCL)
    return Buffer(clh, take_ownership);
  else
    return Buffer(cuh, take_ownership);
 }
 }
 }
 #endif
--- a/include/isaac/driver/context.h
+++ b/include/isaac/driver/context.h
@@ -23,10 +23,6 @@
 #ifndef ISAAC_DRIVER_CONTEXT_H
 #define ISAAC_DRIVER_CONTEXT_H
 #include <map>
 #include <memory>
 #include "isaac/defines.h"
 #include "isaac/driver/common.h"
 #include "isaac/driver/device.h"
 #include "isaac/driver/handle.h"
@@ -36,42 +32,25 @@ namespace isaac
 namespace driver
 {
-class ISAACAPI Context: public has_handle_comparators<Context>
+class Context: public Handle<CUcontext>
 {
-  friend class Program;
+  typedef Handle<CUcontext> base_type;
  friend class CommandQueue;
  friend class Buffer;
 public:
  typedef Handle<cl_context, CUcontext> handle_type;
 private:
-  static std::string cache_path();
+  static std::string get_cache_path();
-
+  static CUdevice device(CUcontext);
  static CUdevice device(CUcontext)
  {
      CUdevice res;
      dispatch::cuCtxGetDevice(&res);
      return res;
  }
 public:
  //Constructors
  explicit Context(CUcontext const & context, bool take_ownership = true);
  explicit Context(cl_context const & context, bool take_ownership = true);
  explicit Context(Device const & device);
  //Accessors
  backend_type backend() const;
  Device const & device() const;
-  handle_type const & handle() const;
+  std::string const & cache_path() const;
 private:
 DISABLE_MSVC_WARNING_C4251
  backend_type backend_;
  Device device_;
  std::string cache_path_;
  handle_type h_;
 RESTORE_MSVC_WARNING_C4251
 };
 }
--- a/include/isaac/driver/cublas.h
+++ b/include/isaac/driver/cublas.h
@@ -0,0 +1,114 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_DRIVER_CUBLAS_H
 #define ISAAC_DRIVER_CUBLAS_H
 #include "isaac/templates/common.hpp"
 #include "isaac/driver/dispatch.h"
 #include "isaac/driver/buffer.h"
 #include "isaac/driver/stream.h"
 #include "isaac/driver/backend.h"
 namespace isaac
 {
 namespace driver
 {
 template<typename... Args> void cublasGemm_impl(half, Args... args){ driver::dispatch::cublasHgemm(args...); }
 template<typename... Args> void cublasGemm_impl(float, Args... args){ driver::dispatch::cublasSgemm_v2(args...); }
 template<typename... Args> void cublasGemm_impl(double, Args... args){ driver::dispatch::cublasDgemm_v2(args...); }
 template<class cuType>
 inline void cublasGemm_dispatch(Context const & ctx, Stream& queue, char AT, char BT, int32_t M, int32_t N, int32_t K, void* alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, void* beta, Buffer& C, int32_t ldc){
  auto cu_trans = [](char xt) { return (xt=='N')?CUBLAS_OP_N:CUBLAS_OP_T; };
  cublasHandle_t handle = dispatch::cublasHandle(ctx);
  dispatch::cublasSetStream_v2(handle, (CUstream)queue);
  CUdeviceptr cuA = A, cuB = B, cuC = C;
  cublasGemm_impl(cuType(), handle, cu_trans(AT), cu_trans(BT), M, N, K, (cuType*)alpha, (const cuType*)cuA, lda, (const cuType*)cuB, ldb, (cuType*)beta, (cuType*)cuC, ldc);
 }
 inline void cublasGemm(DType dtype, Context const & ctx,  Stream& queue, char AT, char BT, int32_t M, int32_t N, int32_t K, scalar alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, scalar beta, Buffer& C, int32_t ldc){
  switch(dtype){
    case HALF_TYPE: return cublasGemm_dispatch<half>(ctx, queue, AT, BT, M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc);
    case FLOAT_TYPE: return cublasGemm_dispatch<float>(ctx, queue, AT, BT, M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc);
    case DOUBLE_TYPE: return cublasGemm_dispatch<double>(ctx, queue, AT, BT, M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc);
    default: throw;
  }
 }
 inline cudnnDataType_t cudnnDtype(DType dtype){
  switch(dtype){
    case HALF_TYPE: return CUDNN_DATA_HALF;
    case FLOAT_TYPE: return CUDNN_DATA_FLOAT;
    case DOUBLE_TYPE: return CUDNN_DATA_DOUBLE;
  }
  throw;
 }
 inline void cudnnConv(DType dtype, Context const & ctx, Stream& queue, int32_t H, int32_t W, int32_t N, int32_t K, int32_t P, int32_t Q, int32_t C, int32_t R, int32_t S,
                      int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, Buffer const & F, scalar beta, Buffer const & O){
  cudnnHandle_t handle = dispatch::cudnnHandle(ctx);
  cudnnDataType_t cutype = cudnnDtype(dtype);
  dispatch::cudnnSetStream(handle, (CUstream)queue);
  cudnnTensorDescriptor_t tO, tI;
  cudnnFilterDescriptor_t tF;
  cudnnConvolutionDescriptor_t conv;
  cudnnConvolutionFwdAlgo_t algo;
  dispatch::cudnnCreateTensorDescriptor(&tO);
  dispatch::cudnnCreateTensorDescriptor(&tI);
  dispatch::cudnnCreateFilterDescriptor(&tF);
  dispatch::cudnnSetTensor4dDescriptor(tO, CUDNN_TENSOR_NCHW, cutype, N, K, P, Q);
  dispatch::cudnnSetFilter4dDescriptor(tF, cutype, CUDNN_TENSOR_NCHW, K, C, R, S);
  dispatch::cudnnSetTensor4dDescriptor(tI, CUDNN_TENSOR_NCHW, cutype, N, C, H, W);
  dispatch::cudnnCreateConvolutionDescriptor(&conv);
  int pad[] = {pad_h, pad_w};
  int stride[] = {stride_h, stride_w};
  int upscale[] = {1, 1};
  dispatch::cudnnSetConvolutionNdDescriptor(conv, 2, pad, stride, upscale, CUDNN_CROSS_CORRELATION, cutype);
 //  dispatch::cudnnSetConvolution2dDescriptor(conv, pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION);
 //  dispatch::cudnnGetConvolutionForwardAlgorithm(handle, tI, tF, conv, tO, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 1024*1024, &algo);
  algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
  size_t workspace_size;
  dispatch::cudnnGetConvolutionForwardWorkspaceSize(handle, tI, tF, conv, tO, algo, &workspace_size);
  Buffer work(ctx, std::max((size_t)1,workspace_size));
  CUdeviceptr twork = work;
  CUdeviceptr pI = I, pF = F, pO = O;
  dispatch::cudnnConvolutionForward(handle, alpha.data(), tI, (void*)pI, tF, (void*)pF, conv, algo, (void*)twork, workspace_size, beta.data(), tO, (void*)pO);
 }
 }
 }
 #endif
--- a/include/isaac/driver/device.h
+++ b/include/isaac/driver/device.h
@@ -23,8 +23,6 @@
 #ifndef ISAAC_DRIVER_DEVICE_H
 #define ISAAC_DRIVER_DEVICE_H
 #include "isaac/defines.h"
 #include "isaac/driver/common.h"
 #include "isaac/driver/platform.h"
 #include "isaac/driver/handle.h"
@@ -35,60 +33,26 @@ namespace driver
 {
 // Device
-class ISAACAPI Device: public has_handle_comparators<Device>
+class Device: public Handle<CUdevice>
 {
 private:
  friend class Context;
  friend class CommandQueue;
 public:
-  typedef Handle<cl_device_id, CUdevice> handle_type;
+  typedef Handle<CUdevice> base_type;
  //Supported types
  enum Type
  {
      GPU = CL_DEVICE_TYPE_GPU,
      CPU = CL_DEVICE_TYPE_CPU,
      ACCELERATOR = CL_DEVICE_TYPE_ACCELERATOR,
      UNKNOWN
  };
  //Supported vendors
  enum class Vendor
  {
      AMD,
      INTEL,
      NVIDIA,
      UNKNOWN
  };
  //Supported architectures
  enum class Architecture
  {
-      //Intel
+    //NVidia
-      HASWELL,
+    SM_2_0,
-      BROADWELL,
+    SM_2_1,
-      SKYLAKE,
+    SM_3_0,
-      KABYLAKE,
+    SM_3_5,
    SM_3_7,
    SM_5_0,
    SM_5_2,
    SM_6_0,
    SM_6_1,
-      //NVidia
+    UNKNOWN
      SM_2_0,
      SM_2_1,
      SM_3_0,
      SM_3_5,
      SM_3_7,
      SM_5_0,
      SM_5_2,
      SM_6_0,
      SM_6_1,
      //AMD
      TERASCALE_2,
      TERASCALE_3,
      GCN_1,
      GCN_2,
      GCN_3,
      GCN_4,
      UNKNOWN
  };
 private:
@@ -96,34 +60,32 @@ private:
  template<CUdevice_attribute attr>
  int cuGetInfo() const;
  inline Architecture nv_arch(std::pair<unsigned int, unsigned int> sm) const;
  inline nvmlDevice_t nvml_device() const;
 public:
-  //Constructors
+  using base_type::base_type;
  explicit Device(CUdevice const & device, bool take_ownership = true);
  explicit Device(cl_device_id const & device, bool take_ownership = true);
  //Accessors
  handle_type const & handle() const;
  Vendor vendor() const;
  Architecture architecture() const;
  backend_type backend() const;
  //Informations
  std::string infos() const;
-  size_t clock_rate() const;
+  size_t address_bits() const;
  unsigned int address_bits() const;
  driver::Platform platform() const;
  std::vector<size_t> max_block_dim() const;
  size_t max_threads_per_block() const;
  size_t max_shared_memory() const;
  size_t warp_size() const;
  std::pair<size_t, size_t> compute_capability() const;
  //Identifier
  std::string name() const;
-  std::string vendor_str() const;
+  std::string pci_bus_id() const;
-  std::vector<size_t> max_work_item_sizes() const;
+  //Clocks
-  Type type() const;
+  size_t current_sm_clock() const;
-  std::string extensions() const;
+  size_t current_mem_clock() const;
-  size_t max_work_group_size() const;
+
-  size_t local_mem_size() const;
+  size_t max_sm_clock() const;
-  size_t warp_wavefront_size() const;
+  size_t max_mem_clock() const;
  bool fp64_support() const;
  std::pair<unsigned int, unsigned int> nv_compute_capability() const;
 private:
  backend_type backend_;
  handle_type h_;
 };
 }
--- a/include/isaac/driver/dispatch.h
+++ b/include/isaac/driver/dispatch.h
@@ -26,15 +26,14 @@
 #include <type_traits>
 #include <dlfcn.h>
 //OpenCL Backend
 #include "isaac/driver/external/CL/cl.h"
 #include "isaac/driver/external/CL/cl_ext.h"
 //CUDA Backend
 #include "isaac/driver/external/CUDA/cuda.h"
 #include "isaac/driver/external/CUDA/nvrtc.h"
 #include "isaac/driver/external/CUDA/cublas.h"
 #include "isaac/driver/external/CUDA/cudnn.h"
 #include "isaac/driver/external/CUDA/nvml.h"
 //Exceptions
 #include "isaac/driver/common.h"
 #include <iostream>
 namespace isaac
@@ -48,211 +47,189 @@ template<class T> void check(T){}
 void check(nvrtcResult err);
 void check(CUresult err);
 void check(cublasStatus_t err);
-void check(cl_int err);
+void check(cudnnStatus_t err);
 void check_destruction(CUresult);
 class dispatch
 {
 private:
-    template <class F>
+  template <class F>
-    struct return_type;
+  struct return_type;
-    template <class R, class... A>
+  template <class R, class... A>
-    struct return_type<R (*)(A...)>
+  struct return_type<R (*)(A...)>
-    { typedef R type; };
+  { typedef R type; };
-    typedef bool (*f_init_t)();
+  typedef bool (*f_init_t)();
-    template<f_init_t initializer, typename FunPtrT, typename... Args>
+  template<f_init_t initializer, typename FunPtrT, typename... Args>
-    static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args)
+  static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args)
-    {
+  {
-        initializer();
+    initializer();
-        if(cache == nullptr)
+    if(cache == nullptr)
-            cache = dlsym(lib_h, name);
+      cache = dlsym(lib_h, name);
-        FunPtrT fptr;
+    FunPtrT fptr;
-        *reinterpret_cast<void **>(&fptr) = cache;
+    *reinterpret_cast<void **>(&fptr) = cache;
-        typename return_type<FunPtrT>::type res = (*fptr)(args...);
+    typename return_type<FunPtrT>::type res = (*fptr)(args...);
-        check(res);
+    check(res);
-        return res;
+    return res;
-    }
+  }
 public:
-    static bool clinit();
+  static bool nvrtcinit();
-    static bool cublasinit();
+  static bool nvmlinit();
-    static bool nvrtcinit();
+  static bool cuinit();
-    static bool cuinit();
+  static bool cublasinit();
  static bool cudnninit();
-    static void release();
+  static void release();
-    //OpenCL
+  //CUDA
-    static cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *);
+  static CUresult cuCtxGetCurrent(CUcontext *pctx);
-    static cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+  static CUresult cuCtxDestroy_v2(CUcontext ctx);
-    static cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *);
+  static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
-    static cl_int clReleaseMemObject(cl_mem);
+  static CUresult cuDeviceGet(CUdevice *device, int ordinal);
-    static cl_int clFinish(cl_command_queue);
+  static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
-    static cl_int clGetMemObjectInfo(cl_mem, cl_mem_info, size_t, void *, size_t *);
+  static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
-    static cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *);
+  static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
-    static cl_int clReleaseContext(cl_context);
+  static CUresult cuMemFree_v2(CUdeviceptr dptr);
-    static cl_int clReleaseEvent(cl_event);
+  static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-    static cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+  static CUresult cuDriverGetVersion(int *driverVersion);
-    static cl_int clEnqueueReadBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+  static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
-    static cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *);
+  static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev);
    static cl_int clReleaseDevice(cl_device_id);
    static cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *);
    static cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
    static cl_int clGetContextInfo(cl_context, cl_context_info, size_t, void *, size_t *);
    static cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *);
    static cl_int clReleaseCommandQueue(cl_command_queue);
    static cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *);
    static cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *);
    static cl_int clGetEventProfilingInfo(cl_event, cl_profiling_info, size_t, void *, size_t *);
    static cl_program clCreateProgramWithBinary(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
    static cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *);
    static cl_int clRetainEvent(cl_event);
    static cl_int clReleaseProgram(cl_program);
    static cl_int clFlush(cl_command_queue);
    static cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *);
    static cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *);
    static cl_int clGetKernelWorkGroupInfo(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *);
    static cl_kernel clCreateKernel(cl_program, const char *, cl_int *);
    static cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *);
    static cl_mem clCreateImage(cl_context, cl_mem_flags, const cl_image_format *, const cl_image_desc *, void *, cl_int *);
    static cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *);
    static cl_int clReleaseKernel(cl_kernel);
    static cl_int clEnqueueCopyBufferToImage(cl_command_queue, cl_mem, cl_mem, size_t, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
    static cl_int clSetEventCallback(cl_event, cl_int, void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), void *);
-    //CUDA
+  static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-    static CUresult cuCtxDestroy_v2(CUcontext ctx);
+  static CUresult cuModuleLoad(CUmodule *module, const char *fname);
-    static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
+  static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
-    static CUresult cuDeviceGet(CUdevice *device, int ordinal);
+  static CUresult cuModuleUnload(CUmodule hmod);
-    static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+  static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
-    static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
+  static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
-    static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+  static CUresult cuDeviceGetCount(int *count);
-    static CUresult cuMemFree_v2(CUdeviceptr dptr);
+  static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
-    static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+  static CUresult cuInit(unsigned int Flags);
-    static CUresult cuDriverGetVersion(int *driverVersion);
+  static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
-    static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
+  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
-    static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+  static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
-    static CUresult cuModuleLoad(CUmodule *module, const char *fname);
+  static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
-    static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+  static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
-    static CUresult cuModuleUnload(CUmodule hmod);
+  static CUresult cuStreamSynchronize(CUstream hStream);
-    static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+  static CUresult cuStreamDestroy_v2(CUstream hStream);
-    static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+  static CUresult cuEventDestroy_v2(CUevent hEvent);
-    static CUresult cuDeviceGetCount(int *count);
+  static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
-    static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+  static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr);
-    static CUresult cuInit(unsigned int Flags);
+  static CUresult cuCtxGetDevice(CUdevice* result);
-    static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
+  static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream);
    static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
    static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
    static CUresult cuStreamSynchronize(CUstream hStream);
    static CUresult cuStreamDestroy_v2(CUstream hStream);
    static CUresult cuEventDestroy_v2(CUevent hEvent);
    static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
    static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr);
    static CUresult cuCtxGetDevice(CUdevice* result);
-    static nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char **options);
+  static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device);
-    static nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
+  static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
-    static nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
+  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
    static nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
    static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
    static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
-    static cublasHandle_t cublasHandle(Context const & ctx);
+  static nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char **options);
-    static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
+  static nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
-    static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId);
+  static nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
-    static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId);
+  static nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
-    static cublasStatus_t cublasSgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
+  static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
-    static cublasStatus_t cublasDgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
+  static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
  static cublasHandle_t cublasHandle(Context const & ctx);
  static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
  static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId);
  static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId);
  static cublasStatus_t cublasSgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
  static cublasStatus_t cublasDgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
  static cublasStatus_t cublasHgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc);
  static cudnnHandle_t cudnnHandle(Context const & ctx);
  static cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc);
  static cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
  static cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
  static cudnnStatus_t cudnnCreate(cudnnHandle_t *handle);
  static cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w);
  static cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w);
  static cudnnStatus_t cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, int pad_h, int pad_w, int u, int v, int upscalex, int upscaley, cudnnConvolutionMode_t mode);
  static cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, int arrayLength, const int padA[], const int filterStrideA[], const int upscaleA[], cudnnConvolutionMode_t mode, cudnnDataType_t dataType);
  static cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionFwdAlgo_t *algo);
  static cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo, size_t *sizeInBytes);
  static cudnnStatus_t cudnnConvolutionForward(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const cudnnFilterDescriptor_t wDesc, const void *w, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo, void *workSpace, size_t workSpaceSizeInBytes, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
  static cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
 private:
-    static void* opencl_;
+  static void* cuda_;
-    static void* cuda_;
+  static void* nvrtc_;
-    static void* nvrtc_;
+  static void* nvml_;
-    static void* cublas_;
+  static void* cublas_;
  static void* cudnn_;
  //CUDA
  static void* cuCtxGetCurrent_;
  static void* cuCtxDestroy_v2_;
  static void* cuEventCreate_;
  static void* cuDeviceGet_;
  static void* cuMemcpyDtoH_v2_;
  static void* cuStreamCreate_;
  static void* cuEventElapsedTime_;
  static void* cuMemFree_v2_;
  static void* cuMemcpyDtoHAsync_v2_;
  static void* cuDriverGetVersion_;
  static void* cuDeviceGetName_;
  static void* cuDeviceGetPCIBusId_;
-    //OpenCL
+  static void* cuMemcpyHtoDAsync_v2_;
-    static void* clBuildProgram_;
+  static void* cuModuleLoad_;
-    static void* clEnqueueNDRangeKernel_;
+  static void* cuLaunchKernel_;
-    static void* clSetKernelArg_;
+  static void* cuModuleUnload_;
-    static void* clReleaseMemObject_;
+  static void* cuModuleLoadDataEx_;
-    static void* clFinish_;
+  static void* cuDeviceGetAttribute_;
-    static void* clGetMemObjectInfo_;
+  static void* cuDeviceGetCount_;
-    static void* clGetCommandQueueInfo_;
+  static void* cuMemcpyHtoD_v2_;
-    static void* clReleaseContext_;
+  static void* cuInit_;
-    static void* clReleaseEvent_;
+  static void* cuEventRecord_;
-    static void* clEnqueueWriteBuffer_;
+  static void* cuCtxCreate_v2_;
-    static void* clEnqueueReadBuffer_;
+  static void* cuModuleGetFunction_;
-    static void* clGetProgramBuildInfo_;
+  static void* cuStreamSynchronize_;
-    static void* clReleaseDevice_;
+  static void* cuStreamDestroy_v2_;
-    static void* clCreateContext_;
+  static void* cuEventDestroy_v2_;
-    static void* clGetDeviceIDs_;
+  static void* cuMemAlloc_v2_;
-    static void* clGetContextInfo_;
+  static void* cuPointerGetAttribute_;
-    static void* clGetDeviceInfo_;
+  static void* cuCtxGetDevice_;
-    static void* clReleaseCommandQueue_;
+  static void* cuMemsetD8Async_;
-    static void* clGetPlatformIDs_;
+  static void* cuCtxPushCurrent_v2_;
-    static void* clGetPlatformInfo_;
+  static void* cuCtxPopCurrent_v2_;
    static void* clGetEventProfilingInfo_;
    static void* clCreateProgramWithBinary_;
    static void* clCreateCommandQueue_;
    static void* clRetainEvent_;
    static void* clReleaseProgram_;
    static void* clFlush_;
    static void* clGetProgramInfo_;
    static void* clGetKernelInfo_;
    static void* clGetKernelWorkGroupInfo_;
    static void* clCreateKernel_;
    static void* clCreateBuffer_;
    static void* clCreateImage_;
    static void* clCreateProgramWithSource_;
    static void* clReleaseKernel_;
    static void* clEnqueueCopyBufferToImage_;
    static void* clSetEventCallback_;
-    //CUDA
+  static void* nvmlInit_v2_;
-    static void* cuCtxDestroy_v2_;
+  static void* nvmlDeviceGetHandleByPciBusId_v2_;
-    static void* cuEventCreate_;
+  static void* nvmlDeviceGetClockInfo_;
-    static void* cuDeviceGet_;
+  static void* nvmlDeviceGetMaxClockInfo_;
    static void* cuMemcpyDtoH_v2_;
    static void* cuStreamCreate_;
    static void* cuEventElapsedTime_;
    static void* cuMemFree_v2_;
    static void* cuMemcpyDtoHAsync_v2_;
    static void* cuDriverGetVersion_;
    static void* cuDeviceGetName_;
    static void* cuMemcpyHtoDAsync_v2_;
    static void* cuModuleLoad_;
    static void* cuLaunchKernel_;
    static void* cuModuleUnload_;
    static void* cuModuleLoadDataEx_;
    static void* cuDeviceGetAttribute_;
    static void* cuDeviceGetCount_;
    static void* cuMemcpyHtoD_v2_;
    static void* cuInit_;
    static void* cuEventRecord_;
    static void* cuCtxCreate_v2_;
    static void* cuModuleGetFunction_;
    static void* cuStreamSynchronize_;
    static void* cuStreamDestroy_v2_;
    static void* cuEventDestroy_v2_;
    static void* cuMemAlloc_v2_;
    static void* cuPointerGetAttribute_;
    static void* cuCtxGetDevice_;
-    static void* nvrtcCompileProgram_;
+  static void* nvrtcCompileProgram_;
-    static void* nvrtcGetProgramLogSize_;
+  static void* nvrtcGetProgramLogSize_;
-    static void* nvrtcGetPTX_;
+  static void* nvrtcGetPTX_;
-    static void* nvrtcGetPTXSize_;
+  static void* nvrtcGetPTXSize_;
-    static void* nvrtcCreateProgram_;
+  static void* nvrtcCreateProgram_;
-    static void* nvrtcGetProgramLog_;
+  static void* nvrtcGetProgramLog_;
  static void* cublasCreate_v2_;
  static void* cublasGetStream_v2_;
  static void* cublasSetStream_v2_;
  static void* cublasHgemm_;
  static void* cublasSgemm_v2_;
  static void* cublasDgemm_v2_;
  static void* cudnnCreateConvolutionDescriptor_;
  static void* cudnnCreateTensorDescriptor_;
  static void* cudnnCreateFilterDescriptor_;
  static void* cudnnCreate_;
  static void* cudnnSetTensor4dDescriptor_;
  static void* cudnnSetFilter4dDescriptor_;
  static void* cudnnSetConvolution2dDescriptor_;
  static void* cudnnSetConvolutionNdDescriptor_;
  static void* cudnnGetConvolutionForwardAlgorithm_;
  static void* cudnnGetConvolutionForwardWorkspaceSize_;
  static void* cudnnConvolutionForward_;
  static void* cudnnSetStream_;
    static void* cublasCreate_v2_;
    static void* cublasGetStream_v2_;
    static void* cublasSetStream_v2_;
    static void* cublasSgemm_v2_;
    static void* cublasDgemm_v2_;
 };
 }
--- a/include/isaac/driver/error.h
+++ b/include/isaac/driver/error.h
@@ -0,0 +1,224 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_EXCEPTION_DRIVER_H
 #define ISAAC_EXCEPTION_DRIVER_H
 #include <exception>
 #include "isaac/driver/dispatch.h"
 namespace isaac
 {
  namespace driver
  {
  namespace exception
  {
  namespace nvrtc
  {
 #define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
  ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory              ,"out of memory");
  ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure   ,"program creation failure");
  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input              ,"invalid input");
  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program            ,"invalid program");
  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_option             ,"invalid option");
  ISAAC_CREATE_NVRTC_EXCEPTION(compilation                ,"compilation");
  ISAAC_CREATE_NVRTC_EXCEPTION(builtin_operation_failure  ,"builtin operation failure");
  ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error              ,"unknown error");
 #undef ISAAC_CREATE_NVRTC_EXCEPTION
  }
  namespace cuda
  {
  class base: public std::exception{};
 #define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "CUDA: Error- " msg; } }
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_value                   ,"invalid value");
  ISAAC_CREATE_CUDA_EXCEPTION(out_of_memory                   ,"out of memory");
  ISAAC_CREATE_CUDA_EXCEPTION(not_initialized                 ,"not initialized");
  ISAAC_CREATE_CUDA_EXCEPTION(deinitialized                   ,"deinitialized");
  ISAAC_CREATE_CUDA_EXCEPTION(profiler_disabled               ,"profiler disabled");
  ISAAC_CREATE_CUDA_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
  ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_started        ,"profiler already started");
  ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
  ISAAC_CREATE_CUDA_EXCEPTION(no_device                       ,"no device");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_device                  ,"invalid device");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_image                   ,"invalid image");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_context                 ,"invalid context");
  ISAAC_CREATE_CUDA_EXCEPTION(context_already_current         ,"context already current");
  ISAAC_CREATE_CUDA_EXCEPTION(map_failed                      ,"map failed");
  ISAAC_CREATE_CUDA_EXCEPTION(unmap_failed                    ,"unmap failed");
  ISAAC_CREATE_CUDA_EXCEPTION(array_is_mapped                 ,"array is mapped");
  ISAAC_CREATE_CUDA_EXCEPTION(already_mapped                  ,"already mapped");
  ISAAC_CREATE_CUDA_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
  ISAAC_CREATE_CUDA_EXCEPTION(already_acquired                ,"already acquired");
  ISAAC_CREATE_CUDA_EXCEPTION(not_mapped                      ,"not mapped");
  ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
  ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
  ISAAC_CREATE_CUDA_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
  ISAAC_CREATE_CUDA_EXCEPTION(unsupported_limit               ,"unsupported limit");
  ISAAC_CREATE_CUDA_EXCEPTION(context_already_in_use          ,"context already in use");
  ISAAC_CREATE_CUDA_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_ptx                     ,"invalid ptx");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_source                  ,"invalid source");
  ISAAC_CREATE_CUDA_EXCEPTION(file_not_found                  ,"file not found");
  ISAAC_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
  ISAAC_CREATE_CUDA_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
  ISAAC_CREATE_CUDA_EXCEPTION(operating_system                ,"operating system");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_handle                  ,"invalid handle");
  ISAAC_CREATE_CUDA_EXCEPTION(not_found                       ,"not found");
  ISAAC_CREATE_CUDA_EXCEPTION(not_ready                       ,"not ready");
  ISAAC_CREATE_CUDA_EXCEPTION(illegal_address                 ,"illegal address");
  ISAAC_CREATE_CUDA_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
  ISAAC_CREATE_CUDA_EXCEPTION(launch_timeout                  ,"launch timeout");
  ISAAC_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
  ISAAC_CREATE_CUDA_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
  ISAAC_CREATE_CUDA_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
  ISAAC_CREATE_CUDA_EXCEPTION(primary_context_active          ,"primary context active");
  ISAAC_CREATE_CUDA_EXCEPTION(context_is_destroyed            ,"context is destroyed");
  ISAAC_CREATE_CUDA_EXCEPTION(assert_error                    ,"assert");
  ISAAC_CREATE_CUDA_EXCEPTION(too_many_peers                  ,"too many peers");
  ISAAC_CREATE_CUDA_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
  ISAAC_CREATE_CUDA_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
  ISAAC_CREATE_CUDA_EXCEPTION(hardware_stack_error            ,"hardware stack error");
  ISAAC_CREATE_CUDA_EXCEPTION(illegal_instruction             ,"illegal instruction");
  ISAAC_CREATE_CUDA_EXCEPTION(misaligned_address              ,"misaligned address");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_address_space           ,"invalid address space");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_pc                      ,"invalid pc");
  ISAAC_CREATE_CUDA_EXCEPTION(launch_failed                   ,"launch failed");
  ISAAC_CREATE_CUDA_EXCEPTION(not_permitted                   ,"not permitted");
  ISAAC_CREATE_CUDA_EXCEPTION(not_supported                   ,"not supported");
  ISAAC_CREATE_CUDA_EXCEPTION(unknown                         ,"unknown");
 #undef ISAAC_CREATE_CUDA_EXCEPTION
  }
  namespace cublas
  {
 #define ISAAC_CREATE_CUBLAS_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } }
  ISAAC_CREATE_CUBLAS_EXCEPTION(not_initialized              ,"not initialized");
  ISAAC_CREATE_CUBLAS_EXCEPTION(alloc_failed                 ,"alloc failed");
  ISAAC_CREATE_CUBLAS_EXCEPTION(invalid_value                ,"invalid value");
  ISAAC_CREATE_CUBLAS_EXCEPTION(arch_mismatch                ,"arch mismatch");
  ISAAC_CREATE_CUBLAS_EXCEPTION(mapping_error                ,"mapping error");
  ISAAC_CREATE_CUBLAS_EXCEPTION(execution_failed             ,"execution failed");
  ISAAC_CREATE_CUBLAS_EXCEPTION(internal_error               ,"internal error");
  ISAAC_CREATE_CUBLAS_EXCEPTION(not_supported                ,"not supported");
  ISAAC_CREATE_CUBLAS_EXCEPTION(license_error                ,"license error");
  ISAAC_CREATE_CUBLAS_EXCEPTION(unknown                      ,"unknown");
 #undef ISAAC_CREATE_CUBLAS_EXCEPTION
  }
  namespace cudnn
  {
 #define ISAAC_CREATE_CUDNN_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "CUDNN: Error- " msg; } }
  ISAAC_CREATE_CUDNN_EXCEPTION(not_initialized              ,"not initialized");
  ISAAC_CREATE_CUDNN_EXCEPTION(alloc_failed                 ,"allocation failed");
  ISAAC_CREATE_CUDNN_EXCEPTION(bad_param                    ,"bad param");
  ISAAC_CREATE_CUDNN_EXCEPTION(internal_error               ,"internal error");
  ISAAC_CREATE_CUDNN_EXCEPTION(invalid_value                ,"invalid value");
  ISAAC_CREATE_CUDNN_EXCEPTION(arch_mismatch                ,"arch mismatch");
  ISAAC_CREATE_CUDNN_EXCEPTION(mapping_error                ,"mapping error");
  ISAAC_CREATE_CUDNN_EXCEPTION(execution_failed             ,"execution failed");
  ISAAC_CREATE_CUDNN_EXCEPTION(not_supported                ,"not supported");
  ISAAC_CREATE_CUDNN_EXCEPTION(license_error                ,"license error");
  }
  namespace ocl
  {
  class base: public std::exception{};
 #define ISAAC_CREATE_CL_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "OpenCL: Error- " msg; } }
  ISAAC_CREATE_CL_EXCEPTION(device_not_found,                  "device not found");
  ISAAC_CREATE_CL_EXCEPTION(device_not_available,              "device not available");
  ISAAC_CREATE_CL_EXCEPTION(compiler_not_available,            "compiler not available");
  ISAAC_CREATE_CL_EXCEPTION(mem_object_allocation_failure,     "object allocation failure");
  ISAAC_CREATE_CL_EXCEPTION(out_of_resources,                  "launch out of resources");
  ISAAC_CREATE_CL_EXCEPTION(out_of_host_memory,                "out of host memory");
  ISAAC_CREATE_CL_EXCEPTION(profiling_info_not_available,      "profiling info not available");
  ISAAC_CREATE_CL_EXCEPTION(mem_copy_overlap,                  "mem copy overlap");
  ISAAC_CREATE_CL_EXCEPTION(image_format_mismatch,             "image format mismatch");
  ISAAC_CREATE_CL_EXCEPTION(image_format_not_supported,        "image format not supported");
  ISAAC_CREATE_CL_EXCEPTION(build_program_failure,             "build program failure");
  ISAAC_CREATE_CL_EXCEPTION(map_failure,                       "map failure");
  ISAAC_CREATE_CL_EXCEPTION(invalid_value,                     "invalid value");
  ISAAC_CREATE_CL_EXCEPTION(invalid_device_type,               "invalid device type");
  ISAAC_CREATE_CL_EXCEPTION(invalid_platform,                  "invalid platform");
  ISAAC_CREATE_CL_EXCEPTION(invalid_device,                    "invalid device");
  ISAAC_CREATE_CL_EXCEPTION(invalid_context,                   "invalid context");
  ISAAC_CREATE_CL_EXCEPTION(invalid_queue_properties,          "invalid queue properties");
  ISAAC_CREATE_CL_EXCEPTION(invalid_command_queue,             "invalid command queue");
  ISAAC_CREATE_CL_EXCEPTION(invalid_host_ptr,                  "invalid host pointer");
  ISAAC_CREATE_CL_EXCEPTION(invalid_mem_object,                "invalid mem object");
  ISAAC_CREATE_CL_EXCEPTION(invalid_image_format_descriptor,   "invalid image format descriptor");
  ISAAC_CREATE_CL_EXCEPTION(invalid_image_size,                "invalid image size");
  ISAAC_CREATE_CL_EXCEPTION(invalid_sampler,                   "invalid sampler");
  ISAAC_CREATE_CL_EXCEPTION(invalid_binary,                    "invalid binary");
  ISAAC_CREATE_CL_EXCEPTION(invalid_build_options,             "invalid build options");
  ISAAC_CREATE_CL_EXCEPTION(invalid_program,                   "invalid program");
  ISAAC_CREATE_CL_EXCEPTION(invalid_program_executable,        "invalid program executable");
  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_name,               "invalid kernel name");
  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_definition,         "invalid kernel definition");
  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel,                    "invalid kernel");
  ISAAC_CREATE_CL_EXCEPTION(invalid_arg_index,                 "invalid arg index");
  ISAAC_CREATE_CL_EXCEPTION(invalid_arg_value,                 "invalid arg value");
  ISAAC_CREATE_CL_EXCEPTION(invalid_arg_size,                  "invalid arg size");
  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_args,               "invalid kernel args");
  ISAAC_CREATE_CL_EXCEPTION(invalid_work_dimension,            "invalid work dimension");
  ISAAC_CREATE_CL_EXCEPTION(invalid_work_group_size,           "invalid work group size");
  ISAAC_CREATE_CL_EXCEPTION(invalid_work_item_size,            "invalid work item size");
  ISAAC_CREATE_CL_EXCEPTION(invalid_global_offset,             "invalid global offset");
  ISAAC_CREATE_CL_EXCEPTION(invalid_event_wait_list,           "invalid event wait list");
  ISAAC_CREATE_CL_EXCEPTION(invalid_event,                     "invalid event");
  ISAAC_CREATE_CL_EXCEPTION(invalid_operation,                 "invalid operation");
  ISAAC_CREATE_CL_EXCEPTION(invalid_gl_object,                 "invalid GL object");
  ISAAC_CREATE_CL_EXCEPTION(invalid_buffer_size,               "invalid buffer size");
  ISAAC_CREATE_CL_EXCEPTION(invalid_mip_level,                 "invalid MIP level");
  ISAAC_CREATE_CL_EXCEPTION(invalid_global_work_size,          "invalid global work size");
 #ifdef CL_INVALID_PROPERTY
  ISAAC_CREATE_CL_EXCEPTION(invalid_property,                  "invalid property");
 #endif
  }
  }
  }
 }
 #endif
--- a/include/isaac/driver/event.h
+++ b/include/isaac/driver/event.h
@@ -23,8 +23,6 @@
 #ifndef ISAAC_DRIVER_EVENT_H
 #define ISAAC_DRIVER_EVENT_H
 #include "isaac/defines.h"
 #include "isaac/driver/common.h"
 #include "isaac/driver/handle.h"
 namespace isaac
@@ -34,26 +32,14 @@ namespace driver
 {
 // Event
-class ISAACAPI Event: public has_handle_comparators<Event>
+class Event: public Handle<cu_event_t>
 {
 private:
-  friend class CommandQueue;
+  typedef Handle<cu_event_t> base_type;
 public:
-  typedef Handle<cl_event, cu_event_t> handle_type;
+  using base_type::base_type;
-
+  float elapsed_time() const;
 public:
  //Constructors
  Event(cl_event const & event, bool take_ownership = true);
  Event(backend_type backend);
  //Accessors
  handle_type const & handle() const;
  //Profiling
  long elapsed_time() const;
 private:
  backend_type backend_;
  handle_type h_;
 };
 }
--- a/include/isaac/driver/external/CL/cl.h
+++ b/include/isaac/driver/external/CL/cl.h
--- a/include/isaac/driver/external/CL/cl_ext.h
+++ b/include/isaac/driver/external/CL/cl_ext.h
@@ -1,346 +0,0 @@
 /*******************************************************************************
 * Copyright (c) 2008-2013 The Khronos Group Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and/or associated documentation files (the
 * "Materials"), to deal in the Materials without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Materials, and to
 * permit persons to whom the Materials are furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Materials.
 *
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 ******************************************************************************/
 /* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
 /* cl_ext.h contains OpenCL extensions which don't have external */
 /* (OpenGL, D3D) dependencies.                                   */
 #ifndef __CL_EXT_H
 #define __CL_EXT_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "isaac/driver/external/CL/cl_ext.h"
 /* cl_khr_fp64 extension - no extension #define since it has no functions  */
 #define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
 /* cl_khr_fp16 extension - no extension #define since it has no functions  */
 #define CL_DEVICE_HALF_FP_CONFIG                    0x1033
 /* Memory object destruction
 *
 * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
 *
 * Registers a user callback function that will be called when the memory object is deleted and its resources 
 * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
 * stack associated with memobj. The registered user callback functions are called in the reverse order in 
 * which they were registered. The user callback functions are called and then the memory object is deleted 
 * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
 * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
 * the storage bits for the memory object, can be reused or freed.
 *
 * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
 *
 * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
 * before using.
 */
 #define cl_APPLE_SetMemObjectDestructor 1
 cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
 /* Context Logging Functions
 *
 * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
 * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
 * before using.
 *
 * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logelementwise_2d 
 */
 #define cl_APPLE_ContextLoggingFunctions 1
 extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
                                            const void * /* private_info */, 
                                            size_t       /* cb */, 
                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
 /* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
 extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
                                          const void * /* private_info */, 
                                          size_t       /* cb */, 
                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
 /* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
 extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
                                          const void * /* private_info */, 
                                          size_t       /* cb */, 
                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
 /************************ 
 * cl_khr_icd extension *                                                  
 ************************/
 #define cl_khr_icd 1
 /* cl_platform_info                                                        */
 #define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
 /* Additional Error Codes                                                  */
 #define CL_PLATFORM_NOT_FOUND_KHR                   -1001
 extern CL_API_ENTRY cl_int CL_API_CALL
 clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
                       cl_platform_id * /* platforms */,
                       cl_uint *        /* num_platforms */);
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
    cl_uint          /* num_entries */,
    cl_platform_id * /* platforms */,
    cl_uint *        /* num_platforms */);
 /* Extension: cl_khr_image2D_buffer
 *
 * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
 * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
 * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
 * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
 * for 2D images created from a buffer.
 *
 * When the 2D image from buffer is created, the client must specify the width,
 * height, image format (i.e. channel order and channel data type) and optionally the row pitch
 *
 * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
 * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
 */
 /*************************************
 * cl_khr_initalize_memory extension *
 *************************************/
 #define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x200E
 /**************************************
 * cl_khr_terminate_context extension *
 **************************************/
 #define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x200F
 #define CL_CONTEXT_TERMINATE_KHR                    0x2010
 #define cl_khr_terminate_context 1
 extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
 /*
 * Extension: cl_khr_spir
 *
 * This extension adds support to create an OpenCL program object from a 
 * Standard Portable Intermediate Representation (SPIR) instance
 */
 /******************************************
 * cl_nv_device_attribute_query extension *
 ******************************************/
 /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
 #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
 #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
 #define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
 #define CL_DEVICE_WARP_SIZE_NV                      0x4003
 #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
 #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
 #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
 /*********************************
 * cl_amd_device_memory_flags *
 *********************************/
 #define cl_amd_device_memory_flags 1
 #define CL_MEM_USE_PERSISTENT_MEM_AMD       (1 << 6)        // Alloc from GPU's CPU visible heap
 /* cl_device_info */
 #define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
 /*********************************
 * cl_amd_device_attribute_query *
 *********************************/
 #define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
 #define CL_DEVICE_TOPOLOGY_AMD                      0x4037
 #define CL_DEVICE_BOARD_NAME_AMD                    0x4038
 #define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD            0x4039
 #define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD         0x4040
 #define CL_DEVICE_SIMD_WIDTH_AMD                    0x4041
 #define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
 #define CL_DEVICE_WAVEFRONT_WIDTH_AMD               0x4043
 #define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD           0x4044
 #define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD      0x4045
 #define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
 #define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD   0x4047
 #define CL_DEVICE_LOCAL_MEM_BANKS_AMD               0x4048
 typedef union
 {
    struct { cl_uint type; cl_uint data[5]; } raw;
    struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
 } cl_device_topology_amd;
 #define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD            1
 /**************************
 * cl_amd_offline_devices *
 **************************/
 #define CL_CONTEXT_OFFLINE_DEVICES_AMD              0x403F
 #ifdef CL_VERSION_1_1
   /***********************************
    * cl_ext_device_fission extension *
    ***********************************/
    #define cl_ext_device_fission   1
    extern CL_API_ENTRY cl_int CL_API_CALL
    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
    typedef CL_API_ENTRY cl_int 
    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
    extern CL_API_ENTRY cl_int CL_API_CALL
    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
    typedef CL_API_ENTRY cl_int 
    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
    typedef cl_ulong  cl_device_partition_property_ext;
    extern CL_API_ENTRY cl_int CL_API_CALL
    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
                            const cl_device_partition_property_ext * /* properties */,
                            cl_uint /*num_entries*/,
                            cl_device_id * /*out_devices*/,
                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
    typedef CL_API_ENTRY cl_int 
    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
                                                const cl_device_partition_property_ext * /* properties */,
                                                cl_uint /*num_entries*/,
                                                cl_device_id * /*out_devices*/,
                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
    /* cl_device_partition_property_ext */
    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
    /* clDeviceGetInfo selectors */
    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
    /* error codes */
    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
    #define CL_INVALID_PARTITION_NAME_EXT               -1059
    /* CL_AFFINITY_DOMAINs */
    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
    /* cl_device_partition_property_ext list terminators */
    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
    /* cl_ext_atomic_counters_32 and cl_ext_atomic_counters_64 extensions
     * no extension #define since they have no functions
     */
    #define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
 /*********************************
 * cl_qcom_ext_host_ptr extension
 *********************************/
 #define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
 #define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
 #define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
 #define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
 #define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
 #define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
 #define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
 #define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
 #define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
 typedef cl_uint                                   cl_image_pitch_info_qcom;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetDeviceImageInfoQCOM(cl_device_id             device,
                         size_t                   image_width,
                         size_t                   image_height,
                         const cl_image_format   *image_format,
                         cl_image_pitch_info_qcom param_name,
                         size_t                   param_value_size,
                         void                    *param_value,
                         size_t                  *param_value_size_ret);
 typedef struct _cl_mem_ext_host_ptr
 {
    // Type of external memory allocation.
    // Legal values will be defined in layered extensions.
    cl_uint  allocation_type;
    // Host cache policy for this external memory allocation.
    cl_uint  host_cache_policy;
 } cl_mem_ext_host_ptr;
 /*********************************
 * cl_qcom_ion_host_ptr extension
 *********************************/
 #define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
 typedef struct _cl_mem_ion_host_ptr
 {
    // Type of external memory allocation.
    // Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations.
    cl_mem_ext_host_ptr  ext_host_ptr;
    // ION file descriptor
    int                  ion_filedesc;
    // Host pointer to the ION allocated memory
    void*                ion_hostptr;
 } cl_mem_ion_host_ptr;
 #endif /* CL_VERSION_1_1 */
 #ifdef __cplusplus
 }
 #endif
 #endif /* __CL_EXT_H */
--- a/include/isaac/driver/external/CL/cl_platform.h
+++ b/include/isaac/driver/external/CL/cl_platform.h
--- a/include/isaac/driver/external/CUDA/cublas.h
+++ b/include/isaac/driver/external/CUDA/cublas.h
@@ -57,7 +57,7 @@
 #if !defined(CUBLAS_H_)
 #define CUBLAS_H_
-#include <cuda_runtime.h>
+#include "cuda_runtime.h"
 #ifndef CUBLASWINAPI
 #ifdef _WIN32
--- a/include/isaac/driver/external/CUDA/cuda.h
+++ b/include/isaac/driver/external/CUDA/cuda.h
--- a/include/isaac/driver/external/CUDA/cudnn.h
+++ b/include/isaac/driver/external/CUDA/cudnn.h
--- a/include/isaac/driver/external/CUDA/nvml.h
+++ b/include/isaac/driver/external/CUDA/nvml.h
--- a/include/isaac/driver/handle.h
+++ b/include/isaac/driver/handle.h
@@ -24,10 +24,11 @@
 #define ISAAC_DRIVER_HANDLE_H
 #include <memory>
 #include "isaac/defines.h"
 #include "isaac/driver/common.h"
 #include <iostream>
 #include <functional>
 #include <type_traits>
 #include "isaac/driver/dispatch.h"
 namespace isaac
 {
@@ -35,68 +36,59 @@ namespace driver
 {
 struct cu_event_t{
-    operator bool() const { return first && second; }
+  operator bool() const { return first && second; }
-    CUevent first;
+  CUevent first;
-    CUevent second;
+  CUevent second;
 };
-template<class CLType, class CUType>
+struct cu_platform{
-class ISAACAPI Handle
+  cu_platform() : status_(dispatch::cuInit(0)) {}
-{
+  operator bool() const { return status_; }
 private:
-  static void _delete(CUcontext x);
+  CUresult status_;
-  static void _delete(CUdeviceptr x);
+};
  static void _delete(CUstream x);
  static void _delete(CUdevice);
  static void _delete(CUevent x);
  static void _delete(CUfunction);
  static void _delete(CUmodule x);
  static void _delete(cu_event_t x);
-  static void release(cl_context x);
+template<typename T> struct remove_class { };
-  static void release(cl_mem x);
+template<typename C, typename R, typename... A>
-  static void release(cl_command_queue x);
+struct remove_class<R(C::*)(A...)> { using type = R(A...); };
-  static void release(cl_device_id x);
+template<typename C, typename R, typename... A>
-  static void release(cl_event x);
+struct remove_class<R(C::*)(A...) const> { using type = R(A...); };
-  static void release(cl_kernel x);
+template<typename C, typename R, typename... A>
-  static void release(cl_program x);
+struct remove_class<R(C::*)(A...) volatile> { using type = R(A...); };
 template<typename C, typename R, typename... A>
 struct remove_class<R(C::*)(A...) const volatile> { using type = R(A...); };
 template<typename T>
 struct get_signature_impl { using type = typename remove_class<
    decltype(&std::remove_reference<T>::type::operator())>::type; };
 template<typename R, typename... A>
 struct get_signature_impl<R(A...)> { using type = R(A...); };
 template<typename R, typename... A>
 struct get_signature_impl<R(&)(A...)> { using type = R(A...); };
 template<typename R, typename... A>
 struct get_signature_impl<R(*)(A...)> { using type = R(A...); };
 template<typename T> using get_signature = typename get_signature_impl<T>::type;
 template<class CUType>
 class Handle
 {
 public:
  //Constructors
-  Handle(backend_type backend, bool take_ownership = true);
+  Handle(CUType cu, bool take_ownership = true);
  Handle(bool take_ownership = true);
  ~Handle();
  //Comparison
  bool operator==(Handle const & other) const;
  bool operator!=(Handle const & other) const;
  bool operator<(Handle const & other) const;
  //Accessors
-  backend_type backend() const;
+  operator CUType() const;
  CLType & cl();
  CLType const & cl() const;
  CUType & cu();
  CUType const & cu() const;
  ~Handle();
-private:
+protected:
 DISABLE_MSVC_WARNING_C4251
  std::shared_ptr<CLType> cl_;
  std::shared_ptr<CUType> cu_;
 RESTORE_MSVC_WARNING_C4251
 private:
  backend_type backend_;
  bool has_ownership_;
 };
 //Helper for automatic implementation of comparison operators
 template<class T>
 class has_handle_comparators
 {
 public:
  friend bool operator==(T const & x, T const & y) { return x.handle() == y.handle(); }
  friend bool operator!=(T const & x, T const & y) { return x.handle() != y.handle(); }
  friend bool operator<(T const & x, T const & y) { return x.handle() < y.handle(); }
 };
 }
 }
--- a/include/isaac/driver/kernel.h
+++ b/include/isaac/driver/kernel.h
@@ -23,11 +23,8 @@
 #ifndef ISAAC_DRIVER_KERNEL_H
 #define ISAAC_DRIVER_KERNEL_H
-#include "isaac/defines.h"
+#include "isaac/driver/module.h"
 #include "isaac/driver/common.h"
 #include "isaac/driver/program.h"
 #include "isaac/driver/handle.h"
 #include "isaac/value_scalar.h"
 #include <memory>
@@ -40,30 +37,25 @@ namespace driver
 class Buffer;
 // Kernel
-class ISAACAPI Kernel: public has_handle_comparators<Kernel>
+class Kernel: public Handle<CUfunction>
 {
  friend class CommandQueue;
 public:
-  typedef Handle<cl_kernel, CUfunction> handle_type;
+  typedef Handle<CUfunction> base_type;
 public:
  //Constructors
-  Kernel(Program const & program, const char * name);
+  Kernel(Module const & program, const char * name);
  //Accessors
  handle_type const & handle() const;
  //Arguments setters
  void setArg(unsigned int index, value_scalar const & scal);
  void setArg(unsigned int index, std::size_t size, void* ptr);
  void setArg(unsigned int index, Buffer const &);
  void setSizeArg(unsigned int index, std::size_t N);
  template<class T> void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); }
-
+  //Arguments getters
  void* const* cu_params() const;
 private:
-  backend_type backend_;
+  Module program_;
  unsigned int address_bits_;
  std::vector<std::shared_ptr<void> >  cu_params_store_;
  std::vector<void*>  cu_params_;
  handle_type h_;
 };
 }
--- a/include/isaac/tools/sys/cpuid.hpp
+++ b/include/isaac/tools/sys/cpuid.hpp
@@ -20,28 +20,38 @@
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
-#include <string>
+#ifndef ISAAC_DRIVER_MODULE_H
 #define ISAAC_DRIVER_MODULE_H
 #include <map>
 #include "isaac/driver/handle.h"
 #include "isaac/driver/context.h"
 namespace isaac
 {
-namespace tools
+
 namespace driver
 {
-inline void cpuid(int code, int *a, int *b, int *c, int *d) {
+class Context;
-  __asm__ __volatile__("cpuid":"=a"(*a),"=b"(*b),
+class Device;
                        "=c"(*c),"=d"(*d):"a"(code));
 }
-inline std::string cpu_brand(){
+class Module: public Handle<CUmodule>
-  char name[48];
+{
-  int* ptr = (int*)name;
+  typedef Handle<CUmodule> base_type;
  cpuid(0x80000002, ptr, ptr+1, ptr+2, ptr+3);
  cpuid(0x80000003, ptr+4, ptr+5, ptr+6, ptr+7);
  cpuid(0x80000004, ptr+8, ptr+9, ptr+10, ptr+11);
  return std::string(name, name+48);
 }
 public:
  Module(Context const & context, std::string const & source, bool is_ir = true);
  Context const & context() const;
 private:
  Context context_;
  std::string source_;
 };
 }
 }
 #endif
--- a/include/isaac/driver/platform.h
+++ b/include/isaac/driver/platform.h
@@ -26,8 +26,7 @@
 #include <vector>
 #include <string>
-#include "isaac/defines.h"
+#include "isaac/driver/handle.h"
 #include "isaac/driver/common.h"
 namespace isaac
 {
@@ -37,20 +36,15 @@ namespace driver
 class Device;
-class ISAACAPI Platform
+class Platform: public Handle<cu_platform>
 {
  typedef Handle<cu_platform> base_type;
 public:
-  //Constructors
+  using base_type::base_type;
  Platform(backend_type);
  Platform(cl_platform_id const &);
  //Accessors
  std::string name() const;
  std::string version() const;
-  void devices(std::vector<Device> &) const;
+  std::vector<Device> devices() const;
  cl_platform_id cl_id() const;
 private:
  backend_type backend_;
  cl_platform_id cl_platform_;
 };
 }
--- a/include/isaac/driver/program.h
+++ b/include/isaac/driver/program.h
@@ -1,70 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_DRIVER_PROGRAM_H
 #define ISAAC_DRIVER_PROGRAM_H
 #include <map>
 #include "isaac/defines.h"
 #include "isaac/driver/common.h"
 #include "isaac/driver/handle.h"
 #include "isaac/driver/context.h"
 namespace isaac
 {
 namespace driver
 {
 class Context;
 class Device;
 class ISAACAPI Program: public has_handle_comparators<Program>
 {
 public:
  typedef Handle<cl_program, CUmodule> handle_type;
 private:
  friend class Kernel;
 public:
  //Constructors
  Program(Context const & context, std::string const & source);
  //Accessors
  handle_type const & handle() const;
  Context const & context() const;
 private:
 DISABLE_MSVC_WARNING_C4251
  backend_type backend_;
  Context context_;
  std::string source_;
  handle_type h_;
 RESTORE_MSVC_WARNING_C4251
 };
 }
 }
 #endif
--- a/include/isaac/driver/program_cache.h
+++ b/include/isaac/driver/program_cache.h
@@ -1,59 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_DRIVER_PROGRAM_CACHE_H
 #define ISAAC_DRIVER_PROGRAM_CACHE_H
 #include <map>
 #include "isaac/defines.h"
 #include "isaac/driver/program.h"
 namespace isaac
 {
 namespace driver
 {
 class ISAACAPI ProgramCache
 {
    friend class backend;
 public:
    //Clearing the cache
    void clear();
    //Adding a program to the cache
    Program & add(Context const & context, std::string const & name, std::string const & src);
    //Finding a program in the cache
    Program const *find(std::string const & name);
 private:
 DISABLE_MSVC_WARNING_C4251
    std::map<std::string, Program> cache_;
 RESTORE_MSVC_WARNING_C4251
 };
 }
 }
 #endif
--- a/include/isaac/driver/command_queue.h
+++ b/include/isaac/driver/command_queue.h
@@ -20,12 +20,10 @@
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
-#ifndef ISAAC_DRIVER_COMMAND_QUEUE_H
+#ifndef ISAAC_DRIVER_STREAM_H
-#define ISAAC_DRIVER_COMMAND_QUEUE_H
+#define ISAAC_DRIVER_STREAM_H
 #include <map>
 #include "isaac/defines.h"
 #include "isaac/driver/common.h"
 #include "isaac/driver/context.h"
 #include "isaac/driver/device.h"
 #include "isaac/driver/handle.h"
@@ -38,40 +36,29 @@ namespace driver
 class Kernel;
 class Event;
-class NDRange;
+class Range;
 class Buffer;
 // Command Queue
-class ISAACAPI CommandQueue: public has_handle_comparators<CommandQueue>
+class Stream: public  Handle<CUstream>
 {
-public:
+  typedef Handle<CUstream> base_type;
  typedef Handle<cl_command_queue, CUstream> handle_type;
 public:
  //Constructors
-  CommandQueue(cl_command_queue const & queue, bool take_ownership = true);
+  using base_type::base_type;
-  CommandQueue(Context const & context, Device const & device, cl_command_queue_properties properties = 0);
+  Stream(Context const & context);
  //Accessors
  handle_type & handle();
  handle_type const & handle() const;
  backend_type backend() const;
  Context const & context() const;
  Device const & device() const;
  //Synchronize
  void synchronize();
  //Profiling
  void enable_profiling();
  void disable_profiling();
  //Enqueue calls
-  void enqueue(Kernel const & kernel, NDRange global, driver::NDRange local, std::vector<Event> const *, Event *event);
+  void enqueue(Kernel const & kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const * = NULL, Event *event = NULL);
  void write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
  void read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr);
 private:
  backend_type backend_;
  Context context_;
  Device device_;
  handle_type h_;
 };
--- a/include/isaac/exception/api.h
+++ b/include/isaac/exception/api.h
@@ -1,82 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_EXCEPTION_API_H
 #define ISAAC_EXCEPTION_API_H
 #include <string>
 #include <exception>
 #include "isaac/defines.h"
 namespace isaac
 {
 /** @brief Exception for the case the generator is unable to deal with the operation */
 DISABLE_MSVC_WARNING_C4275
 class operation_not_supported_exception : public std::exception
 {
 public:
  operation_not_supported_exception();
  operation_not_supported_exception(std::string message);
  virtual const char* what() const throw();
 private:
 DISABLE_MSVC_WARNING_C4251
  std::string message_;
 RESTORE_MSVC_WARNING_C4251
 };
 RESTORE_MSVC_WARNING_C4275
 /** @brief Exception for the case the generator is unable to deal with the operation */
 DISABLE_MSVC_WARNING_C4275
 class ISAACAPI unknown_datatype : public std::exception
 {
 public:
  unknown_datatype(int);
  virtual const char* what() const throw();
 private:
 DISABLE_MSVC_WARNING_C4251
  std::string message_;
 RESTORE_MSVC_WARNING_C4251
 };
 RESTORE_MSVC_WARNING_C4275
 /** @brief Exception for the case the generator is unable to deal with the operation */
 DISABLE_MSVC_WARNING_C4275
 class ISAACAPI semantic_error : public std::exception
 {
 public:
  semantic_error(std::string const & message);
  virtual const char* what() const throw();
 private:
 DISABLE_MSVC_WARNING_C4251
  std::string message_;
 RESTORE_MSVC_WARNING_C4251
 };
 RESTORE_MSVC_WARNING_C4275
 }
 #endif
--- a/include/isaac/exception/driver.h
+++ b/include/isaac/exception/driver.h
@@ -1,216 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_EXCEPTION_DRIVER_H
 #define ISAAC_EXCEPTION_DRIVER_H
 #include <exception>
 #include "isaac/driver/dispatch.h"
 #include "isaac/defines.h"
 DISABLE_MSVC_WARNING_C4275
 namespace isaac
 {
 namespace exception
 {
  class ISAACAPI unknown_architecture: public std::exception{
    public:
      unknown_architecture(std::string const & msg): msg_("Unrecognized architecture: " + msg){}
      const char * what() const throw(){ return msg_.c_str(); }
    private:
      std::string msg_;
  };
 namespace nvrtc
 {
 #define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class ISAACAPI name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
  ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory              ,"out of memory");
  ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure   ,"program creation failure");
  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input              ,"invalid input");
  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program            ,"invalid program");
  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_option             ,"invalid option");
  ISAAC_CREATE_NVRTC_EXCEPTION(compilation                ,"compilation");
  ISAAC_CREATE_NVRTC_EXCEPTION(builtin_operation_failure  ,"builtin operation failure");
  ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error              ,"unknown error");
  #undef ISAAC_CREATE_NVRTC_EXCEPTION
 }
 namespace cuda
 {
    class base: public std::exception{};
 #define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class ISAACAPI name: public base { public:const char * what() const throw(){ return "CUDA: Error- " msg; } }
    ISAAC_CREATE_CUDA_EXCEPTION(invalid_value                   ,"invalid value");
    ISAAC_CREATE_CUDA_EXCEPTION(out_of_memory                   ,"out of memory");
    ISAAC_CREATE_CUDA_EXCEPTION(not_initialized                 ,"not initialized");
    ISAAC_CREATE_CUDA_EXCEPTION(deinitialized                   ,"deinitialized");
    ISAAC_CREATE_CUDA_EXCEPTION(profiler_disabled               ,"profiler disabled");
    ISAAC_CREATE_CUDA_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
    ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_started        ,"profiler already started");
    ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
    ISAAC_CREATE_CUDA_EXCEPTION(no_device                       ,"no device");
    ISAAC_CREATE_CUDA_EXCEPTION(invalid_device                  ,"invalid device");
    ISAAC_CREATE_CUDA_EXCEPTION(invalid_image                   ,"invalid image");
    ISAAC_CREATE_CUDA_EXCEPTION(invalid_context                 ,"invalid context");
    ISAAC_CREATE_CUDA_EXCEPTION(context_already_current         ,"context already current");
    ISAAC_CREATE_CUDA_EXCEPTION(map_failed                      ,"map failed");
    ISAAC_CREATE_CUDA_EXCEPTION(unmap_failed                    ,"unmap failed");
    ISAAC_CREATE_CUDA_EXCEPTION(array_is_mapped                 ,"array is mapped");
    ISAAC_CREATE_CUDA_EXCEPTION(already_mapped                  ,"already mapped");
    ISAAC_CREATE_CUDA_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
    ISAAC_CREATE_CUDA_EXCEPTION(already_acquired                ,"already acquired");
    ISAAC_CREATE_CUDA_EXCEPTION(not_mapped                      ,"not mapped");
    ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
    ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
    ISAAC_CREATE_CUDA_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
    ISAAC_CREATE_CUDA_EXCEPTION(unsupported_limit               ,"unsupported limit");
    ISAAC_CREATE_CUDA_EXCEPTION(context_already_in_use          ,"context already in use");
    ISAAC_CREATE_CUDA_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
    ISAAC_CREATE_CUDA_EXCEPTION(invalid_ptx                     ,"invalid ptx");
    ISAAC_CREATE_CUDA_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
    ISAAC_CREATE_CUDA_EXCEPTION(invalid_source                  ,"invalid source");
    ISAAC_CREATE_CUDA_EXCEPTION(file_not_found                  ,"file not found");
    ISAAC_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
    ISAAC_CREATE_CUDA_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
    ISAAC_CREATE_CUDA_EXCEPTION(operating_system                ,"operating system");
    ISAAC_CREATE_CUDA_EXCEPTION(invalid_handle                  ,"invalid handle");
    ISAAC_CREATE_CUDA_EXCEPTION(not_found                       ,"not found");
    ISAAC_CREATE_CUDA_EXCEPTION(not_ready                       ,"not ready");
    ISAAC_CREATE_CUDA_EXCEPTION(illegal_address                 ,"illegal address");
    ISAAC_CREATE_CUDA_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
    ISAAC_CREATE_CUDA_EXCEPTION(launch_timeout                  ,"launch timeout");
    ISAAC_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
    ISAAC_CREATE_CUDA_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
    ISAAC_CREATE_CUDA_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
    ISAAC_CREATE_CUDA_EXCEPTION(primary_context_active          ,"primary context active");
    ISAAC_CREATE_CUDA_EXCEPTION(context_is_destroyed            ,"context is destroyed");
    ISAAC_CREATE_CUDA_EXCEPTION(assert_error                    ,"assert");
    ISAAC_CREATE_CUDA_EXCEPTION(too_many_peers                  ,"too many peers");
    ISAAC_CREATE_CUDA_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
    ISAAC_CREATE_CUDA_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
    ISAAC_CREATE_CUDA_EXCEPTION(hardware_stack_error            ,"hardware stack error");
    ISAAC_CREATE_CUDA_EXCEPTION(illegal_instruction             ,"illegal instruction");
    ISAAC_CREATE_CUDA_EXCEPTION(misaligned_address              ,"misaligned address");
    ISAAC_CREATE_CUDA_EXCEPTION(invalid_address_space           ,"invalid address space");
    ISAAC_CREATE_CUDA_EXCEPTION(invalid_pc                      ,"invalid pc");
    ISAAC_CREATE_CUDA_EXCEPTION(launch_failed                   ,"launch failed");
    ISAAC_CREATE_CUDA_EXCEPTION(not_permitted                   ,"not permitted");
    ISAAC_CREATE_CUDA_EXCEPTION(not_supported                   ,"not supported");
    ISAAC_CREATE_CUDA_EXCEPTION(unknown                         ,"unknown");
    #undef ISAAC_CREATE_CUDA_EXCEPTION
 }
 namespace cublas
 {
 #define ISAAC_CREATE_CUBLAS_EXCEPTION(name, msg) class ISAACAPI name: public std::exception { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } }
  ISAAC_CREATE_CUBLAS_EXCEPTION(not_initialized              ,"not initialized");
  ISAAC_CREATE_CUBLAS_EXCEPTION(alloc_failed                 ,"alloc failed");
  ISAAC_CREATE_CUBLAS_EXCEPTION(invalid_value                ,"invalid value");
  ISAAC_CREATE_CUBLAS_EXCEPTION(arch_mismatch                ,"arch mismatch");
  ISAAC_CREATE_CUBLAS_EXCEPTION(mapping_error                ,"mapping error");
  ISAAC_CREATE_CUBLAS_EXCEPTION(execution_failed             ,"execution failed");
  ISAAC_CREATE_CUBLAS_EXCEPTION(internal_error               ,"internal error");
  ISAAC_CREATE_CUBLAS_EXCEPTION(not_supported                ,"not supported");
  ISAAC_CREATE_CUBLAS_EXCEPTION(license_error                ,"license error");
  ISAAC_CREATE_CUBLAS_EXCEPTION(unknown                      ,"unknown");
  #undef ISAAC_CREATE_CUBLAS_EXCEPTION
 }
 namespace ocl
 {
    class ISAACAPI base: public std::exception{};
 #define ISAAC_CREATE_CL_EXCEPTION(name, msg) class ISAACAPI name: public base { public: const char * what() const throw(){ return "OpenCL: Error- " msg; } }
   ISAAC_CREATE_CL_EXCEPTION(device_not_found,                  "device not found");
   ISAAC_CREATE_CL_EXCEPTION(device_not_available,              "device not available");
   ISAAC_CREATE_CL_EXCEPTION(compiler_not_available,            "compiler not available");
   ISAAC_CREATE_CL_EXCEPTION(mem_object_allocation_failure,     "object allocation failure");
   ISAAC_CREATE_CL_EXCEPTION(out_of_resources,                  "launch out of resources");
   ISAAC_CREATE_CL_EXCEPTION(out_of_host_memory,                "out of host memory");
   ISAAC_CREATE_CL_EXCEPTION(profiling_info_not_available,      "profiling info not available");
   ISAAC_CREATE_CL_EXCEPTION(mem_copy_overlap,                  "mem copy overlap");
   ISAAC_CREATE_CL_EXCEPTION(image_format_mismatch,             "image format mismatch");
   ISAAC_CREATE_CL_EXCEPTION(image_format_not_supported,        "image format not supported");
   ISAAC_CREATE_CL_EXCEPTION(build_program_failure,             "build program failure");
   ISAAC_CREATE_CL_EXCEPTION(map_failure,                       "map failure");
   ISAAC_CREATE_CL_EXCEPTION(invalid_value,                     "invalid value");
   ISAAC_CREATE_CL_EXCEPTION(invalid_device_type,               "invalid device type");
   ISAAC_CREATE_CL_EXCEPTION(invalid_platform,                  "invalid platform");
   ISAAC_CREATE_CL_EXCEPTION(invalid_device,                    "invalid device");
   ISAAC_CREATE_CL_EXCEPTION(invalid_context,                   "invalid context");
   ISAAC_CREATE_CL_EXCEPTION(invalid_queue_properties,          "invalid queue properties");
   ISAAC_CREATE_CL_EXCEPTION(invalid_command_queue,             "invalid command queue");
   ISAAC_CREATE_CL_EXCEPTION(invalid_host_ptr,                  "invalid host pointer");
   ISAAC_CREATE_CL_EXCEPTION(invalid_mem_object,                "invalid mem object");
   ISAAC_CREATE_CL_EXCEPTION(invalid_image_format_descriptor,   "invalid image format descriptor");
   ISAAC_CREATE_CL_EXCEPTION(invalid_image_size,                "invalid image size");
   ISAAC_CREATE_CL_EXCEPTION(invalid_sampler,                   "invalid sampler");
   ISAAC_CREATE_CL_EXCEPTION(invalid_binary,                    "invalid binary");
   ISAAC_CREATE_CL_EXCEPTION(invalid_build_options,             "invalid build options");
   ISAAC_CREATE_CL_EXCEPTION(invalid_program,                   "invalid program");
   ISAAC_CREATE_CL_EXCEPTION(invalid_program_executable,        "invalid program executable");
   ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_name,               "invalid kernel name");
   ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_definition,         "invalid kernel definition");
   ISAAC_CREATE_CL_EXCEPTION(invalid_kernel,                    "invalid kernel");
   ISAAC_CREATE_CL_EXCEPTION(invalid_arg_index,                 "invalid arg index");
   ISAAC_CREATE_CL_EXCEPTION(invalid_arg_value,                 "invalid arg value");
   ISAAC_CREATE_CL_EXCEPTION(invalid_arg_size,                  "invalid arg size");
   ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_args,               "invalid kernel args");
   ISAAC_CREATE_CL_EXCEPTION(invalid_work_dimension,            "invalid work dimension");
   ISAAC_CREATE_CL_EXCEPTION(invalid_work_group_size,           "invalid work group size");
   ISAAC_CREATE_CL_EXCEPTION(invalid_work_item_size,            "invalid work item size");
   ISAAC_CREATE_CL_EXCEPTION(invalid_global_offset,             "invalid global offset");
   ISAAC_CREATE_CL_EXCEPTION(invalid_event_wait_list,           "invalid event wait list");
   ISAAC_CREATE_CL_EXCEPTION(invalid_event,                     "invalid event");
   ISAAC_CREATE_CL_EXCEPTION(invalid_operation,                 "invalid operation");
   ISAAC_CREATE_CL_EXCEPTION(invalid_gl_object,                 "invalid GL object");
   ISAAC_CREATE_CL_EXCEPTION(invalid_buffer_size,               "invalid buffer size");
   ISAAC_CREATE_CL_EXCEPTION(invalid_mip_level,                 "invalid MIP level");
   ISAAC_CREATE_CL_EXCEPTION(invalid_global_work_size,          "invalid global work size");
 #ifdef CL_INVALID_PROPERTY
   ISAAC_CREATE_CL_EXCEPTION(invalid_property,                  "invalid property");
 #endif
 }
 }
 }
 RESTORE_MSVC_WARNING_C4275
 #endif
--- a/include/isaac/external/half.hpp
+++ b/include/isaac/external/half.hpp
--- a/include/isaac/jit/generation/base.h
+++ b/include/isaac/jit/generation/base.h
@@ -1,118 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_TEMPLATES_base_
 #define ISAAC_TEMPLATES_base_
 #include <list>
 #include <set>
 #include <cmath>
 #include <stdint.h>
 #include "isaac/types.h"
 #include "isaac/jit/generation/engine/stream.h"
 #include "isaac/runtime/handler.h"
 #include "isaac/jit/syntax/engine/binder.h"
 #include "isaac/jit/syntax/engine/object.h"
 namespace isaac
 {
 namespace templates
 {
 //Error codes
 static const int TEMPLATE_VALID = 0;
 static const int TEMPLATE_LOCAL_MEMORY_OVERFLOW = -1;
 static const int TEMPLATE_WORK_GROUP_SIZE_OVERFLOW = -2;
 static const int TEMPLATE_LOCAL_SIZE_0_OVERFLOW = -3;
 static const int TEMPLATE_LOCAL_SIZE_1_OVERFLOW = -4;
 static const int TEMPLATE_LOCAL_SIZE_2_OVERFLOW = -5;
 static const int TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE = -6;
 static const int TEMPLATE_INVALID_SIMD_WIDTH = -7;
 static const int TEMPLATE_ALIGNMENT_MUST_BE_BLOCK_SIZE_MULTIPLE = -8;
 static const int TEMPLATE_INVALID_FETCHING_POLICY_TYPE= -9;
 static const int TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH = -10;
 static const int TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE = -11;
 static const int TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL = -12;
 static const int TEMPLATE_SIMD_WIDTH_MUST_BE_ONE = -13;
 static const int TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT = -14;
 static const int TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE = -15;
 static const int TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE = -16;
 static const int TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE = -17;
 static const int TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE = -18;
 static const int TEMPLATE_TEMPORARY_TOO_LARGE = -19;
 static const int TEMPLATE_BLOCK_SIZE_TOO_LARGE = -20;
 class base: public std::enable_shared_from_this<base>
 {
 private:
  virtual std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & mapping) const = 0;
 public:
  base();
  virtual ~base();
  virtual unsigned int temporary_workspace(expression_tree const &) const;
  virtual unsigned int lmem_usage(expression_tree const &) const;
  virtual unsigned int registers_usage(expression_tree const &) const;
  virtual std::vector<int_t> input_sizes(expression_tree const & expressions) const = 0;
  virtual int is_invalid(expression_tree const & expressions, driver::Device const & device) const = 0;
  virtual void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & expressions) = 0;
  virtual expression_type type() const = 0;
  std::string generate(std::string const & suffix, expression_tree const & expressions, driver::Device const & device);
  std::shared_ptr<base> getptr();
 };
 class external_base: public base
 {
 private:
  virtual std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & mapping) const;
 public:
  external_base();
  virtual unsigned int temporary_workspace(expression_tree const &) const;
  virtual unsigned int lmem_usage(expression_tree const &) const;
  virtual unsigned int registers_usage(expression_tree const &) const;
  virtual std::vector<int_t> input_sizes(expression_tree const & expressions) const = 0;
 };
 class parameterized_base : public base
 {
 private:
  virtual int is_invalid_impl(driver::Device const &, expression_tree const &) const;
 public:
  parameterized_base(unsigned int _vwidth, int_t _ls0, int_t _ls1);
  unsigned int ls0() const;
  unsigned int ls1() const;
  /** @brief returns whether or not the profile has undefined behavior on particular device */
  int is_invalid(expression_tree const & expressions, driver::Device const & device) const;
 protected:
  unsigned int vwidth_;
  unsigned int ls0_;
  unsigned int ls1_;
 };
 }
 }
 #endif
--- a/include/isaac/jit/generation/elementwise_1d.h
+++ b/include/isaac/jit/generation/elementwise_1d.h
@@ -1,49 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_BACKEND_TEMPLATES_VAXPY_H
 #define ISAAC_BACKEND_TEMPLATES_VAXPY_H
 #include "isaac/jit/generation/base.h"
 namespace isaac
 {
 namespace templates
 {
 class elementwise_1d : public parameterized_base
 {
 private:
  std::string generate_impl(std::string const & suffix, expression_tree const  & expressions, driver::Device const & device, symbolic::symbols_table const & symbols) const;
 public:
  elementwise_1d(unsigned int vwidth, unsigned int ls, unsigned int ng);
  std::vector<int_t> input_sizes(expression_tree const  & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
  expression_type type() const;
 private:
  unsigned int ng_;
 };
 }
 }
 #endif
--- a/include/isaac/jit/generation/elementwise_2d.h
+++ b/include/isaac/jit/generation/elementwise_2d.h
@@ -1,52 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_BACKEND_TEMPLATES_MAXPY_H
 #define ISAAC_BACKEND_TEMPLATES_MAXPY_H
 #include <vector>
 #include "isaac/jit/generation/base.h"
 namespace isaac
 {
 namespace templates
 {
 class elementwise_2d : public parameterized_base
 {
 private:
  int is_invalid_impl(driver::Device const &, expression_tree const  &) const;
  std::string generate_impl(std::string const & suffix, expression_tree const  & expressions, driver::Device const & device, symbolic::symbols_table const & mapping) const;
 public:
  elementwise_2d(unsigned int vwidth, unsigned int ls0, unsigned int ls1,  unsigned int ng0, unsigned int ng1);
  std::vector<int_t> input_sizes(expression_tree const  & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
  expression_type type() const;
 private:
  unsigned int ng0_;
  unsigned int ng1_;
 };
 }
 }
 #endif
--- a/include/isaac/jit/generation/engine/keywords.h
+++ b/include/isaac/jit/generation/engine/keywords.h
@@ -1,98 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_BACKEND_KEYWORDS_H
 #define ISAAC_BACKEND_KEYWORDS_H
 #include "isaac/driver/common.h"
 #include "isaac/driver/device.h"
 namespace isaac
 {
 class keyword
 {
 public:
  keyword(driver::backend_type backend, std::string const & opencl, std::string const & cuda);
  std::string const & get() const;
 private:
  driver::backend_type backend_;
  std::string opencl_;
  std::string cuda_;
 };
 static inline std::string size_type(driver::Device const & device)
 {
  switch(device.backend())
  {
    case driver::CUDA:
      return "int";
    case driver::OPENCL:
      return "int";
    default:
      throw;
  }
 }
 std::ostream &  operator<<(std::ostream & ss, keyword const & kw);
 #define ADD_KEYWORD(NAME, OCLKW, CUDAKW) class NAME : public keyword { public: NAME(driver::backend_type backend) : keyword(backend, OCLKW, CUDAKW){} };
 ADD_KEYWORD(KernelPrefix, "__kernel", "extern \"C\" __global__")
 ADD_KEYWORD(Local, "__local", "__shared__")
 ADD_KEYWORD(Global, "__global", "")
 ADD_KEYWORD(LocalPtr, "__local", "")
 ADD_KEYWORD(GlobalIdx0, "get_global_id(0)", "(blockIdx.x*blockDim.x + threadIdx.x)")
 ADD_KEYWORD(GlobalIdx1, "get_global_id(1)", "(blockIdx.y*blockDim.y + threadIdx.y)")
 ADD_KEYWORD(GlobalIdx2, "get_global_id(2)", "(blockIdx.z*blockDim.z + threadIdx.z)")
 ADD_KEYWORD(GlobalSize0, "get_global_size(0)", "(blockDim.x*gridDim.x)")
 ADD_KEYWORD(GlobalSize1, "get_global_size(1)", "(blockDim.y*gridDim.y)")
 ADD_KEYWORD(GlobalSize2, "get_global_size(2)", "(blockDim.z*gridDim.z)")
 ADD_KEYWORD(LocalIdx0, "get_local_id(0)", "threadIdx.x")
 ADD_KEYWORD(LocalIdx1, "get_local_id(1)", "threadIdx.y")
 ADD_KEYWORD(LocalIdx2, "get_local_id(2)", "threadIdx.z")
 ADD_KEYWORD(LocalSize0, "get_local_size(0)", "blockDim.x")
 ADD_KEYWORD(LocalSize1, "get_local_size(1)", "blockDim.y")
 ADD_KEYWORD(LocalSize2, "get_local_size(2)", "blockDim.z")
 ADD_KEYWORD(GroupIdx0, "get_group_id(0)", "blockIdx.x")
 ADD_KEYWORD(GroupIdx1, "get_group_id(1)", "blockIdx.y")
 ADD_KEYWORD(GroupIdx2, "get_group_id(2)", "blockIdx.z")
 ADD_KEYWORD(GroupSize0, "get_ng(0)", "GridDim.x")
 ADD_KEYWORD(GroupSize1, "get_ng(1)", "GridDim.y")
 ADD_KEYWORD(GroupSize2, "get_ng(2)", "GridDim.z")
 ADD_KEYWORD(LocalBarrier, "barrier(CLK_LOCAL_MEM_FENCE)", "__syncthreads()")
 struct CastPrefix: public keyword{ CastPrefix(driver::backend_type backend, std::string const & datatype): keyword(backend, "convert_" + datatype, "make_" + datatype){} };
 struct InitPrefix: public keyword{ InitPrefix(driver::backend_type backend, std::string const & datatype): keyword(backend, "", "make_" + datatype){} };
 struct Infinity: public keyword{ Infinity(driver::backend_type backend, std::string const & datatype): keyword(backend, "INFINITY", "infinity<" + datatype + ">()"){} };
 struct Select: public keyword{ Select(driver::backend_type backend, std::string cond, std::string if_value, std::string else_value): keyword(backend, "select(" + else_value + "," + if_value + "," + cond + ")", "(" + cond + ")?" + if_value + ":" + else_value) {} };
 #undef ADD_KEYWORD
 }
 #endif
--- a/include/isaac/jit/generation/engine/stream.h
+++ b/include/isaac/jit/generation/engine/stream.h
@@ -1,62 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_BACKEND_STREAM_H
 #define ISAAC_BACKEND_STREAM_H
 #include <sstream>
 #include "isaac/driver/common.h"
 namespace isaac
 {
 class kernel_generation_stream : public std::ostream
 {
  class kgenstream : public std::stringbuf
  {
  public:
    kgenstream(std::ostringstream& oss,unsigned int const & tab_count) ;
    int sync();
    ~kgenstream();
  private:
    std::ostream& oss_;
    unsigned int const & tab_count_;
  };
  void process(std::string& str);
 public:
  kernel_generation_stream(driver::backend_type backend);
  ~kernel_generation_stream();
  std::string str();
  void inc_tab();
  void dec_tab();
 private:
  unsigned int tab_count_;
  driver::backend_type backend_;
  std::ostringstream oss;
 };
 }
 #endif
--- a/include/isaac/jit/generation/gemm.h
+++ b/include/isaac/jit/generation/gemm.h
@@ -1,155 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_BACKEND_TEMPLATES_MPRODUCT_H
 #define ISAAC_BACKEND_TEMPLATES_MPRODUCT_H
 #include "isaac/jit/generation/base.h"
 #include "isaac/jit/syntax/expression/expression.h"
 #include "isaac/jit/syntax/expression/preset.h"
 namespace isaac
 {
 namespace templates
 {
 class cublas_gemm : public external_base
 {
  bool init();
 public:
  cublas_gemm(char A_trans, char B_trans);
  int is_invalid(expression_tree const  &, driver::Device const &) const;
  std::vector<int_t> input_sizes(expression_tree const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & h);
  expression_type type() const;
 private:
  const char A_trans_;
  const char B_trans_;
  bool init_;
 };
 class intelblas_gemm : public external_base
 {
  bool init();
 public:
  intelblas_gemm(char A_trans, char B_trans);
  int is_invalid(expression_tree const  &, driver::Device const &) const;
  std::vector<int_t> input_sizes(expression_tree const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & h);
  expression_type type() const;
 private:
  std::string generate_impl(std::string const & suffix, expression_tree const &, driver::Device const & device, symbolic::symbols_table const &) const;
  const char A_trans_;
  const char B_trans_;
  bool init_;
 };
 class intelblas_gemm_image : public external_base
 {
  bool init();
 public:
  intelblas_gemm_image(char A_trans, char B_trans);
  int is_invalid(expression_tree const  &, driver::Device const &) const;
  std::vector<int_t> input_sizes(expression_tree const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & h);
  expression_type type() const;
 private:
  std::string generate_impl(std::string const & suffix, expression_tree const &, driver::Device const & device, symbolic::symbols_table const &) const;
  const char A_trans_;
  const char B_trans_;
  bool init_;
 };
 class gemm : public parameterized_base
 {
 private:
  unsigned int temporary_workspace(expression_tree const & expressions) const;
  unsigned int lmem_usage(expression_tree const & expressions) const;
  unsigned int registers_usage(expression_tree const & expressions) const;
  int is_invalid_impl(driver::Device const &, expression_tree const &) const;
  std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const &) const;
  void enqueue_block(driver::CommandQueue & queue, int_t M, int_t N, int_t K, const expression_tree::node &A, const expression_tree::node &B, const expression_tree::node &C,
                     value_scalar const &alpha, value_scalar const &beta, driver::Program const & program, std::string const & suffix, runtime::execution_options_type const & options);
 public:
  gemm(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
       , int_t ms, int_t ks, int_t ns, int_t lf0, int_t lf1
       , char A_trans, char B_trans);
  std::vector<int_t> input_sizes(expression_tree const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & h);
  expression_type type() const;
 private:
  //Parameters
  unsigned int mL_;
  unsigned int kL_;
  unsigned int nL_;
  unsigned int depth_;
  unsigned int mS_;
  unsigned int kS_;
  unsigned int nS_;
  unsigned int lf0_;
  unsigned int lf1_;
  bool prefetch_;
  bool unroll_outer_;
  //
  const char A_trans_;
  const char B_trans_;
  expression_type type_;
 };
 class gemm_nn : public gemm
 {
 public:
  gemm_nn(unsigned int vwidth, int_t ls0, int_t KL, int_t ls1, int_t D
                      , int_t ms, int_t ks, int_t ns, int_t lf0, int_t lf1);
 };
 class gemm_tn : public gemm
 {
 public:
  gemm_tn(unsigned int vwidth, int_t ls0, int_t KL, int_t ls1, int_t D
                      , int_t ms, int_t ks, int_t ns, int_t lf0, int_t lf1);
 };
 class gemm_nt : public gemm
 {
 public:
  gemm_nt(unsigned int vwidth, int_t ls0, int_t KL, int_t ls1, int_t D
                      , int_t ms, int_t ks, int_t ns, int_t lf0, int_t lf1);
 };
 class gemm_tt : public gemm
 {
 public:
  gemm_tt(unsigned int vwidth, int_t ls0, int_t KL, int_t ls1, int_t D
                      , int_t ms, int_t ks, int_t ns, int_t lf0, int_t lf1);
 };
 }
 }
 #endif
--- a/include/isaac/jit/generation/reduce_1d.h
+++ b/include/isaac/jit/generation/reduce_1d.h
@@ -1,57 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_BACKEND_TEMPLATES_DOT_H
 #define ISAAC_BACKEND_TEMPLATES_DOT_H
 #include "isaac/jit/generation/base.h"
 namespace isaac
 {
 namespace templates
 {
 class reduce_1d : public parameterized_base
 {
 private:
  unsigned int lmem_usage(expression_tree const  & expressions) const;
  unsigned int temporary_workspace(expression_tree const & expressions) const;
  inline void reduce_1d_local_memory(kernel_generation_stream & stream, unsigned int size, std::vector<symbolic::reduce_1d*> exprs,
                                     std::string const & buf_str, std::string const & buf_value_str, driver::backend_type backend) const;
  std::string generate_impl(std::string const & suffix,  expression_tree const  & expressions, driver::Device const & device, symbolic::symbols_table const & mapping) const;
 public:
  reduce_1d(unsigned int vwidth, unsigned int ls, unsigned int ng);
  std::vector<int_t> input_sizes(expression_tree const  & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
  expression_type type() const;
 private:
  unsigned int ng_;
  std::vector< driver::Buffer > tmp_;
  std::vector< driver::Buffer > tmpidx_;
 };
 }
 }
 #endif
--- a/include/isaac/jit/generation/reduce_2d.h
+++ b/include/isaac/jit/generation/reduce_2d.h
@@ -1,69 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_BACKEND_TEMPLATES_MDOT_H
 #define ISAAC_BACKEND_TEMPLATES_MDOT_H
 #include <vector>
 #include "isaac/jit/syntax/expression/expression.h"
 #include "isaac/jit/generation/base.h"
 namespace isaac
 {
 namespace templates
 {
 class reduce_2d : public parameterized_base
 {
 protected:
  reduce_2d(unsigned int vwidth, unsigned int ls0, unsigned int ls1, unsigned int ng0, unsigned int ng1, operation_type_family);
 private:
  unsigned int lmem_usage(expression_tree const &) const;
  unsigned int temporary_workspace(expression_tree const & expressions) const;
  std::string generate_impl(std::string const & suffix, expression_tree const &, driver::Device const & device, symbolic::symbols_table const &) const;
 public:
  virtual std::vector<int_t> input_sizes(expression_tree const & expressions) const;
  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
  expression_type type() const;
 private:
  unsigned int ng0_;
  unsigned int ng1_;
  operation_type_family reduction_type_;
 };
 class reduce_2d_rows : public reduce_2d
 {
 public:
  reduce_2d_rows(unsigned int vwidth, unsigned int ls0, unsigned int ls1, unsigned int ng0, unsigned int ng1);
 };
 class reduce_2d_cols : public reduce_2d
 {
 public:
  reduce_2d_cols(unsigned int vwidth, unsigned int ls0, unsigned int ls1, unsigned int ng0, unsigned int ng1);
 };
 }
 }
 #endif
--- a/include/isaac/jit/syntax/engine/binder.h
+++ b/include/isaac/jit/syntax/engine/binder.h
@@ -1,85 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_BACKEND_BINDER_H
 #define ISAAC_BACKEND_BINDER_H
 #include <map>
 #include "isaac/driver/buffer.h"
 #include "isaac/jit/syntax/expression/expression.h"
 namespace isaac
 {
 class array_base;
 class symbolic_binder
 {
  class cmp
  {
  public:
    cmp(driver::backend_type backend) : backend_(backend) {}
    bool operator()(handle_t const & x, handle_t const & y) const
    {
      if(backend_==driver::OPENCL)
        return x.cl < y.cl;
      else
        return x.cu < y.cu;
    }
  private:
    driver::backend_type backend_;
  };
 public:
  symbolic_binder(driver::backend_type backend);
  virtual ~symbolic_binder();
  virtual bool bind(handle_t const &, bool) = 0;
  virtual unsigned int get(handle_t const &, bool) = 0;
  unsigned int get();
 protected:
  unsigned int current_arg_;
  std::map<handle_t,unsigned int, cmp> memory;
 };
 class bind_sequential : public symbolic_binder
 {
 public:
  bind_sequential(driver::backend_type backend);
  bool bind(handle_t const & a, bool);
  unsigned int get(handle_t const & a, bool);
 };
 class bind_independent : public symbolic_binder
 {
 public:
  bind_independent(driver::backend_type backend);
  bool bind(handle_t const & a, bool);
  unsigned int get(const handle_t &a, bool);
 };
 }
 #endif
--- a/include/isaac/jit/syntax/engine/macro.h
+++ b/include/isaac/jit/syntax/engine/macro.h
@@ -1,54 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_SYMBOLIC_ENGINE_MACRO_H
 #define ISAAC_SYMBOLIC_ENGINE_MACRO_H
 #include <string>
 #include <vector>
 namespace isaac
 {
 namespace symbolic
 {
 //Macro
 class macro
 {
 public:
  macro(std::string const & code);
  macro(const char * code);
  int expand(std::string & str) const;
  bool operator<(macro const & o) const;
 private:
  std::string code_;
  std::string name_;
  std::vector<std::string> args_;
  std::vector<std::string> tokens_;
 };
 }
 }
 #endif
--- a/include/isaac/jit/syntax/engine/object.h
+++ b/include/isaac/jit/syntax/engine/object.h
@@ -1,207 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_MAPPED_OBJECT_H
 #define ISAAC_MAPPED_OBJECT_H
 #include <set>
 #include <map>
 #include <string>
 #include "isaac/jit/syntax/engine/macro.h"
 #include "isaac/jit/syntax/expression/expression.h"
 #include "isaac/jit/generation/engine/stream.h"
 #include "isaac/types.h"
 namespace isaac
 {
 namespace symbolic
 {
 class object;
 typedef std::map<size_t, std::shared_ptr<object> > symbols_table;
 //Node
 class node
 {
 public:
  node(size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
  op_element op() const;
  object const * lhs() const;
  object const * rhs() const;
  size_t root() const;
 protected:
  op_element op_;
  object* lhs_;
  object* rhs_;
  size_t root_;
 };
 //Object
 class object
 {
 protected:
  void add_base(std::string const & name);
  void add_load(bool contiguous);
 public:
  object(driver::Context const & context, std::string const & scalartype, unsigned int id);
  object(driver::Context const & context, std::string const & scalartype, std::string const & name);
  virtual ~object();
  bool hasattr(std::string const & name) const;
  std::string process(std::string const & in) const;
  virtual std::string evaluate(std::map<std::string, std::string> const & table) const;
 protected:
  driver::Context const & context_;
  std::map<std::string, std::string> attributes_;
  std::set<macro> macros_;
  std::list<std::string> hierarchy_;
 };
 //Leaf
 class leaf: public object
 {
 public:
  leaf(driver::Context const & context, std::string const & scalartype, unsigned int id);
  leaf(driver::Context const & context, std::string const & scalartype, std::string const & name);
 };
 //Arithmetic node
 class arithmetic_node : public object, public node
 {
 public:
  arithmetic_node(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
 protected:
  std::string op_str_;
 };
 //Binary arithmetic
 class binary_arithmetic_node: public arithmetic_node
 {
 public:
  binary_arithmetic_node(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
  std::string evaluate(std::map<std::string, std::string> const & table) const;
 };
 //Unary arithmetic
 class unary_arithmetic_node: public arithmetic_node
 {
 public:
  unary_arithmetic_node(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
  std::string evaluate(std::map<std::string, std::string> const & table) const;
 };
 //Sfor
 class sfor: public object, public node
 {
 public:
  sfor(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
 };
 //Reductions
 class reduction : public object, public node
 {
 public:
  reduction(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
 };
 class reduce_1d : public reduction
 {
 public:
  reduce_1d(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
 };
 class reduce_2d : public reduction
 {
 public:
  reduce_2d(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
 };
 //Host scalar
 class host_scalar : public leaf
 {
 public:
  host_scalar(driver::Context const & context, std::string const & scalartype, unsigned int id);
 };
 //Placeholder
 class placeholder : public leaf
 {
 public:
  placeholder(driver::Context const & context, unsigned int level);
 };
 //Arrays
 class array : public leaf
 {
 protected:
  std::string make_broadcast(tuple const & shape);
 public:
  array(driver::Context const & context, std::string const & scalartype, unsigned int id);
 };
 //Buffer
 class buffer : public array
 {
 public:
  buffer(driver::Context const & context, std::string const & scalartype, unsigned int id, tuple const & shape, tuple const &strides);
  unsigned int dim() const { return dim_; }
 private:
  std::string ld_;
  std::string start_;
  std::string stride_;
  unsigned int dim_;
 };
 //Index modifier
 class index_modifier: public array, public node
 {
 public:
  index_modifier(std::string const & scalartype, unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
 };
 class reshape : public index_modifier
 {
 public:
  reshape(std::string const & scalartype, unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
 };
 class trans : public index_modifier
 {
 public:
  trans(std::string const & scalartype, unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
 };
 class diag_vector : public index_modifier
 {
 public:
  diag_vector(std::string const & scalartype, unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
 };
 }
 }
 #endif
--- a/include/isaac/jit/syntax/engine/process.h
+++ b/include/isaac/jit/syntax/engine/process.h
@@ -1,123 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef ISAAC_SYMBOLIC_ENGINE_PROCESS
 #define ISAAC_SYMBOLIC_ENGINE_PROCESS
 #include <functional>
 #include <typeinfo>
 #include "isaac/tools/cpp/string.hpp"
 #include "isaac/jit/syntax/expression/expression.h"
 #include "isaac/jit/syntax/engine/binder.h"
 #include "isaac/jit/syntax/engine/object.h"
 #include "isaac/array.h"
 namespace isaac
 {
 namespace symbolic
 {
 //Traverse
 template<class FUN>
 inline void traverse(expression_tree const & tree, size_t root, FUN const & fun,
                     std::function<bool(size_t)> const & recurse)
 {
  expression_tree::node const & node = tree[root];
  if (node.type==COMPOSITE_OPERATOR_TYPE && recurse(root)){
    traverse(tree, node.binary_operator.lhs, fun, recurse);
    traverse(tree, node.binary_operator.rhs, fun, recurse);
  }
  if (node.type != INVALID_SUBTYPE)
    fun(root);
 }
 template<class FUN>
 inline void traverse(expression_tree const & tree, size_t root, FUN const & fun)
 { return traverse(tree, root, fun,  [](size_t){return true;}); }
 template<class FUN>
 inline void traverse(expression_tree const & tree, FUN const & fun)
 { return traverse(tree, tree.root(), fun); }
 //Extract symbolic types
 template<class T>
 inline void extract(expression_tree const & tree, symbols_table const & table,
                    size_t root, std::set<std::string>& processed, std::vector<T*>& result, bool array_recurse = true)
 {
  auto extract_impl = [&](size_t index)
  {
    symbols_table::const_iterator it = table.find(index);
    if(it!=table.end())
    {
      T* obj = dynamic_cast<T*>(&*it->second);
      if(obj && processed.insert(obj->process("#name")).second)
        result.push_back(obj);
    }
  };
  auto recurse = [&](size_t index){ return array_recurse?true:dynamic_cast<index_modifier*>(&*table.at(index))==0;};
  traverse(tree, root, extract_impl, recurse);
 }
 template<class T>
 inline std::vector<T*> extract(expression_tree const & tree, symbols_table const & table, std::vector<size_t> roots, bool array_recurse = true)
 {
  std::vector<T*> result;
  std::set<std::string> processed;
  for(size_t root: roots)
     extract(tree, table, root, processed, result, array_recurse);
  return result;
 }
 template<class T>
 inline std::vector<T*> extract(expression_tree const & tree, symbols_table const & table, size_t root, bool array_recurse = true)
 {
  return extract<T>(tree, table, std::vector<size_t>{root}, array_recurse);
 }
 template<class T>
 inline std::vector<T*> extract(expression_tree const & tree, symbols_table const & table)
 {
  return extract<T>(tree, table, tree.root());
 }
 // Filter nodes
 std::vector<size_t> find(expression_tree const & tree, size_t root, std::function<bool (expression_tree::node const &)> const & pred);
 std::vector<size_t> find(expression_tree const & tree, std::function<bool (expression_tree::node const &)> const & pred);
 std::vector<size_t> assignments(expression_tree const & tree);
 std::vector<size_t> lhs_of(expression_tree const & tree, std::vector<size_t> const & in);
 std::vector<size_t> rhs_of(expression_tree const & tree, std::vector<size_t> const & in);
 // Hash
 std::string hash(expression_tree const & tree);
 //Set arguments
 void set_arguments(expression_tree const & tree, driver::Kernel & kernel, unsigned int& current_arg);
 //Symbolize
 symbols_table symbolize(isaac::expression_tree const & expression);
 }
 }
 #endif
--- a/include/isaac/jit/syntax/expression/expression.h
+++ b/include/isaac/jit/syntax/expression/expression.h
@@ -1,154 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef _ISAAC_SYMBOLIC_EXPRESSION_H
 #define _ISAAC_SYMBOLIC_EXPRESSION_H
 #include <utility>
 #include <vector>
 #include <list>
 #include "isaac/driver/backend.h"
 #include "isaac/driver/context.h"
 #include "isaac/driver/command_queue.h"
 #include "isaac/driver/event.h"
 #include "isaac/driver/kernel.h"
 #include "isaac/driver/ndrange.h"
 #include "isaac/driver/buffer.h"
 #include "isaac/jit/syntax/expression/operations.h"
 #include "isaac/tools/cpp/tuple.hpp"
 #include "isaac/types.h"
 #include "isaac/value_scalar.h"
 #include <memory>
 #include <iostream>
 namespace isaac
 {
 class array_base;
 struct invalid_node{};
 enum node_type
 {
  INVALID_SUBTYPE = 0,
  COMPOSITE_OPERATOR_TYPE,
  VALUE_SCALAR_TYPE,
  DENSE_ARRAY_TYPE,
 };
 union handle_t
 {
  cl_mem cl;
  CUdeviceptr cu;
 };
 struct array_holder
 {
  int_t start;
  handle_t handle;
  array_base* base;
 };
 class expression_tree
 {
 public:
  struct node
  {
    //Constructors
    node();
    node(invalid_node);
    node(value_scalar const & x);
    node(array_base const & x);
    node(int_t lhs, op_element op, int_t rhs, numeric_type dtype, tuple const & shape);
    //Common
    node_type type;
    numeric_type dtype;
    tuple shape;
    tuple ld;
    //Type-specific
    union
    {
      //Operator
      struct{
        int_t lhs;
        op_element op;
        int_t rhs;
      }binary_operator;
      //Scalar
      values_holder scalar;
      //Array
      array_holder array;
    };
  };
  typedef std::vector<node>     data_type;
 public:
  expression_tree(node const & lhs, node const & rhs, op_element const & op, driver::Context const * context, numeric_type const & dtype, tuple const & shape);
  expression_tree(expression_tree const & lhs, node const & rhs, op_element const & op, driver::Context const * context, numeric_type const & dtype, tuple const & shape);
  expression_tree(node const & lhs, expression_tree const & rhs, op_element const & op, driver::Context const * context, numeric_type const & dtype, tuple const & shape);
  expression_tree(expression_tree const & lhs, expression_tree const & rhs, op_element const & op, driver::Context const * context, numeric_type const & dtype, tuple const & shape);
  tuple shape() const;
  int_t dim() const;
  data_type const & data() const;
  std::size_t root() const;
  driver::Context const & context() const;
  numeric_type const & dtype() const;
  node const & operator[](size_t) const;
  node & operator[](size_t);
  expression_tree operator-();
  expression_tree operator!();
 private:
  data_type tree_;
  std::size_t root_;
  driver::Context const * context_;
 };
 template<class T> typename std::enable_if<!std::is_arithmetic<T>::value, T const &>::type wrap_generic(T const & x){ return x;}
 template<class T> typename std::enable_if<std::is_arithmetic<T>::value, value_scalar>::type wrap_generic(T x) { return value_scalar(x); }
 template<typename T>
 ISAACAPI typename std::conditional<std::is_arithmetic<T>::value, value_scalar, T const &>::type make_tuple(driver::Context const &, T const & x)
 { return wrap_generic(x); }
 template<typename T, typename... Args>
 ISAACAPI expression_tree make_tuple(driver::Context const & context, T const & x, Args... args)
 { return expression_tree(wrap_generic(x), make_tuple(context, args...), op_element(BINARY_ARITHMETIC, PAIR_TYPE), &context, numeric_type_of(x), {1}); }
 //io
 std::string to_string(node_type const & f);
 std::string to_string(expression_tree::node const & e);
 std::ostream & operator<<(std::ostream & os, expression_tree::node const & s_node);
 std::string to_string(isaac::expression_tree const & s);
 }
 #endif
--- a/include/isaac/jit/syntax/expression/operations.h
+++ b/include/isaac/jit/syntax/expression/operations.h
@@ -1,157 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef _ISAAC_SYMBOLIC_OPERATIONS_H
 #define _ISAAC_SYMBOLIC_OPERATIONS_H
 #include <string>
 namespace isaac
 {
 /** @brief Optimization enum for grouping operations into unary or binary operations. Just for optimization of lookups. */
 enum operation_type_family
 {
  INVALID_ = 0,
  // BLAS1-type
  UNARY_ARITHMETIC,
  BINARY_ARITHMETIC,
  REDUCE,
  // BLAS2-type
  REDUCE_ROWS,
  REDUCE_COLUMNS,
  // BLAS3-type
  GEMM
 };
 /** @brief Enumeration for identifying the possible operations */
 enum operation_type
 {
  INVALID_TYPE = 0,
  // unary operator
  MINUS_TYPE,
  NEGATE_TYPE,
  // unary expression
  CAST_BOOL_TYPE,
  CAST_CHAR_TYPE,
  CAST_UCHAR_TYPE,
  CAST_SHORT_TYPE,
  CAST_USHORT_TYPE,
  CAST_INT_TYPE,
  CAST_UINT_TYPE,
  CAST_LONG_TYPE,
  CAST_ULONG_TYPE,
  CAST_HALF_TYPE,
  CAST_FLOAT_TYPE,
  CAST_DOUBLE_TYPE,
  ABS_TYPE,
  ACOS_TYPE,
  ASIN_TYPE,
  ATAN_TYPE,
  CEIL_TYPE,
  COS_TYPE,
  COSH_TYPE,
  EXP_TYPE,
  FABS_TYPE,
  FLOOR_TYPE,
  LOG_TYPE,
  LOG10_TYPE,
  SIN_TYPE,
  SINH_TYPE,
  SQRT_TYPE,
  TAN_TYPE,
  TANH_TYPE,
  TRANS_TYPE,
  // binary expression
  ASSIGN_TYPE,
  INPLACE_ADD_TYPE,
  INPLACE_SUB_TYPE,
  ADD_TYPE,
  SUB_TYPE,
  MULT_TYPE,
  DIV_TYPE,
  ELEMENT_ARGFMAX_TYPE,
  ELEMENT_ARGFMIN_TYPE,
  ELEMENT_ARGMAX_TYPE,
  ELEMENT_ARGMIN_TYPE,
  ELEMENT_PROD_TYPE,
  ELEMENT_DIV_TYPE,
  ELEMENT_EQ_TYPE,
  ELEMENT_NEQ_TYPE,
  ELEMENT_GREATER_TYPE,
  ELEMENT_GEQ_TYPE,
  ELEMENT_LESS_TYPE,
  ELEMENT_LEQ_TYPE,
  ELEMENT_POW_TYPE,
  ELEMENT_FMAX_TYPE,
  ELEMENT_FMIN_TYPE,
  ELEMENT_MAX_TYPE,
  ELEMENT_MIN_TYPE,
  //Products
  OUTER_PROD_TYPE,
  GEMM_NN_TYPE,
  GEMM_TN_TYPE,
  GEMM_NT_TYPE,
  GEMM_TT_TYPE,
  //Access modifiers
  RESHAPE_TYPE,
  SHIFT_TYPE,
  DIAG_MATRIX_TYPE,
  DIAG_VECTOR_TYPE,
  ACCESS_INDEX_TYPE,
  PAIR_TYPE,
  OPERATOR_FUSE,
  SFOR_TYPE,
 };
 struct op_element
 {
  op_element();
  op_element(operation_type_family const & _type_family, operation_type const & _type);
  operation_type_family   type_family;
  operation_type          type;
 };
 std::string to_string(operation_type type);
 bool is_assignment(operation_type op);
 bool is_operator(operation_type op);
 bool is_function(operation_type op);
 bool is_cast(operation_type op);
 bool is_indexing(operation_type op);
 }
 #endif
--- a/Show More
+++ b/Show More
		`@@ -1 +0,0 @@`
			`cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchain/android.cmake -DANDROID_NDK=/opt/android-ndk-r10d/ -DANDROID_ABI=armeabi-v7a with NEON -DANDROID_NATIVE_API_LEVEL=19 -DANDROID_APK_API_LEVEL=19 -DANDROID_APK_RUN=1 ../`