[GENERAL] Deleted ISAAC Files

2020-02-06 00:48:45 -05:00
parent 77c6b750bc
commit fa29e63838
158 changed files with 0 additions and 110955 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,29 +0,0 @@
-cmake_minimum_required(VERSION 2.8.7)
-project(isaac)
-include(CTest)
-
-#Default build type
-if(NOT CMAKE_BUILD_TYPE)
-  message(STATUS "Default build type: Release")
-  set(CMAKE_BUILD_TYPE "Release")
-endif()
-
-#QtCreator: add visibility of headers
-file( GLOB_RECURSE ALL_SRC *.cpp *.hpp *.h *.py)
-add_custom_target( ALL SOURCES ${ALL_SRC} )
-
-#Compiler flags
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include/isaac/external/CUDA)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -pedantic -Wno-strict-aliasing")
-
-#Source
-file(GLOB_RECURSE LIBISAAC_SRC lib/*.cpp)
-add_library(isaac SHARED ${LIBISAAC_SRC})
-target_link_libraries(isaac "dl")
-
-#Examples
-add_subdirectory(examples)
-
-#Tests
-add_subdirectory(tests)
--- a/22
+++ b/22
@@ -1,22 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
--- a/README.md
+++ b/README.md
@@ -1,76 +0,0 @@
-# ISAAC
-
-This is the development repository for ISAAC, an input-aware auto-tuning framework and code-generator for HPC/DL. This version is only compatible with NVIDIA hardware (it generates PTX source code). For OpenCL/CUDA compatibility, visit the Intel fork (https://github.com/intel/isaac) or the v1.0 branch (deprecated) or the
-
-### License
-
-ISAAC is distributed under the MIT/X11 license.
-
-### Getting started - Deep Learning Inference
-
-Execute the following commands on a python environment that contains a recent version of pytorch:
-
-```
-git clone https://github.com/ptillet/isaac.git
-cd isaac/python;
-python setup.py build;
-python setup.py install;
-cd examples/pytorch;
-python imagenet.py --arch resnet152 /path/to/imagenet/;
-```
-
-This should give you 78.1% accuracy, and roughly 4x speed-up over pytorch.
-
-### Getting started - C++ API
-
-In order to compile and use the ISAAC C++ API, only a proprietary NVIDIA driver is necessary. No CUDA SDK is required (except for testing and benchmarking against cuBLAS/cuDNN):
-
-```
-git clone https://github.com/ptillet/isaac.git
-cd isaac; 
-mkdir build; 
-cd build;
-cmake ../ ; make -j8;
-./examples/isaac-tools --gemm --bench --suite deepbench --dtype float32
-./examples/isaac-tools --conv --bench --suite deepbench --dtype float32
-```
-
-If you want, you can also dump the PTX source code generated by ISAAC for some shapes:
-```
-./examples/isaac-tools --gemm --dump --format ptx --shape 2048,2048,2048 --layout NT --dtype float32
-```
-
-If you really know what you're doing, you can also capture the tiling parameters found by ISAAC:
-```
-./examples/isaac-tools --gemm --dump --format params --shape 2048,2048,2048 --layout NT --dtype float32
-```
-
-You will get the following output:
-```
-Tuning parameters: 4, 16, 8, 8, 8, 8, 16, 8, 16, 8, 1, 1, 1
-```
-
-The parameters respectively mean:
-(1) that shared memory loads have a width of **4** ; 
-(2) each block comprises **16**x**8** threads ; 
-(3) each threads computes a tile of **8**x**8** elements; 
-(4) Each loop iteration processes **8** elements along the K axis ; 
-(5) threads are rearranged  as a **16** x **8** block for loading A, and a **16** x **8** block for loading B; 
-(6) the  reduction is split accross **1**, **1** and **1** independent batches within each thread, thread-block and grid, and the results are accumulated after the inner-loop
-
-
-### Benchmarks - C++ API
-ISAAC often provides 
-Tesla P100 - SGEMM:
-![sgemm-gv100](https://github.com/ptillet/isaac/blob/master/documentation/bench/gv100/sgemm.png?raw=true)
-
-Tesla P100 - DGEMM:
-![sgemm-gv100](https://github.com/ptillet/isaac/blob/master/documentation/bench/gv100/dgemm.png?raw=true)
-
-Tesla P100 - SCONV (vs cuDNN's IMPLICIT_PRECOMP_GEMM)
-![sgemm-gv100](https://github.com/ptillet/isaac/blob/master/documentation/bench/gv100/sconv.png?raw=true)
-
-## Acknowledgments
-
-This work was partially supported by the National Science Foundation (IIS 1409097) and by IARPA (contract D16PC00002).
-
--- a/documentation/add-license.sh
+++ b/documentation/add-license.sh
@@ -1,7 +0,0 @@
-for i in $(find ../lib/ ../include/isaac/ ../python/src/bind -name '*.cpp' -or -name '*.hpp' -or -name '*.h' | grep -v "../lib/external" | grep -v "../include/isaac/driver/external/");
-do
-  if ! grep -q Copyright $i
-  then
-    cat ../LICENSE $i >$i.new && mv $i.new $i
-  fi
-done
--- a/documentation/bench/gm200/sconv.pdf
+++ b/documentation/bench/gm200/sconv.pdf
--- a/documentation/bench/gm200/sgemm.pdf
+++ b/documentation/bench/gm200/sgemm.pdf
--- a/documentation/bench/gv100/dgemm.png
+++ b/documentation/bench/gv100/dgemm.png
--- a/documentation/bench/gv100/sconv.png
+++ b/documentation/bench/gv100/sconv.png
--- a/documentation/bench/gv100/sgemm.png
+++ b/documentation/bench/gv100/sgemm.png
--- a/documentation/papers/supercomputing-2017.pdf
+++ b/documentation/papers/supercomputing-2017.pdf
--- a/documentation/posters/gtc-2018.pdf
+++ b/documentation/posters/gtc-2018.pdf
--- a/documentation/slides/supercomputing-2017.pdf
+++ b/documentation/slides/supercomputing-2017.pdf
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,6 +0,0 @@
-foreach(PROG isaac-tools)
-     add_executable(${PROG} ${PROG}.cpp)
-     set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG})
-     include_directories(/usr/local/cuda/include/)
-     target_link_libraries(${PROG} isaac)
-endforeach(PROG)
--- a/examples/isaac-tools.cpp
+++ b/examples/isaac-tools.cpp
@@ -1,649 +0,0 @@
-#include "opts.hpp"
-#include "isaac/scalar.h"
-#include "isaac/api.h"
-#include "isaac/driver/cublas.h"
-#include "isaac/driver/backend.h"
-#include "isaac/driver/context.h"
-#include "isaac/driver/stream.h"
-#include "isaac/runtime/predict.h"
-#include "isaac/templates/gemm.h"
-#include "isaac/templates/error.hpp"
-#include "isaac/tools/bench.hpp"
-
-namespace sc = isaac;
-namespace drv = sc::driver;
-using sc::param_t;
-
-enum Code {
-  RESET = 0,
-  BOLD = 1,
-  ITALIC = 3,
-  FG_RED = 31,
-  FG_GREEN = 32,
-  FG_YELLOW = 33,
-  FG_BLUE = 34,
-  FG_MAGENTA = 35,
-  FG_CYAN = 36,
-  FG_LIGHT_GRAY = 37,
-  FG_DARK_GRAY = 90,
-  FG_LIGHT_RED = 91,
-  FG_LIGHT_GREEN = 92,
-  FG_LIGHT_YELLOW = 93,
-  FG_LIGHT_BLUE = 94,
-  FG_LIGHT_MAGENTA = 95,
-  FG_LIGHT_CYAN = 96,
-  FG_WHITE = 97
-};
-
-class color_stream {
-    Code code;
-public:
-    color_stream(Code pCode) : code(pCode) {}
-    friend std::ostream&
-    operator<<(std::ostream& os, const color_stream& mod) {
-        return os << "\033[" << mod.code << "m";
-    }
-};
-
-/* Helpers for benchmarking */
-typedef std::tuple<sc::DType, sc::IsaacOperation_t, sc::IsaacOperation_t, sc::param_t, sc::param_t, sc::param_t> gemm_params_t;
-typedef std::tuple<sc::DType, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> conv_params_t;
-typedef std::tuple<sc::DType, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> pool_params_t;
-
-struct SC17{
-  // GEMM
-  static std::vector<gemm_params_t> gemm(sc::DType dtype){
-    std::vector<gemm_params_t> shapes;
-    // LinPack
-    for(param_t N: std::vector<param_t>{512, 1024, 2048})
-      shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, N));
-    // DeepBench
-    for(sc::IsaacOperation_t AT: std::vector<sc::IsaacOperation_t>{sc::ISAAC_OP_N, sc::ISAAC_OP_T})
-      for(param_t M: std::vector<param_t>{1760})
-        for(param_t N: std::vector<param_t>{16, 32, 64, 128})
-          shapes.push_back(std::make_tuple(dtype, AT, sc::ISAAC_OP_N, M, N, M));
-    // PCA/ICA
-    for(param_t N: std::vector<param_t>{16, 64, 256})
-      for(param_t K: std::vector<param_t>{64000})
-        shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
-    // LaPACK
-    for(param_t N: std::vector<param_t>{1024, 2048, 4096})
-      for(param_t K: std::vector<param_t>{32})
-        shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
-    return shapes;
-  }
-
-  // CONV
-  static std::vector<conv_params_t> conv(sc::DType dtype){
-    // Vector of (dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)
-    std::vector<conv_params_t> shapes;
-//    // DeepSpeech
-//    for(size_t N: std::vector<size_t>{8})
-//      shapes.push_back(std::make_tuple(dtype, 1, 700, 161, 1, N, 32, 1, 5, 20, 0, 0, 0, 1, 1, 1));
-//    for(size_t N: std::vector<size_t>{8})
-//      shapes.push_back(std::make_tuple(dtype, 1, 341, 79, 32, N, 32, 1, 5, 10, 0, 0, 0, 1, 1, 1));
-
-//    // OCR
-//    shapes.push_back(std::make_tuple(dtype, 1, 480, 48, 1, 16, 16, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-//    shapes.push_back(std::make_tuple(dtype, 1, 240, 24, 16, 16, 32, 1, 3, 3, 0, 0, 0, 1, 1, 1));
-//    shapes.push_back(std::make_tuple(dtype, 1, 120, 12, 32, 16, 64, 1, 3, 3, 0, 0, 0, 1, 1, 1));
-//    shapes.push_back(std::make_tuple(dtype, 1, 60, 6, 64, 16, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-
-//    // Face Recognition
-//    shapes.push_back(std::make_tuple(dtype, 1, 108, 108, 3, 8, 64, 1, 3, 3, 0, 1, 1, 1, 2, 2));
-//    shapes.push_back(std::make_tuple(dtype, 1, 54, 54, 64, 8, 64, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-//    shapes.push_back(std::make_tuple(dtype, 1, 27, 27, 128, 8, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-//    shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 128, 8, 256, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-//    shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 256, 8, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-
-//    // Vision
-//    for(size_t N: std::vector<size_t>{8}){
-//      shapes.push_back(std::make_tuple(dtype, 1, 224, 224, 3, N, 64, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 112, 112, 64, N, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 128, N, 256, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 256, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-//    }
-//    shapes.push_back(std::make_tuple(dtype, 1, 224, 224, 3, 16, 64, 1, 7, 7, 0, 3, 3, 1, 2, 2));
-//    shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 192, 16, 32, 1, 5, 5, 0, 2, 2, 1, 1, 1));
-//    shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 192, 16, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-//    shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, 16, 48, 1, 5, 5, 0, 2, 2, 1, 1, 1));
-//    shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, 16, 192, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-//    shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 832, 16, 256, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-//    shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 832, 16, 128, 1, 5, 5, 0, 2, 2, 1, 1, 1));
-
-//    // Speaker ID
-//    shapes.push_back(std::make_tuple(dtype, 1, 350, 80, 64, 16, 128, 1, 5, 5, 0, 1, 1, 1, 2, 2));
-//    shapes.push_back(std::make_tuple(dtype, 1, 175, 40, 128, 16, 256, 1, 5, 5, 0, 1, 1, 1, 2, 2));
-
-//    // ResNET
-//    for(size_t N: std::vector<size_t>{8}){
-//      shapes.push_back(std::make_tuple(dtype, 1, 112, 112, 64, N, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 64, N, 256, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 256, N, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 256, N, 128, 1, 1, 1, 0, 0, 0, 1, 2, 2));
-//      shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 128, N, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 128, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 256, 1, 1, 1, 0, 0, 0, 1, 2, 2));
-//      shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 256, N, 1024, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 1024, 1, 1, 1, 0, 0, 0, 1, 2, 2));
-//      shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 1024, N, 2048, 1, 1, 1, 0, 0, 0, 1, 2, 2));
-//      shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 2048, 1, 1, 1, 0, 1, 1, 1, 1, 1));
-//      shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 1024, N, 2048, 1, 1, 1, 0, 1, 1, 1, 2, 2));
-//    }
-
-    // 3D-Unet
-    shapes.push_back(std::make_tuple(dtype, 31, 204, 204,   4, 1,  24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 29, 202, 202,  24, 1,  24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 27, 100, 100,  24, 1,  72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 25,  98,  98,  72, 1,  72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 23,  48,  48,  72, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 21,  46,  46, 216, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 19,  22,  22, 216, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 17,  20,  20, 648, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 15,  36,  36, 648, 1, 432, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 13,  36,  36, 432, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 11,  34,  34, 216, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 11,  64,  64, 216, 1, 144, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 11,  64,  64, 144, 1, 72,  3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 9 ,  62,  62,  72, 1, 72,  3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 7 , 120, 120,  72, 1, 48,  1, 1, 1, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 5 , 120, 120,  48, 1, 24,  3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 3 , 118, 118,  24, 1, 24,  3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 1 , 116, 116,  4 , 1, 24,  1, 1, 1, 0, 0, 0, 1, 1, 1));
-    return shapes;
-  }
-
-  // POOL
-  static std::vector<pool_params_t> pool(sc::DType dtype){
-    std::vector<pool_params_t> shapes;
-
-    // 3D-Unet
-    shapes.push_back(std::make_tuple(dtype, 31, 204, 204, 1,  24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 29, 202, 202, 1,  24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 27, 100, 100, 1,  72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 25,  98,  98, 1,  72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 23,  48,  48, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 21,  46,  46, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 19,  22,  22, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 17,  20,  20, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 15,  36,  36, 1, 432, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 13,  36,  36, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 11,  34,  34, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 11,  64,  64, 1, 144, 1, 1, 1, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 11,  64,  64, 1, 72,  3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 9 ,  62,  62, 1, 72,  3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 7 , 120, 120, 1, 48,  1, 1, 1, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 5 , 120, 120, 1, 24,  3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 3 , 118, 118, 1, 24,  3, 3, 3, 0, 0, 0, 1, 1, 1));
-    shapes.push_back(std::make_tuple(dtype, 1 , 116, 116, 1, 24,  1, 1, 1, 0, 0, 0, 1, 1, 1));
-
-    return shapes;
-  }
-
-};
-/* Metrics for benchmarking */
-struct Metric{
-  virtual std::function<bool(double, double)> cmp() const = 0;
-  virtual double conv(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t C, param_t R, param_t S, param_t T, double tsec) const = 0;
-  virtual double gemm(param_t M, param_t N, param_t K, double tsec) const = 0;
-  virtual double pool(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t, param_t, param_t, double tsec) const = 0;
-};
-
-class FLOPS: public Metric{
-public:
-  FLOPS(double scale): scale_(scale){}
-  std::function<bool(double, double)> cmp() const { return std::greater<double>(); }
-  double conv(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t C, param_t R, param_t S, param_t T, double tsec) const
-  { return  sc::templates::Conv::tflops(P,Q,M,K,N,C,R,S,T,tsec) * 1e12 / scale_; }
-  double gemm(param_t M, param_t N, param_t K, double tsec) const
-  { return  sc::templates::GEMM::tflops(M, N, K, tsec) * 1e12 / scale_; }
-  double pool(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t T, param_t R, param_t S, double tsec) const
-  { return sc::templates::Pool::tflops(P, Q, M, K, N, T, R, S, tsec) * 1e12 / scale_;}
-
-private:
-  double scale_;
-};
-
-class Time: public Metric{
-public:
-  Time(double scale): scale_(scale){}
-  std::function<bool(double, double)> cmp() const { return std::less<double>(); }
-  double conv(param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; }
-  double gemm(param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; }
-  double pool(param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; }
-
-private:
-  double scale_;
-};
-
-void print_results_header(std::vector<std::string> sections){
-    std::cout << color_stream(ITALIC) << color_stream(BOLD) ;
-    std::copy(sections.begin(), sections.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
-    std::cout << color_stream(RESET) << std::endl;
-}
-
-void print_results(std::vector<double> const & times, std::vector<std::string> const & prefix, std::function<bool(double, double)> cmp, std::function<double(double)> fn){
-    std::copy(prefix.begin(), prefix.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
-    std::vector<double> perf;
-    std::transform(times.begin(), times.end(), std::back_inserter(perf), fn);
-    auto fastest = perf;
-    std::sort(fastest.begin(), fastest.end(), cmp);
-
-    for(auto x: perf){
-      if(x == fastest[0] && x / fastest[1] > 1.05)
-        std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
-      else
-        std::cout << x;
-      std::cout << "\t";
-    }
-    std::cout << std::endl;
-}
-
-void benchmark_gemm(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream,
-                    sc::DType dtype, sc::IsaacOperation_t AT, sc::IsaacOperation_t BT, size_t M, size_t N, size_t K,
-                    sc::templates::Generator* generator){
-  size_t ldc = M;
-  size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
-  size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
-
-  size_t dtsize = sc::size_of(dtype);
-  sc::scalar alpha(1., dtype);
-  sc::scalar beta(0., dtype);
-  char cuAT = (AT==sc::ISAAC_OP_T)?'T':'N';
-  char cuBT = (BT==sc::ISAAC_OP_T)?'T':'N';
-
-  sc::driver::Buffer C(ctx, M*N*dtsize);
-  sc::driver::Buffer A(ctx, M*K*dtsize);
-  sc::driver::Buffer B(ctx, K*N*dtsize);
-
-  std::vector<double> times;
-  times.push_back(bench([&](){ sc::GEMM(device, stream, dtype, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, alpha, A, B, beta, C, 1., 1., 1., NULL, (sc::templates::GEMM*)generator, 10); }, [&](){ stream.synchronize(); }, device));
-  if(sc::driver::dispatch::cublasinit()){
-    cublasGemmAlgo_t fastest;
-    sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, &fastest);
-    times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, NULL, fastest); }, [&](){ stream.synchronize();  }, device));
-    //times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }, [&](){ stream.synchronize();  }, device));
-  }
-  print_results(times, {str(AT), str(BT), str(M), str(N), str(K)}, metric.cmp(), [&](double tsec){ return metric.gemm(M, N, K, tsec);});
-}
-
-void benchmark_conv(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream,
-                    sc::DType in_dtype, sc::DType out_dtype, size_t D, size_t H, size_t W, size_t C, size_t N, size_t K, size_t T, size_t R, size_t S,
-                    size_t pad_d, size_t pad_h, size_t pad_w,
-                    size_t stride_d, size_t stride_h, size_t stride_w,
-                    size_t upsample_d, size_t upsample_h, size_t upsample_w,
-                    sc::templates::Generator* generator){
-
-  param_t Zk = 0, crop_z_m0 = 0, crop_z_m1 = 0, crop_z_p0 = 0, crop_z_p1 = 0, crop_z_q0 = 0, crop_z_q1 = 0;
-  param_t M, P, Q;
-  sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, M, P, Q);
-  sc::ActivationType activation = sc::Linear;
-
-  size_t vect_c = (in_dtype==sc::INT8X4_TYPE)?4:1;
-  size_t vect_k = (out_dtype==sc::INT8X4_TYPE)?4:1;
-
-  sc::DType ab_dtype = (out_dtype==sc::INT8X4_TYPE)?sc::FLOAT_TYPE:out_dtype;
-  sc::scalar alpha(1., ab_dtype);
-  sc::scalar beta(0., ab_dtype);
-
-  sc::driver::Buffer O(ctx, N*K/vect_k*M*P*Q*sc::size_of(out_dtype));
-  sc::driver::Buffer I(ctx, C/vect_c*D*H*W*N*sc::size_of(in_dtype));
-  sc::driver::Buffer F(ctx, K*C/vect_c*T*R*S*sc::size_of(in_dtype));
-
-  std::vector<double> times;
-  times.push_back(bench([&](){ sc::CONV(device, stream, in_dtype, out_dtype, N, K, M, P, Q, C, T, R, S, D, H, W, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, I, F, &O, 1, NULL, activation, 0., 1., 1., {1.}, 1., sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1, NULL, (sc::templates::Conv*)generator, 10); }, [&](){ stream.synchronize(); }, device));
-//  if(sc::driver::dispatch::cudnninit())
-//    times.push_back(bench([&](){ sc::driver::cudnnConv(out_dtype, stream, D, H, W, N, K, M, P, Q, C, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize();  }, device));
-  print_results(times, {str(N), str(K), str(M), str(P), str(Q), str(C), str(T), str(R), str(S)}, metric.cmp(), [&](double tsec){ return metric.conv(M, P, Q, K, N, C, T, R, S, tsec);});
-}
-
-void benchmark_pool(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream,
-                    sc::DType dtype, size_t D, size_t H, size_t W, size_t N, size_t K, size_t T, size_t R, size_t S, size_t pad_d, size_t pad_h, size_t pad_w, size_t stride_d, size_t stride_h, size_t stride_w,
-                    sc::templates::Generator* generator){
-
-  param_t M, P, Q;
-  sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, 1, 1, 1, M, P, Q);
-
-  size_t dtsize = sc::size_of(dtype);
-  sc::scalar alpha(1., dtype);
-  sc::scalar beta(0., dtype);
-
-  sc::driver::Buffer O(ctx, N*K*M*P*Q*dtsize);
-  sc::driver::Buffer I(ctx, K*D*H*W*N*dtsize);
-
-  std::vector<double> times;
-  times.push_back(bench([&](){ sc::POOL(device, stream, dtype, dtype, sc::MaxPool, K, M, P, Q, N, T, R, S, D, H, W, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, I, O, 1., 1., (sc::templates::Pool*)generator); }, [&](){ stream.synchronize(); }, device));
-  if(sc::driver::dispatch::cudnninit())
-    times.push_back(bench([&](){ sc::driver::cudnnPool(dtype, stream, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, alpha, I, beta, O); }, [&](){ stream.synchronize();  }, device));
-  print_results(times, {str(N), str(K), str(M), str(P), str(Q),  str(T), str(R), str(S)}, metric.cmp(), [&](double tsec){ return metric.pool(M, P, Q, K, N, T, R, S, tsec);});
-}
-
-
-/* ------------------------------- */
-
-void loop_nest(std::vector<size_t> const & ranges, std::function<void(std::vector<size_t> const &)> const & f){
-  size_t D = ranges.size();
-  std::vector<size_t> values(D, 0);
-  // Start with innermost loop
-  size_t i = D - 1;
-  while(true){
-    //Execute function
-    f(values);
-    //Increment counters
-    while(values[i]++ == ranges[i] - 1){
-      if(i == 0)
-        return;
-      values[i--] = 0;
-    }
-    i = D - 1;
-  }
-}
-
-template<class T>
-void loop_nest(std::vector<std::vector<T>> const & iterates, std::function<void(std::vector<T>)> const & f){
-  //Ranges to iterate over
-  std::vector<size_t> ranges;
-  for(auto const & x: iterates)
-    ranges.push_back(x.size());
-  //Proxy function
-  auto proxy = [&](std::vector<size_t> const & idx){
-    std::vector<T> x(iterates.size());
-    for(size_t i = 0; i < x.size(); ++i)
-    x[i] = iterates[i][idx[i]];
-  f(x);
-  };
-  //Iterate
-  loop_nest(ranges, proxy);
-}
-
-
-void search_conv(int32_t D, int32_t H, int32_t W,
-                 int32_t C, int32_t N, int32_t K,
-                 int32_t T, int32_t R, int32_t S,
-                 int32_t pad_d, int32_t pad_h, int32_t pad_w,
-                 int32_t stride_d, int32_t stride_h, int32_t stride_w,
-                 int32_t upsample_d, int32_t upsample_h, int32_t upsample_w,
-                 sc::ActivationType activation, sc::DType in_dtype, sc::DType out_dtype)
-{
-  auto ctx = drv::backend::contexts::get_default();
-
-  size_t P = (H - R + 1 + 2*pad_h + stride_h - 1)/stride_h;
-  size_t Q = (W - S + 1 + 2*pad_w + stride_w - 1)/stride_w;
-  size_t M = (D - T + 1 + 2*pad_d + stride_d - 1)/stride_d;
-
-  size_t Zk = 0, crop_z_m0 = 0, crop_z_m1 = 0, crop_z_p0 = 0, crop_z_p1 = 0, crop_z_q0 = 0, crop_z_q1 = 0;
-  //Setup
-  drv::Buffer O(ctx, K*P*Q*M*N*sc::size_of(out_dtype));
-  drv::Buffer I(ctx, C*H*W*D*N*sc::size_of(in_dtype));
-  drv::Buffer F(ctx, C*R*S*T*K*sc::size_of(in_dtype));
-  drv::Stream stream(ctx);
-
-  //Exhaustive search
-  std::vector<sc::param_t> r1 = {1};
-  std::vector<sc::param_t> rv = {4};
-  std::vector<sc::param_t> rr = {1, 2, 4, 8};
-  std::vector<sc::param_t> rl = {4, 8, 16, 32};
-  std::vector<sc::param_t> rs = {4, 8, 16};
-  double best;
-  loop_nest<sc::param_t>({rv, rl, rl, rs, rs, rl, rl, r1, rr, rr}, [&](std::vector<sc::param_t> const & x){
-    sc::templates::Conv generator(in_dtype, out_dtype,
-                                  C, D, H, W, N, K, M, P, Q, T, R, S,
-                                  pad_d, pad_h, pad_w,
-                                  stride_d, stride_h, stride_w,
-                                  upsample_d, upsample_h, upsample_w,
-                                  activation, 1,
-                                  sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1,
-                                  x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8]);
-    //Compile
-    try{
-      std::string src = generator.dump(ctx.device(), "conv");
-      drv::Module program(ctx, src);
-      drv::Kernel kernel(program, "conv");
-      double tsec = bench([&](){ generator.enqueue(kernel, stream, I, F, &O); }, [&](){ stream.synchronize(); }, ctx.device());
-      double tflops = sc::templates::Conv::tflops(P,Q,M,K,N,C,R,S,T,tsec);
-      best = std::max(tflops, best);
-      std::cout << "//";
-      std::copy(x.begin(), x.end(), std::ostream_iterator<int>(std::cout, " "));
-      std::cout << ": " << tflops << " TFLOPS [BEST: " << best << "]" << std::endl;
-    }catch(isaac::templates::invalid_parameters const &){
-      return;
-    }catch(drv::exception::cuda::launch_out_of_resources const &){
-      return;
-    }
-  });
-  std::cout << "ISAAC: " << best << std::endl;
-}
-
-void search_gemm(int32_t M, int32_t N, int32_t K, sc::IsaacOperation_t AT, sc::IsaacOperation_t BT, sc::DType dtype){
-  auto ctx = drv::backend::contexts::get_default();
-  size_t dtsize = sc::size_of(dtype);
-
-  // Setup
-  size_t ldc = M;
-  size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
-  size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
-  int32_t offc = 0, offa = 0, offb = 0;
-  drv::Buffer C(ctx, M*N*dtsize);
-  drv::Buffer A(ctx, M*K*dtsize);
-  drv::Buffer B(ctx, K*N*dtsize);
-  drv::Stream stream(ctx);
-  sc::scalar alpha(1., dtype), beta(0., dtype);
-
-  // Exhaustive search
-  std::vector<sc::param_t> r1 = {1};
-  std::vector<sc::param_t> rv = {4};
-  std::vector<sc::param_t> rr = {1, 2, 4, 8};
-  std::vector<sc::param_t> rl = {4, 8, 16, 32};
-  std::vector<sc::param_t> rs = {4, 8, 16};
-  double best = 0;
-
-  loop_nest<sc::param_t>({rv, rl, rl, rl, rs, r1, rs, rl, rl, rl, rl, r1, rr, rr}, [&](std::vector<sc::param_t> const & x){
-    isaac::templates::GEMM generator(dtype, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]);
-    // Compile
-    try{
-      std::string src = generator.dump(ctx.device(), "gemm");
-      drv::Module program(ctx, src);
-      drv::Kernel kernel(program, "gemm");
-      double time = bench([&](){ generator.enqueue(kernel, stream, alpha, A, B, beta, C); }, [&](){ stream.synchronize(); }, ctx.device());
-      double tflops = 2*1e-3*M*N*K/time;
-      best = std::max(tflops, best);
-      std::cout << "//";
-      std::copy(x.begin(), x.end(), std::ostream_iterator<int>(std::cout, " "));
-      std::cout << ": " << tflops << " TFLOPS [BEST: " << best << "]" << std::endl;
-    }catch(isaac::templates::invalid_parameters const &){
-      return;
-    }catch(drv::exception::cuda::launch_out_of_resources const &){
-      return;
-    }
-  });
-  std::cout << "ISAAC: " << best << std::endl;
-}
-
-/* Helpers for dumping source code */
-void dump_source(sc::driver::Device const & device, sc::templates::Generator& generator, opts::Options* options, std::string const & name){
-  if(options->get<std::string>("format") == "ptx")
-    std::cout << generator.dump(device, name) << std::endl;
-  else{
-    auto x = generator.tuning_params();
-    std::cout << "Tuning parameters: " << std::flush;
-    for(size_t i = 0; i < x.size(); ++i)
-      std::cout << ((i>0)?", ":"") << x[i] << std::flush;
-    std::cout << std::endl;
-  }
-}
-
-/* Application code */
-int main(int argc, char* argv[]){
-  opts::Application program("isaac-tools", "Command-line interface for ISAAC");
-  // Options
-  opts::Options* options = program.options();
-  options->add<size_t>("device", "Device to run on", 0);
-  options->add<sc::DType>("dtype", "Data-type to use for computations", "float32", {{"int8x4", sc::INT8X4_TYPE}, {"float32", sc::FLOAT_TYPE}, {"float64", sc::DOUBLE_TYPE}});
-  options->add<std::string>("name", "Name to give to the generated kernel", "kernel");
-  options->add_group("search", "Exhaustively search for best tuning parameters");
-  opts::Options* dump = options->add_group("dump", "Dump source-code generated by ISAAC");
-  dump->add("format", "Format to generate", "ptx", {"ptx", "params"});
-  dump->add("target", "Target GPU (sm_xx)", {"sm_50", "sm_52", "sm_60", "sm_61", "sm_70"});
-  opts::Options* bench = options->add_group("bench", "Benchmark source code generated by ISAAC");
-  bench->add("suite", "Benchmarking suite to run", "custom", {"custom", "deepbench"});
-  bench->add<std::shared_ptr<Metric>>("metric", "performance metric for the results", "tflops", {{"tflops", std::make_shared<FLOPS>(1e12)}, {"ms", std::make_shared<Time>(1e-3)},  {"us", std::make_shared<Time>(1e-6)}});
-  // Constraints
-  options->add_constraint(opts::OneOf({"bench", "dump", "search"}));
-  options->add_constraint(opts::OneOf({"gemm", "conv", "pool"}));
-  // GEMM
-  opts::Options* gemm = options->add_group("gemm", "Use matrix-multiplication");
-  gemm->add("layout", "Transposition layout for A and B", "NT", {"NN", "NT", "TN", "TT"});
-  gemm->add<std::vector<size_t>>("shape", "Matrix shapes (M,N,K)", {2048, 2048, 2048}, opts::SizeConstraint(3));
-  gemm->add<std::vector<size_t>>("kernel", "Bypass predictive model to use given tuning parameters", opts::SizeConstraint(14));
-  // CONV
-  opts::Options* conv = options->add_group("conv", "Use convolutions");
-  conv->add<std::vector<size_t>>("shape", "Tensor shapes (D, H, W, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)", {1, 70, 14, 512, 128, 64, 1, 7, 7, 0, 0, 0, 1, 1, 1}, opts::SizeConstraint(15));
-  conv->add<std::vector<size_t>>("kernel", "Bypass predictive model to use given tuning parameters", opts::SizeConstraint(9));
-  // POOL
-  opts::Options* pool = options->add_group("pool", "Use pooling");
-  pool->add<std::vector<size_t>>("shape", "Tensor shapes (D, H, W, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)", {1, 70, 14, 128, 64, 1, 7, 7, 0, 0, 0, 1, 1, 1}, opts::SizeConstraint(14));
-  pool->add<std::vector<size_t>>("kernel", "Bypass predictive model to use given tuning parameters", opts::SizeConstraint(4));
-  program.parse(argc, argv);
-
-  if(options->has("bench"))
-    std::cout << std::fixed << std::setprecision(2);
-  //Device
-  sc::driver::Device device = sc::driver::backend::devices()[options->get<size_t>("device")];
-  if(options->has("dump") && dump->has("target")){
-    std::string target = dump->get<std::string>("target");
-    char major = target[3];
-    char minor = target[4];
-    device.interpret_as(std::make_pair((size_t)std::atoi(&major), (size_t)std::atoi(&minor)));
-  }
-  static sc::driver::Context context(device);
-  sc::driver::Stream stream(context);
-  // Data-Type
-  sc::DType dtype = options->get<sc::DType>("dtype");
-  // Kernel name
-  std::string name = options->get<std::string>("name");
-
-  /* Get optimized kernel generator */
-  std::unique_ptr<sc::templates::Generator> generator;
-
-  // GEMM
-  if(options->has("gemm")){
-    std::string layout = gemm->get<std::string>("layout");
-    sc::IsaacOperation_t AT = layout[0]=='T'?sc::ISAAC_OP_T:sc::ISAAC_OP_N;
-    sc::IsaacOperation_t BT = layout[1]=='T'?sc::ISAAC_OP_T:sc::ISAAC_OP_N;
-    auto shape = gemm->get<std::vector<size_t>>("shape");
-    size_t M = shape[0], N = shape[1], K = shape[2];
-    //Get Source
-    size_t ldc = M;
-    size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
-    size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
-    if(options->has("search")){
-      search_gemm(M, N, K, AT, BT, dtype);
-    }
-    if(gemm->has("kernel")){
-      auto x = gemm->get<std::vector<size_t>>("kernel");
-      generator.reset(new sc::templates::GEMM(dtype, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]));
-    }
-    else{
-      sc::runtime::GEMMProfile* profile = (sc::runtime::GEMMProfile*)sc::runtime::database.at({device.architecture(), sc::runtime::GEMM}).get();
-      generator.reset(new sc::templates::GEMM(profile->predict(stream, dtype, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc)));
-    }
-    if(options->has("dump"))
-      dump_source(device, *generator, dump, name);
-    if(options->has("bench")){
-      auto metric = bench->get<std::shared_ptr<Metric>>("metric");
-      print_results_header({"AT", "BT", "M", "N", "K", "ISAAC", "cuBLAS"});
-      std::vector<gemm_params_t> shapes;
-      //User provided shapes
-      if(bench->get<std::string>("suite")=="custom")
-        shapes = {std::make_tuple(dtype, AT, BT, M, N, K)};
-
-      //SC17 paper shapes
-      if(bench->get<std::string>("suite")=="deepbench")
-        shapes = SC17::gemm(dtype);
-
-      //Print results
-      for(auto x: shapes){
-        std::tie(dtype, AT, BT, M, N, K) = x;
-        benchmark_gemm(*metric, context, device, stream, dtype, AT, BT, M, N, K, gemm->has("kernel")?generator.get():NULL);
-      }
-    }
-  }
-
-  // CONV
-  if(options->has("conv")){
-    sc::DType in_dtype = dtype;
-    sc::DType out_dtype = dtype;
-    auto x = conv->get<std::vector<size_t>>("shape");
-    param_t D = x[0], H = x[1], W = x[2], C = x[3], N = x[4], K = x[5], T = x[6], R = x[7], S = x[8], pad_d = x[9], pad_h = x[10], pad_w = x[11], stride_d = x[12], stride_h = x[13], stride_w = x[14];
-    param_t M, P, Q;
-    param_t upsample_d = 1, upsample_h = 1, upsample_w = 1;
-    param_t Zk = 0, crop_z_m0 = 0, crop_z_m1 = 0, crop_z_p0 = 0, crop_z_p1 = 0, crop_z_q0 = 0, crop_z_q1 = 0;
-    sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, M, P, Q);
-    sc::ActivationType activation = sc::Linear;
-    if(options->has("search"))
-      search_conv(D, H, W, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, in_dtype, out_dtype);
-    if(conv->has("kernel")){
-      auto x = conv->get<std::vector<size_t>>("kernel");
-      generator.reset(new sc::templates::Conv(in_dtype, out_dtype, C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, 1, sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8]));
-    }
-    else{
-      sc::runtime::ConvProfile* profile = (sc::runtime::ConvProfile*)sc::runtime::database.at({device.architecture(), sc::runtime::CONV}).get();
-      generator.reset(new sc::templates::Conv(profile->predict(stream, in_dtype, out_dtype, C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, 1, sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1)));
-    }
-    if(options->has("dump"))
-      dump_source(device, *generator, dump, name);
-    if(options->has("bench")){
-      auto metric = bench->get<std::shared_ptr<Metric>>("metric");
-      print_results_header({"N", "K", "M", "P", "Q", "C", "T", "R", "S", "ISAAC", "cuDNN"});
-      std::vector<conv_params_t> shapes;
-      //User provided shapes
-      if(bench->get<std::string>("suite")=="custom")
-        shapes = {std::make_tuple(dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)};
-      //SuperComputing17 shapes
-      if(bench->get<std::string>("suite")=="deepbench")
-        shapes = SC17::conv(dtype);
-      //Print results
-      for(auto x: shapes){
-        std::tie(dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w) = x;
-        benchmark_conv(*metric, context, device, stream, in_dtype, out_dtype, D, H, W, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, conv->has("kernel")?generator.get():NULL);
-      }
-    }
-  }
-
-  // POOL
-  if(options->has("pool")){
-    auto x = pool->get<std::vector<size_t>>("shape");
-    param_t D = x[0], W = x[1], H = x[2], N = x[3], K = x[4], T = x[5], R = x[6], S = x[7], pad_d = x[8], pad_h = x[9], pad_w = x[10], stride_d = x[11], stride_h = x[12], stride_w = x[13];
-    param_t M, P, Q;
-    sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, 1, 1, 1, M, P, Q);
-
-    if(pool->has("kernel")){
-      auto x = pool->get<std::vector<size_t>>("kernel");
-      generator.reset(new sc::templates::Pool(dtype, dtype, sc::MaxPool, K, D, H, W, N, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, x[0], x[1], x[2], x[3]));
-    }
-    else{
-      generator.reset(new sc::templates::Pool(dtype, dtype, sc::MaxPool, K, D, H, W, N, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w));
-    }
-    if(options->has("dump"))
-      dump_source(device, *generator, dump, name);
-    if(options->has("bench")){
-      auto metric = bench->get<std::shared_ptr<Metric>>("metric");
-      print_results_header({"N", "K", "M", "P", "Q", "T", "R", "S", "ISAAC", "cuDNN"});
-      std::vector<pool_params_t> shapes;
-      //User provided shapes
-      if(bench->get<std::string>("suite")=="custom")
-        shapes = {std::make_tuple(dtype, D, W, H, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)};
-      //SuperComputing17 shapes
-      if(bench->get<std::string>("suite")=="deepbench")
-        shapes = SC17::pool(dtype);
-      //Print results
-      for(auto x: shapes){
-        std::tie(dtype, D, W, H, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w) = x;
-        benchmark_pool(*metric, context, device, stream, dtype, D, H, W, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, pool->has("kernel")?generator.get():NULL);
-      }
-    }
-  }
-}
--- a/examples/opts.hpp
+++ b/examples/opts.hpp
@@ -1,395 +0,0 @@
-#ifndef OPTS_HPP
-#define OPTS_HPP
-
-#include <string>
-#include <set>
-#include <vector>
-#include <iostream>
-#include <sstream>
-#include <memory>
-#include <map>
-#include <algorithm>
-#include <type_traits>
-#include <functional>
-#include <sstream>
-#include <stdexcept>
-
-namespace opts{
-
-class InvalidOptions: public std::exception{
-public:
-  InvalidOptions(std::string const & msg): msg_("Invalid options: " + msg){}
-  const char* what() const throw(){ return msg_.c_str();}
-private:
-  std::string msg_;
-};
-
-/**
- * @class OptionBase
- * @brief Base class for command-line options
-*/
-class OptionBase{
-protected:
-  template<class ItType>
-  std::vector<std::string>::const_iterator get_option(ItType const & begin, ItType const & end){
-    auto it = std::find(begin, end, "--" + name_);
-    if(it==end && required_)
-      throw InvalidOptions("parameter '" + name_ + "' is mandatory");
-    if(parent_ && parent_->parent_ && parent_->get_option(begin, it)==it)
-      throw InvalidOptions("parameter '" + name_ + "' needs to be nested in group '" + parent_->name_ + "'");
-    return it;
-  }
-
-public:
-  OptionBase(std::string const & name, std::string const & desc, bool required = false, OptionBase* parent = NULL): name_(name), desc_(desc), required_(required), parent_(parent)
-  {}
-
-  virtual std::ostream& usage(std::ostream& os, size_t indent) const{
-    if(!desc_.empty())
-      os << std::string(indent, ' ') << "--" << "\033[1m" << name_ << "\033[0m" << ": " << desc_ << std::endl;
-    return os;
-  }
-
-  virtual void parse(std::vector<std::string> const & args, std::map<std::string, void*>& values) = 0;
-
-  std::string const & name() const
-  { return name_; }
-
-protected:
-  const std::string name_;
-  const std::string desc_;
-  bool required_;
-  OptionBase* parent_;
-};
-
-/**
- * @class OptionHelp
- * @brief Automatically added --help option
-*/
-class OptionHelp: public OptionBase{
-public:
-  OptionHelp() : OptionBase("help", "Display this message", false){}
-
-  void parse(std::vector<std::string> const & args, std::map<std::string, void*>& values){
-    if(get_option(args.begin(), args.end()) != args.end())
-      values[name_] = (void*)this;
-  }
-};
-
-
-/**
- * @class Option
- * @brief Standard, typed option
-*/
-template<class T>
-class Option: public OptionBase{
-public:
-  typedef std::function<T(std::string const &)> converter_t;
-  typedef std::function<void(T const &)> constraint_t;
-
-public:
-  Option(std::string const & name, std::string const & desc, T dft, converter_t convert, constraint_t constraint, OptionBase* parent):
-    OptionBase(name, desc, false, parent), default_(new T(dft)), convert_(convert), constraint_(constraint){}
-
-  Option(std::string const & name, std::string const & desc, bool required, converter_t convert, constraint_t constraint, OptionBase* parent):
-    OptionBase(name, desc, required, parent), convert_(convert), constraint_(constraint){}
-
-  void parse(std::vector<std::string> const & args, std::map<std::string, void*>& values){
-    value_ = default_;
-    auto it = get_option(args.begin(), args.end());
-    if(it!=args.end()){
-      auto next = it + 1;
-      if(next==args.end() || next->compare(0, 2, "--")==0)
-        throw InvalidOptions("parameter " + name_ + " requires an argument");
-      else{
-        value_.reset(new T(convert_(*next)));
-        constraint_(*value_);
-      }
-    }
-    values[name_] = (void*)value_.get();
-  }
-
-  std::ostream& usage(std::ostream& os, size_t indent) const{
-    OptionBase::usage(os, indent);
-    return os;
-  }
-
-private:
-  std::shared_ptr<T> default_;
-  std::shared_ptr<T> value_;
-  converter_t convert_;
-  constraint_t constraint_;
-};
-
-
-/**
- * @class SwitchOption
- * @brief Boolean option activated with --flag or --no-flag
-*/
-class SwitchOption: public OptionBase{
-public:
-  SwitchOption(std::string const & name, std::string const & desc, bool dft, OptionBase* parent):
-    OptionBase(name, desc, false, parent), default_(dft)
-  {}
-
-  void parse(std::vector<std::string> const & args, std::map<std::string, void*>& values){
-    auto it_true = std::find(args.begin(), args.end(), "--" + name_);
-    auto it_false = std::find(args.begin(), args.end(), "--no-" + name_);
-    value_.reset(new bool(default_));
-    if(it_true != args.end()) value_.reset(new bool(true));
-    if(it_false != args.end()) value_.reset(new bool(false));
-    values[name_] = (void*)value_.get();
-  }
-
-private:
-  bool default_;
-  std::shared_ptr<bool> value_;
-};
-
-/* Pre-defined converters */
-template<class T>
-class MapConverter{
-public:
-  MapConverter(std::map<std::string, T> const & values): values_(values){}
-
-  inline T operator()(std::string const & str){
-    if(values_.find(str) == values_.end())
-      throw InvalidOptions("value " + str + " is invalid");
-    return values_.at(str);
-  }
-
-private:
-  std::map<std::string, T> values_;
-};
-
-//Read type from stream
-template<class T>
-class StreamConverter{
-public:
-  T operator()(std::string const & str){
-    T value;
-    std::istringstream iss(str);
-    iss >> value;
-    return value;
-  }
-};
-
-//Read vector from stream
-template<class T>
-class StreamConverter<std::vector<T>>{
-public:
-  std::vector<T> operator()(std::string const & str){
-    std::vector<T> result;
-    std::istringstream iss(str);
-    std::string token;
-    while(std::getline(iss, token, ','))
-      result.push_back(StreamConverter<T>()(token));
-    return result;
-  }
-};
-
-//Read tuple from stream
-template<class... Args>
-class StreamConverter<std::tuple<Args...>>{
-  template<size_t I, class T, class... U>
-  struct TupleReader{
-    static std::tuple<T, U...> get(std::istringstream& iss){
-      auto x = TupleReader<0,T>::get(iss);
-      auto y = TupleReader<I-1, U...>::get(iss);
-      return std::tuple_cat(x, y);
-    }
-  };
-
-  template<class T>
-  struct TupleReader<0, T>{
-    static std::tuple<T> get(std::istringstream& iss){
-      std::string token;
-      std::getline(iss, token, ',');
-      return std::make_tuple(StreamConverter<T>()(token));
-    }
-  };
-
-public:
-  inline std::tuple<Args...> operator()(std::string const & str){
-    std::istringstream iss(str);
-    return TupleReader<sizeof...(Args) - 1, Args...>::get(iss);
-  }
-};
-
-/* Pre-defined constraints */
-struct NoOp {
-  template<class T>
-  void operator()(T const &) {}
-};
-
-class SizeConstraint{
-public:
-  SizeConstraint(size_t size): size_(size){}
-
-  template<class T>
-  void operator()(std::vector<T> const & x) const {
-    if(x.size()!=size_)
-      throw InvalidOptions("parameter must have size " + std::to_string(size_));
-  }
-private:
-  size_t size_;
-};
-
-class OneOf{
-public:
-  OneOf(std::vector<std::string> keys): keys_(keys){}
-
-  void operator()(std::map<std::string, void*> values){
-    std::vector<std::string> keys;
-    for(auto& x: values)
-      keys.push_back(x.first);
-
-    size_t found = 0;
-    for(auto& x: keys_)
-      if(std::find(keys.begin(), keys.end(), x) != keys.end())
-        found++;
-
-    std::string msg;
-    for(size_t i = 0; i < keys_.size(); ++i)
-      msg += (i>0?", ":"") + keys_[i];
-
-    if(found != 1)
-      throw InvalidOptions(std::string(found<1?"At least":"Only") + " one of the following flags must be specified: " + msg);
-  }
-
-private:
-  std::vector<std::string> keys_;
-};
-
-/**
- * @class Options
- * @brief Container for multiple options
- */
-class Options: public OptionBase{
-public:
-  typedef std::function<void(std::map<std::string, void*> const &)> constraint_t;
-
-  std::map<std::string, std::string> set_to_map(std::set<std::string> const & set){
-    std::map<std::string, std::string> tmp;
-    for(std::string x: set)
-      tmp.insert(std::make_pair(x, x));
-    return tmp;
-  }
-
-public:
-  Options(std::string const & name, std::string const & desc, OptionBase* parent): OptionBase(name, desc, false, parent)
-  {}
-
-  std::ostream& usage(std::ostream& os, size_t indent) const{
-    OptionBase::usage(os, indent);
-    for(auto& opt: opts_)
-      opt->usage(os, indent + (parent_==NULL)?0:2);
-    return os;
-  }
-
-  void parse(std::vector<std::string> const & args, std::map<std::string, void*>& values){
-    if(parent_==NULL || get_option(args.begin(), args.end()) != args.end()){
-      for(auto& opt: opts_)
-        opt->parse(args, values_);
-      for(auto& constraint: constraints_)
-        constraint(values_);
-      values[name_] = (void*)&values_;
-    }
-  }
-
-  void parse(int argc, char* argv[]){
-    std::vector<std::string> args(argv, argv + argc);
-    parse(args, values_);
-  }
-
-  template<class T>
-  void add(std::string const & name, std::string const & desc, T dft, typename Option<T>::constraint_t constraint = NoOp())
-  { opts_.push_back(std::make_shared<Option<T>>(name, desc, dft, StreamConverter<T>(), constraint, this));}
-
-  template<class T>
-  void add(std::string const & name, std::string const & desc, typename Option<T>::constraint_t constraint = NoOp())
-  { opts_.push_back(std::make_shared<Option<T>>(name, desc, false, StreamConverter<T>(), constraint, this));}
-
-  void add(std::string const & name, std::string const & desc, std::string dft, std::set<std::string> values)
-  { add<std::string>(name, desc, dft, set_to_map(values)); }
-
-  void add(std::string const & name, std::string const & desc, std::set<std::string> values)
-  { add<std::string>(name, desc, set_to_map(values)); }
-
-  template<class T>
-  void add(std::string const & name, std::string const & desc, std::string dft, std::map<std::string, T> values, typename Option<T>::constraint_t constraint = NoOp())
-  { opts_.push_back(std::make_shared<Option<T>>(name, desc, values.at(dft), MapConverter<T>(values), constraint, this)); }
-
-  template<class T>
-  void add(std::string const & name, std::string const & desc, std::map<std::string, T> values, typename Option<T>::constraint_t constraint = NoOp())
-  { opts_.push_back(std::make_shared<Option<T>>(name, desc, false, MapConverter<T>(values), constraint, this)); }
-
-
-  void add_switch(std::string const & name, std::string const & desc, bool dft = true)
-  { opts_.push_back(std::make_shared<SwitchOption>(name, desc, dft, this)); }
-
-  void add(OptionBase* opt)
-  { opts_.push_back(std::shared_ptr<OptionBase>(opt)); }
-
-  Options* add_group(std::string const & name, std::string const & desc){
-    opts_.push_back(std::make_shared<Options>(name, desc, this));
-    return (Options*)opts_.back().get();
-  }
-
-  void add_constraint(constraint_t const & constraint){
-    constraints_.push_back(constraint);
-  }
-
-  bool has(std::string const & name)
-  { return values_.find(name) != values_.end() && values_.at(name)!=NULL; }
-
-  template<class T>
-  T get(std::string const & name)
-  { return *((T*)values_[name]); }
-
-private:
-  std::vector<std::shared_ptr<OptionBase>> opts_;
-  std::map<std::string, void*> values_;
-  std::vector<constraint_t> constraints_;
-};
-
-
-/* Application */
-class Application{
-private:
-  void show_help() const{
-    std::cerr << "Usage: " << name_ << " [OPTS]" << std::endl;
-    std::cerr << "Description: " << desc_ << std::endl;
-    opts_.usage(std::cerr, 0);
-  }
-
-public:
-  Application(std::string const & name, std::string const & desc): name_(name), desc_(desc), opts_("root","",NULL)
-  { opts_.add(new OptionHelp()); }
-
-  void parse(int argc, char* argv[]){
-    try{
-      opts_.parse(argc, argv);
-    }catch(InvalidOptions const & e){
-      std::cerr << e.what() << std::endl;
-      show_help();
-      exit(EXIT_FAILURE);
-    }
-    if(opts_.has("help")){
-      show_help();
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  Options* options()
-  { return &opts_; }
-
-private:
-  std::string name_;
-  std::string desc_;
-  Options opts_;
-};
-
-}
-
-#endif
--- a/include/isaac/api.h
+++ b/include/isaac/api.h
@@ -1,69 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <tuple>
-
-#include "isaac/runtime/predict.h"
-#include "isaac/driver/backend.h"
-#include "isaac/driver/cublas.h"
-#include "isaac/driver/context.h"
-#include "isaac/driver/kernel.h"
-#include "isaac/driver/buffer.h"
-#include "isaac/driver/stream.h"
-#include "isaac/tools/bench.hpp"
-#include "isaac/tools/collections.hpp"
-#include "isaac/templates/conv.h"
-#include "isaac/templates/gemm.h"
-#include "isaac/templates/pool.h"
-
-namespace isaac{
-
-void GEMM(driver::Device const & device, driver::Stream & stream,
-          DType in_dtype, DType out_dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K,
-          param_t offa, param_t lda, param_t offb, param_t ldb, param_t offc, param_t ldc,
-          scalar const & alpha, driver::Buffer const & A, driver::Buffer const & B, scalar const & beta, driver::Buffer& C,
-          float a_scale, float b_scale, float c_scale,
-          driver::Buffer const *bias = NULL,
-          templates::GEMM* generator = NULL, size_t optimization_level = 1);
-
-void CONV(driver::Device const &, driver::Stream & stream,
-          DType in_dtype, DType out_dtype, param_t N, param_t K, param_t M, param_t P, param_t Q, param_t C, param_t T, param_t R, param_t S,
-          param_t D, param_t H, param_t W, param_t pad_d, param_t pad_h, param_t pad_w,
-          param_t stride_d, param_t stride_h, param_t stride_w,
-          param_t upsample_d, param_t upsample_h, param_t upsample_w,
-          driver::Buffer const & I, driver::Buffer const & F, driver::Buffer *O, param_t num_outputs,
-          driver::Buffer const *bias = NULL, ActivationType activation = Linear, float alpha = 0, float iscale = 1, float fscale = 1, std::vector<float> const & oscale = {1}, float z_scale = 1,
-          ResidualType residual = NoResidual, param_t Zk = 0, param_t crop_z_m0 = 0, param_t crop_z_m1 = 0, param_t crop_z_p0 = 0, param_t crop_z_p1 = 0, param_t crop_z_q0 = 0, param_t crop_z_q1 = 0, driver::Buffer const *Z = NULL,
-          templates::Conv* generator = NULL, size_t optimization_level = 1);
-
-
-void POOL(driver::Device const & device, driver::Stream & stream,
-          DType in_dtype, DType out_dtype, PoolType pool_type, param_t C, param_t M, param_t P, param_t Q, param_t N, param_t T, param_t R, param_t S,
-          param_t D, param_t H, param_t W, param_t pad_d, param_t pad_h, param_t pad_w, param_t stride_d, param_t stride_h, param_t stride_w,
-          driver::Buffer const & I, driver::Buffer& O,
-          float iscale, float oscale,
-          templates::Pool* generator = NULL, size_t optimization_level = 1);
-
-void TRANSFORM(driver::Stream & stream,
-               DType in_dtype, DType out_dtype, param_t N, param_t C, param_t D, param_t H, param_t W,
-               driver::Buffer const & I, driver::Buffer& O);
-}
--- a/include/isaac/driver/backend.h
+++ b/include/isaac/driver/backend.h
@@ -1,116 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_CL_QUEUES_H
-#define ISAAC_CL_QUEUES_H
-
-#include <map>
-#include <list>
-#include <vector>
-
-
-namespace isaac
-{
-namespace driver
-{
-
-class Buffer;
-class Stream;
-class Device;
-class Context;
-class Platform;
-class Module;
-class Kernel;
-
-struct backend
-{
-
-  class modules
-  {
-    friend class backend;
-  public:
-    static void release();
-    static Module& get(Stream const & stream, std::string const & name, std::string const &src);
-  private:
-    static std::map<std::tuple<Stream, std::string>, Module * > cache_;
-  };
-
-  class kernels
-  {
-    friend class backend;
-  public:
-    static void release();
-    static Kernel & get(Module const & program, std::string const & name);
-  private:
-    static std::map<std::tuple<Module, std::string>, Kernel * > cache_;
-  };
-
-  class contexts
-  {
-    friend class backend;
-  private:
-    static void init(std::vector<Platform> const &);
-    static void release();
-  public:
-    static Context const & get_default();
-    template<class T>
-    static Context const & import(T context)
-    {
-      for(driver::Context const * x: cache_)
-        if((T)*x==context)
-          return *x;
-      cache_.emplace_back(new Context(context, false));
-      return *cache_.back();
-    }
-    static void get(std::list<Context const *> &);
-  private:
-    static std::list<Context const *> cache_;
-  };
-
-  class streams
-  {
-    friend class backend;
-  private:
-    static void init(std::list<Context const *> const &);
-    static void release();
-  public:
-    static void get(Context const &, std::vector<Stream *> &streams);
-    static Stream & get(Context const &, unsigned int id = 0);
-    static Stream & get_default();
-  private:
-    static std::map< Context, std::vector<Stream*> > cache_;
-  };
-
-  static void init();
-  static void release();
-
-  static std::vector<Device> devices();
-  static std::vector<Platform> platforms();
-  static void synchronize(Context const &);
-
-  static unsigned int default_device;
-};
-
-}
-}
-
-#endif
--- a/include/isaac/driver/buffer.h
+++ b/include/isaac/driver/buffer.h
@@ -1,54 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_DRIVER_BUFFER_H
-#define ISAAC_DRIVER_BUFFER_H
-
-#include "isaac/driver/handle.h"
-#include "isaac/driver/context.h"
-
-namespace isaac
-{
-namespace driver
-{
-
-class Stream;
-
-// Buffer
-class Buffer: public HandleInterface<Buffer, CUdeviceptr>
-{
-public:
-  Buffer(Context const & context, size_t size);
-  Buffer(Context const & context, CUdeviceptr cu, bool take_ownership);
-  void set_zero(Stream const & queue, size_t size);
-  Handle<CUdeviceptr> const & cu() const;
-  Handle<CUdeviceptr> & cu();
-
-private:
-  Context context_;
-  Handle<CUdeviceptr> cu_;
-};
-
-}
-}
-
-#endif
--- a/include/isaac/driver/context.h
+++ b/include/isaac/driver/context.h
@@ -1,66 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_DRIVER_CONTEXT_H
-#define ISAAC_DRIVER_CONTEXT_H
-
-#include "isaac/driver/device.h"
-#include "isaac/driver/handle.h"
-
-namespace isaac
-{
-namespace driver
-{
-
-class Context: public HandleInterface<Context, CUcontext>
-{
-private:
-  static std::string get_cache_path();
-  static CUdevice device(CUcontext);
-
-public:
-  //Constructors
-  explicit Context(CUcontext context, bool take_ownership = true);
-  explicit Context(Device const & device);
-  //Accessors
-  Device const & device() const;
-  std::string const & cache_path() const;
-  Handle<CUcontext> const & cu() const;
-
-private:
-  Handle<CUcontext> cu_;
-  Device device_;
-  std::string cache_path_;
-};
-
-class ContextSwitcher{
-public:
-    ContextSwitcher(Context const & ctx);
-    ~ContextSwitcher();
-private:
-    Context const & ctx_;
-};
-
-}
-}
-
-#endif
--- a/include/isaac/driver/cublas.h
+++ b/include/isaac/driver/cublas.h
@@ -1,229 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_DRIVER_CUBLAS_H
-#define ISAAC_DRIVER_CUBLAS_H
-
-#include "isaac/templates/common.hpp"
-#include "isaac/driver/dispatch.h"
-#include "isaac/driver/buffer.h"
-#include "isaac/driver/stream.h"
-#include "isaac/driver/backend.h"
-#include "isaac/driver/error.h"
-#include "isaac/tools/bench.hpp"
-#include "isaac/tools/collections.hpp"
-
-namespace isaac
-{
-namespace driver
-{
-
-enum cublasStrategy_t{
-    CUBLAS_PREFER_FASTEST,
-    CUBLAS_HEURISTICS
-};
-
-
-static const std::vector<cublasGemmAlgo_t> cublasAlgorithms = {
-  CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1, CUBLAS_GEMM_ALGO2, CUBLAS_GEMM_ALGO3,
-  CUBLAS_GEMM_ALGO4, CUBLAS_GEMM_ALGO5, CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7
-};
-
-static const std::map<DType, cudaDataType> cudtype = {{FLOAT_TYPE, CUDA_R_32F}, {DOUBLE_TYPE,CUDA_R_64F}};
-static const std::map<char, cublasOperation_t> cuop = {{'N', CUBLAS_OP_N}, {'T', CUBLAS_OP_T}};
-
-inline cublasGemmAlgo_t cublasGemmFastest(Stream& stream, cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K,
-                         void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb,
-                         void* beta, CUdeviceptr C, int32_t ldc){
-
-  typedef std::tuple<cudaDataType_t, cublasOperation_t, cublasOperation_t, int32_t, int32_t, int32_t> key_t;
-  // Benchmark fastest algorithm in cublasGemmEx
-  auto benchmark_fastest = [&](key_t const &){
-    std::vector<double> times;
-    for(cublasGemmAlgo_t a: cublasAlgorithms){
-      try{
-        times.push_back(bench([&](){ dispatch::cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, a); },
-        [&](){ stream.synchronize(); },
-        stream.context().device()));
-      }catch(driver::exception::cublas::base const &){
-        times.push_back(INFINITY);
-      }
-    }
-    size_t argmin = std::min_element(times.begin(), times.end()) - times.begin();
-    return cublasAlgorithms[argmin];
-  };
-  // Cache result
-  static cpp::CachedMap<key_t, cublasGemmAlgo_t> cache(benchmark_fastest);
-  return cache.get(std::make_tuple(cudt, AT, BT, M, N, K));
-}
-
-/* Wrapper for cublasGemmEx */
-inline void cublasGemmEx(cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K,
-                         void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb,
-                         void* beta, CUdeviceptr C, int32_t ldc, cublasGemmAlgo_t algo)
-{ dispatch::cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, algo); }
-
-
-/* Simplified API for default GEMM */
-inline void cublasGemm(DType dtype, Stream& stream, char cAT, char cBT, int32_t M, int32_t N, int32_t K, scalar alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, scalar beta, Buffer& C, int32_t ldc, cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT){
-  ContextSwitcher ctx_switch(stream.context());
-  cublasHandle_t handle = dispatch::cublasHandle(stream.context());
-  dispatch::cublasSetStream_v2(handle, (CUstream)stream);
-  if(fastest)
-    *fastest = cublasGemmFastest(stream, handle, cudtype.at(dtype), cuop.at(cAT), cuop.at(cBT), M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc);
-  else
-    cublasGemmEx(handle, cudtype.at(dtype), cuop.at(cAT), cuop.at(cBT), M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc, algo);
-}
-
-inline cudnnDataType_t cudnnDtype(DType dtype){
-  switch(dtype){
-    case INT8X4_TYPE: return CUDNN_DATA_INT8x4;
-    case INT32_TYPE: return CUDNN_DATA_INT32;
-    case FLOAT_TYPE: return CUDNN_DATA_FLOAT;
-    case DOUBLE_TYPE: return CUDNN_DATA_DOUBLE;
-  }
-  throw;
-}
-
-inline cudnnTensorFormat_t format(cudnnDataType_t cutype){
-  switch(cutype){
-    case CUDNN_DATA_INT8x4: return CUDNN_TENSOR_NCHW_VECT_C;
-    default: return CUDNN_TENSOR_NCHW;
-  }
-}
-
-inline void cudnnConv(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t C, int32_t T, int32_t R, int32_t S,
-                      int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, Buffer const & F, scalar beta, Buffer const & O){
-  driver::Context const & ctx = stream.context();
-  ContextSwitcher switch_ctx(ctx);
-
-  std::vector<int> pad = {pad_d, pad_h, pad_w};
-  std::vector<int> stride = {stride_d, stride_h, stride_w};
-  std::vector<int> upscale = {1, 1, 1};
-  std::vector<int> Oshapes = {N, K, M, P, Q};
-  std::vector<int> Fshapes = {K, C, T, R, S};
-  std::vector<int> Ishapes = {N, C, D, H, W};
-  if(M == 1 && T == 1 && D == 1){
-    pad.erase(pad.begin());
-    stride.erase(stride.begin());
-    upscale.erase(upscale.begin());
-    Oshapes.erase(Oshapes.begin() + 2);
-    Ishapes.erase(Ishapes.begin() + 2);
-    Fshapes.erase(Fshapes.begin() + 2);
-  }
-
-  cudnnHandle_t handle = dispatch::cudnnHandle(ctx);
-  cudnnDataType_t in_cutype = cudnnDtype(dtype);
-  cudnnDataType_t conv_cutype = (dtype == INT8X4_TYPE)?CUDNN_DATA_INT32:in_cutype;
-
-  dispatch::cudnnSetStream(handle, (CUstream)stream);
-  cudnnTensorDescriptor_t tO, tI;
-  cudnnFilterDescriptor_t tF;
-  cudnnConvolutionDescriptor_t conv;
-  cudnnConvolutionFwdAlgo_t algo;
-  dispatch::cudnnCreateTensorDescriptor(&tO);
-  dispatch::cudnnCreateTensorDescriptor(&tI);
-  dispatch::cudnnCreateFilterDescriptor(&tF);
-
-  dispatch::cudnnSetTensorNdDescriptorEx(tO, format(in_cutype), in_cutype, Oshapes.size(), Oshapes.data());
-  dispatch::cudnnSetFilterNdDescriptor(tF, in_cutype, format(in_cutype), Fshapes.size(), Fshapes.data());
-  dispatch::cudnnSetTensorNdDescriptorEx(tI, format(in_cutype), in_cutype, Ishapes.size(), Ishapes.data());
-
-  dispatch::cudnnCreateConvolutionDescriptor(&conv);
-  dispatch::cudnnSetConvolutionNdDescriptor(conv, pad.size(), pad.data(), stride.data(), upscale.data(), CUDNN_CROSS_CORRELATION, conv_cutype);
-  dispatch::cudnnGetConvolutionForwardAlgorithm(handle, tI, tF, conv, tO, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 1024*1024*64, &algo);
-
-  size_t workspace_size;
-  dispatch::cudnnGetConvolutionForwardWorkspaceSize(handle, tI, tF, conv, tO, algo, &workspace_size);
-  static Buffer work(ctx, 1024*1024*64);
-  CUdeviceptr twork = work;
-  CUdeviceptr pI = I, pF = F, pO = O;
-  dispatch::cudnnConvolutionForward(handle, alpha.data(), tI, (void*)pI, tF, (void*)pF, conv, algo, (void*)twork, workspace_size, beta.data(), tO, (void*)pO);
-}
-
-
-inline void cudnnPool(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t T, int32_t R, int32_t S,
-                      int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, scalar beta, Buffer const & O){
-  driver::Context const & ctx = stream.context();
-  ContextSwitcher switch_ctx(ctx);
-
-  std::vector<int> pad = {pad_d, pad_h, pad_w};
-  std::vector<int> stride = {stride_d, stride_h, stride_w};
-  std::vector<int> upscale = {1, 1, 1};
-  std::vector<int> Oshapes = {N, K, M, P, Q};
-  std::vector<int> Ishapes = {N, K, D, H, W};
-  std::vector<int> window = {T, R, S};
-  if(M == 1 && T == 1 && D == 1){
-    window.erase(window.begin());
-    pad.erase(pad.begin());
-    stride.erase(stride.begin());
-    upscale.erase(upscale.begin());
-    Oshapes.erase(Oshapes.begin() + 2);
-    Ishapes.erase(Ishapes.begin() + 2);
-  }
-
-  cudnnHandle_t handle = dispatch::cudnnHandle(ctx);
-  cudnnDataType_t cutype = cudnnDtype(dtype);
-
-  dispatch::cudnnSetStream(handle, (CUstream)stream);
-  cudnnTensorDescriptor_t tO, tI;
-  cudnnPoolingDescriptor_t desc;
-  dispatch::cudnnCreateTensorDescriptor(&tO);
-  dispatch::cudnnCreateTensorDescriptor(&tI);
-
-  dispatch::cudnnSetTensorNdDescriptorEx(tO, CUDNN_TENSOR_NCHW, cutype, Oshapes.size(), Oshapes.data());
-  dispatch::cudnnSetTensorNdDescriptorEx(tI, CUDNN_TENSOR_NCHW, cutype, Ishapes.size(), Ishapes.data());
-
-  dispatch::cudnnCreatePoolingDescriptor(&desc);
-  dispatch::cudnnSetPoolingNdDescriptor(desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN, window.size(), window.data(), pad.data(), stride.data());
-
-  CUdeviceptr pI = I, pO = O;
-  dispatch::cudnnPoolingForward(handle, desc, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO);
-}
-
-inline void cudnnTransformTensor(driver::Stream & stream,
-               DType in_dtype, DType out_dtype,
-               cudnnTensorFormat_t in_layout, cudnnTensorFormat_t out_layout,
-               int32_t N, int32_t C, int32_t D, int32_t H, int32_t W,
-               scalar alpha, driver::Buffer const & I, scalar beta, driver::Buffer& O)
-{
-  cudnnHandle_t handle = dispatch::cudnnHandle(stream.context());
-  dispatch::cudnnSetStream(handle, (CUstream)stream);
-
-  cudnnTensorDescriptor_t tO, tI;
-  std::vector<int> shapes = {N, C, D, H, W};
-  dispatch::cudnnCreateTensorDescriptor(&tI);
-  dispatch::cudnnSetTensorNdDescriptorEx(tI, in_layout, cudnnDtype(in_dtype), shapes.size(), shapes.data());
-  dispatch::cudnnCreateTensorDescriptor(&tO);
-  dispatch::cudnnSetTensorNdDescriptorEx(tO, out_layout, cudnnDtype(out_dtype), shapes.size(), shapes.data());
-
-  CUdeviceptr pI = I, pO = O;
-  dispatch::cudnnTransformTensor(handle, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO);
-}
-
-
-}
-}
-
-
-
-#endif
--- a/include/isaac/driver/device.h
+++ b/include/isaac/driver/device.h
@@ -1,98 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_DRIVER_DEVICE_H
-#define ISAAC_DRIVER_DEVICE_H
-
-#include "isaac/driver/platform.h"
-#include "isaac/driver/handle.h"
-
-namespace isaac
-{
-
-namespace driver
-{
-
-// Device
-class Device: public HandleInterface<Device, CUdevice>
-{
-public:
-  //Supported architectures
-  enum class Architecture{
-    //NVidia
-    SM_2_0,
-    SM_2_1,
-    SM_3_0,
-    SM_3_5,
-    SM_3_7,
-    SM_5_0,
-    SM_5_2,
-    SM_6_0,
-    SM_6_1,
-    SM_7_0,
-    UNKNOWN
-  };
-
-private:
-  //Metaprogramming elper to get cuda info from attribute
-  template<CUdevice_attribute attr>
-  int cuGetInfo() const;
-
-  inline Architecture nv_arch(std::pair<unsigned int, unsigned int> sm) const;
-  inline nvmlDevice_t nvml_device() const;
-
-public:
-  Device(CUdevice cu = CUdevice(), bool take_ownership = true): cu_(cu, take_ownership){}
-  //Accessors
-  Architecture architecture() const;
-  Handle<CUdevice> const & cu() const;
-  //Informations
-  std::string infos() const;
-  size_t address_bits() const;
-  driver::Platform platform() const;
-  std::vector<size_t> max_block_dim() const;
-  size_t max_threads_per_block() const;
-  size_t max_shared_memory() const;
-  size_t warp_size() const;
-  //Compute Capability
-  void interpret_as(std::pair<size_t, size_t> cc);
-  std::pair<size_t, size_t> compute_capability() const;
-  //Identifier
-  std::string name() const;
-  std::string pci_bus_id() const;
-  //Clocks
-  size_t current_sm_clock() const;
-  size_t current_mem_clock() const;
-
-  size_t max_sm_clock() const;
-  size_t max_mem_clock() const;
-
-private:
-  Handle<CUdevice> cu_;
-  std::shared_ptr<std::pair<size_t, size_t>> interpreted_as_;
-};
-
-}
-
-}
-
-#endif
--- a/include/isaac/driver/dispatch.h
+++ b/include/isaac/driver/dispatch.h
@@ -1,258 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_DRIVER_DISPATCHER_H
-#define ISAAC_DRIVER_DISPATCHER_H
-
-#include <type_traits>
-#include <dlfcn.h>
-
-//CUDA Backend
-#include "isaac/external/CUDA/cuda.h"
-#include "isaac/external/CUDA/nvrtc.h"
-#include "isaac/external/CUDA/cublas_v2.h"
-#include "isaac/external/CUDA/cudnn.h"
-#include "isaac/external/CUDA/nvml.h"
-
-//Exceptions
-#include <iostream>
-#include <stdexcept>
-
-namespace isaac
-{
-namespace driver
-{
-
-class Context;
-
-template<class T> void check(T){}
-void check(nvrtcResult err);
-void check(CUresult err);
-void check(cublasStatus_t err);
-void check(cudnnStatus_t err);
-
-class dispatch
-{
-private:
-  template <class F>
-  struct return_type;
-
-  template <class R, class... A>
-  struct return_type<R (*)(A...)>
-  { typedef R type; };
-
-  typedef bool (*f_init_t)();
-
-  template<f_init_t initializer, typename FunPtrT, typename... Args>
-  static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args)
-  {
-    initializer();
-    if(cache == nullptr){
-      cache = dlsym(lib_h, name);
-			if(cache == 0)
-				throw std::runtime_error("dlsym unable to load function");
-		}
-    FunPtrT fptr;
-    *reinterpret_cast<void **>(&fptr) = cache;
-    typename return_type<FunPtrT>::type res = (*fptr)(args...);
-    check(res);
-    return res;
-  }
-
-public:
-  static bool nvrtcinit();
-  static bool nvmlinit();
-  static bool cuinit();
-  static bool cublasinit();
-  static bool cudnninit();
-
-  static void release();
-
-  //CUDA
-  static CUresult cuCtxGetCurrent(CUcontext *pctx);
-  static CUresult cuCtxSetCurrent(CUcontext ctx);
-
-  static CUresult cuCtxDestroy_v2(CUcontext ctx);
-  static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
-  static CUresult cuDeviceGet(CUdevice *device, int ordinal);
-  static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
-  static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
-  static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
-  static CUresult cuMemFree_v2(CUdeviceptr dptr);
-  static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-  static CUresult cuDriverGetVersion(int *driverVersion);
-  static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
-  static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev);
-  static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t* bytes, CUmodule hmod, const char *name);
-
-  static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-  static CUresult cuModuleLoad(CUmodule *module, const char *fname);
-  static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
-  static CUresult cuModuleUnload(CUmodule hmod);
-  static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
-  static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
-  static CUresult cuDeviceGetCount(int *count);
-  static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
-  static CUresult cuInit(unsigned int Flags);
-  static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
-  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
-  static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
-  static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
-  static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
-  static CUresult cuStreamSynchronize(CUstream hStream);
-  static CUresult cuStreamDestroy_v2(CUstream hStream);
-  static CUresult cuEventDestroy_v2(CUevent hEvent);
-  static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
-  static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr);
-  static CUresult cuCtxGetDevice(CUdevice* result);
-  static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream);
-
-  static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device);
-  static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
-  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
-
-  static nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char **options);
-  static nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
-  static nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
-  static nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
-  static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
-  static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
-
-  static cublasHandle_t cublasHandle(Context const & ctx);
-  static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
-  static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId);
-  static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId);
-  static cublasStatus_t cublasSgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
-  static cublasStatus_t cublasDgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
-  static cublasStatus_t cublasHgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc);
-  static cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const void *beta, void *C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo);
-
-  static cudnnHandle_t cudnnHandle(Context const & ctx);
-  static cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
-  static cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc);
-  static cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
-  static cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
-  static cudnnStatus_t cudnnCreate(cudnnHandle_t *handle);
-  static cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w);
-  static cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w);
-  static cudnnStatus_t cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int nbDims, const int dimA[]);
-  static cudnnStatus_t cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int nbDims, const int filterDimA[]);
-  static cudnnStatus_t cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, int pad_h, int pad_w, int u, int v, int upscalex, int upscaley, cudnnConvolutionMode_t mode);
-  static cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, int arrayLength, const int padA[], const int filterStrideA[], const int upscaleA[], cudnnConvolutionMode_t mode, cudnnDataType_t dataType);
-  static cudnnStatus_t cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims, const int windowDimA[], const int paddingA[], const int strideA[]);
-  static cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionFwdAlgo_t *algo);
-  static cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo, size_t *sizeInBytes);
-  static cudnnStatus_t cudnnConvolutionForward(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const cudnnFilterDescriptor_t wDesc, const void *w, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo, void *workSpace, size_t workSpaceSizeInBytes, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
-  static cudnnStatus_t cudnnPoolingForward(cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
-  static cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
-  static cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
-
-private:
-  static void* cuda_;
-  static void* nvrtc_;
-  static void* nvml_;
-  static void* cublas_;
-  static void* cudnn_;
-
-  //CUDA
-  static void* cuCtxGetCurrent_;
-  static void* cuCtxSetCurrent_;
-  static void* cuCtxDestroy_v2_;
-  static void* cuEventCreate_;
-  static void* cuDeviceGet_;
-  static void* cuMemcpyDtoH_v2_;
-  static void* cuStreamCreate_;
-  static void* cuEventElapsedTime_;
-  static void* cuMemFree_v2_;
-  static void* cuMemcpyDtoHAsync_v2_;
-  static void* cuDriverGetVersion_;
-  static void* cuDeviceGetName_;
-  static void* cuDeviceGetPCIBusId_;
-  static void* cuModuleGetGlobal_v2_;
-
-  static void* cuMemcpyHtoDAsync_v2_;
-  static void* cuModuleLoad_;
-  static void* cuLaunchKernel_;
-  static void* cuModuleUnload_;
-  static void* cuModuleLoadDataEx_;
-  static void* cuDeviceGetAttribute_;
-  static void* cuDeviceGetCount_;
-  static void* cuMemcpyHtoD_v2_;
-  static void* cuInit_;
-  static void* cuEventRecord_;
-  static void* cuCtxCreate_v2_;
-  static void* cuModuleGetFunction_;
-  static void* cuStreamSynchronize_;
-  static void* cuStreamDestroy_v2_;
-  static void* cuEventDestroy_v2_;
-  static void* cuMemAlloc_v2_;
-  static void* cuPointerGetAttribute_;
-  static void* cuCtxGetDevice_;
-  static void* cuMemsetD8Async_;
-  static void* cuCtxPushCurrent_v2_;
-  static void* cuCtxPopCurrent_v2_;
-
-  static void* nvmlInit_v2_;
-  static void* nvmlDeviceGetHandleByPciBusId_v2_;
-  static void* nvmlDeviceGetClockInfo_;
-  static void* nvmlDeviceGetMaxClockInfo_;
-
-  static void* nvrtcCompileProgram_;
-  static void* nvrtcGetProgramLogSize_;
-  static void* nvrtcGetPTX_;
-  static void* nvrtcGetPTXSize_;
-  static void* nvrtcCreateProgram_;
-  static void* nvrtcGetProgramLog_;
-
-  static void* cublasCreate_v2_;
-  static void* cublasGetStream_v2_;
-  static void* cublasSetStream_v2_;
-  static void* cublasHgemm_;
-  static void* cublasSgemm_v2_;
-  static void* cublasDgemm_v2_;
-  static void* cublasGemmEx_;
-
-  static void* cudnnCreateConvolutionDescriptor_;
-  static void* cudnnCreatePoolingDescriptor_;
-  static void* cudnnCreateTensorDescriptor_;
-  static void* cudnnCreateFilterDescriptor_;
-  static void* cudnnCreate_;
-  static void* cudnnSetTensor4dDescriptor_;
-  static void* cudnnSetFilter4dDescriptor_;
-  static void* cudnnSetTensorNdDescriptorEx_;
-  static void* cudnnSetFilterNdDescriptor_;
-  static void* cudnnSetConvolution2dDescriptor_;
-  static void* cudnnSetConvolutionNdDescriptor_;
-  static void* cudnnSetPoolingNdDescriptor_;
-  static void* cudnnGetConvolutionForwardAlgorithm_;
-  static void* cudnnGetConvolutionForwardWorkspaceSize_;
-  static void* cudnnConvolutionForward_;
-  static void* cudnnPoolingForward_;
-  static void* cudnnSetStream_;
-  static void* cudnnTransformTensor_;
-};
-
-}
-}
-
-
-#endif
--- a/include/isaac/driver/error.h
+++ b/include/isaac/driver/error.h
@@ -1,228 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_EXCEPTION_DRIVER_H
-#define ISAAC_EXCEPTION_DRIVER_H
-
-#include <exception>
-#include "isaac/driver/dispatch.h"
-
-
-namespace isaac
-{
-
-  namespace driver
-  {
-
-  namespace exception
-  {
-
-  namespace nvrtc
-  {
-
-#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
-
-  ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory              ,"out of memory");
-  ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure   ,"program creation failure");
-  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input              ,"invalid input");
-  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program            ,"invalid program");
-  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_option             ,"invalid option");
-  ISAAC_CREATE_NVRTC_EXCEPTION(compilation                ,"compilation");
-  ISAAC_CREATE_NVRTC_EXCEPTION(builtin_operation_failure  ,"builtin operation failure");
-  ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error              ,"unknown error");
-
-#undef ISAAC_CREATE_NVRTC_EXCEPTION
-  }
-
-
-  namespace cuda
-  {
-  class base: public std::exception{};
-
-#define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "CUDA: Error- " msg; } }
-
-
-  ISAAC_CREATE_CUDA_EXCEPTION(invalid_value                   ,"invalid value");
-  ISAAC_CREATE_CUDA_EXCEPTION(out_of_memory                   ,"out of memory");
-  ISAAC_CREATE_CUDA_EXCEPTION(not_initialized                 ,"not initialized");
-  ISAAC_CREATE_CUDA_EXCEPTION(deinitialized                   ,"deinitialized");
-  ISAAC_CREATE_CUDA_EXCEPTION(profiler_disabled               ,"profiler disabled");
-  ISAAC_CREATE_CUDA_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
-  ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_started        ,"profiler already started");
-  ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
-  ISAAC_CREATE_CUDA_EXCEPTION(no_device                       ,"no device");
-  ISAAC_CREATE_CUDA_EXCEPTION(invalid_device                  ,"invalid device");
-  ISAAC_CREATE_CUDA_EXCEPTION(invalid_image                   ,"invalid image");
-  ISAAC_CREATE_CUDA_EXCEPTION(invalid_context                 ,"invalid context");
-  ISAAC_CREATE_CUDA_EXCEPTION(context_already_current         ,"context already current");
-  ISAAC_CREATE_CUDA_EXCEPTION(map_failed                      ,"map failed");
-  ISAAC_CREATE_CUDA_EXCEPTION(unmap_failed                    ,"unmap failed");
-  ISAAC_CREATE_CUDA_EXCEPTION(array_is_mapped                 ,"array is mapped");
-  ISAAC_CREATE_CUDA_EXCEPTION(already_mapped                  ,"already mapped");
-  ISAAC_CREATE_CUDA_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
-  ISAAC_CREATE_CUDA_EXCEPTION(already_acquired                ,"already acquired");
-  ISAAC_CREATE_CUDA_EXCEPTION(not_mapped                      ,"not mapped");
-  ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
-  ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
-  ISAAC_CREATE_CUDA_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
-  ISAAC_CREATE_CUDA_EXCEPTION(unsupported_limit               ,"unsupported limit");
-  ISAAC_CREATE_CUDA_EXCEPTION(context_already_in_use          ,"context already in use");
-  ISAAC_CREATE_CUDA_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
-  ISAAC_CREATE_CUDA_EXCEPTION(invalid_ptx                     ,"invalid ptx");
-  ISAAC_CREATE_CUDA_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
-  ISAAC_CREATE_CUDA_EXCEPTION(invalid_source                  ,"invalid source");
-  ISAAC_CREATE_CUDA_EXCEPTION(file_not_found                  ,"file not found");
-  ISAAC_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
-  ISAAC_CREATE_CUDA_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
-  ISAAC_CREATE_CUDA_EXCEPTION(operating_system                ,"operating system");
-  ISAAC_CREATE_CUDA_EXCEPTION(invalid_handle                  ,"invalid handle");
-  ISAAC_CREATE_CUDA_EXCEPTION(not_found                       ,"not found");
-  ISAAC_CREATE_CUDA_EXCEPTION(not_ready                       ,"not ready");
-  ISAAC_CREATE_CUDA_EXCEPTION(illegal_address                 ,"illegal address");
-  ISAAC_CREATE_CUDA_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
-  ISAAC_CREATE_CUDA_EXCEPTION(launch_timeout                  ,"launch timeout");
-  ISAAC_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
-  ISAAC_CREATE_CUDA_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
-  ISAAC_CREATE_CUDA_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
-  ISAAC_CREATE_CUDA_EXCEPTION(primary_context_active          ,"primary context active");
-  ISAAC_CREATE_CUDA_EXCEPTION(context_is_destroyed            ,"context is destroyed");
-  ISAAC_CREATE_CUDA_EXCEPTION(assert_error                    ,"assert");
-  ISAAC_CREATE_CUDA_EXCEPTION(too_many_peers                  ,"too many peers");
-  ISAAC_CREATE_CUDA_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
-  ISAAC_CREATE_CUDA_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
-  ISAAC_CREATE_CUDA_EXCEPTION(hardware_stack_error            ,"hardware stack error");
-  ISAAC_CREATE_CUDA_EXCEPTION(illegal_instruction             ,"illegal instruction");
-  ISAAC_CREATE_CUDA_EXCEPTION(misaligned_address              ,"misaligned address");
-  ISAAC_CREATE_CUDA_EXCEPTION(invalid_address_space           ,"invalid address space");
-  ISAAC_CREATE_CUDA_EXCEPTION(invalid_pc                      ,"invalid pc");
-  ISAAC_CREATE_CUDA_EXCEPTION(launch_failed                   ,"launch failed");
-  ISAAC_CREATE_CUDA_EXCEPTION(not_permitted                   ,"not permitted");
-  ISAAC_CREATE_CUDA_EXCEPTION(not_supported                   ,"not supported");
-  ISAAC_CREATE_CUDA_EXCEPTION(unknown                         ,"unknown");
-
-#undef ISAAC_CREATE_CUDA_EXCEPTION
-  }
-
-  namespace cublas
-  {
-  class base: public std::exception{};
-
-#define ISAAC_CREATE_CUBLAS_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } }
-
-  ISAAC_CREATE_CUBLAS_EXCEPTION(not_initialized              ,"not initialized");
-  ISAAC_CREATE_CUBLAS_EXCEPTION(alloc_failed                 ,"alloc failed");
-  ISAAC_CREATE_CUBLAS_EXCEPTION(invalid_value                ,"invalid value");
-  ISAAC_CREATE_CUBLAS_EXCEPTION(arch_mismatch                ,"arch mismatch");
-  ISAAC_CREATE_CUBLAS_EXCEPTION(mapping_error                ,"mapping error");
-  ISAAC_CREATE_CUBLAS_EXCEPTION(execution_failed             ,"execution failed");
-  ISAAC_CREATE_CUBLAS_EXCEPTION(internal_error               ,"internal error");
-  ISAAC_CREATE_CUBLAS_EXCEPTION(not_supported                ,"not supported");
-  ISAAC_CREATE_CUBLAS_EXCEPTION(license_error                ,"license error");
-  ISAAC_CREATE_CUBLAS_EXCEPTION(unknown                      ,"unknown");
-
-#undef ISAAC_CREATE_CUBLAS_EXCEPTION
-  }
-
-  namespace cudnn
-  {
-#define ISAAC_CREATE_CUDNN_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "CUDNN: Error- " msg; } }
-
-  ISAAC_CREATE_CUDNN_EXCEPTION(not_initialized              ,"not initialized");
-  ISAAC_CREATE_CUDNN_EXCEPTION(alloc_failed                 ,"allocation failed");
-  ISAAC_CREATE_CUDNN_EXCEPTION(bad_param                    ,"bad param");
-  ISAAC_CREATE_CUDNN_EXCEPTION(internal_error               ,"internal error");
-  ISAAC_CREATE_CUDNN_EXCEPTION(invalid_value                ,"invalid value");
-  ISAAC_CREATE_CUDNN_EXCEPTION(arch_mismatch                ,"arch mismatch");
-  ISAAC_CREATE_CUDNN_EXCEPTION(mapping_error                ,"mapping error");
-  ISAAC_CREATE_CUDNN_EXCEPTION(execution_failed             ,"execution failed");
-  ISAAC_CREATE_CUDNN_EXCEPTION(not_supported                ,"not supported");
-  ISAAC_CREATE_CUDNN_EXCEPTION(license_error                ,"license error");
-  ISAAC_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing ,"prerequisite missing");
-  ISAAC_CREATE_CUDNN_EXCEPTION(runtime_in_progress          ,"runtime in progress");
-  ISAAC_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow          ,"runtime fp overflow");
-  }
-
-  namespace ocl
-  {
-
-  class base: public std::exception{};
-
-#define ISAAC_CREATE_CL_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "OpenCL: Error- " msg; } }
-
-
-  ISAAC_CREATE_CL_EXCEPTION(device_not_found,                  "device not found");
-  ISAAC_CREATE_CL_EXCEPTION(device_not_available,              "device not available");
-  ISAAC_CREATE_CL_EXCEPTION(compiler_not_available,            "compiler not available");
-  ISAAC_CREATE_CL_EXCEPTION(mem_object_allocation_failure,     "object allocation failure");
-  ISAAC_CREATE_CL_EXCEPTION(out_of_resources,                  "launch out of resources");
-  ISAAC_CREATE_CL_EXCEPTION(out_of_host_memory,                "out of host memory");
-  ISAAC_CREATE_CL_EXCEPTION(profiling_info_not_available,      "profiling info not available");
-  ISAAC_CREATE_CL_EXCEPTION(mem_copy_overlap,                  "mem copy overlap");
-  ISAAC_CREATE_CL_EXCEPTION(image_format_mismatch,             "image format mismatch");
-  ISAAC_CREATE_CL_EXCEPTION(image_format_not_supported,        "image format not supported");
-  ISAAC_CREATE_CL_EXCEPTION(build_program_failure,             "build program failure");
-  ISAAC_CREATE_CL_EXCEPTION(map_failure,                       "map failure");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_value,                     "invalid value");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_device_type,               "invalid device type");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_platform,                  "invalid platform");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_device,                    "invalid device");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_context,                   "invalid context");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_queue_properties,          "invalid queue properties");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_command_queue,             "invalid command queue");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_host_ptr,                  "invalid host pointer");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_mem_object,                "invalid mem object");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_image_format_descriptor,   "invalid image format descriptor");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_image_size,                "invalid image size");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_sampler,                   "invalid sampler");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_binary,                    "invalid binary");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_build_options,             "invalid build options");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_program,                   "invalid program");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_program_executable,        "invalid program executable");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_name,               "invalid kernel name");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_definition,         "invalid kernel definition");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel,                    "invalid kernel");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_arg_index,                 "invalid arg index");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_arg_value,                 "invalid arg value");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_arg_size,                  "invalid arg size");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_args,               "invalid kernel args");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_work_dimension,            "invalid work dimension");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_work_group_size,           "invalid work group size");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_work_item_size,            "invalid work item size");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_global_offset,             "invalid global offset");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_event_wait_list,           "invalid event wait list");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_event,                     "invalid event");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_operation,                 "invalid operation");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_gl_object,                 "invalid GL object");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_buffer_size,               "invalid buffer size");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_mip_level,                 "invalid MIP level");
-  ISAAC_CREATE_CL_EXCEPTION(invalid_global_work_size,          "invalid global work size");
-#ifdef CL_INVALID_PROPERTY
-  ISAAC_CREATE_CL_EXCEPTION(invalid_property,                  "invalid property");
-#endif
-  }
-
-
-  }
-  }
-}
-
-#endif
--- a/include/isaac/driver/event.h
+++ b/include/isaac/driver/event.h
@@ -1,49 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_DRIVER_EVENT_H
-#define ISAAC_DRIVER_EVENT_H
-
-#include "isaac/driver/handle.h"
-
-namespace isaac
-{
-
-namespace driver
-{
-
-// Event
-class Event: public HandleInterface<Event, cu_event_t>
-{
-public:
-  float elapsed_time() const;
-  Handle<cu_event_t> const & cu() const;
-
-private:
-  Handle<cu_event_t> cu_;
-};
-
-}
-
-}
-
-#endif
--- a/include/isaac/driver/handle.h
+++ b/include/isaac/driver/handle.h
@@ -1,82 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_DRIVER_HANDLE_H
-#define ISAAC_DRIVER_HANDLE_H
-
-#include <memory>
-#include <iostream>
-#include <functional>
-#include <type_traits>
-#include "isaac/driver/dispatch.h"
-
-namespace isaac
-{
-
-namespace driver
-{
-
-struct cu_event_t{
-  operator bool() const { return first && second; }
-  CUevent first;
-  CUevent second;
-};
-
-struct cu_platform{
-  cu_platform() : status_(dispatch::cuInit(0)) { }
-  operator bool() const { return status_; }
-private:
-  CUresult status_;
-};
-
-template<class T, class CUType>
-class HandleInterface{
-public:
-    //Accessors
-    operator CUType() const { return *(((T*)this)->cu().h_); }
-    //Comparison
-    bool operator==(HandleInterface const & y) { return (CUType)(*this) == (CUType)(y); }
-    bool operator!=(HandleInterface const & y) { return (CUType)(*this) != (CUType)(y); }
-    bool operator<(HandleInterface const & y) { return (CUType)(*this) < (CUType)(y); }
-};
-
-template<class CUType>
-class Handle{
-public:
-  template<class, class> friend class HandleInterface;
-public:
-  //Constructors
-  Handle(CUType cu = CUType(), bool take_ownership = true);
-  ~Handle();
-  CUType& operator*() { return *h_; }
-  CUType const & operator*() const { return *h_; }
-  CUType* operator->() const { return h_.get(); }
-
-protected:
-  std::shared_ptr<CUType> h_;
-  bool has_ownership_;
-};
-
-}
-}
-
-#endif
--- a/include/isaac/driver/kernel.h
+++ b/include/isaac/driver/kernel.h
@@ -1,68 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_DRIVER_KERNEL_H
-#define ISAAC_DRIVER_KERNEL_H
-
-#include "isaac/driver/module.h"
-#include "isaac/driver/handle.h"
-
-#include <memory>
-
-namespace isaac
-{
-
-namespace driver
-{
-
-class Buffer;
-
-// Kernel
-class Kernel: public HandleInterface<Kernel, CUfunction>
-{
-public:
-  //Constructors
-  Kernel(Module const & program, const char * name);
-  //Accessors
-  Handle<CUfunction> const & cu() const;
-  Module const & module() const;
-  //Arguments setters
-  void setArg(unsigned int index, std::size_t size, void* ptr);
-  void setArg(unsigned int index, Buffer const &);
-  template<class T> void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); }
-  //Arguments getters
-  void* const* cu_params() const;
-
-private:
-  Handle<CUfunction> cu_;
-  Module program_;
-  unsigned int address_bits_;
-  std::vector<std::shared_ptr<void> >  cu_params_store_;
-  std::vector<void*>  cu_params_;
-};
-
-}
-
-}
-
-#endif
-
--- a/include/isaac/driver/module.h
+++ b/include/isaac/driver/module.h
@@ -1,61 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_DRIVER_MODULE_H
-#define ISAAC_DRIVER_MODULE_H
-
-#include <map>
-#include "isaac/driver/handle.h"
-#include "isaac/driver/context.h"
-#include "isaac/driver/buffer.h"
-
-namespace isaac
-{
-
-namespace driver
-{
-
-class Context;
-class Device;
-
-class Module: public HandleInterface<Module, CUmodule>
-{
-  static std::string header(Device const & device);
-
-public:
-  Module(Context const & context, std::string const & source);
-  Context const & context() const;
-  Handle<CUmodule> const & cu() const;
-  Buffer symbol(const char * name) const;
-
-private:
-  Handle<CUmodule> cu_;
-  Context context_;
-  std::string source_;
-};
-
-
-}
-
-}
-
-#endif
--- a/include/isaac/driver/platform.h
+++ b/include/isaac/driver/platform.h
@@ -1,54 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_DRIVER_PLATFORM_H
-#define ISAAC_DRIVER_PLATFORM_H
-
-#include <vector>
-#include <string>
-
-#include "isaac/driver/handle.h"
-
-namespace isaac
-{
-
-namespace driver
-{
-
-class Device;
-
-class Platform
-{
-public:
-  //Accessors
-  std::string name() const;
-  std::string version() const;
-  std::vector<Device> devices() const;
-private:
-  Handle<cu_platform> cu_;
-};
-
-}
-
-}
-
-#endif
--- a/include/isaac/driver/stream.h
+++ b/include/isaac/driver/stream.h
@@ -1,82 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_DRIVER_STREAM_H
-#define ISAAC_DRIVER_STREAM_H
-
-#include <map>
-#include "isaac/driver/context.h"
-#include "isaac/driver/device.h"
-#include "isaac/driver/handle.h"
-#include "isaac/driver/buffer.h"
-
-namespace isaac
-{
-
-namespace driver
-{
-
-class Kernel;
-class Event;
-class Range;
-class Buffer;
-
-// Command Queue
-class Stream: public HandleInterface<Stream, CUstream>
-{
-public:
-  //Constructors
-  Stream(CUstream stream, bool take_ownership);
-  Stream(Context const & context);
-
-  //Accessors
-  Handle<CUstream> const & cu() const;
-  Context const & context() const;
-
-  //Synchronize
-  void synchronize();
-
-  //Enqueue
-  void enqueue(Kernel const & kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const * = NULL, Event *event = NULL);
-
-  // Write
-  void write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
-
-  template<class T> void write(Buffer const & buffer, bool blocking, std::size_t offset, std::vector<T> const & x)
-  { write(buffer, blocking, offset, x.size()*sizeof(T), x.data()); }
-
-  // Read
-  void read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr);
-
-  template<class T> void read(Buffer const & buffer, bool blocking, std::size_t offset, std::vector<T>& x)
-  { read(buffer, blocking, offset, x.size()*sizeof(T), x.data()); }
-private:
-  Context context_;
-  Handle<CUstream> cu_;
-};
-
-
-}
-
-}
-
-#endif
--- a/include/isaac/external/CUDA/builtin_types.h
+++ b/include/isaac/external/CUDA/builtin_types.h
@@ -1,64 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "device_types.h"
-#if !defined(__CUDACC_RTC__)
-#define EXCLUDE_FROM_RTC
-#include "driver_types.h"
-#undef EXCLUDE_FROM_RTC
-#endif /* !__CUDACC_RTC__ */
-#include "surface_types.h"
-#include "texture_types.h"
-#include "vector_types.h"
--- a/include/isaac/external/CUDA/channel_descriptor.h
+++ b/include/isaac/external/CUDA/channel_descriptor.h
@@ -1,412 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CHANNEL_DESCRIPTOR_H__)
-#define __CHANNEL_DESCRIPTOR_H__
-
-#if defined(__cplusplus)
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "driver_types.h"
-#include "cuda_runtime_api.h"
-#include "host_defines.h"
-#include "vector_types.h"
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-/**
- * \addtogroup CUDART_HIGHLEVEL
- *
- * @{
- */
-
-/**
- * \brief \hl Returns a channel descriptor using the specified format
- *
- * Returns a channel descriptor with format \p f and number of bits of each
- * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
- * defined as:
- * \code
-  struct cudaChannelFormatDesc {
-    int x, y, z, w;
-    enum cudaChannelFormatKind f;
-  };
- * \endcode
- *
- * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
- * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
- *
- * \return
- * Channel descriptor with format \p f
- *
- * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
- * ::cudaGetChannelDesc, ::cudaGetTextureReference,
- * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
- * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
- * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
- * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
- * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
- * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
- * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
- */
-template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
-{
-  return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
-}
-
-static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
-}
-
-static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
-}
-
-static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
-}
-
-static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
-{
-  int e = (int)sizeof(char) * 8;
-
-#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
-{
-  int e = (int)sizeof(signed char) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
-{
-  int e = (int)sizeof(unsigned char) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
-{
-  int e = (int)sizeof(signed char) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
-{
-  int e = (int)sizeof(unsigned char) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
-{
-  int e = (int)sizeof(signed char) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
-{
-  int e = (int)sizeof(unsigned char) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
-{
-  int e = (int)sizeof(signed char) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
-{
-  int e = (int)sizeof(unsigned char) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
-{
-  int e = (int)sizeof(short) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
-{
-  int e = (int)sizeof(short) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
-{
-  int e = (int)sizeof(short) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
-{
-  int e = (int)sizeof(short) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
-{
-  int e = (int)sizeof(unsigned short) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
-{
-  int e = (int)sizeof(int) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
-{
-  int e = (int)sizeof(unsigned int) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
-{
-  int e = (int)sizeof(int) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
-{
-  int e = (int)sizeof(unsigned int) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
-{
-  int e = (int)sizeof(int) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
-{
-  int e = (int)sizeof(unsigned int) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
-{
-  int e = (int)sizeof(int) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
-{
-  int e = (int)sizeof(unsigned int) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
-}
-
-#if !defined(__LP64__)
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
-{
-  int e = (int)sizeof(long) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
-{
-  int e = (int)sizeof(unsigned long) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
-{
-  int e = (int)sizeof(long) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
-{
-  int e = (int)sizeof(unsigned long) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
-{
-  int e = (int)sizeof(long) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
-{
-  int e = (int)sizeof(unsigned long) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
-{
-  int e = (int)sizeof(long) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
-{
-  int e = (int)sizeof(unsigned long) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
-}
-
-#endif /* !__LP64__ */
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
-{
-  int e = (int)sizeof(float) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
-{
-  int e = (int)sizeof(float) * 8;
-
-  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
-{
-  int e = (int)sizeof(float) * 8;
-
-  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
-}
-
-template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
-{
-  int e = (int)sizeof(float) * 8;
-
-  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
-}
-
-#endif /* __cplusplus */
-
-/** @} */
-/** @} */ /* END CUDART_TEXTURE_HL */
-
-#endif /* !__CHANNEL_DESCRIPTOR_H__ */
--- a/include/isaac/external/CUDA/crt/host_config.h
+++ b/include/isaac/external/CUDA/crt/host_config.h
@@ -1,266 +0,0 @@
-/*
- * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__HOST_CONFIG_H__)
-#define __HOST_CONFIG_H__
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#if defined(__CUDACC__)
-
-#if defined(__CUDACC_RTC__)
-
-#define _CRTIMP
-#define __THROW
-
-#else /* __CUDACC_RTC__ */
-
-/* check for host compilers that are compatible with nvcc */
-#if !defined(__GNUC__) && !defined(_WIN32)
-
-#error --- !!! UNSUPPORTED COMPILER !!! ---
-
-#endif /* !__GNUC__ && !_WIN32 */
-
-#if defined(__ICC)
-
-#if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700) || !defined(__GNUC__) || !defined(__LP64__)
-
-#error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, and ICC 17.0 on Linux x86_64 are supported!
-
-#endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 17.0) || !__GNUC__ || !__LP64__ */
-
-#endif /* __ICC */
-
-#if defined(__PGIC__)
-
-#if (!(__PGIC__ == 17) && \
-     !(__PGIC__ == 99 && __PGIC_MINOR__ == 99)) || \
-    !defined(__GNUC__) || !defined(__LP64__)
-
-#error -- unsupported pgc++ configuration! Only pgc++ 17 on Linux x86_64 is supported!
-
-#endif /* (!(__PGIC__ == 17) &&
-           !(__PGIC__ == 99 && __PGIC_MINOR__ == 99 )) ||
-          !__GNUC__ || !__LP64__ */
-
-#endif /* __PGIC__ */
-
-#if defined(__powerpc__)
-
-#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
-
-#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
-
-#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
-
-#if defined(__ibmxl_vrm__) && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000)
-
-#error -- unsupported xlC version! only xlC 13.1 is supported
-
-#endif /* __ibmxl_vrm__ && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) */
-
-#endif /* __powerpc__ */
-
-#if defined(__GNUC__)
-
-#if __GNUC__ > 6
-
-#error -- unsupported GNU version! gcc versions later than 6 are not supported!
-
-#endif /* __GNUC__ > 6 */
-
-#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
-#error -- clang and clang++ are the only supported host compilers on Mac OS X!
-#endif /* __APPLE__ && __MACH__ && !__clang__ */
-
-#endif /* __GNUC__ */
-
-#if defined(_WIN32)
-
-#if _MSC_VER < 1600 || _MSC_VER > 1911
-
-#error -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!
-
-#elif _MSC_VER == 1600 /* _MSC_VERION == 1600 */
-
-#pragma message("support for Microsoft Visual Studio 2010 has been deprecated!")
-
-#endif /* _MSC_VER < 1600 || _MSC_VER > 1800 || _MSC_VERSION == 1600 */
-
-#endif /* _WIN32 */
-
-/* configure host compiler */
-#if defined(__APPLE__)
-
-#define _CRTIMP
-#define _ACRTIMP
-#define __THROW
-
-#if defined(__BLOCKS__) /* nvcc does not support closures */
-
-#undef __BLOCKS__
-
-#endif /* __BLOCKS__ */
-
-#elif defined(__ANDROID__)
-
-#define _CRTIMP
-#define _ACRTIMP
-#define __THROW
-
-#elif defined(__QNX__)
-
-#define _CRTIMP
-#define _ACRTIMP
-#define __THROW
-
-#elif defined(__HORIZON__)
-
-#define _CRTIMP
-#define _ACRTIMP
-#define __THROW
-
-#elif defined(__GNUC__)
-
-#define _CRTIMP
-#define _ACRTIMP
-
-#include <features.h> /* for __THROW */
-
-#elif defined(_WIN32)
-
-#if _MSC_VER >= 1500
-
-#undef _USE_DECLSPECS_FOR_SAL
-#define _USE_DECLSPECS_FOR_SAL \
-        1
-
-#endif /* _MSC_VER >= 1500 */
-
-#if !defined(_CRT_NONSTDC_NO_WARNINGS)
-
-#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
-
-#endif /* !_CRT_NONSTDC_NO_WARNINGS */
-
-#if !defined(_CRT_SECURE_NO_WARNINGS)
-
-#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
-
-#endif /* !_CRT_SECURE_NO_WARNINGS */
-
-#if !defined(NOMINMAX)
-
-#define NOMINMAX /* min and max are part of cuda runtime */
-
-#endif /* !NOMINMAX */
-
-#include <crtdefs.h> /* for _CRTIMP */
-#if _MSC_VER >= 1900
-#include <corecrt.h> /* for _ACRTIMP */
-#endif /* _MSC_VER >= 1900 */
-
-#define __THROW
-
-#endif /* __APPLE__ */
-
-#endif /* __CUDACC_RTC__ */
-
-
-#if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER)))
-
-#if __CUDACC_RTC__
-typedef char *va_list;
-#else /* !__CUDACC_RTC__ */
-#include <cstdarg>
-#endif /* __CUDACC_RTC__ */
-
-
-#undef va_start
-#undef va_end
-#undef va_arg
-
-#ifdef __PGIC__
-
-#undef __builtin_va_end
-
-#define va_start(v,l) __builtin_alt_va_start(v,l)
-#define va_end(v) __builtin_va_end(v)
-#define va_arg(v,l) __builtin_alt_va_arg(v,l)
-
-#if (__cplusplus >= 201103L)
-#undef va_copy
-#define va_copy(d,s)  __builtin_va_copy(d,s)
-#endif
-
-#else /* !__PGIC__ */
-
-
-#define va_start(ap, x) (__cu_va_start(&ap, x))
-#define va_end(ap) (__cu_va_end(&ap))
-#define va_arg(ap, t)  (*((t *)__cu_va_arg(&ap, (t *)0)))
-
-#if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L))
-#undef va_copy
-#define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps)))
-#endif /* (_MSC_VER >= 1800)  || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */
-#endif /* __PGIC__ */
-
-#endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */
-
-
-
-#endif /* __CUDACC__ */
-
-#endif /* !__HOST_CONFIG_H__ */
--- a/include/isaac/external/CUDA/crt/host_defines.h
+++ b/include/isaac/external/CUDA/crt/host_defines.h
@@ -1,216 +0,0 @@
-/*
- * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__HOST_DEFINES_H__)
-#define __HOST_DEFINES_H__
-
-/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
-#if defined(__GNUC__) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
-
-#if defined(__CUDACC_RTC__)
-#define __volatile__ volatile
-#endif /* __CUDACC_RTC__ */
-
-#define __no_return__ \
-        __attribute__((noreturn))
-        
-#if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
-/* gcc allows users to define attributes with underscores, 
-   e.g., __attribute__((__noinline__)).
-   Consider a non-CUDA source file (e.g. .cpp) that has the 
-   above attribute specification, and includes this header file. In that case,
-   defining __noinline__ as below  would cause a gcc compilation error.
-   Hence, only define __noinline__ when the code is being processed
-   by a  CUDA compiler component.
-*/   
-#define __noinline__ \
-        __attribute__((noinline))
-#endif /* __CUDACC__  || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */
-        
-#define __forceinline__ \
-        __inline__ __attribute__((always_inline))
-#define __align__(n) \
-        __attribute__((aligned(n)))
-#define __thread__ \
-        __thread
-#define __import__
-#define __export__
-#define __cdecl
-#define __annotate__(a) \
-        __attribute__((a))
-#define __location__(a) \
-        __annotate__(a)
-#define CUDARTAPI
-
-#elif defined(_MSC_VER)
-
-#if _MSC_VER >= 1400
-
-#define __restrict__ \
-        __restrict
-
-#else /* _MSC_VER >= 1400 */
-
-#define __restrict__
-
-#endif /* _MSC_VER >= 1400 */
-
-#define __inline__ \
-        __inline
-#define __no_return__ \
-        __declspec(noreturn)
-#define __noinline__ \
-        __declspec(noinline)
-#define __forceinline__ \
-        __forceinline
-#define __align__(n) \
-        __declspec(align(n))
-#define __thread__ \
-        __declspec(thread)
-#define __import__ \
-        __declspec(dllimport)
-#define __export__ \
-        __declspec(dllexport)
-#define __annotate__(a) \
-        __declspec(a)
-#define __location__(a) \
-        __annotate__(__##a##__)
-#define CUDARTAPI \
-        __stdcall
-
-#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
-
-#define __inline__
-
-#if !defined(__align__)
-
-#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
-
-#endif /* !__align__ */
-
-#if !defined(CUDARTAPI)
-
-#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
-
-#endif /* !CUDARTAPI */
-
-#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
-
-#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
-    (defined(_MSC_VER) && _MSC_VER < 1900) || \
-    (!defined(__GNUC__) && !defined(_MSC_VER))
-
-#define __specialization_static \
-        static
-
-#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
-         (_MSC_VER && _MSC_VER < 1900) ||
-         (!__GNUC__ && !_MSC_VER) */
-
-#define __specialization_static
-
-#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
-         (_MSC_VER && _MSC_VER < 1900) ||
-         (!__GNUC__ && !_MSC_VER) */
-
-#if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__)
-
-#undef __annotate__
-#define __annotate__(a)
-
-#else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
-
-#define __launch_bounds__(...) \
-        __annotate__(launch_bounds(__VA_ARGS__))
-
-#endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
-
-#if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \
-    defined(__GNUC__) || defined(_WIN64)
-
-#define __builtin_align__(a) \
-        __align__(a)
-
-#else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
-
-#define __builtin_align__(a)
-
-#endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__  || _WIN64 */
-
-#define __host__ \
-        __location__(host)
-#define __device__ \
-        __location__(device)
-#define __global__ \
-        __location__(global)
-#define __shared__ \
-        __location__(shared)
-#define __constant__ \
-        __location__(constant)
-#define __managed__ \
-        __location__(managed)
-        
-#if !defined(__CUDACC__)
-#define __device_builtin__
-#define __device_builtin_texture_type__
-#define __device_builtin_surface_type__
-#define __cudart_builtin__
-#else /* defined(__CUDACC__) */
-#define __device_builtin__ \
-        __location__(device_builtin)
-#define __device_builtin_texture_type__ \
-        __location__(device_builtin_texture_type)
-#define __device_builtin_surface_type__ \
-        __location__(device_builtin_surface_type)
-#define __cudart_builtin__ \
-        __location__(cudart_builtin)
-#endif /* !defined(__CUDACC__) */
-
-
-#endif /* !__HOST_DEFINES_H__ */
--- a/include/isaac/external/CUDA/cuComplex.h
+++ b/include/isaac/external/CUDA/cuComplex.h
@@ -1,338 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(CU_COMPLEX_H_)
-#define CU_COMPLEX_H_
-
-/* When trying to include C header file in C++ Code extern "C" is required
- * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
- * extern "C" cannot be nested
- * Hence keep the header out of extern "C" block
- */
-
-#include <math.h>       /* import fabsf, sqrt */
-
-#if defined(__cplusplus)
-extern "C" {
-#endif /* __cplusplus */
-
-#include "vector_types.h"
-
-typedef float2 cuFloatComplex;
-
-__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) 
-{ 
-    return x.x; 
-}
-
-__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) 
-{ 
-    return x.y; 
-}
-
-__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex 
-                                                             (float r, float i)
-{
-    cuFloatComplex res;
-    res.x = r;
-    res.y = i;
-    return res;
-}
-
-__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
-{
-    return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
-}
-__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
-                                                              cuFloatComplex y)
-{
-    return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), 
-                                cuCimagf(x) + cuCimagf(y));
-}
-
-__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
-                                                              cuFloatComplex y)
-{
-        return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), 
-                                    cuCimagf(x) - cuCimagf(y));
-}
-
-/* This implementation could suffer from intermediate overflow even though
- * the final result would be in range. However, various implementations do
- * not guard against this (presumably to avoid losing performance), so we 
- * don't do it either to stay competitive.
- */
-__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
-                                                              cuFloatComplex y)
-{
-    cuFloatComplex prod;
-    prod = make_cuFloatComplex  ((cuCrealf(x) * cuCrealf(y)) - 
-                                 (cuCimagf(x) * cuCimagf(y)),
-                                 (cuCrealf(x) * cuCimagf(y)) + 
-                                 (cuCimagf(x) * cuCrealf(y)));
-    return prod;
-}
-
-/* This implementation guards against intermediate underflow and overflow
- * by scaling. Such guarded implementations are usually the default for
- * complex library implementations, with some also offering an unguarded,
- * faster version.
- */
-__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
-                                                              cuFloatComplex y)
-{
-    cuFloatComplex quot;
-    float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
-    float oos = 1.0f / s;
-    float ars = cuCrealf(x) * oos;
-    float ais = cuCimagf(x) * oos;
-    float brs = cuCrealf(y) * oos;
-    float bis = cuCimagf(y) * oos;
-    s = (brs * brs) + (bis * bis);
-    oos = 1.0f / s;
-    quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
-                                ((ais * brs) - (ars * bis)) * oos);
-    return quot;
-}
-
-/* 
- * We would like to call hypotf(), but it's not available on all platforms.
- * This discrete implementation guards against intermediate underflow and 
- * overflow by scaling. Otherwise we would lose half the exponent range. 
- * There are various ways of doing guarded computation. For now chose the 
- * simplest and fastest solution, however this may suffer from inaccuracies 
- * if sqrt and division are not IEEE compliant. 
- */
-__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
-{
-    float a = cuCrealf(x);
-    float b = cuCimagf(x);
-    float v, w, t;
-    a = fabsf(a);
-    b = fabsf(b);
-    if (a > b) {
-        v = a;
-        w = b; 
-    } else {
-        v = b;
-        w = a;
-    }
-    t = w / v;
-    t = 1.0f + t * t;
-    t = v * sqrtf(t);
-    if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
-        t = v + w;
-    }
-    return t;
-}
-
-/* Double precision */
-typedef double2 cuDoubleComplex;
-
-__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) 
-{ 
-    return x.x; 
-}
-
-__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) 
-{ 
-    return x.y; 
-}
-
-__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex 
-                                                           (double r, double i)
-{
-    cuDoubleComplex res;
-    res.x = r;
-    res.y = i;
-    return res;
-}
-
-__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
-{
-    return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
-}
-
-__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
-                                                             cuDoubleComplex y)
-{
-    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), 
-                                 cuCimag(x) + cuCimag(y));
-}
-
-__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
-                                                             cuDoubleComplex y)
-{
-    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), 
-                                 cuCimag(x) - cuCimag(y));
-}
-
-/* This implementation could suffer from intermediate overflow even though
- * the final result would be in range. However, various implementations do
- * not guard against this (presumably to avoid losing performance), so we 
- * don't do it either to stay competitive.
- */
-__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
-                                                             cuDoubleComplex y)
-{
-    cuDoubleComplex prod;
-    prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - 
-                                 (cuCimag(x) * cuCimag(y)),
-                                 (cuCreal(x) * cuCimag(y)) + 
-                                 (cuCimag(x) * cuCreal(y)));
-    return prod;
-}
-
-/* This implementation guards against intermediate underflow and overflow
- * by scaling. Such guarded implementations are usually the default for
- * complex library implementations, with some also offering an unguarded,
- * faster version.
- */
-__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
-                                                             cuDoubleComplex y)
-{
-    cuDoubleComplex quot;
-    double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
-    double oos = 1.0 / s;
-    double ars = cuCreal(x) * oos;
-    double ais = cuCimag(x) * oos;
-    double brs = cuCreal(y) * oos;
-    double bis = cuCimag(y) * oos;
-    s = (brs * brs) + (bis * bis);
-    oos = 1.0 / s;
-    quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
-                                 ((ais * brs) - (ars * bis)) * oos);
-    return quot;
-}
-
-/* This implementation guards against intermediate underflow and overflow
- * by scaling. Otherwise we would lose half the exponent range. There are
- * various ways of doing guarded computation. For now chose the simplest
- * and fastest solution, however this may suffer from inaccuracies if sqrt
- * and division are not IEEE compliant.
- */
-__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
-{
-    double a = cuCreal(x);
-    double b = cuCimag(x);
-    double v, w, t;
-    a = fabs(a);
-    b = fabs(b);
-    if (a > b) {
-        v = a;
-        w = b; 
-    } else {
-        v = b;
-        w = a;
-    }
-    t = w / v;
-    t = 1.0 + t * t;
-    t = v * sqrt(t);
-    if ((v == 0.0) || 
-        (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
-        t = v + w;
-    }
-    return t;
-}
-
-#if defined(__cplusplus)
-}
-#endif /* __cplusplus */
-
-/* aliases */
-typedef cuFloatComplex cuComplex;
-__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, 
-                                                                float y) 
-{ 
-    return make_cuFloatComplex (x, y); 
-}
-
-/* float-to-double promotion */
-__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
-                                                      (cuFloatComplex c)
-{
-    return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
-}
-
-__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
-(cuDoubleComplex c)
-{
-	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
-}
-
-
-__host__ __device__ static __inline__  cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
-{
-    float real_res;
-    float imag_res;
-    
-    real_res = (cuCrealf(x) *  cuCrealf(y)) + cuCrealf(d);
-    imag_res = (cuCrealf(x) *  cuCimagf(y)) + cuCimagf(d);
-            
-    real_res = -(cuCimagf(x) * cuCimagf(y))  + real_res;  
-    imag_res =  (cuCimagf(x) *  cuCrealf(y)) + imag_res;          
-     
-    return make_cuComplex(real_res, imag_res);
-}
-
-__host__ __device__ static __inline__  cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
-{
-    double real_res;
-    double imag_res;
-    
-    real_res = (cuCreal(x) *  cuCreal(y)) + cuCreal(d);
-    imag_res = (cuCreal(x) *  cuCimag(y)) + cuCimag(d);
-            
-    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
-    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
-     
-    return make_cuDoubleComplex(real_res, imag_res);
-}
-
-#endif /* !defined(CU_COMPLEX_H_) */
--- a/include/isaac/external/CUDA/cublas.h
+++ b/include/isaac/external/CUDA/cublas.h
@@ -1,565 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
- 
-/*
- * This is the public header file for the CUBLAS library, defining the API
- *
- * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines) 
- * on top of the CUDA runtime. 
- */
-
-#if !defined(CUBLAS_H_)
-#define CUBLAS_H_
-
-#include <cuda_runtime.h>
-
-#ifndef CUBLASWINAPI
-#ifdef _WIN32
-#define CUBLASWINAPI __stdcall
-#else
-#define CUBLASWINAPI 
-#endif
-#endif
-
-#undef CUBLASAPI
-#ifdef __CUDACC__
-#define CUBLASAPI __host__
-#else
-#define CUBLASAPI
-#endif
-
-#include "cublas_api.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* CUBLAS data types */
-#define cublasStatus cublasStatus_t
-
-cublasStatus CUBLASWINAPI cublasInit (void);
-cublasStatus CUBLASWINAPI cublasShutdown (void);
-cublasStatus CUBLASWINAPI cublasGetError (void);
-
-cublasStatus CUBLASWINAPI cublasGetVersion(int *version);
-cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr);
-
-cublasStatus CUBLASWINAPI cublasFree (void *devicePtr);
-
-
-cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream);
-
-
-
-/* ---------------- CUBLAS BLAS1 functions ---------------- */
-/* NRM2 */
-float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx);
-double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx);
-float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx);
-double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx);
-/*------------------------------------------------------------------------*/
-/* DOT */
-float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, 
-                               int incy);
-double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, 
-                               int incy);
-cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, 
-                               int incy);
-cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, 
-                               int incy);
-cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
-                               int incy);
-cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
-                               int incy);
-/*------------------------------------------------------------------------*/
-/* SCAL */
-void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx);
-void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx);
-void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx);
-void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx);
-
-void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx);
-void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx);
-/*------------------------------------------------------------------------*/
-/* AXPY */
-void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, 
-                               float *y, int incy);
-void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, 
-                               int incx, double *y, int incy);
-void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, 
-                               int incx, cuComplex *y, int incy);
-void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
-                               int incx, cuDoubleComplex *y, int incy);
-/*------------------------------------------------------------------------*/
-/* COPY */
-void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, 
-                               int incy);
-void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, 
-                               int incy);
-void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
-                               int incy);
-void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
-                               int incy);
-/*------------------------------------------------------------------------*/
-/* SWAP */
-void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy);
-void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy);
-void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy);
-void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy);           
-/*------------------------------------------------------------------------*/
-/* AMAX */
-int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx);
-int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx);
-int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx);
-int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx);
-/*------------------------------------------------------------------------*/
-/* AMIN */
-int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx);
-int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx);
-
-int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx);
-int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx);
-/*------------------------------------------------------------------------*/
-/* ASUM */
-float CUBLASWINAPI cublasSasum (int n, const float *x, int incx);
-double CUBLASWINAPI cublasDasum (int n, const double *x, int incx);
-float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx);
-double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx);
-/*------------------------------------------------------------------------*/
-/* ROT */
-void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, 
-                              float sc, float ss);
-void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, 
-                              double sc, double ss);
-void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, 
-                              int incy, float c, cuComplex s);
-void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, 
-                              cuDoubleComplex *y, int incy, double sc, 
-                              cuDoubleComplex cs);
-void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
-                               int incy, float c, float s);
-void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, 
-                               cuDoubleComplex *y, int incy, double c, double s);
-/*------------------------------------------------------------------------*/
-/* ROTG */
-void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss);
-void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss);
-void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
-                               cuComplex *cs);                                     
-void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
-                               cuDoubleComplex *cs);                                                               
-/*------------------------------------------------------------------------*/
-/* ROTM */
-void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, 
-                              const float* sparam);
-void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, 
-                              const double* sparam);
-/*------------------------------------------------------------------------*/
-/* ROTMG */
-void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, 
-                                const float *sy1, float* sparam);
-void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, 
-                                const double *sy1, double* sparam);
-                           
-/* --------------- CUBLAS BLAS2 functions  ---------------- */
-/* GEMV */
-void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
-                               const float *A, int lda, const float *x, int incx,
-                               float beta, float *y, int incy);
-void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
-                               const double *A, int lda, const double *x, int incx,
-                               double beta, double *y, int incy);
-void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
-                               const cuComplex *A, int lda, const cuComplex *x, int incx,
-                               cuComplex beta, cuComplex *y, int incy);
-void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-                               cuDoubleComplex beta, cuDoubleComplex *y, int incy);
-/*------------------------------------------------------------------------*/
-/* GBMV */
-void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, 
-                               float alpha, const float *A, int lda, 
-                               const float *x, int incx, float beta, float *y, 
-                               int incy);
-void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, 
-                               double alpha, const double *A, int lda, 
-                               const double *x, int incx, double beta, double *y, 
-                               int incy);
-void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, 
-                               cuComplex alpha, const cuComplex *A, int lda, 
-                               const cuComplex *x, int incx, cuComplex beta, cuComplex *y, 
-                               int incy);
-void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, 
-                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, 
-                               const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, 
-                               int incy);                  
-/*------------------------------------------------------------------------*/
-/* TRMV */
-void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, 
-                               const float *A, int lda, float *x, int incx);
-void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, 
-                               const double *A, int lda, double *x, int incx);
-void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, 
-                               const cuComplex *A, int lda, cuComplex *x, int incx);
-void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, 
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
-/*------------------------------------------------------------------------*/
-/* TBMV */
-void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, 
-                               const float *A, int lda, float *x, int incx);
-void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const double *A, int lda, double *x, int incx);
-void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const cuComplex *A, int lda, cuComplex *x, int incx);
-void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
-/*------------------------------------------------------------------------*/
-/* TPMV */                                                    
-void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx);
-
-void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
-
-void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
-                                         
-void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx);
-/*------------------------------------------------------------------------*/
-/* TRSV */
-void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx);
-
-void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx);
-
-void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx);
-
-void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, 
-                              cuDoubleComplex *x, int incx);       
-/*------------------------------------------------------------------------*/
-/* TPSV */
-void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, 
-                              float *x, int incx);
-                                                                                                            
-void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
-
-void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
-
-void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, 
-                              cuDoubleComplex *x, int incx);
-/*------------------------------------------------------------------------*/                                         
-/* TBSV */                                         
-void CUBLASWINAPI cublasStbsv(char uplo, char trans, 
-                              char diag, int n, int k, const float *A, 
-                              int lda, float *x, int incx);
-    
-void CUBLASWINAPI cublasDtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const double *A, 
-                              int lda, double *x, int incx);
-void CUBLASWINAPI cublasCtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const cuComplex *A, 
-                              int lda, cuComplex *x, int incx);      
-                                         
-void CUBLASWINAPI cublasZtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const cuDoubleComplex *A, 
-                              int lda, cuDoubleComplex *x, int incx);  
-/*------------------------------------------------------------------------*/                                         
-/* SYMV/HEMV */
-void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
-                               int lda, const float *x, int incx, float beta, 
-                               float *y, int incy);
-void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
-                               int lda, const double *x, int incx, double beta, 
-                               double *y, int incy);
-void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, const cuComplex *x, int incx, cuComplex beta, 
-                               cuComplex *y, int incy);
-void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, 
-                               cuDoubleComplex *y, int incy);
-/*------------------------------------------------------------------------*/       
-/* SBMV/HBMV */
-void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, 
-                               const float *A, int lda, const float *x, int incx, 
-                               float beta, float *y, int incy);
-void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, 
-                               const double *A, int lda, const double *x, int incx, 
-                               double beta, double *y, int incy);
-void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, 
-                               const cuComplex *A, int lda, const cuComplex *x, int incx, 
-                               cuComplex beta, cuComplex *y, int incy);
-void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, 
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, 
-                               cuDoubleComplex beta, cuDoubleComplex *y, int incy);
-/*------------------------------------------------------------------------*/       
-/* SPMV/HPMV */
-void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
-                              const float *AP, const float *x,
-                              int incx, float beta, float *y, int incy);
-void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
-                              const double *AP, const double *x,
-                              int incx, double beta, double *y, int incy);
-void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *AP, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y, int incy);
-void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *AP, const cuDoubleComplex *x,
-                              int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy);
-
-/*------------------------------------------------------------------------*/       
-/* GER */
-void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
-                              const float *y, int incy, float *A, int lda);
-void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
-                              const double *y, int incy, double *A, int lda);
-
-void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
-                               int incx, const cuComplex *y, int incy,
-                               cuComplex *A, int lda);
-void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
-                               int incx, const cuComplex *y, int incy,
-                               cuComplex *A, int lda);
-void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
-                               int incx, const cuDoubleComplex *y, int incy,
-                               cuDoubleComplex *A, int lda);
-void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
-                               int incx, const cuDoubleComplex *y, int incy,
-                               cuDoubleComplex *A, int lda);
-/*------------------------------------------------------------------------*/       
-/* SYR/HER */
-void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
-                              int incx, float *A, int lda);
-void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
-                              int incx, double *A, int lda);
-
-void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, 
-                              const cuComplex *x, int incx, cuComplex *A, int lda);
-void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, 
-                              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda);
-
-/*------------------------------------------------------------------------*/       
-/* SPR/HPR */
-void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
-                              int incx, float *AP);
-void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
-                              int incx, double *AP);
-void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
-                              int incx, cuComplex *AP);
-void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
-                              int incx, cuDoubleComplex *AP);
-/*------------------------------------------------------------------------*/       
-/* SYR2/HER2 */
-void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, 
-                               int incx, const float *y, int incy, float *A, 
-                               int lda);
-void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, 
-                               int incx, const double *y, int incy, double *A, 
-                               int lda);
-void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, 
-                               int incx, const cuComplex *y, int incy, cuComplex *A, 
-                               int lda);
-void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
-                               int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, 
-                               int lda);
-
-/*------------------------------------------------------------------------*/       
-/* SPR2/HPR2 */
-void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, 
-                               int incx, const float *y, int incy, float *AP);
-void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
-                               const double *x, int incx, const double *y,
-                               int incy, double *AP);
-void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
-                               const cuComplex *x, int incx, const cuComplex *y,
-                               int incy, cuComplex *AP);
-void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-                               int incy, cuDoubleComplex *AP);
-/* ------------------------BLAS3 Functions ------------------------------- */
-/* GEMM */
-void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, 
-                               float alpha, const float *A, int lda, 
-                               const float *B, int ldb, float beta, float *C, 
-                               int ldc);
-void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
-                               double alpha, const double *A, int lda, 
-                               const double *B, int ldb, double beta, double *C, 
-                               int ldc);              
-void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, 
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc);
-void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
-                               int k, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C,
-                               int ldc);                   
-/* -------------------------------------------------------*/
-/* SYRK */
-void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, 
-                               const float *A, int lda, float beta, float *C, 
-                               int ldc);
-void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
-                               double alpha, const double *A, int lda,
-                               double beta, double *C, int ldc);
-
-void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               cuComplex beta, cuComplex *C, int ldc);
-void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               cuDoubleComplex beta,
-                               cuDoubleComplex *C, int ldc);
-/* ------------------------------------------------------- */
-/* HERK */
-void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
-                               float alpha, const cuComplex *A, int lda,
-                               float beta, cuComplex *C, int ldc);
-void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
-                               double alpha,
-                               const cuDoubleComplex *A, int lda,
-                               double beta,
-                               cuDoubleComplex *C, int ldc);
-/* ------------------------------------------------------- */
-/* SYR2K */
-void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, 
-                                const float *A, int lda, const float *B, int ldb, 
-                                float beta, float *C, int ldc);
-
-void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
-                                double alpha, const double *A, int lda,
-                                const double *B, int ldb, double beta,
-                                double *C, int ldc);
-void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
-                                cuComplex alpha, const cuComplex *A, int lda,
-                                const cuComplex *B, int ldb, cuComplex beta,
-                                cuComplex *C, int ldc);
-
-void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
-                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                                const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
-                                cuDoubleComplex *C, int ldc);                             
-/* ------------------------------------------------------- */
-/* HER2K */
-void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
-                                cuComplex alpha, const cuComplex *A, int lda,
-                                const cuComplex *B, int ldb, float beta,
-                                cuComplex *C, int ldc);
-
-void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
-                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                                const cuDoubleComplex *B, int ldb, double beta,
-                                cuDoubleComplex *C, int ldc); 
-
-/*------------------------------------------------------------------------*/       
-/* SYMM*/
-void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, 
-                               const float *A, int lda, const float *B, int ldb,
-                               float beta, float *C, int ldc);
-void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, 
-                               const double *A, int lda, const double *B, int ldb,
-                               double beta, double *C, int ldc);
-          
-void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, 
-                               const cuComplex *A, int lda, const cuComplex *B, int ldb,
-                               cuComplex beta, cuComplex *C, int ldc);
-          
-void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, 
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
-/*------------------------------------------------------------------------*/       
-/* HEMM*/
-void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc);
-void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                               const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
-                               cuDoubleComplex *C, int ldc);  
-
-/*------------------------------------------------------------------------*/       
-/* TRSM*/
-void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
-                               int m, int n, float alpha, const float *A, int lda,
-                               float *B, int ldb);
-
-void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
-                               char diag, int m, int n, double alpha,
-                               const double *A, int lda, double *B,
-                               int ldb);
-
-void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
-                               int m, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, cuComplex *B, int ldb);
-
-void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
-                               char diag, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               cuDoubleComplex *B, int ldb);                                                        
-/*------------------------------------------------------------------------*/       
-/* TRMM*/
-void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
-                               int m, int n, float alpha, const float *A, int lda,
-                               float *B, int ldb);
-void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
-                               char diag, int m, int n, double alpha,
-                               const double *A, int lda, double *B,
-                               int ldb);
-void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
-                               int m, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, cuComplex *B, int ldb);
-void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
-                               char diag, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
-                               int ldb);
-
-#if defined(__cplusplus)
-}
-#endif /* __cplusplus */
-
-#endif /* !defined(CUBLAS_H_) */
--- a/include/isaac/external/CUDA/cublas_api.h
+++ b/include/isaac/external/CUDA/cublas_api.h
--- a/include/isaac/external/CUDA/cublas_v2.h
+++ b/include/isaac/external/CUDA/cublas_v2.h
@@ -1,274 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
- 
-/*
- * This is the public header file for the new CUBLAS library API, it mapped the generic 
- * Cublas name functions to the actual _v2 implementations.
- */
-
-#if !defined(CUBLAS_V2_H_)
-#define CUBLAS_V2_H_
-
-#undef CUBLASAPI
-#ifdef __CUDACC__
-#define CUBLASAPI __host__ __device__
-#else
-#define CUBLASAPI
-#endif
-
-#include "cublas_api.h"
-
-#define cublasCreate         cublasCreate_v2
-#define cublasDestroy        cublasDestroy_v2
-#define cublasGetVersion     cublasGetVersion_v2
-#define cublasSetStream      cublasSetStream_v2
-#define cublasGetStream      cublasGetStream_v2
-#define cublasGetPointerMode cublasGetPointerMode_v2
-#define cublasSetPointerMode cublasSetPointerMode_v2
-
-/* Blas3 Routines   */
-
-#define cublasSnrm2          cublasSnrm2_v2
-#define cublasDnrm2          cublasDnrm2_v2 
-#define cublasScnrm2         cublasScnrm2_v2
-#define cublasDznrm2         cublasDznrm2_v2
-
-#define cublasSdot           cublasSdot_v2
-#define cublasDdot           cublasDdot_v2
-#define cublasCdotu          cublasCdotu_v2
-#define cublasCdotc          cublasCdotc_v2
-#define cublasZdotu          cublasZdotu_v2
-#define cublasZdotc          cublasZdotc_v2
-
-#define cublasSscal          cublasSscal_v2
-#define cublasDscal          cublasDscal_v2
-#define cublasCscal          cublasCscal_v2
-#define cublasCsscal         cublasCsscal_v2
-#define cublasZscal          cublasZscal_v2
-#define cublasZdscal         cublasZdscal_v2
-
-#define cublasSaxpy          cublasSaxpy_v2
-#define cublasDaxpy          cublasDaxpy_v2
-#define cublasCaxpy          cublasCaxpy_v2
-#define cublasZaxpy          cublasZaxpy_v2
-
-#define cublasScopy          cublasScopy_v2
-#define cublasDcopy          cublasDcopy_v2
-#define cublasCcopy          cublasCcopy_v2
-#define cublasZcopy          cublasZcopy_v2
-
-#define cublasSswap          cublasSswap_v2
-#define cublasDswap          cublasDswap_v2
-#define cublasCswap          cublasCswap_v2
-#define cublasZswap          cublasZswap_v2
-
-#define cublasIsamax         cublasIsamax_v2
-#define cublasIdamax         cublasIdamax_v2
-#define cublasIcamax         cublasIcamax_v2
-#define cublasIzamax         cublasIzamax_v2
- 
-#define cublasIsamin         cublasIsamin_v2
-#define cublasIdamin         cublasIdamin_v2
-#define cublasIcamin         cublasIcamin_v2
-#define cublasIzamin         cublasIzamin_v2
-                         
-#define cublasSasum          cublasSasum_v2
-#define cublasDasum          cublasDasum_v2
-#define cublasScasum         cublasScasum_v2
-#define cublasDzasum         cublasDzasum_v2
-
-#define cublasSrot           cublasSrot_v2 
-#define cublasDrot           cublasDrot_v2 
-#define cublasCrot           cublasCrot_v2 
-#define cublasCsrot          cublasCsrot_v2
-#define cublasZrot           cublasZrot_v2 
-#define cublasZdrot          cublasZdrot_v2
-
-#define cublasSrotg          cublasSrotg_v2
-#define cublasDrotg          cublasDrotg_v2
-#define cublasCrotg          cublasCrotg_v2
-#define cublasZrotg          cublasZrotg_v2
-
-#define cublasSrotm          cublasSrotm_v2 
-#define cublasDrotm          cublasDrotm_v2 
-                                
-#define cublasSrotmg         cublasSrotmg_v2 
-#define cublasDrotmg         cublasDrotmg_v2 
-
-
-/* Blas2 Routines */
-
-#define cublasSgemv          cublasSgemv_v2
-#define cublasDgemv          cublasDgemv_v2
-#define cublasCgemv          cublasCgemv_v2
-#define cublasZgemv          cublasZgemv_v2
-
-#define cublasSgbmv          cublasSgbmv_v2
-#define cublasDgbmv          cublasDgbmv_v2
-#define cublasCgbmv          cublasCgbmv_v2
-#define cublasZgbmv          cublasZgbmv_v2
-
-#define cublasStrmv          cublasStrmv_v2
-#define cublasDtrmv          cublasDtrmv_v2
-#define cublasCtrmv          cublasCtrmv_v2
-#define cublasZtrmv          cublasZtrmv_v2
-
-#define cublasStbmv          cublasStbmv_v2
-#define cublasDtbmv          cublasDtbmv_v2
-#define cublasCtbmv          cublasCtbmv_v2
-#define cublasZtbmv          cublasZtbmv_v2
-
-#define cublasStpmv          cublasStpmv_v2
-#define cublasDtpmv          cublasDtpmv_v2
-#define cublasCtpmv          cublasCtpmv_v2
-#define cublasZtpmv          cublasZtpmv_v2
-
-#define cublasStrsv          cublasStrsv_v2
-#define cublasDtrsv          cublasDtrsv_v2
-#define cublasCtrsv          cublasCtrsv_v2
-#define cublasZtrsv          cublasZtrsv_v2
-
-#define cublasStpsv          cublasStpsv_v2
-#define cublasDtpsv          cublasDtpsv_v2
-#define cublasCtpsv          cublasCtpsv_v2
-#define cublasZtpsv          cublasZtpsv_v2
-
-#define cublasStbsv          cublasStbsv_v2
-#define cublasDtbsv          cublasDtbsv_v2
-#define cublasCtbsv          cublasCtbsv_v2
-#define cublasZtbsv          cublasZtbsv_v2
-
-#define cublasSsymv          cublasSsymv_v2
-#define cublasDsymv          cublasDsymv_v2
-#define cublasCsymv          cublasCsymv_v2
-#define cublasZsymv          cublasZsymv_v2
-#define cublasChemv          cublasChemv_v2
-#define cublasZhemv          cublasZhemv_v2
-
-#define cublasSsbmv          cublasSsbmv_v2
-#define cublasDsbmv          cublasDsbmv_v2
-#define cublasChbmv          cublasChbmv_v2
-#define cublasZhbmv          cublasZhbmv_v2
-
-#define cublasSspmv          cublasSspmv_v2
-#define cublasDspmv          cublasDspmv_v2
-#define cublasChpmv          cublasChpmv_v2
-#define cublasZhpmv          cublasZhpmv_v2
-
-
-#define cublasSger           cublasSger_v2
-#define cublasDger           cublasDger_v2
-#define cublasCgeru          cublasCgeru_v2
-#define cublasCgerc          cublasCgerc_v2
-#define cublasZgeru          cublasZgeru_v2
-#define cublasZgerc          cublasZgerc_v2
-
-#define cublasSsyr           cublasSsyr_v2
-#define cublasDsyr           cublasDsyr_v2
-#define cublasCsyr           cublasCsyr_v2
-#define cublasZsyr           cublasZsyr_v2
-#define cublasCher           cublasCher_v2
-#define cublasZher           cublasZher_v2
-
-#define cublasSspr           cublasSspr_v2
-#define cublasDspr           cublasDspr_v2
-#define cublasChpr           cublasChpr_v2
-#define cublasZhpr           cublasZhpr_v2
-
-#define cublasSsyr2          cublasSsyr2_v2
-#define cublasDsyr2          cublasDsyr2_v2
-#define cublasCsyr2          cublasCsyr2_v2
-#define cublasZsyr2          cublasZsyr2_v2
-#define cublasCher2          cublasCher2_v2
-#define cublasZher2          cublasZher2_v2
-
-#define cublasSspr2          cublasSspr2_v2
-#define cublasDspr2          cublasDspr2_v2
-#define cublasChpr2          cublasChpr2_v2
-#define cublasZhpr2          cublasZhpr2_v2
-
-/* Blas3 Routines   */
-
-#define cublasSgemm          cublasSgemm_v2
-#define cublasDgemm          cublasDgemm_v2
-#define cublasCgemm          cublasCgemm_v2
-#define cublasZgemm          cublasZgemm_v2
-
-#define cublasSsyrk          cublasSsyrk_v2
-#define cublasDsyrk          cublasDsyrk_v2
-#define cublasCsyrk          cublasCsyrk_v2
-#define cublasZsyrk          cublasZsyrk_v2
-#define cublasCherk          cublasCherk_v2
-#define cublasZherk          cublasZherk_v2
-
-#define cublasSsyr2k         cublasSsyr2k_v2
-#define cublasDsyr2k         cublasDsyr2k_v2
-#define cublasCsyr2k         cublasCsyr2k_v2
-#define cublasZsyr2k         cublasZsyr2k_v2
-#define cublasCher2k         cublasCher2k_v2
-#define cublasZher2k         cublasZher2k_v2
-
-#define cublasSsymm          cublasSsymm_v2
-#define cublasDsymm          cublasDsymm_v2
-#define cublasCsymm          cublasCsymm_v2
-#define cublasZsymm          cublasZsymm_v2
-#define cublasChemm          cublasChemm_v2
-#define cublasZhemm          cublasZhemm_v2
-
-#define cublasStrsm          cublasStrsm_v2
-#define cublasDtrsm          cublasDtrsm_v2
-#define cublasCtrsm          cublasCtrsm_v2
-#define cublasZtrsm          cublasZtrsm_v2
-
-#define cublasStrmm          cublasStrmm_v2
-#define cublasDtrmm          cublasDtrmm_v2
-#define cublasCtrmm          cublasCtrmm_v2
-#define cublasZtrmm          cublasZtrmm_v2
-
-#endif /* !defined(CUBLAS_V2_H_) */
--- a/include/isaac/external/CUDA/cuda.h
+++ b/include/isaac/external/CUDA/cuda.h
--- a/include/isaac/external/CUDA/cuda_device_runtime_api.h
+++ b/include/isaac/external/CUDA/cuda_device_runtime_api.h
@@ -1,248 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
-#define __CUDA_DEVICE_RUNTIME_API_H__
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#if !defined(__CUDACC_RTC__)
-
-#if (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-struct cudaFuncAttributes;
-
-#if defined(_WIN32)
-#define __NV_WEAK__ __declspec(nv_weak)
-#else
-#define __NV_WEAK__ __attribute__((nv_weak))
-#endif
-
-__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s) 
-{ 
-  return cudaErrorUnknown;
-}
-
-__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c) 
-{ 
-  return cudaErrorUnknown;
-}
-
-__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
-{
-  return cudaErrorUnknown;
-}
-
-__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaGetDevice(int *device)
-{
-  return cudaErrorUnknown;
-}
-
-__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
-{
-  return cudaErrorUnknown;
-}
-
-__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
-{
-  return cudaErrorUnknown;
-}
-
-#undef __NV_WEAK__
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
-
-#endif /* !defined(__CUDACC_RTC__) */
-
-#if defined(__cplusplus) && defined(__CUDACC__)         /* Visible to nvcc front-end only */
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)   // Visible to SM>=3.5 and "__host__ __device__" only
-
-#include "driver_types.h"
-#include "host_defines.h"
-
-extern "C"
-{
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
-extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
-extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
-
-/**
- * \ingroup CUDART_EXECUTION
- * \brief Obtains a parameter buffer
- *
- * Obtains a parameter buffer which can be filled with parameters for a kernel launch.
- * Parameters passed to ::cudaLaunchDevice must be allocated via this function.
- *
- * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
- * CUDA user code should use <<< >>> to launch kernels.
- *
- * \param alignment - Specifies alignment requirement of the parameter buffer
- * \param size      - Specifies size requirement in bytes
- *
- * \return
- * Returns pointer to the allocated parameterBuffer
- * \notefnerr
- *
- * \sa cudaLaunchDevice
- */
-extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
-
-/**
- * \ingroup CUDART_EXECUTION
- * \brief Launches a specified kernel
- *
- * Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
- * by calling ::cudaGetParameterBuffer().
- *
- * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
- * CUDA user code should use <<< >>> to launch the kernels.
- *
- * \param func            - Pointer to the kernel to be launched
- * \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
- * \param gridDimension   - Specifies grid dimensions
- * \param blockDimension  - Specifies block dimensions
- * \param sharedMemSize   - Specifies size of shared memory
- * \param stream          - Specifies the stream to be used
- *
- * \return
- * ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
- * ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
- * \notefnerr
- * \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
- * Guide for the detailed descriptions of launch configuration and parameter layout respectively.
- *
- * \sa cudaGetParameterBuffer
- */
-extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
-
-#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
-    // When compiling for the device and per thread default stream is enabled, add
-    // a static inline redirect to the per thread stream entry points.
-
-    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
-    cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
-    {
-        return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
-    }
-
-    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
-    cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
-    {
-        return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
-    }
-#else
-    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
-    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
-#endif
-
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
-
-extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle);
-extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle);
-}
-
-template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
-template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
-template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
-template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
-
-
-#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
-#endif /* defined(__cplusplus) && defined(__CUDACC__) */
-
-#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */
--- a/include/isaac/external/CUDA/cuda_fp16.h
+++ b/include/isaac/external/CUDA/cuda_fp16.h
--- a/include/isaac/external/CUDA/cuda_fp16.hpp
+++ b/include/isaac/external/CUDA/cuda_fp16.hpp
--- a/include/isaac/external/CUDA/cuda_runtime.h
+++ b/include/isaac/external/CUDA/cuda_runtime.h
--- a/include/isaac/external/CUDA/cuda_runtime_api.h
+++ b/include/isaac/external/CUDA/cuda_runtime_api.h
--- a/include/isaac/external/CUDA/cudnn.h
+++ b/include/isaac/external/CUDA/cudnn.h
--- a/include/isaac/external/CUDA/cusparse.h
+++ b/include/isaac/external/CUDA/cusparse.h
--- a/include/isaac/external/CUDA/device_types.h
+++ b/include/isaac/external/CUDA/device_types.h
@@ -1,69 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__DEVICE_TYPES_H__)
-#define __DEVICE_TYPES_H__
-
-#include "host_defines.h"
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-enum __device_builtin__ cudaRoundMode
-{
-    cudaRoundNearest,
-    cudaRoundZero,
-    cudaRoundPosInf,
-    cudaRoundMinInf
-};
-
-#endif /* !__DEVICE_TYPES_H__ */
--- a/include/isaac/external/CUDA/driver_functions.h
+++ b/include/isaac/external/CUDA/driver_functions.h
@@ -1,145 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__DRIVER_FUNCTIONS_H__)
-#define __DRIVER_FUNCTIONS_H__
-
-#include "builtin_types.h"
-#include "host_defines.h"
-#include "driver_types.h"
-
-/**
- * \addtogroup CUDART_MEMORY
- *
- * @{
- */
-
-/**
- * \brief Returns a cudaPitchedPtr based on input parameters
- *
- * Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
- * \p p, \p xsz, and \p ysz.
- *
- * \param d   - Pointer to allocated memory
- * \param p   - Pitch of allocated memory in bytes
- * \param xsz - Logical width of allocation in elements
- * \param ysz - Logical height of allocation in elements
- *
- * \return
- * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
- *
- * \sa make_cudaExtent, make_cudaPos
- */
-static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) 
-{
-  struct cudaPitchedPtr s;
-
-  s.ptr   = d;
-  s.pitch = p;
-  s.xsize = xsz;
-  s.ysize = ysz;
-
-  return s;
-}
-
-/**
- * \brief Returns a cudaPos based on input parameters
- *
- * Returns a ::cudaPos based on the specified input parameters \p x,
- * \p y, and \p z.
- *
- * \param x - X position
- * \param y - Y position
- * \param z - Z position
- *
- * \return
- * ::cudaPos specified by \p x, \p y, and \p z
- *
- * \sa make_cudaExtent, make_cudaPitchedPtr
- */
-static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) 
-{
-  struct cudaPos p;
-
-  p.x = x;
-  p.y = y;
-  p.z = z;
-
-  return p;
-}
-
-/**
- * \brief Returns a cudaExtent based on input parameters
- *
- * Returns a ::cudaExtent based on the specified input parameters \p w,
- * \p h, and \p d.
- *
- * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
- * \param h - Height in elements
- * \param d - Depth in elements
- *
- * \return
- * ::cudaExtent specified by \p w, \p h, and \p d
- *
- * \sa make_cudaPitchedPtr, make_cudaPos
- */
-static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) 
-{
-  struct cudaExtent e;
-
-  e.width  = w;
-  e.height = h;
-  e.depth  = d;
-
-  return e;
-}
-
-/** @} */ /* END CUDART_MEMORY */
-
-#endif /* !__DRIVER_FUNCTIONS_H__ */
--- a/include/isaac/external/CUDA/driver_types.h
+++ b/include/isaac/external/CUDA/driver_types.h
--- a/include/isaac/external/CUDA/host_config.h
+++ b/include/isaac/external/CUDA/host_config.h
@@ -1,50 +0,0 @@
-/*
- * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#include "crt/host_config.h"
--- a/include/isaac/external/CUDA/host_defines.h
+++ b/include/isaac/external/CUDA/host_defines.h
@@ -1,50 +0,0 @@
-/*
- * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#include "crt/host_defines.h"
--- a/include/isaac/external/CUDA/library_types.h
+++ b/include/isaac/external/CUDA/library_types.h
@@ -1,80 +0,0 @@
-/*
- * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__LIBRARY_TYPES_H__)
-#define __LIBRARY_TYPES_H__
-
-
-typedef enum cudaDataType_t
-{
-	CUDA_R_16F= 2,  /* real as a half */
-	CUDA_C_16F= 6,  /* complex as a pair of half numbers */
-	CUDA_R_32F= 0,  /* real as a float */
-	CUDA_C_32F= 4,  /* complex as a pair of float numbers */
-	CUDA_R_64F= 1,  /* real as a double */
-	CUDA_C_64F= 5,  /* complex as a pair of double numbers */
-	CUDA_R_8I = 3,  /* real as a signed char */
-	CUDA_C_8I = 7,  /* complex as a pair of signed char numbers */
-	CUDA_R_8U = 8,  /* real as a unsigned char */
-	CUDA_C_8U = 9,  /* complex as a pair of unsigned char numbers */
-	CUDA_R_32I= 10, /* real as a signed int */
-	CUDA_C_32I= 11, /* complex as a pair of signed int numbers */
-	CUDA_R_32U= 12, /* real as a unsigned int */
-	CUDA_C_32U= 13  /* complex as a pair of unsigned int numbers */
-} cudaDataType; 
-
-
-typedef enum libraryPropertyType_t
-{
-	MAJOR_VERSION,
-	MINOR_VERSION,
-	PATCH_LEVEL
-} libraryPropertyType;
-
-#endif /* !__LIBRARY_TYPES_H__ */
--- a/include/isaac/external/CUDA/nvml.h
+++ b/include/isaac/external/CUDA/nvml.h
--- a/include/isaac/external/CUDA/nvrtc.h
+++ b/include/isaac/external/CUDA/nvrtc.h
@@ -1,525 +0,0 @@
-//
-// NVIDIA_COPYRIGHT_BEGIN
-//
-// Copyright (c) 2014-2017, NVIDIA CORPORATION.  All rights reserved.
-//
-// NVIDIA CORPORATION and its licensors retain all intellectual property
-// and proprietary rights in and to this software, related documentation
-// and any modifications thereto.  Any use, reproduction, disclosure or
-// distribution of this software and related documentation without an express
-// license agreement from NVIDIA CORPORATION is strictly prohibited.
-//
-// NVIDIA_COPYRIGHT_END
-//
-
-#ifndef __NVRTC_H__
-#define __NVRTC_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-#include <stdlib.h>
-
-
-/*************************************************************************//**
- *
- * \defgroup error Error Handling
- *
- * NVRTC defines the following enumeration type and function for API call
- * error handling.
- *
- ****************************************************************************/
-
-
-/**
- * \ingroup error
- * \brief   The enumerated type nvrtcResult defines API call result codes.
- *          NVRTC API functions return nvrtcResult to indicate the call
- *          result.
- */
-typedef enum {
-  NVRTC_SUCCESS = 0,
-  NVRTC_ERROR_OUT_OF_MEMORY = 1,
-  NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
-  NVRTC_ERROR_INVALID_INPUT = 3,
-  NVRTC_ERROR_INVALID_PROGRAM = 4,
-  NVRTC_ERROR_INVALID_OPTION = 5,
-  NVRTC_ERROR_COMPILATION = 6,
-  NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
-  NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
-  NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
-  NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
-  NVRTC_ERROR_INTERNAL_ERROR = 11
-} nvrtcResult;
-
-
-/**
- * \ingroup error
- * \brief   nvrtcGetErrorString is a helper function that returns a string
- *          describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to
- *          \c "NVRTC_SUCCESS".
- *          For unrecognized enumeration values, it returns
- *          \c "NVRTC_ERROR unknown".
- *
- * \param   [in] result CUDA Runtime Compilation API result code.
- * \return  Message string for the given #nvrtcResult code.
- */
-const char *nvrtcGetErrorString(nvrtcResult result);
-
-
-/*************************************************************************//**
- *
- * \defgroup query General Information Query
- *
- * NVRTC defines the following function for general information query.
- *
- ****************************************************************************/
-
-
-/**
- * \ingroup query
- * \brief   nvrtcVersion sets the output parameters \p major and \p minor
- *          with the CUDA Runtime Compilation version number.
- *
- * \param   [out] major CUDA Runtime Compilation major version number.
- * \param   [out] minor CUDA Runtime Compilation minor version number.
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *
- */
-nvrtcResult nvrtcVersion(int *major, int *minor);
-
-
-/*************************************************************************//**
- *
- * \defgroup compilation Compilation
- *
- * NVRTC defines the following type and functions for actual compilation.
- *
- ****************************************************************************/
-
-
-/**
- * \ingroup compilation
- * \brief   nvrtcProgram is the unit of compilation, and an opaque handle for
- *          a program.
- *
- * To compile a CUDA program string, an instance of nvrtcProgram must be
- * created first with ::nvrtcCreateProgram, then compiled with
- * ::nvrtcCompileProgram.
- */
-typedef struct _nvrtcProgram *nvrtcProgram;
-
-
-/**
- * \ingroup compilation
- * \brief   nvrtcCreateProgram creates an instance of nvrtcProgram with the
- *          given input parameters, and sets the output parameter \p prog with
- *          it.
- *
- * \param   [out] prog         CUDA Runtime Compilation program.
- * \param   [in]  src          CUDA program source.
- * \param   [in]  name         CUDA program name.\n
- *                             \p name can be \c NULL; \c "default_program" is
- *                             used when \p name is \c NULL.
- * \param   [in]  numHeaders   Number of headers used.\n
- *                             \p numHeaders must be greater than or equal to 0.
- * \param   [in]  headers      Sources of the headers.\n
- *                             \p headers can be \c NULL when \p numHeaders is
- *                             0.
- * \param   [in]  includeNames Name of each header by which they can be
- *                             included in the CUDA program source.\n
- *                             \p includeNames can be \c NULL when \p numHeaders
- *                             is 0.
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *
- * \see     ::nvrtcDestroyProgram
- */
-nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
-                               const char *src,
-                               const char *name,
-                               int numHeaders,
-                               const char * const *headers,
-                               const char * const *includeNames);
-
-
-/**
- * \ingroup compilation
- * \brief   nvrtcDestroyProgram destroys the given program.
- *
- * \param    [in] prog CUDA Runtime Compilation program.
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *
- * \see     ::nvrtcCreateProgram
- */
-nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
-
-
-/**
- * \ingroup compilation
- * \brief   nvrtcCompileProgram compiles the given program.
- *
- * It supports compile options listed in \ref options.
- */
-nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
-                                int numOptions, const char * const *options);
-
-
-/**
- * \ingroup compilation
- * \brief   nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX
- *          generated by the previous compilation of \p prog (including the
- *          trailing \c NULL).
- *
- * \param   [in]  prog       CUDA Runtime Compilation program.
- * \param   [out] ptxSizeRet Size of the generated PTX (including the trailing
- *                           \c NULL).
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *
- * \see     ::nvrtcGetPTX
- */
-nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
-
-
-/**
- * \ingroup compilation
- * \brief   nvrtcGetPTX stores the PTX generated by the previous compilation
- *          of \p prog in the memory pointed by \p ptx.
- *
- * \param   [in]  prog CUDA Runtime Compilation program.
- * \param   [out] ptx  Compiled result.
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *
- * \see     ::nvrtcGetPTXSize
- */
-nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
-
-
-/**
- * \ingroup compilation
- * \brief   nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
- *          log generated by the previous compilation of \p prog (including the
- *          trailing \c NULL).
- *
- * Note that compilation log may be generated with warnings and informative
- * messages, even when the compilation of \p prog succeeds.
- *
- * \param   [in]  prog       CUDA Runtime Compilation program.
- * \param   [out] logSizeRet Size of the compilation log
- *                           (including the trailing \c NULL).
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *
- * \see     ::nvrtcGetProgramLog
- */
-nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
-
-
-/**
- * \ingroup compilation
- * \brief   nvrtcGetProgramLog stores the log generated by the previous
- *          compilation of \p prog in the memory pointed by \p log.
- *
- * \param   [in]  prog CUDA Runtime Compilation program.
- * \param   [out] log  Compilation log.
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *
- * \see     ::nvrtcGetProgramLogSize
- */
-nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
-
-
-/**
- * \ingroup compilation
- * \brief   nvrtcAddNameExpression notes the given name expression
- *          denoting a __global__ function or function template
- *          instantiation.
- *
- * The identical name expression string must be provided on a subsequent
- * call to nvrtcGetLoweredName to extract the lowered name.
- * \param   [in]  prog CUDA Runtime Compilation program.
- * \param   [in] name_expression constant expression denoting a __global__
- *               function or function template instantiation.
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink
- *
- * \see     ::nvrtcGetLoweredName
- */
-nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog,
-                                   const char * const name_expression);
-
-/**
- * \ingroup compilation
- * \brief   nvrtcGetLoweredName extracts the lowered (mangled) name
- *          for a __global__ function or function template instantiation,
- *          and updates *lowered_name to point to it. The memory containing
- *          the name is released when the NVRTC program is destroyed by 
- *          nvrtcDestroyProgram.
- *          The identical name expression must have been previously
- *          provided to nvrtcAddNameExpression.
- *
- * \param   [in]  prog CUDA Runtime Compilation program.
- * \param   [in] name_expression constant expression denoting a __global__
- *               function or function template instantiation.
- * \param   [out] lowered_name initialized by the function to point to a
- *               C string containing the lowered (mangled)
- *               name corresponding to the provided name expression.
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink
- *
- * \see     ::nvrtcAddNameExpression
- */
-nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog,
-                                const char *const name_expression,
-                                const char** lowered_name);
-
-
-/**
- * \defgroup options Supported Compile Options
- *
- * NVRTC supports the compile options below.
- * Option names with two preceding dashs (\c --) are long option names and
- * option names with one preceding dash (\c -) are short option names.
- * Short option names can be used instead of long option names.
- * When a compile option takes an argument, an assignment operator (\c =)
- * is used to separate the compile option argument from the compile option
- * name, e.g., \c "--gpu-architecture=compute_30".
- * Alternatively, the compile option name and the argument can be specified in
- * separate strings without an assignment operator, .e.g,
- * \c "--gpu-architecture" \c "compute_30".
- * Single-character short option names, such as \c -D, \c -U, and \c -I, do
- * not require an assignment operator, and the compile option name and the
- * argument can be present in the same string with or without spaces between
- * them.
- * For instance, \c "-D=<def>", \c "-D<def>", and \c "-D <def>" are all
- * supported.
- *
- * The valid compiler options are:
- *
- *   - Compilation targets
- *     - \c --gpu-architecture=\<arch\> (\c -arch)\n
- *       Specify the name of the class of GPU architectures for which the
- *       input must be compiled.\n
- *       - Valid <c>\<arch\></c>s:
- *         - \c compute_30
- *         - \c compute_32
- *         - \c compute_35
- *         - \c compute_37
- *         - \c compute_50
- *         - \c compute_52
- *         - \c compute_53
- *         - \c compute_60
- *         - \c compute_61
- *         - \c compute_62
- *         - \c compute_70
- *         - \c compute_72
- *       - Default: \c compute_30
- *   - Separate compilation / whole-program compilation
- *     - \c --device-c (\c -dc)\n
- *       Generate relocatable code that can be linked with other relocatable
- *       device code.  It is equivalent to --relocatable-device-code=true.
- *     - \c --device-w (\c -dw)\n
- *       Generate non-relocatable code.  It is equivalent to
- *       \c --relocatable-device-code=false.
- *     - \c --relocatable-device-code={true|false} (\c -rdc)\n
- *       Enable (disable) the generation of relocatable device code.
- *       - Default: \c false
- *   - Debugging support
- *     - \c --device-debug (\c -G)\n
- *       Generate debug information.
- *     - \c --generate-line-info (\c -lineinfo)\n
- *       Generate line-number information.
- *   - Code generation
- *     - \c --maxrregcount=\<N\> (\c -maxrregcount)\n
- *       Specify the maximum amount of registers that GPU functions can use.
- *       Until a function-specific limit, a higher value will generally
- *       increase the performance of individual GPU threads that execute this
- *       function.  However, because thread registers are allocated from a
- *       global register pool on each GPU, a higher value of this option will
- *       also reduce the maximum thread block size, thereby reducing the amount
- *       of thread parallelism.  Hence, a good maxrregcount value is the result
- *       of a trade-off.  If this option is not specified, then no maximum is
- *       assumed.  Value less than the minimum registers required by ABI will
- *       be bumped up by the compiler to ABI minimum limit.
- *     - \c --ftz={true|false} (\c -ftz)\n
- *       When performing single-precision floating-point operations, flush
- *       denormal values to zero or preserve denormal values.
- *       \c --use_fast_math implies \c --ftz=true.
- *       - Default: \c false
- *     - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n
- *       For single-precision floating-point square root, use IEEE
- *       round-to-nearest mode or use a faster approximation.
- *       \c --use_fast_math implies \c --prec-sqrt=false.
- *       - Default: \c true
- *     - \c --prec-div={true|false} (\c -prec-div)\n
- *       For single-precision floating-point division and reciprocals, use IEEE
- *       round-to-nearest mode or use a faster approximation.
- *       \c --use_fast_math implies \c --prec-div=false.
- *       - Default: \c true
- *     - \c --fmad={true|false} (\c -fmad)\n
- *       Enables (disables) the contraction of floating-point multiplies and
- *       adds/subtracts into floating-point multiply-add operations (FMAD,
- *       FFMA, or DFMA).  \c --use_fast_math implies \c --fmad=true.
- *       - Default: \c true
- *     - \c --use_fast_math (\c -use_fast_math)\n
- *       Make use of fast math operations.
- *       \c --use_fast_math implies \c --ftz=true \c --prec-div=false
- *       \c --prec-sqrt=false \c --fmad=true.
- *   - Preprocessing
- *     - \c --define-macro=\<def\> (\c -D)\n
- *       \c \<def\> can be either \c \<name\> or \c \<name=definitions\>.
- *       - \c \<name\> \n
- *         Predefine \c \<name\> as a macro with definition \c 1.
- *       - \c \<name\>=\<definition\> \n
- *         The contents of \c \<definition\> are tokenized and preprocessed
- *         as if they appeared during translation phase three in a \c \#define
- *         directive.  In particular, the definition will be truncated by
- *         embedded new line characters.
- *     - \c --undefine-macro=\<def\> (\c -U)\n
- *       Cancel any previous definition of \c \<def\>.
- *     - \c --include-path=\<dir\> (\c -I)\n
- *       Add the directory \c \<dir\> to the list of directories to be
- *       searched for headers.  These paths are searched after the list of
- *       headers given to ::nvrtcCreateProgram.
- *     - \c --pre-include=\<header\> (\c -include)\n
- *       Preinclude \c \<header\> during preprocessing.
- *   - Language Dialect
- *     - \c --std={c++11|c++14} (\c -std={c++11|c++14})\n
- *       Set language dialect to C++11 or C++14.
- *     - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n
- *       Provide builtin definitions of \c std::move and \c std::forward,
- *       when C++11 language dialect is selected.
- *       - Default: \c true
- *     - \c --builtin-initializer-list={true|false}
- *       (\c -builtin-initializer-list)\n
- *       Provide builtin definitions of \c std::initializer_list class and
- *       member functions when C++11 language dialect is selected.
- *       - Default: \c true
- *   - Misc.
- *     - \c --disable-warnings (\c -w)\n
- *       Inhibit all warning messages.
- *     - \c --restrict (\c -restrict)\n
- *       Programmer assertion that all kernel pointer parameters are restrict
- *       pointers.
- *     - \c --device-as-default-execution-space
- *       (\c -default-device)\n
- *       Treat entities with no execution space annotation as \c __device__
- *       entities.
- *
- * \param   [in] prog       CUDA Runtime Compilation program.
- * \param   [in] numOptions Number of compiler options passed.
- * \param   [in] options    Compiler options in the form of C string array.\n
- *                          \p options can be \c NULL when \p numOptions is 0.
- *
- * \return
- *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
- *   - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
- */
-
-
-#ifdef __cplusplus
-}
-#endif /* __cplusplus */
-
-
-/* The utility function 'nvrtcGetTypeName' is not available by default. Define
-   the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available.
-*/
-   
-#if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__
-
-#if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__
-#include <cxxabi.h>
-#include <cstdlib>
-
-#elif defined(_WIN32)
-#include <Windows.h>
-#include <DbgHelp.h>
-#endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */
-
-
-#include <string>
-#include <typeinfo>
-
-
-/*************************************************************************//**
- *
- * \defgroup hosthelper Host Helper
- *
- * NVRTC defines the following functions for easier interaction with host code.
- *
- ****************************************************************************/
-
-/**
- * \ingroup hosthelper
- * \brief   nvrtcGetTypeName stores the source level name of the template type argument
- *          T in the given std::string location.
- *
- * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
- * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
- * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
- * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
- * otherwise *result is initialized with the extracted name.
- * 
- * \param   [in] result: pointer to std::string in which to store the type name.
- * \return
- *  - \link #nvrtcResult NVRTC_SUCCESS \endlink
- *  - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
- *
- */
-
-template <typename T>
-nvrtcResult nvrtcGetTypeName(std::string *result)
-{
-  const char *name = typeid(T).name();
-  
-#if USE_CXXABI || __clang__ || __GNUC__
-  int status;
-  char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status);
-  if (status == 0) {
-    *result = undecorated_name;
-    free(undecorated_name);
-    return NVRTC_SUCCESS;
-  }
-#elif defined(_WIN32)
-  char undecorated_name[4096];
-  if(UnDecorateSymbolName(name, undecorated_name,
-                          sizeof(undecorated_name) / sizeof(*undecorated_name),
-                          UNDNAME_COMPLETE) ) {
-    *result = undecorated_name;
-    return NVRTC_SUCCESS;
-  }
-#endif  /* USE_CXXABI || __clang__ || __GNUC__ */
-  return NVRTC_ERROR_INTERNAL_ERROR;
-}
-#endif  /* NVRTC_GET_TYPE_NAME */
-
-#endif /* __NVRTC_H__ */
--- a/include/isaac/external/CUDA/surface_types.h
+++ b/include/isaac/external/CUDA/surface_types.h
@@ -1,119 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__SURFACE_TYPES_H__)
-#define __SURFACE_TYPES_H__
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "driver_types.h"
-
-/**
- * \addtogroup CUDART_TYPES
- *
- * @{
- */
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#define cudaSurfaceType1D              0x01
-#define cudaSurfaceType2D              0x02
-#define cudaSurfaceType3D              0x03
-#define cudaSurfaceTypeCubemap         0x0C
-#define cudaSurfaceType1DLayered       0xF1
-#define cudaSurfaceType2DLayered       0xF2
-#define cudaSurfaceTypeCubemapLayered  0xFC
-
-/**
- * CUDA Surface boundary modes
- */
-enum __device_builtin__ cudaSurfaceBoundaryMode
-{
-    cudaBoundaryModeZero  = 0,    /**< Zero boundary mode */
-    cudaBoundaryModeClamp = 1,    /**< Clamp boundary mode */
-    cudaBoundaryModeTrap  = 2     /**< Trap boundary mode */
-};
-
-/**
- * CUDA Surface format modes
- */
-enum __device_builtin__  cudaSurfaceFormatMode
-{
-    cudaFormatModeForced = 0,     /**< Forced format mode */
-    cudaFormatModeAuto = 1        /**< Auto format mode */
-};
-
-/**
- * CUDA Surface reference
- */
-struct __device_builtin__ surfaceReference
-{
-    /**
-     * Channel descriptor for surface reference
-     */
-    struct cudaChannelFormatDesc channelDesc;
-};
-
-/**
- * An opaque value that represents a CUDA Surface object
- */
-typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
-
-/** @} */
-/** @} */ /* END CUDART_TYPES */
-
-#endif /* !__SURFACE_TYPES_H__ */
--- a/include/isaac/external/CUDA/texture_types.h
+++ b/include/isaac/external/CUDA/texture_types.h
@@ -1,217 +0,0 @@
-/*
- * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__TEXTURE_TYPES_H__)
-#define __TEXTURE_TYPES_H__
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "driver_types.h"
-
-/**
- * \addtogroup CUDART_TYPES
- *
- * @{
- */
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#define cudaTextureType1D              0x01
-#define cudaTextureType2D              0x02
-#define cudaTextureType3D              0x03
-#define cudaTextureTypeCubemap         0x0C
-#define cudaTextureType1DLayered       0xF1
-#define cudaTextureType2DLayered       0xF2
-#define cudaTextureTypeCubemapLayered  0xFC
-
-/**
- * CUDA texture address modes
- */
-enum __device_builtin__ cudaTextureAddressMode
-{
-    cudaAddressModeWrap   = 0,    /**< Wrapping address mode */
-    cudaAddressModeClamp  = 1,    /**< Clamp to edge address mode */
-    cudaAddressModeMirror = 2,    /**< Mirror address mode */
-    cudaAddressModeBorder = 3     /**< Border address mode */
-};
-
-/**
- * CUDA texture filter modes
- */
-enum __device_builtin__ cudaTextureFilterMode
-{
-    cudaFilterModePoint  = 0,     /**< Point filter mode */
-    cudaFilterModeLinear = 1      /**< Linear filter mode */
-};
-
-/**
- * CUDA texture read modes
- */
-enum __device_builtin__ cudaTextureReadMode
-{
-    cudaReadModeElementType     = 0,  /**< Read texture as specified element type */
-    cudaReadModeNormalizedFloat = 1   /**< Read texture as normalized float */
-};
-
-/**
- * CUDA texture reference
- */
-struct __device_builtin__ textureReference
-{
-    /**
-     * Indicates whether texture reads are normalized or not
-     */
-    int                          normalized;
-    /**
-     * Texture filter mode
-     */
-    enum cudaTextureFilterMode   filterMode;
-    /**
-     * Texture address mode for up to 3 dimensions
-     */
-    enum cudaTextureAddressMode  addressMode[3];
-    /**
-     * Channel descriptor for the texture reference
-     */
-    struct cudaChannelFormatDesc channelDesc;
-    /**
-     * Perform sRGB->linear conversion during texture read
-     */
-    int                          sRGB;
-    /**
-     * Limit to the anisotropy ratio
-     */
-    unsigned int                 maxAnisotropy;
-    /**
-     * Mipmap filter mode
-     */
-    enum cudaTextureFilterMode   mipmapFilterMode;
-    /**
-     * Offset applied to the supplied mipmap level
-     */
-    float                        mipmapLevelBias;
-    /**
-     * Lower end of the mipmap level range to clamp access to
-     */
-    float                        minMipmapLevelClamp;
-    /**
-     * Upper end of the mipmap level range to clamp access to
-     */
-    float                        maxMipmapLevelClamp;
-    int                          __cudaReserved[15];
-};
-
-/**
- * CUDA texture descriptor
- */
-struct __device_builtin__ cudaTextureDesc
-{
-    /**
-     * Texture address mode for up to 3 dimensions
-     */
-    enum cudaTextureAddressMode addressMode[3];
-    /**
-     * Texture filter mode
-     */
-    enum cudaTextureFilterMode  filterMode;
-    /**
-     * Texture read mode
-     */
-    enum cudaTextureReadMode    readMode;
-    /**
-     * Perform sRGB->linear conversion during texture read
-     */
-    int                         sRGB;
-    /**
-     * Texture Border Color
-     */
-    float                       borderColor[4];
-    /**
-     * Indicates whether texture reads are normalized or not
-     */
-    int                         normalizedCoords;
-    /**
-     * Limit to the anisotropy ratio
-     */
-    unsigned int                maxAnisotropy;
-    /**
-     * Mipmap filter mode
-     */
-    enum cudaTextureFilterMode  mipmapFilterMode;
-    /**
-     * Offset applied to the supplied mipmap level
-     */
-    float                       mipmapLevelBias;
-    /**
-     * Lower end of the mipmap level range to clamp access to
-     */
-    float                       minMipmapLevelClamp;
-    /**
-     * Upper end of the mipmap level range to clamp access to
-     */
-    float                       maxMipmapLevelClamp;
-};
-
-/**
- * An opaque value that represents a CUDA texture object
- */
-typedef __device_builtin__ unsigned long long cudaTextureObject_t;
-
-/** @} */
-/** @} */ /* END CUDART_TYPES */
-
-#endif /* !__TEXTURE_TYPES_H__ */
--- a/include/isaac/external/CUDA/vector_functions.h
+++ b/include/isaac/external/CUDA/vector_functions.h
@@ -1,177 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__VECTOR_FUNCTIONS_H__)
-#define __VECTOR_FUNCTIONS_H__
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "builtin_types.h"
-#include "host_defines.h"
-#include "vector_types.h"
-
-#if defined(__CUDACC_RTC__)
-#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
-#else /* !__CUDACC_RTC__ */
-#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
-#endif /* __CUDACC_RTC__ */
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
-
-__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
-
-__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
-
-__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
-
-__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
-
-__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
-
-__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
-
-__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
-
-__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
-
-__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
-
-__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
-
-__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
-
-__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
-
-__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
-
-__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
-
-__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
-
-__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
-
-__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
-
-__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
-
-__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
-
-__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
-
-__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
-
-__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
-
-__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
-
-__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
-
-__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
-
-__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
-
-__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
-
-__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
-
-__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
-
-__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
-
-__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
-
-__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
-
-__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
-
-__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
-
-__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
-
-__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
-
-__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
-
-__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
-
-__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
-
-__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
-
-__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
-
-__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
-
-__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
-
-__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
-
-__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
-
-__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
-
-__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
-
-#undef __VECTOR_FUNCTIONS_DECL__
-
-#if !defined(__CUDACC_RTC__)
-#include "vector_functions.hpp"
-#endif /* !__CUDACC_RTC__ */
-
-#endif /* !__VECTOR_FUNCTIONS_H__ */
--- a/include/isaac/external/CUDA/vector_functions.hpp
+++ b/include/isaac/external/CUDA/vector_functions.hpp
@@ -1,318 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__VECTOR_FUNCTIONS_HPP__)
-#define __VECTOR_FUNCTIONS_HPP__
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "builtin_types.h"
-#include "host_defines.h"
-#include "vector_types.h"
-
-#if defined(__CUDACC_RTC__)
-#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
-#else /* !__CUDACC_RTC__ */
-#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
-#endif /* __CUDACC_RTC__ */
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
-{
-  char1 t; t.x = x; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
-{
-  uchar1 t; t.x = x; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
-{
-  char2 t; t.x = x; t.y = y; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
-{
-  uchar2 t; t.x = x; t.y = y; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
-{
-  char3 t; t.x = x; t.y = y; t.z = z; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
-{
-  uchar3 t; t.x = x; t.y = y; t.z = z; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
-{
-  char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
-{
-  uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
-{
-  short1 t; t.x = x; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
-{
-  ushort1 t; t.x = x; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
-{
-  short2 t; t.x = x; t.y = y; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
-{
-  ushort2 t; t.x = x; t.y = y; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
-{ 
-  short3 t; t.x = x; t.y = y; t.z = z; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
-{
-  ushort3 t; t.x = x; t.y = y; t.z = z; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
-{
-  short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
-{
-  ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
-{
-  int1 t; t.x = x; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
-{
-  uint1 t; t.x = x; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
-{
-  int2 t; t.x = x; t.y = y; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
-{
-  uint2 t; t.x = x; t.y = y; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
-{
-  int3 t; t.x = x; t.y = y; t.z = z; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
-{
-  uint3 t; t.x = x; t.y = y; t.z = z; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
-{
-  int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
-{
-  uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
-{
-  long1 t; t.x = x; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
-{
-  ulong1 t; t.x = x; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
-{
-  long2 t; t.x = x; t.y = y; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
-{
-  ulong2 t; t.x = x; t.y = y; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
-{
-  long3 t; t.x = x; t.y = y; t.z = z; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
-{
-  ulong3 t; t.x = x; t.y = y; t.z = z; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
-{
-  long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
-{
-  ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
-{
-  float1 t; t.x = x; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
-{
-  float2 t; t.x = x; t.y = y; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
-{
-  float3 t; t.x = x; t.y = y; t.z = z; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
-{
-  float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
-{
-  longlong1 t; t.x = x; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
-{
-  ulonglong1 t; t.x = x; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
-{
-  longlong2 t; t.x = x; t.y = y; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
-{
-  ulonglong2 t; t.x = x; t.y = y; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
-{
-  longlong3 t; t.x = x; t.y = y; t.z = z; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
-{
-  ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
-{
-  longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
-{
-  ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
-{
-  double1 t; t.x = x; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
-{
-  double2 t; t.x = x; t.y = y; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
-{
-  double3 t; t.x = x; t.y = y; t.z = z; return t;
-}
-
-__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
-{
-  double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
-}
-
-#undef __VECTOR_FUNCTIONS_DECL__
-
-#endif /* !__VECTOR_FUNCTIONS_HPP__ */
-
--- a/include/isaac/external/CUDA/vector_types.h
+++ b/include/isaac/external/CUDA/vector_types.h
@@ -1,425 +0,0 @@
-/*
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#if !defined(__VECTOR_TYPES_H__)
-#define __VECTOR_TYPES_H__
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#include "host_defines.h"
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && \
-    defined(_WIN32) && !defined(_WIN64)
-
-#pragma warning(push)
-#pragma warning(disable: 4201 4408)
-
-#define __cuda_builtin_vector_align8(tag, members) \
-struct __device_builtin__ tag                      \
-{                                                  \
-    union                                          \
-    {                                              \
-        struct { members };                        \
-        struct { long long int :1,:0; };           \
-    };                                             \
-}
-
-#else /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
-
-#define __cuda_builtin_vector_align8(tag, members) \
-struct __device_builtin__ __align__(8) tag         \
-{                                                  \
-    members                                        \
-}
-
-#endif /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
-
-struct __device_builtin__ char1
-{
-    signed char x;
-};
-
-struct __device_builtin__ uchar1
-{
-    unsigned char x;
-};
-
-
-struct __device_builtin__ __align__(2) char2
-{
-    signed char x, y;
-};
-
-struct __device_builtin__ __align__(2) uchar2
-{
-    unsigned char x, y;
-};
-
-struct __device_builtin__ char3
-{
-    signed char x, y, z;
-};
-
-struct __device_builtin__ uchar3
-{
-    unsigned char x, y, z;
-};
-
-struct __device_builtin__ __align__(4) char4
-{
-    signed char x, y, z, w;
-};
-
-struct __device_builtin__ __align__(4) uchar4
-{
-    unsigned char x, y, z, w;
-};
-
-struct __device_builtin__ short1
-{
-    short x;
-};
-
-struct __device_builtin__ ushort1
-{
-    unsigned short x;
-};
-
-struct __device_builtin__ __align__(4) short2
-{
-    short x, y;
-};
-
-struct __device_builtin__ __align__(4) ushort2
-{
-    unsigned short x, y;
-};
-
-struct __device_builtin__ short3
-{
-    short x, y, z;
-};
-
-struct __device_builtin__ ushort3
-{
-    unsigned short x, y, z;
-};
-
-__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
-__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
-
-struct __device_builtin__ int1
-{
-    int x;
-};
-
-struct __device_builtin__ uint1
-{
-    unsigned int x;
-};
-
-__cuda_builtin_vector_align8(int2, int x; int y;);
-__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
-
-struct __device_builtin__ int3
-{
-    int x, y, z;
-};
-
-struct __device_builtin__ uint3
-{
-    unsigned int x, y, z;
-};
-
-struct __device_builtin__ __builtin_align__(16) int4
-{
-    int x, y, z, w;
-};
-
-struct __device_builtin__ __builtin_align__(16) uint4
-{
-    unsigned int x, y, z, w;
-};
-
-struct __device_builtin__ long1
-{
-    long int x;
-};
-
-struct __device_builtin__ ulong1
-{
-    unsigned long x;
-};
-
-#if defined(_WIN32)
-__cuda_builtin_vector_align8(long2, long int x; long int y;);
-__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
-#else /* !_WIN32 */
-
-struct __device_builtin__ __align__(2*sizeof(long int)) long2
-{
-    long int x, y;
-};
-
-struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
-{
-    unsigned long int x, y;
-};
-
-#endif /* _WIN32 */
-
-struct __device_builtin__ long3
-{
-    long int x, y, z;
-};
-
-struct __device_builtin__ ulong3
-{
-    unsigned long int x, y, z;
-};
-
-struct __device_builtin__ __builtin_align__(16) long4
-{
-    long int x, y, z, w;
-};
-
-struct __device_builtin__ __builtin_align__(16) ulong4
-{
-    unsigned long int x, y, z, w;
-};
-
-struct __device_builtin__ float1
-{
-    float x;
-};
-
-#if !defined(__CUDACC__) && defined(__arm__) && \
-    defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-pedantic"
-
-struct __device_builtin__ __attribute__((aligned(8))) float2
-{
-    float x; float y; float __cuda_gnu_arm_ice_workaround[0];
-};
-
-#pragma GCC poison __cuda_gnu_arm_ice_workaround
-#pragma GCC diagnostic pop
-
-#else /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
-         __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
-
-__cuda_builtin_vector_align8(float2, float x; float y;);
-
-#endif /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
-          __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
-
-struct __device_builtin__ float3
-{
-    float x, y, z;
-};
-
-struct __device_builtin__ __builtin_align__(16) float4
-{
-    float x, y, z, w;
-};
-
-struct __device_builtin__ longlong1
-{
-    long long int x;
-};
-
-struct __device_builtin__ ulonglong1
-{
-    unsigned long long int x;
-};
-
-struct __device_builtin__ __builtin_align__(16) longlong2
-{
-    long long int x, y;
-};
-
-struct __device_builtin__ __builtin_align__(16) ulonglong2
-{
-    unsigned long long int x, y;
-};
-
-struct __device_builtin__ longlong3
-{
-    long long int x, y, z;
-};
-
-struct __device_builtin__ ulonglong3
-{
-    unsigned long long int x, y, z;
-};
-
-struct __device_builtin__ __builtin_align__(16) longlong4
-{
-    long long int x, y, z ,w;
-};
-
-struct __device_builtin__ __builtin_align__(16) ulonglong4
-{
-    unsigned long long int x, y, z, w;
-};
-
-struct __device_builtin__ double1
-{
-    double x;
-};
-
-struct __device_builtin__ __builtin_align__(16) double2
-{
-    double x, y;
-};
-
-struct __device_builtin__ double3
-{
-    double x, y, z;
-};
-
-struct __device_builtin__ __builtin_align__(16) double4
-{
-    double x, y, z, w;
-};
-
-#if !defined(__CUDACC__) && defined(_WIN32) && !defined(_WIN64)
-
-#pragma warning(pop)
-
-#endif /* !__CUDACC__ && _WIN32 && !_WIN64 */
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-typedef __device_builtin__ struct char1 char1;
-typedef __device_builtin__ struct uchar1 uchar1;
-typedef __device_builtin__ struct char2 char2;
-typedef __device_builtin__ struct uchar2 uchar2;
-typedef __device_builtin__ struct char3 char3;
-typedef __device_builtin__ struct uchar3 uchar3;
-typedef __device_builtin__ struct char4 char4;
-typedef __device_builtin__ struct uchar4 uchar4;
-typedef __device_builtin__ struct short1 short1;
-typedef __device_builtin__ struct ushort1 ushort1;
-typedef __device_builtin__ struct short2 short2;
-typedef __device_builtin__ struct ushort2 ushort2;
-typedef __device_builtin__ struct short3 short3;
-typedef __device_builtin__ struct ushort3 ushort3;
-typedef __device_builtin__ struct short4 short4;
-typedef __device_builtin__ struct ushort4 ushort4;
-typedef __device_builtin__ struct int1 int1;
-typedef __device_builtin__ struct uint1 uint1;
-typedef __device_builtin__ struct int2 int2;
-typedef __device_builtin__ struct uint2 uint2;
-typedef __device_builtin__ struct int3 int3;
-typedef __device_builtin__ struct uint3 uint3;
-typedef __device_builtin__ struct int4 int4;
-typedef __device_builtin__ struct uint4 uint4;
-typedef __device_builtin__ struct long1 long1;
-typedef __device_builtin__ struct ulong1 ulong1;
-typedef __device_builtin__ struct long2 long2;
-typedef __device_builtin__ struct ulong2 ulong2;
-typedef __device_builtin__ struct long3 long3;
-typedef __device_builtin__ struct ulong3 ulong3;
-typedef __device_builtin__ struct long4 long4;
-typedef __device_builtin__ struct ulong4 ulong4;
-typedef __device_builtin__ struct float1 float1;
-typedef __device_builtin__ struct float2 float2;
-typedef __device_builtin__ struct float3 float3;
-typedef __device_builtin__ struct float4 float4;
-typedef __device_builtin__ struct longlong1 longlong1;
-typedef __device_builtin__ struct ulonglong1 ulonglong1;
-typedef __device_builtin__ struct longlong2 longlong2;
-typedef __device_builtin__ struct ulonglong2 ulonglong2;
-typedef __device_builtin__ struct longlong3 longlong3;
-typedef __device_builtin__ struct ulonglong3 ulonglong3;
-typedef __device_builtin__ struct longlong4 longlong4;
-typedef __device_builtin__ struct ulonglong4 ulonglong4;
-typedef __device_builtin__ struct double1 double1;
-typedef __device_builtin__ struct double2 double2;
-typedef __device_builtin__ struct double3 double3;
-typedef __device_builtin__ struct double4 double4;
-
-/*******************************************************************************
-*                                                                              *
-*                                                                              *
-*                                                                              *
-*******************************************************************************/
-
-struct __device_builtin__ dim3
-{
-    unsigned int x, y, z;
-#if defined(__cplusplus)
-    __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
-    __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
-    __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
-#endif /* __cplusplus */
-};
-
-typedef __device_builtin__ struct dim3 dim3;
-
-#undef  __cuda_builtin_vector_align8
-
-#endif /* !__VECTOR_TYPES_H__ */
--- a/include/isaac/external/half.hpp
+++ b/include/isaac/external/half.hpp
--- a/include/isaac/runtime/predict.h
+++ b/include/isaac/runtime/predict.h
@@ -1,148 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_RUNTIME_PREDICT_H_
-#define ISAAC_RUNTIME_PREDICT_H_
-
-#include <fstream>
-#include <vector>
-#include <memory>
-#include <iostream>
-#include <cstring>
-#include <algorithm>
-#include "isaac/tools/matrix.hpp"
-#include "isaac/driver/device.h"
-#include "isaac/templates/common.hpp"
-#include "isaac/templates/pool.h"
-#include "isaac/templates/conv.h"
-#include "isaac/templates/gemm.h"
-#include <map>
-
-namespace isaac{
-namespace runtime{
-
-// Layers
-class Layer{
-public:
-  static Layer* read(u_char*& current);
-  virtual void forward(matrix<float> const & X, matrix<float> & Y) = 0;
-  virtual size_t n_outs(size_t n_outs_prev) = 0;
-};
-
-class Activation: public Layer{
-public:
-  static const int BINARY_CODE = 0;
-  size_t n_outs(size_t n_outs_prev);
-
-private:
-};
-
-class ReLU: public Activation{
-public:
-  static const int BINARY_CODE = 0;
-  void forward(matrix<float> const & X, matrix<float> & Y);
-};
-
-class Linear: public Activation{
-public:
-  static const int BINARY_CODE = 1;
-  void forward(matrix<float> const & X, matrix<float> & Y);
-};
-
-// Dense
-class Dense: public Layer{
-public:
-  static const int BINARY_CODE = 1;
-  Dense(u_char*& data);
-  size_t n_outs(size_t);
-  void forward(matrix<float> const & X, matrix<float> & Y);
-
-private:
-  matrix<float> W_;
-  std::vector<float> b_;
-};
-
-// Network
-class Network{
-public:
-  Network(u_char* data);
-  void predict(const matrix<float>& X, matrix<float>& Y);
-
-private:
-  std::vector<std::shared_ptr<Layer>> layers_;
-};
-
-enum OperationType{
-  GEMM,
-  CONV,
-  POOL
-};
-
-//Profile
-class Profile{
-protected:
-  typedef void (&validator_t)(driver::Device const &, size_t, param_t*, uint8_t*);
-  typedef std::function<double(std::vector<param_t> const&)> benchmark_t;
-
-public:
-  Profile(u_char* data, size_t nshapes);
-  std::vector<param_t> predict(driver::Device const & device, std::vector<param_t> const & shapes, validator_t const & validator, benchmark_t const & benchmark, size_t num_re_evaluate);
-  matrix<param_t> const & kernels() const;
-
-private:
-  matrix<param_t> kernels_;
-  driver::Device device_;
-  Network predictor_;
-};
-
-class ConvProfile: public Profile{
-public:
-  ConvProfile(u_char* data);
-  templates::Conv predict(driver::Stream& stream, DType in_dtype, DType out_dtype, param_t C, param_t D, param_t H, param_t W, param_t N, param_t K, param_t M, param_t P, param_t Q, param_t T, param_t R, param_t S,
-                          param_t pad_d, param_t pad_h, param_t pad_w,
-                          param_t stride_d, param_t stride_h, param_t stride_w,
-                          param_t upsample_d, param_t upsample_h, param_t upsample_w,
-                          ActivationType activation, size_t num_outputs,
-                          ResidualType residual, param_t Zk, param_t crop_z_m0, param_t crop_z_m1, param_t crop_z_p0, param_t crop_z_p1, param_t crop_z_q0, param_t crop_z_q1, size_t num_re_evaluate = 1);
-};
-
-class PoolProfile: public Profile{
-public:
-  PoolProfile(u_char* data);
-  templates::Pool predict(driver::Stream& stream, DType in_dtype, DType out_dtype, PoolType pool_type, param_t C, param_t D, param_t H, param_t W, param_t N, param_t M, param_t P, param_t Q, param_t T, param_t R, param_t S,
-                        param_t pad_d, param_t pad_h, param_t pad_w, param_t stride_d, param_t stride_h, param_t stride_w, size_t num_re_evaluate = 1);
-};
-
-class GEMMProfile: public Profile{
-public:
-  GEMMProfile(u_char* data);
-  templates::GEMM predict(driver::Stream& stream, DType in_dtype, DType out_dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K,
-                          param_t offa, param_t lda, param_t offb, param_t ldb, param_t offc, param_t ldc, size_t num_re_evaluate = 1);
-};
-
-//Database
-extern const std::map<std::pair<driver::Device::Architecture, OperationType>, std::shared_ptr<Profile> > database;
-
-}
-}
-
-#endif
--- a/include/isaac/scalar.h
+++ b/include/isaac/scalar.h
@@ -1,95 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_SCALAR_H
-#define ISAAC_SCALAR_H
-
-#include "isaac/external/half.hpp"
-
-namespace isaac{
-
-
-enum DType{
-  INT8X4_TYPE = 1,
-  INT32_TYPE,
-  FLOAT_TYPE,
-  DOUBLE_TYPE,
-};
-
-inline size_t size_of(DType dtype){
-  switch (dtype) {
-  case INT8X4_TYPE: return 4;
-  case INT32_TYPE: return 4;
-  case FLOAT_TYPE: return 4;
-  case DOUBLE_TYPE: return 8;
-  default: throw;
-  }
-}
-
-template<class T> struct to_DType;
-template<> struct to_DType<int32_t>{ static const DType value = INT8X4_TYPE; };
-template<> struct to_DType<float>{ static const DType value = FLOAT_TYPE; };
-template<> struct to_DType<double>{ static const DType value = DOUBLE_TYPE; };
-
-class scalar{
-private:
-  template<class T>
-  void init(T const & x){
-    switch(dtype_){
-      case INT32_TYPE: value_.int32 = (int32_t)x; break;
-      case FLOAT_TYPE: value_.float32 = (float)x; break;
-      case DOUBLE_TYPE: value_.float64 = (double)x; break;
-      default: throw;
-    }
-  }
-
-public:
-#define ISAAC_INSTANTIATE(TYPE) scalar(TYPE value, DType dtype = to_DType<TYPE>::value) : dtype_(dtype) { init(value); }
-  ISAAC_INSTANTIATE(float)
-  ISAAC_INSTANTIATE(double)
-#undef ISAAC_INSTANTIATE
-
-  void* data() const{
-    switch(dtype_){
-      case INT32_TYPE: return (void*)&value_.int32;
-      case FLOAT_TYPE: return (void*)&value_.float32;
-      case DOUBLE_TYPE: return (void*)&value_.float64;
-      default: throw;
-    }
-  }
-
-  DType dtype() const{
-    return dtype_;
-  }
-
-private:
-  DType dtype_;
-  union{
-    int32_t int32;
-    float float32;
-    double float64;
-  }value_;
-};
-
-}
-
-#endif
--- a/include/isaac/templates/common.hpp
+++ b/include/isaac/templates/common.hpp
@@ -1,89 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_TEMPLATES_COMMON_HPP_
-#define ISAAC_TEMPLATES_COMMON_HPP_
-
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-#include "isaac/scalar.h"
-
-namespace isaac{
-
-inline int32_t ceil(int32_t num, int32_t div){
-  return (num + div - 1)/div;
-}
-
-inline size_t log2(size_t x){
-  size_t res = 0;
-  while((x>>=1)>0) res++;
-  return res;
-}
-
-inline size_t next_pow2(size_t N){
-  size_t res = 1;
-  while(res < N)
-    res*=2;
-  return res;
-}
-
-inline std::string arith_str(DType dtype){
-  switch (dtype) {
-  case INT8X4_TYPE: return "s32";
-  case FLOAT_TYPE: return "f32";
-  case DOUBLE_TYPE: return "f64";
-  default: throw;
-  }
-}
-
-inline std::string io_str(DType dtype){
-  switch (dtype) {
-  case INT8X4_TYPE: return "b32";
-  case FLOAT_TYPE: return "b32";
-  case DOUBLE_TYPE: return "b64";
-  default: throw;
-  }
-}
-
-typedef uint32_t param_t;
-
-namespace driver{
-  class Device;
-  class Stream;
-  class Kernel;
-  class Buffer;
-}
-
-namespace templates{
-
-class Generator{
-public:
-  Generator(){}
-  virtual std::string dump(driver::Device const & device, std::string const & name) = 0;
-  virtual std::vector<param_t> tuning_params() const = 0;
-};
-
-}
-}
-
-#endif
--- a/include/isaac/templates/conv.h
+++ b/include/isaac/templates/conv.h
@@ -1,155 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_TEMPLATES_CONV_H_
-#define ISAAC_TEMPLATES_CONV_H_
-
-#include <cstddef>
-#include <string>
-#include "isaac/templates/common.hpp"
-
-namespace isaac{
-
-enum ActivationType{
-  Linear,
-  ReLU,
-  ELU,
-  Sigmoid
-};
-
-enum ResidualType{
-   NoResidual,
-   CatResidual,
-   AddResidual
-};
-
-namespace templates{
-
-class Conv: public Generator{
-public:
-  static const std::string id;
-  static const size_t Nshapes;
-  static const size_t Ntune;
-  static const size_t Nparams;
-
-private:
-  void init_constant_memory(std::vector<int32_t>& delta, std::vector<uint32_t> &masks, size_t nlut, int32_t strideIc, int32_t strideIw, int32_t strideIh, int32_t strideId);
-
-public:
-  Conv(DType in_dtype, DType out_dtype, param_t C, param_t D, param_t H, param_t W, param_t N, param_t K, param_t M, param_t P, param_t Q, param_t T, param_t R, param_t S,
-       param_t pad_h, param_t pad_w, param_t pad_d, param_t stride_h, param_t stride_w, param_t stride_d, param_t upsample_d, param_t upsample_h, param_t upsample_w,
-       ActivationType activation, size_t num_outputs,
-       ResidualType residual_type, param_t Zk, param_t z_crop_m0, param_t z_crop_m1, param_t z_crop_p0, param_t z_crop_p1, param_t z_crop_q0, param_t z_crop_q1,
-       param_t vec, param_t bpqn, param_t bk, param_t pqns, param_t ks, param_t crs_l, param_t cs, param_t bc, param_t gridc);
-  // Execution
-  std::string dump(driver::Device const & device, std::string const & name);
-  std::vector<param_t> tuning_params() const;
-  void enqueue(driver::Kernel& kernel, driver::Stream& queue, driver::Buffer const & I, driver::Buffer const & F, driver::Buffer *O, driver::Buffer const * bias = NULL, float alpha = 0, float iscale = 1, float fscale = 1, std::vector<float> oscale = {1}, float z_scale = 1, driver::Buffer const *Z = NULL);
-  // Validity
-  static void output_shapes(param_t D, param_t H, param_t W, param_t T, param_t R, param_t S, param_t pad_d,
-                            param_t pad_h, param_t pad_w, param_t stride_d, param_t stride_h, param_t stride_w,
-                            param_t upsample_d, param_t upsample_h, param_t upsample_w,
-                            param_t& M, param_t& P, param_t& Q);
-  static void check_valid(driver::Device const & device, size_t M, param_t* params, uint8_t* valid);
-  // Benchmark
-  static double tflops(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t C, param_t R, param_t S, param_t T, double time);
-
-private:
-  // data types
-  DType in_dtype_;
-  DType out_dtype_;
-
-  // activation type
-  ActivationType activation_;
-  size_t num_outputs_;
-
-  // residual
-  ResidualType residual_type_;
-  param_t Zk_;
-  param_t z_crop_m0_;
-  param_t z_crop_m1_;
-  param_t z_crop_p0_;
-  param_t z_crop_p1_;
-  param_t z_crop_q0_;
-  param_t z_crop_q1_;
-  param_t Zm_;
-  param_t Zp_;
-  param_t Zq_;
-
-  //input shapes
-  param_t C_;
-  param_t N_;
-  param_t K_;
-  param_t Kout_;
-
-  // Input dimensions
-  param_t D_;
-  param_t H_;
-  param_t W_;
-
-  // Output Dimensions
-  param_t M_;
-  param_t P_;
-  param_t Q_;
-
-  // Filter Dimensions
-  param_t T_;
-  param_t R_;
-  param_t S_;
-
-  // Pad
-  param_t pad_d_;
-  param_t pad_h_;
-  param_t pad_w_;
-
-  // stride
-  param_t stride_d_;
-  param_t stride_h_;
-  param_t stride_w_;
-
-  // upsample
-  param_t upsample_d_;
-  param_t upsample_h_;
-  param_t upsample_w_;
-
-  //parameters
-  param_t vec_;
-  param_t bc0_;
-  param_t bc1_;
-  param_t cs0_;
-  param_t cs1_;
-  param_t bf_n_;
-  param_t u_;
-  param_t us_;
-  param_t zs_;
-  param_t bz_;
-  param_t gridz_;
-
-  // constant memory
-  std::vector<int32_t> cLUT;
-  std::vector<uint32_t> masks_;
-};
-
-}
-}
-
-#endif
--- a/include/isaac/templates/error.hpp
+++ b/include/isaac/templates/error.hpp
@@ -1,39 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_TEMPLATES_ERROR_HPP_
-#define ISAAC_TEMPLATES_ERROR_HPP_
-
-#include <exception>
-
-namespace isaac{
-namespace templates{
-
-class invalid_parameters: public std::exception {
-  public:
-    const char * what() const throw(){ return "Invalid parameters";}
-};
-
-}
-}
-
-#endif
--- a/include/isaac/templates/gemm.h
+++ b/include/isaac/templates/gemm.h
@@ -1,102 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_TEMPLATES_GEMM_H_
-#define ISAAC_TEMPLATES_GEMM_H_
-
-#include <cstddef>
-#include <string>
-#include "isaac/templates/common.hpp"
-#include "isaac/scalar.h"
-
-namespace isaac{
-
-namespace driver{
-  class Device;
-  class Stream;
-  class Kernel;
-  class Buffer;
-}
-
-enum IsaacOperation_t{
-  ISAAC_OP_N = 1,
-  ISAAC_OP_T = 2
-};
-
-
-namespace templates{
-
-class GEMM: public Generator{
-public:
-  static const std::string id;
-  static const size_t Nshapes;
-  static const size_t Ntune;
-  static const size_t Nparams;
-
-public:
-  GEMM(DType in_dtype, DType out_dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K, param_t offa, param_t lda, param_t offb, param_t ldb, param_t offc, param_t ldc,
-       param_t vec, param_t bm, param_t u, param_t bn, param_t ms, param_t us, param_t ns, param_t ba0, param_t ba1, param_t bb0, param_t bb1,
-       param_t ks, param_t bk, param_t kg);
-  std::string dump(driver::Device const & device, std::string const & name);
-  std::vector<param_t> tuning_params() const;
-  void enqueue(driver::Kernel& kernel, driver::Stream& queue, scalar const & alpha, driver::Buffer const & A, driver::Buffer const & B, scalar const & beta, driver::Buffer& C, float a_scale = 1, float b_scale = 1, float c_scale = 1, const driver::Buffer *bias = NULL);
-  static void check_valid(driver::Device const & device, size_t M, param_t* params, uint8_t* valid);
-  static double tflops(param_t M, param_t N, param_t K, double time);
-
-private:
-  DType in_dtype_;
-  DType out_dtype_;
-  //transposition
-  IsaacOperation_t AT_;
-  IsaacOperation_t BT_;
-  //input shapes
-  param_t M_;
-  param_t N_;
-  param_t K_;
-  param_t offa_;
-  param_t lda_;
-  param_t offb_;
-  param_t ldb_;
-  param_t offc_;
-  param_t ldc_;
-  //parameters
-  param_t vec_;
-  param_t bc0_;
-  param_t bc1_;
-  param_t cs0_;
-  param_t cs1_;
-  param_t u_;
-  param_t us_;
-  param_t ba0_;
-  param_t ba1_;
-  param_t bb0_;
-  param_t bb1_;
-  param_t zs_;
-  param_t bz_;
-  param_t gridz_;
-  param_t stn_;
-};
-
-}
-}
-
-#endif
--- a/include/isaac/templates/pool.h
+++ b/include/isaac/templates/pool.h
@@ -1,100 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_TEMPLATES_POOL_H_
-#define ISAAC_TEMPLATES_POOL_H_
-
-#include <cstddef>
-#include <string>
-#include "isaac/templates/common.hpp"
-
-namespace isaac{
-
-enum PoolType{
-  MaxPool,
-  AvgPool
-};
-
-
-namespace templates{
-
-class Pool: public Generator{
-private:
-  void init_constant_memory(std::vector<int32_t>& delta, std::vector<uint32_t> &masks, size_t nlut, int32_t strideIc, int32_t strideIw, int32_t strideIh, int32_t strideId);
-
-public:
-  static const std::string id;
-  static const size_t Nshapes;
-  static const size_t Ntune;
-  static const size_t Nparams;
-
-public:
-  Pool(DType in_dtype, DType out_dtype, PoolType pool_type,
-       param_t C, param_t D, param_t H, param_t W, param_t N, param_t M, param_t P, param_t Q, param_t T, param_t R, param_t S,
-       param_t pad_d, param_t pad_h, param_t pad_w,
-       param_t stride_d, param_t stride_h, param_t stride_w,
-       param_t vec = 1, param_t bc0 = 32, param_t cs0 = 4, param_t u = 1);
-  // Execution
-  std::string dump(driver::Device const & device, std::string const & name);
-  static void check_valid(driver::Device const & device, size_t M, param_t* params, uint8_t* valid);
-  void enqueue(driver::Kernel& kernel, driver::Stream& queue, driver::Buffer const & I, driver::Buffer &O, float i_scale = 1, float o_scale = 1);
-  std::vector<unsigned int> tuning_params() const;
-  static double tflops(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t T, param_t R, param_t S, double time);
-
-private:
-  DType in_dtype_;
-  DType out_dtype_;
-  PoolType pool_type_;
-  // Shapes
-  param_t Cin_;
-  param_t Cout_;
-  param_t D_;
-  param_t H_;
-  param_t W_;
-  param_t N_;
-  param_t M_;
-  param_t P_;
-  param_t Q_;
-  param_t T_;
-  param_t R_;
-  param_t S_;
-  param_t pad_d_;
-  param_t pad_h_;
-  param_t pad_w_;
-  param_t stride_d_;
-  param_t stride_h_;
-  param_t stride_w_;
-  // Tuning params
-  param_t vec_;
-  param_t bc0_;
-  param_t cs0_;
-  param_t u_;
-  // Constant buffer
-  std::vector<int32_t> cLUT;
-  std::vector<uint32_t> masks_;
-};
-
-}
-
-}
-
-#endif
--- a/include/isaac/tools/bench.hpp
+++ b/include/isaac/tools/bench.hpp
@@ -1,80 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef BENCH_HPP
-#define BENCH_HPP
-
-#include <chrono>
-#include <algorithm>
-#include <isaac/driver/device.h>
-#include <iostream>
-#include <iomanip>
-#include <iterator>
-
-class Timer
-{
-    typedef std::chrono::high_resolution_clock high_resolution_clock;
-    typedef std::chrono::nanoseconds nanoseconds;
-
-public:
-    explicit Timer(bool run = false)
-    { if (run) start(); }
-
-    void start()
-    { _start = high_resolution_clock::now(); }
-
-    nanoseconds get() const
-    { return std::chrono::duration_cast<nanoseconds>(high_resolution_clock::now() - _start); }
-
-private:
-    high_resolution_clock::time_point _start;
-};
-
-template<class T>
-T min(std::vector<T> x)
-{ return *std::min_element(x.begin(), x.end()); }
-
-
-template<class OP, class SYNC>
-double bench(OP const & op, SYNC const & sync, isaac::driver::Device const & device)
-{
-  Timer tmr;
-  std::vector<size_t> times;
-  double total_time = 0;
-  op();
-  sync();
-  while(total_time*1e-9 < 1e-1){
-    float norm = (float)device.current_sm_clock()/device.max_sm_clock();
-    tmr.start();
-    op();
-    sync();
-    times.push_back(norm*tmr.get().count());
-    total_time+=times.back();
-  }
-  return min(times);
-}
-
-
-template<class T>
-std::string str(T const & x){ return std::to_string(x); }
-
-#endif
--- a/include/isaac/tools/collections.hpp
+++ b/include/isaac/tools/collections.hpp
@@ -1,286 +0,0 @@
-/*
- * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
- *
- * This file is part of ISAAC.
- *
- * ISAAC is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301  USA
- */
-
-#ifndef ISAAC_CPP_COLLECTIONS_HPP
-#define ISAAC_CPP_COLLECTIONS_HPP
-
-#include <vector>
-#include <iostream>
-#include <sstream>
-#include <iterator>
-#include <algorithm>
-#include <numeric>
-#include <memory>
-#include <map>
-#include <set>
-#include <unordered_map>
-#include <unordered_set>
-#include <type_traits>
-#include <deque>
-
-namespace isaac
-{
-namespace cpp
-{
-
-/* ---- Cached Map ----- */
-template<class K, class V>
-class CachedMap{
-public:
-  CachedMap(std::function<V(K const &)> value_maker) : value_maker_(value_maker)
-  { }
-
-  V const & get(K const & key){
-    auto it = cache_.find(key);
-    if(it==cache_.end())
-      return cache_.insert(std::make_pair(key, value_maker_(key))).first->second;
-    return it->second;
-  }
-private:
-  std::map<K, V> cache_;
-  std::function<V(K const &)> value_maker_;
-};
-
-/* ---- Cartesian ---- */
-inline std::vector<std::vector<int>> cartesian(const std::vector<std::vector<int>>& v) {
-  std::vector<std::vector<int>> res = {{}};
-  for (const auto& u : v){
-    std::vector<std::vector<int>> current;
-    for (const auto& x : res)
-      for (const auto y : u){
-        current.push_back(x);
-        current.back().push_back(y);
-      }
-    res = std::move(current);
-  }
-  return res;
-}
-
-/* ---- Tuple ----- */
-
-template<class T>
-class tuple
-{
-  template<class U>
-  friend std::ostream& operator<<(std::ostream & oss, tuple<U> const &);
-public:
-  tuple() {}
-  tuple(std::vector<T> const & list): data_(list){}
-  tuple(std::initializer_list<T> const & list) : data_(list){}
-  tuple(T a) : data_{a} {}
-  tuple(T a, T b) : data_{a, b} {}
-
-  tuple(tuple const & other) = default;
-  tuple(tuple&& other) = default;
-  tuple& operator=(tuple const & other) = default;
-  tuple& operator=(tuple && other) = default;
-
-  typename std::vector<T>::iterator begin() { return data_.begin(); }
-  typename std::vector<T>::const_iterator begin() const { return data_.begin(); }
-  typename std::vector<T>::iterator end() { return data_.end(); }
-  typename std::vector<T>::const_iterator end() const { return data_.end(); }
-
-  size_t size() const { return data_.size(); }
-  T front() const { return data_.front(); }
-  T back() const { return data_.back(); }
-
-  void remove_index(size_t i) { data_.erase(std::next(data_.begin(), i)); }
-
-  T& operator[](size_t i) { return data_[i]; }
-  T operator[](size_t i) const { return data_[i]; }
-
-  bool operator==(tuple const & other) const { return data_==other.data_; }
-  operator std::vector<T>() const { return data_; }
-private:
-  std::vector<T> data_;
-};
-
-template<class T>
-inline std::ostream& operator<<(std::ostream & oss, tuple<T> const &tp)
-{
-  oss << "(";
-  std::copy(tp.data_.begin(), tp.data_.end() - 1, std::ostream_iterator<T>(oss, ","));
-  oss << tp.data_.back();
-  if(tp.size()==1)
-    oss << ",";
-  oss << ")";
-  return oss;
-}
-
-template<class T>
-inline std::string to_string(tuple<T> const & tp)
-{
-  std::ostringstream oss;
-  oss << tp;
-  return oss.str();
-}
-
-template<class T>
-inline void remove_index(std::vector<T>& tp, size_t i)
-{ tp.erase(std::next(tp.begin(), i)); }
-
-template<class T>
-inline T max(std::vector<T> const & tp)
-{ return std::accumulate(tp.begin(), tp.end(), std::numeric_limits<T>::min(), [](T a, T b){ return std::max(a, b); }); }
-
-template<class T>
-inline T min(std::vector<T> const & tp)
-{ return std::accumulate(tp.begin(), tp.end(), std::numeric_limits<T>::max(), [](T a, T b){ return std::min(a, b); }); }
-
-template<class T>
-inline T prod(std::vector<T> const & tp)
-{ return std::accumulate(tp.begin(), tp.end(), 1, std::multiplies<T>()); }
-
-template<class T>
-inline size_t numgt1(std::vector<T> const & tp)
-{ return std::accumulate(tp.begin(), tp.end(), 0, [](size_t a, size_t b){ return a + (b>1); }); }
-
-/* ----- Set/Map ----- */
-
-template<class T>
-struct deref_hash
-{ size_t operator()(T const & x) const { return x.hash();} };
-
-template<class T>
-struct deref_hash<T*>
-{ size_t operator()(T const * x) const { return x->hash();} };
-
-template<class T>
-struct deref_hash<std::shared_ptr<T>>
-{ size_t operator()(std::shared_ptr<T> const & x) const { return x->hash();} };
-
-
-template<class T>
-struct deref_eq
-{ size_t operator()(T const & x, T const & y) const { return x == y;} };
-
-template<class T>
-struct deref_eq<T*>
-{ size_t operator()(T const * x, T const * y) const { return *x == *y;} };
-
-template<class T>
-struct deref_eq<std::shared_ptr<T>>
-{ size_t operator()(std::shared_ptr<T> const & x, std::shared_ptr<T> const & y) const { return *x == *y;} };
-
-template<class KEY>
-using deref_unordered_set = std::unordered_set<KEY, deref_hash<KEY>, deref_eq<KEY>>;
-
-template<class U>
-using set_map = std::map<U, std::set<U>>;
-
-template<class U, class H = std::hash<U>, class E = std::equal_to<U>>
-using unordered_set_map = std::unordered_map<U, std::unordered_set<U,H,E>, H, E>;
-
-template<class T>
-struct is_set_map
-{ static const bool value = false; };
-template<class U>
-struct is_set_map<set_map<U>> { static const bool value = true; };
-template<class U, class H, class E>
-struct is_set_map<unordered_set_map<U,H,E>> { static const bool value = true; };
-
-
-/* ---- Transformations ---- */
-
-//Pairs
-template<class T, class Enable = typename std::enable_if<is_set_map<T>::value>::type>
-std::deque<std::pair<typename T::key_type, typename T::key_type>> pairs(T const & map)
-{
-  typedef typename T::key_type K;
-  std::deque<std::pair<K,K>> result;
-  for(auto const& x: map)
-    for(auto const & y: x.second)
-      result.push_back({x.first, y});
-  return result;
-}
-
-//Invert
-template<class T, class Enable = typename std::enable_if<is_set_map<T>::value>::type>
-static T invert(T const & in)
-{
-  T result;
-  typedef typename T::key_type U;
-  typedef typename T::mapped_type V;
-  for(auto const & x: in){
-    U u = x.first;
-    result.insert({u, V()});
-    for(U v: x.second)
-      result[v].insert(u);
-  }
-  return result;
-}
-
-//Intersect
-template<class T, class H, class E>
-std::unordered_set<T,H,E> intersection(std::unordered_set<T,H,E> const & x,
-                                       std::unordered_set<T,H,E> const & y)
-{
-  if(y.size() < x.size())
-    return intersection(y, x);
-  std::unordered_set<T,H,E> result;
-  for(auto const & u: x)
-    if(y.find(u)!=y.end())
-      result.insert(u);
-  return result;
-}
-
-//Merge
-template<class T>
-typename std::enable_if<!is_set_map<T>::value, T&>::type merge(T& x, T const & y)
-{
-  std::merge(x.begin(), x.end(), y.begin(), y.end(), std::inserter(x, x.end()));
-  return x;
-}
-
-template<class T>
-typename std::enable_if<is_set_map<T>::value, T&>::type merge(T& x, T const & y)
-{
-  for(auto const & p: y) merge(x[p.first], p.second);
-  return x;
-}
-
-//Transfer
-template<class T, class U, class Enable = typename std::enable_if<is_set_map<T>::value>::type>
-void transfer(T& map, U u, U v, typename T::mapped_type const & exclude)
-{
-  for(auto const & x: exclude)
-    map[v].erase(x);
-  merge(map[u], map[v]);
-  for(auto& x: map)
-    x.second.erase(v);
-  map.erase(v);
-}
-
-//subset
-template<class T, class Enable = typename std::enable_if<is_set_map<T>::value>::type>
-T subset(T& map, typename T::mapped_type const & include)
-{
-  T result;
-  for(auto const & e: map)
-    if(include.find(e.first)!=include.end())
-      result[e.first] = cpp::intersection(e.second, include);
-  return result;
-}
-
-}
-}
-
-#endif
--- a/include/isaac/tools/functional.hpp
+++ b/include/isaac/tools/functional.hpp
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
- *
- * This file is part of ISAAC.
- *
- * ISAAC is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301  USA
- */
-
-#ifndef ISAAC_CPP_FUNCTIONAL_HPP
-#define ISAAC_CPP_FUNCTIONAL_HPP
-
-#include <type_traits>
-#include <tuple>
-
-namespace isaac
-{
-namespace cpp
-{
-
-template <typename T>
-struct function_traits
-    : public function_traits<decltype(&T::operator())>
-{};
-// For generic types, directly use the result of the signature of its 'operator()'
-
-template <typename ClassType, typename ReturnType, typename... Args>
-struct function_traits<ReturnType(ClassType::*)(Args...) const>
-// we specialize for pointers to member function
-{
-  enum { arity = sizeof...(Args) };
-  // arity is the number of arguments.
-
-  typedef ReturnType result_type;
-
-  template <size_t i>
-  struct arg
-  {
-    typedef typename std::tuple_element<i, std::tuple<Args...>>::type type;
-    // the i-th argument is equivalent to the i-th tuple element of a tuple
-    // composed of those arguments.
-  };
-};
-
-
-template<class U, class FN, class V>
-V forward_dyncast(U const & x, FN const & fn, V const &backup)
-{
-  typedef typename function_traits<FN>::template arg<0>::type RT;
-  typedef typename std::remove_reference<RT>::type T;
-  if(T const * p = dynamic_cast<T const *>(&x))
-    return fn(*p);
-  return backup;
-}
-
-template<class U, class FN>
-void forward_dyncast(U const & x, FN const & fn)
-{
-  typedef typename function_traits<FN>::template arg<0>::type RT;
-  typedef typename std::remove_reference<RT>::type T;
-  if(T const * p = dynamic_cast<T const *>(&x))
-    fn(*p);
-}
-
-template<class U, class FN>
-bool compare_if_same(U const & base, FN const & f)
-{ return cpp::forward_dyncast(base, f, false); }
-
-}
-}
-
-#endif
--- a/include/isaac/tools/matrix.hpp
+++ b/include/isaac/tools/matrix.hpp
@@ -1,92 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#ifndef ISAAC_TOOLS_MATRIX_HPP_
-#define ISAAC_TOOLS_MATRIX_HPP_
-
-#include <cstddef>
-
-inline void read_inc(void* dst, u_char*& data, size_t nbytes){
-  std::memcpy(dst, (void*)data, nbytes);
-  data += nbytes;
-}
-
-template<class T>
-void gemm(uint32_t M, uint32_t N, uint32_t K, T alpha, T* A, uint32_t lda, T* B, uint32_t ldb, T, T* C, uint32_t ldc, T* bias){
-  for(uint32_t i = 0; i < M ; ++i)
-    for(uint32_t j = 0; j < N ; ++j){
-      T acc = 0;
-      for(uint32_t k = 0; k < K; ++k)
-        acc += A[i*lda + k] * B[k*ldb + j];
-      C[i*ldc + j] = alpha*acc + bias[j];
-    }
-}
-
-template<class T>
-class matrix{
-  typedef std::array<uint32_t, 2> shapes_t;
-
-public:
-  matrix(u_char*& data){
-    read_inc((void*)shapes_.data(), data, 8);
-    values_.resize(shapes_[0]*shapes_[1]);
-    ld_ = shapes_[1];
-    read_inc((void*)values_.data(), data, values_.size()*4);
-    data_ = values_.data();
-  }
-  matrix(shapes_t const & shapes, size_t ld, T* data): shapes_(shapes), ld_(ld), data_(data){}
-  matrix(shapes_t const & shapes): shapes_(shapes), ld_(shapes.back()), values_(shapes[0]*shapes[1]), data_(values_.data()){}
-
-  shapes_t const & shapes() const
-  { return shapes_; }
-
-  T const & operator()(size_t i, size_t j) const
-  { return data_[i*ld_ + j]; }
-  T & operator ()(size_t i, size_t j)
-  { return data_[i*ld_ + j]; }
-
-  T* data() const
-  { return data_; }
-  T* data()
-  { return data_; }
-
-  uint32_t ld() const
-  { return ld_; }
-
-private:
-  shapes_t shapes_;
-  size_t ld_;
-  std::vector<T> values_;
-  T* data_;
-};
-
-template<class T>
-matrix<T> pad_left(matrix<T> const & in, uint32_t npad){
-  uint32_t M = in.shapes()[0], N = in.shapes()[1];
-  matrix<T> result({M, N + npad});
-  for(size_t i = 0; i < M; ++i)
-    for(size_t j = 0; j < N; ++j)
-      result(i,  npad + j) = in(i, j);
-  return result;
-}
-
-#endif
--- a/include/isaac/tools/sys/getenv.hpp
+++ b/include/isaac/tools/sys/getenv.hpp
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
- *
- * This file is part of ISAAC.
- *
- * ISAAC is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301  USA
- */
-
-#ifndef ISAAC_TOOLS_GETENV
-#define ISAAC_TOOLS_GETENV
-
-#include <string>
-#include <cstdlib>
-
-namespace isaac
-{
-
-namespace tools
-{
-
-    inline std::string getenv(const char * name)
-    {
-        #ifdef _MSC_VER
-            char* cache_path = 0;
-            std::size_t sz = 0;
-            _dupenv_s(&cache_path, &sz, name);
-        #else
-            const char * cache_path = std::getenv(name);
-        #endif
-        if(!cache_path)
-            return "";
-        std::string result(cache_path);
-        #ifdef _MSC_VER
-            free(cache_path);
-        #endif
-        return result;
-    }
-
-}
-
-}
-
-#endif
--- a/include/isaac/tools/sys/mkdir.hpp
+++ b/include/isaac/tools/sys/mkdir.hpp
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
- *
- * This file is part of ISAAC.
- *
- * ISAAC is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301  USA
- */
-
-#ifndef ISAAC_TOOLS_MKDIR
-#define ISAAC_TOOLS_MKDIR
-
-#include <cstring>
-#include <string>
-#include <cstdlib>
-#include <sys/stat.h>
-#include <errno.h>
-#if defined(_WIN32)
-  #include <direct.h>
-#endif
-namespace isaac
-{
-
-namespace tools
-{
-
-    inline int mkdir(std::string const & path)
-    {
-        #if defined(_WIN32)
-            return _mkdir(path.c_str());
-        #else
-            return ::mkdir(path.c_str(), 0777);
-        #endif
-    }
-
-    inline int mkpath(std::string const & path)
-    {
-        int status = 0;
-        size_t pp = 0;
-        size_t sp;
-        while ((sp = path.find('/', pp)) != std::string::npos)
-        {
-            if (sp != pp){
-                status = mkdir(path.substr(0, sp));
-            }
-            pp = sp + 1;
-        }
-        return (status==0 || errno==EEXIST)?0:-1;
-    }
-
-}
-
-}
-
-#endif
--- a/lib/api.cpp
+++ b/lib/api.cpp
@@ -1,142 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include "isaac/api.h"
-
-namespace isaac{
-
-inline size_t num_re_evaluate(size_t optimization_level){
-    if(optimization_level <= 1)
-        return 1;
-    return 5*optimization_level;
-}
-
-void GEMM(driver::Device const &, driver::Stream & stream,
-          DType in_dtype, DType out_dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K,
-          param_t offa, param_t lda, param_t offb, param_t ldb, param_t offc, param_t ldc,
-          scalar const & alpha, driver::Buffer const & A, driver::Buffer const & B, scalar const & beta, driver::Buffer& C,
-          float a_scale, float b_scale, float c_scale,
-          const driver::Buffer *bias,
-          templates::GEMM* generator, size_t optimization_level)
-{
-  typedef std::tuple<driver::Stream, DType, DType, IsaacOperation_t, IsaacOperation_t, std::vector<param_t>> key_type;
-  // Build the generator if necessary
-  static cpp::CachedMap<key_type, std::shared_ptr<templates::GEMM>> inference([optimization_level](key_type const & key){
-    driver::Stream & stream = (driver::Stream&)std::get<0>(key);
-    DType in_dtype = std::get<1>(key);
-    DType out_dtype = std::get<2>(key);
-    IsaacOperation_t AT = std::get<3>(key), BT = std::get<4>(key);
-    runtime::GEMMProfile* profile = (runtime::GEMMProfile*)runtime::database.at({stream.context().device().architecture(), runtime::GEMM}).get();
-    std::vector<param_t> const & x = std::get<5>(key);
-    templates::GEMM result = profile->predict(stream, in_dtype, out_dtype, AT, BT, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], num_re_evaluate(optimization_level));
-    return std::make_shared<templates::GEMM>(result);
-  });
-
-  // Build the kernel
-  static cpp::CachedMap<std::pair<driver::Stream, templates::GEMM*>, std::shared_ptr<driver::Kernel>> kernels([](std::pair<driver::Stream, templates::GEMM*> key){
-    driver::Context const & context = key.first.context();
-    driver::Module module(context, key.second->dump(context.device(), "gemm"));
-    return std::make_shared<driver::Kernel>(module, "gemm");
-  });
-
-  //Retrieve profile/kernel and execute
-  if(generator == NULL)
-    generator = inference.get(key_type(stream, in_dtype, out_dtype, AT, BT, {M, N, K, offa, lda, offb, ldb, offc, ldc})).get();
-  generator->enqueue(*kernels.get(std::make_pair(stream, generator)), stream, alpha, A, B, beta, C, a_scale, b_scale, c_scale, bias);
-
-}
-
-void CONV(driver::Device const &, driver::Stream & stream,
-          DType in_dtype, DType out_dtype, param_t N, param_t K, param_t M, param_t P, param_t Q, param_t C, param_t T, param_t R, param_t S,
-          param_t D, param_t H, param_t W,
-          param_t pad_d, param_t pad_h, param_t pad_w,
-          param_t stride_d, param_t stride_h, param_t stride_w,
-          param_t upsample_d, param_t upsample_h, param_t upsample_w,
-          driver::Buffer const & I, driver::Buffer const & F, driver::Buffer* O, param_t num_outputs,
-          driver::Buffer const * bias,
-          ActivationType activation, float alpha,
-          float iscale, float fscale, std::vector<float> const & oscale, float z_scale,
-          ResidualType residual, param_t Zk, param_t crop_z_m0, param_t crop_z_m1, param_t crop_z_p0, param_t crop_z_p1, param_t crop_z_q0, param_t crop_z_q1, driver::Buffer const *Z,
-          templates::Conv* generator, size_t optimization_level)
-{
-
-  typedef std::tuple<driver::Stream, DType, DType, std::vector<param_t>> key_type;
-  // Build the generator if necessary
-  static cpp::CachedMap<key_type, std::shared_ptr<templates::Conv>> inference([optimization_level](key_type const & key){
-    driver::Stream & stream = (driver::Stream&)std::get<0>(key);
-    DType in_dtype = std::get<1>(key);
-    DType out_dtype = std::get<2>(key);
-    std::vector<param_t> const & x = std::get<3>(key);
-    runtime::ConvProfile* profile = (runtime::ConvProfile*)runtime::database.at({stream.context().device().architecture(), runtime::CONV}).get();
-    templates::Conv result = profile->predict(stream, in_dtype, out_dtype, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15], x[16], x[17], x[18], x[19], x[20], (ActivationType)x[21], x[22], (ResidualType)x[23], x[24], x[25], x[26], x[27], x[28], x[29], x[30], num_re_evaluate(optimization_level));
-    return std::make_shared<templates::Conv>(result);
-  });
-
-  // Build the kernel
-  static cpp::CachedMap<std::pair<driver::Stream, templates::Conv*>, std::shared_ptr<driver::Kernel>> kernels([](std::pair<driver::Stream, templates::Conv*> const & key){
-    driver::Context const & context = key.first.context();
-    driver::Module module(context, key.second->dump(context.device(), "conv"));
-    return std::make_shared<driver::Kernel>(module, "conv");
-  });
-
-
-  //Retrieve profile/kernel and execute
-  if(generator == NULL)
-    generator = inference.get(key_type(stream, in_dtype, out_dtype, {C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, num_outputs, residual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1})).get();
-  generator->enqueue(*kernels.get(std::make_pair(stream, generator)), stream,  I, F, O, bias, alpha, iscale, fscale, oscale, z_scale, Z);
-}
-
-
-void POOL(driver::Device const &, driver::Stream & stream,
-          DType in_dtype, DType out_dtype, PoolType pool_type, param_t C, param_t M, param_t P, param_t Q, param_t N, param_t T, param_t R, param_t S,
-          param_t D, param_t H, param_t W, param_t pad_d, param_t pad_h, param_t pad_w, param_t stride_d, param_t stride_h, param_t stride_w,
-          driver::Buffer const & I, driver::Buffer& O,
-          float iscale, float oscale,
-          templates::Pool* generator, size_t optimization_level)
-{
-  typedef std::tuple<driver::Stream, DType, DType, std::vector<param_t>> key_type;
-  // Build the generator if necessary
-  static cpp::CachedMap<key_type, std::shared_ptr<templates::Pool>> inference([optimization_level](key_type const & key){
-    driver::Stream & stream = (driver::Stream&)std::get<0>(key);
-    runtime::PoolProfile* profile = (runtime::PoolProfile*)runtime::database.at({stream.context().device().architecture(), runtime::POOL}).get();
-    DType in_dtype = std::get<1>(key);
-    DType out_dtype = std::get<2>(key);
-    std::vector<param_t> const & x = std::get<3>(key);
-    templates::Pool result = profile->predict(stream, in_dtype, out_dtype, (PoolType)x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15], x[16], x[17], num_re_evaluate(optimization_level));
-    return std::make_shared<templates::Pool>(result);
-  });
-  // Build the kernel
-  static cpp::CachedMap<std::pair<driver::Stream, templates::Pool*>, std::shared_ptr<driver::Kernel>> kernels([](std::pair<driver::Stream, templates::Pool*> const & key){
-    driver::Context const & context = key.first.context();
-    driver::Module module(context, key.second->dump(context.device(), "pool"));
-    return std::make_shared<driver::Kernel>(module, "pool");
-  });
-
-  //Retrieve profile/kernel and execute
-  if(generator == NULL)
-    generator = inference.get(key_type(stream, in_dtype, out_dtype, {pool_type, C, D, H, W, N, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w})).get();
-  generator->enqueue(*kernels.get(std::make_pair(stream, generator)), stream, I, O, iscale, oscale);
-}
-
-
-
-}
--- a/lib/driver/backend.cpp
+++ b/lib/driver/backend.cpp
@@ -1,196 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include "isaac/driver/dispatch.h"
-#include "isaac/driver/backend.h"
-#include "isaac/driver/buffer.h"
-#include "isaac/driver/context.h"
-#include "isaac/driver/stream.h"
-#include "isaac/driver/kernel.h"
-
-#include <assert.h>
-#include <stdexcept>
-#include <vector>
-
-namespace isaac
-{
-
-namespace driver
-{
-
-/*-----------------------------------*/
-//---------- Modules ----------------*/
-/*-----------------------------------*/
-
-void backend::modules::release(){
-  for(auto & x: cache_)
-    delete x.second;
-  cache_.clear();
-}
-
-Module& backend::modules::get(Stream const & stream, std::string const & name, std::string const & src){
-  std::tuple<Stream, std::string> key(stream, name);
-  if(cache_.find(key)==cache_.end())
-    return *cache_.insert(std::make_pair(key, new Module(stream.context(), src))).first->second;
-  return *cache_.at(key);
-}
-
-std::map<std::tuple<Stream, std::string>, Module * >  backend::modules::cache_;
-
-/*-----------------------------------*/
-//-----------  Kernels --------------*/
-/*-----------------------------------*/
-
-void backend::kernels::release(){
-  for(auto & x: cache_)
-    delete x.second;
-  cache_.clear();
-}
-
-Kernel & backend::kernels::get(Module const & program, std::string const & name){
-  std::tuple<Module, std::string> key(program, name);
-  if(cache_.find(key)==cache_.end())
-    return *cache_.insert(std::make_pair(key, new Kernel(program, name.c_str()))).first->second;
-  return *cache_.at(key);
-}
-
-std::map<std::tuple<Module, std::string>, Kernel * > backend::kernels::cache_;
-
-/*-----------------------------------*/
-//------------  Queues --------------*/
-/*-----------------------------------*/
-
-void backend::streams::init(std::list<const Context *> const & contexts){
-  for(Context const * ctx : contexts)
-    if(cache_.find(*ctx)==cache_.end())
-      cache_.insert(std::make_pair(*ctx, std::vector<Stream*>{new Stream(*ctx)}));
-}
-
-void backend::streams::release(){
-  for(auto & x: cache_)
-    for(auto & y: x.second)
-      delete y;
-  cache_.clear();
-}
-
-Stream & backend::streams::get_default()
-{ return get(contexts::get_default(), 0); }
-
-Stream & backend::streams::get(Context const & context, unsigned int id){
-  init(std::list<Context const *>(1,&context));
-  for(auto & x : cache_)
-    if(x.first==context)
-      return *x.second[id];
-  throw;
-}
-
-void backend::streams::get(Context const & context, std::vector<Stream*> & queues){
-  init(std::list<Context const *>(1,&context));
-  queues = cache_.at(context);
-}
-
-std::map<Context, std::vector<Stream*> > backend::streams::cache_;
-
-/*-----------------------------------*/
-//------------  Contexts ------------*/
-/*-----------------------------------*/
-
-void backend::contexts::init(std::vector<Platform> const & platforms){
-  for(Platform const & platform: platforms){
-    for(Device const & device: platform.devices())
-      cache_.push_back(new Context(device));
-  }
-}
-
-void backend::contexts::release(){
-  for(auto & x: cache_)
-    delete x;
-  cache_.clear();
-}
-
-Context const & backend::contexts::get_default(){
-  backend::init();
-  std::list<Context const *>::const_iterator it = cache_.begin();
-  std::advance(it, default_device);
-  return **it;
-}
-
-void backend::contexts::get(std::list<Context const *> & contexts){
-  backend::init();
-  contexts = cache_;
-}
-
-std::list<Context const *> backend::contexts::cache_;
-
-
-
-/*-----------------------------------*/
-//------------  General -------------*/
-/*-----------------------------------*/
-
-std::vector<Device> backend::devices(){
-  std::vector<Platform> platforms = backend::platforms();
-  std::vector<Device> result;
-  for(Platform const & platform: platforms){
-    auto devices = platform.devices();
-    result.insert(result.end(), devices.begin(), devices.end());
-  }
-  return result;
-}
-
-std::vector<Platform> backend::platforms(){
-  std::vector<Platform> platforms;
-  //if CUDA is here
-  if(dispatch::cuinit())
-    platforms.push_back(Platform());
-  if(platforms.empty())
-    throw std::runtime_error("ISAAC: No backend available. Make sure CUDA is available in your library path");
-  return platforms;
-}
-
-void backend::synchronize(Context const & context){
-  for(Stream * queue: streams::cache_.at(context))
-    queue->synchronize();
-}
-
-
-void backend::release(){
-  backend::kernels::release();
-//  backend::programs::release();
-  backend::streams::release();
-  backend::contexts::release();
-}
-
-
-void backend::init(){
-  if(!contexts::cache_.empty())
-    return;
-  std::vector<Platform> platforms = backend::platforms();
-  contexts::init(platforms);
-  streams::init(contexts::cache_);
-}
-
-unsigned int backend::default_device = 0;
-
-}
-
-}
--- a/lib/driver/buffer.cpp
+++ b/lib/driver/buffer.cpp
@@ -1,60 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <iostream>
-#include "isaac/driver/stream.h"
-#include "isaac/driver/buffer.h"
-#include "isaac/driver/context.h"
-#include "isaac/driver/dispatch.h"
-
-
-namespace isaac
-{
-
-namespace driver
-{
-
-Buffer::Buffer(Context const & context, size_t size) : context_(context)
-{
-  ContextSwitcher ctx_switch(context_);
-  dispatch::cuMemAlloc(&*cu_, size);
-}
-
-Buffer::Buffer(Context const & context, CUdeviceptr cu, bool take_ownership):
-  context_(context), cu_(cu, take_ownership)
-{ }
-
-void Buffer::set_zero(Stream const & queue, size_t size)
-{
-  ContextSwitcher ctx_switch(context_);
-  dispatch::cuMemsetD8Async(*cu_, 0, size, queue);
-}
-
-Handle<CUdeviceptr> const & Buffer::cu() const
-{ return cu_; }
-
-Handle<CUdeviceptr> & Buffer::cu()
-{ return cu_; }
-
-}
-
-}
--- a/lib/driver/context.cpp
+++ b/lib/driver/context.cpp
@@ -1,99 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <iostream>
-#include <cassert>
-
-#include "isaac/driver/context.h"
-#include "isaac/driver/module.h"
-
-#include "isaac/tools/sys/getenv.hpp"
-#include "isaac/tools/sys/mkdir.hpp"
-
-namespace isaac
-{
-
-namespace driver
-{
-
-std::string Context::get_cache_path(){
-  //user-specified cache path
-  std::string result = tools::getenv("ISAAC_CACHE_PATH");
-  if(!result.empty()){
-    if(tools::mkpath(result)==0)
-      return result;
-  }
-  //create in home
-  result = tools::getenv("HOME");
-  if(!result.empty())
-  {
-    result = result + "/.isaac/cache/";
-    if(tools::mkpath(result)==0)
-      return result;
-  }
-  //couldn't find a directory
-  return "";
-}
-
-CUdevice Context::device(CUcontext context){
-  dispatch::cuCtxPushCurrent_v2(context);
-  CUdevice res;
-  dispatch::cuCtxGetDevice(&res);
-  dispatch::cuCtxPopCurrent_v2(NULL);
-  return res;
-}
-
-Context::Context(CUcontext context, bool take_ownership): cu_(context, take_ownership), device_(device(context), false), cache_path_(get_cache_path())
-{ }
-
-Context::Context(Device const & device): device_(device), cache_path_(get_cache_path())
-{
-  dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, (CUdevice)device);
-  dispatch::cuCtxPopCurrent_v2(NULL);
-}
-
-Device const & Context::device() const
-{ return device_; }
-
-std::string const & Context::cache_path() const
-{ return cache_path_; }
-
-Handle<CUcontext> const & Context::cu() const
-{ return cu_; }
-
-/* Context Switcher */
-ContextSwitcher::ContextSwitcher(Context const & ctx): ctx_(ctx)
-{
-  dispatch::cuCtxPushCurrent_v2(ctx_);
-}
-
-ContextSwitcher::~ContextSwitcher()
-{
-  CUcontext tmp;
-  dispatch::cuCtxPopCurrent_v2(&tmp);
-  assert(tmp==(CUcontext)ctx_ && "Switching back to invalid context!");
-}
-
-
-
-}
-}
--- a/lib/driver/device.cpp
+++ b/lib/driver/device.cpp
@@ -1,197 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <map>
-#include <algorithm>
-#include <sstream>
-#include <cstring>
-#include <memory>
-
-#include "isaac/driver/device.h"
-
-namespace isaac
-{
-
-namespace driver
-{
-
-/* Architecture [NVidia] */
-Device::Architecture Device::nv_arch(std::pair<unsigned int, unsigned int> sm) const{
-  switch(sm.first)
-  {
-   case 7:
-     switch(sm.second)
-     {
-     case 0: return Architecture::SM_7_0;
-     }
-
-  case 6:
-    switch(sm.second)
-    {
-    case 0: return Architecture::SM_6_0;
-    case 1: return Architecture::SM_6_1;
-    }
-
-  case 5:
-    switch(sm.second)
-    {
-    case 0: return Architecture::SM_5_0;
-    case 2: return Architecture::SM_5_2;
-    default: return Architecture::UNKNOWN;
-    }
-
-  case 3:
-    switch(sm.second)
-    {
-    case 0: return Architecture::SM_3_0;
-    case 5: return Architecture::SM_3_5;
-    case 7: return Architecture::SM_3_7;
-    default: return Architecture::UNKNOWN;
-    }
-
-  case 2:
-    switch(sm.second)
-    {
-    case 0: return Architecture::SM_2_0;
-    case 1: return Architecture::SM_2_1;
-    default: return Architecture::UNKNOWN;
-    }
-
-  default: return Architecture::UNKNOWN;
-  }
-}
-
-template<CUdevice_attribute attr>
-int Device::cuGetInfo() const{
-  int res;
-  dispatch::cuDeviceGetAttribute(&res, attr, *cu_);
-  return res;
-}
-
-nvmlDevice_t Device::nvml_device() const{
-  std::map<std::string, nvmlDevice_t> map;
-  std::string key = pci_bus_id();
-  if(map.find(key)==map.end()){
-    nvmlDevice_t device;
-    dispatch::nvmlDeviceGetHandleByPciBusId_v2(key.c_str(), &device);
-    return map.insert(std::make_pair(key, device)).first->second;
-  }
-  return map.at(key);
-}
-
-/* Architecture */
-Device::Architecture Device::architecture() const
-{  return nv_arch(compute_capability()); }
-
-/* Attributes */
-size_t Device::address_bits() const
-{ return sizeof(size_t)*8; }
-
-driver::Platform Device::platform() const
-{ return Platform(); }
-
-std::string Device::name() const{
-    char tmp[128];
-    dispatch::cuDeviceGetName(tmp, 128, *cu_);
-    return std::string(tmp);
-}
-
-std::string Device::pci_bus_id() const{
-  char tmp[128];
-  dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_);
-  return std::string(tmp);
-}
-
-void Device::interpret_as(std::pair<size_t, size_t> cc){
-  interpreted_as_ = std::make_shared<std::pair<size_t, size_t>>(cc);
-}
-
-std::pair<size_t, size_t> Device::compute_capability() const{
-  if(interpreted_as_)
-    return *interpreted_as_;
-  size_t _major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>();
-  size_t _minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
-  return std::make_pair(_major, _minor);
-}
-
-size_t Device::max_threads_per_block() const
-{ return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK>(); }
-
-size_t Device::max_shared_memory() const
-{ return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK>(); }
-
-size_t Device::warp_size() const
-{ return cuGetInfo<CU_DEVICE_ATTRIBUTE_WARP_SIZE>(); }
-
-
-std::vector<size_t> Device::max_block_dim() const{
-  std::vector<size_t> result(3);
-  result[0] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X>();
-  result[1] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y>();
-  result[2] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z>();
-  return result;
-}
-
-size_t Device::current_sm_clock() const{
-  unsigned int result;
-  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
-  return result;
-}
-
-size_t Device::max_sm_clock() const{
-  unsigned int result;
-  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
-  return result;
-}
-
-
-size_t Device::current_mem_clock() const{
-  unsigned int result;
-  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
-  return result;
-}
-
-size_t Device::max_mem_clock() const{
-  unsigned int result;
-  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
-  return result;
-}
-
-/* Infos */
-std::string Device::infos() const{
-  std::ostringstream oss;
-  std::vector<size_t> max_wi_sizes = max_block_dim();
-  oss << "Platform: " << platform().name() << std::endl;
-  oss << "Name: " << name() << std::endl;
-  oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl;
-  oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl;
-  oss << "Local memory size: " << max_shared_memory() << std::endl;
-  return oss.str();
-}
-
-Handle<CUdevice> const & Device::cu() const
-{ return cu_; }
-
-}
-
-}
-
--- a/lib/driver/dispatch.cpp
+++ b/lib/driver/dispatch.cpp
@@ -1,363 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <map>
-#include "isaac/driver/dispatch.h"
-#include "isaac/driver/context.h"
-
-namespace isaac
-{
-namespace driver
-{
-
-//Helpers for function definition
-#define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }
-
-#define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }
-
-#define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }
-
-#define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }
-
-#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }
-
-#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }
-
-#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }
-
-#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }
-
-#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }
-
-#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }
-
-#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }
-
-#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }
-
-#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
-
-#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }
-
-//Specialized helpers for CUDA
-#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
-#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
-#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
-#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
-#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
-#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
-#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
-#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
-#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
-#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
-#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
-
-#define NVRTC_DEFINE1(ret, fname, t1) DEFINE1(nvrtcinit, nvrtc_, ret, fname, t1)
-#define NVRTC_DEFINE2(ret, fname, t1, t2) DEFINE2(nvrtcinit, nvrtc_, ret, fname, t1, t2)
-#define NVRTC_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3)
-#define NVRTC_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4)
-#define NVRTC_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5)
-#define NVRTC_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6)
-#define NVRTC_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
-#define NVRTC_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
-#define NVRTC_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
-#define NVRTC_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
-#define NVRTC_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
-
-#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
-#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
-#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
-#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
-
-#define CUBLAS_DEFINE1(ret, fname, t1) DEFINE1(cublasinit, cublas_, ret, fname, t1)
-#define CUBLAS_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cublasinit, cublas_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13)
-#define CUBLAS_DEFINE19(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) DEFINE19(cublasinit, cublas_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19)
-
-#define CUDNN_DEFINE1(ret, fname, t1) DEFINE1(cudnninit, cudnn_, ret, fname, t1)
-#define CUDNN_DEFINE2(ret, fname, t1, t2) DEFINE2(cudnninit, cudnn_, ret, fname, t1, t2)
-#define CUDNN_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cudnninit, cudnn_, ret, fname, t1, t2, t3)
-#define CUDNN_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5)
-#define CUDNN_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6)
-#define CUDNN_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
-#define CUDNN_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
-#define CUDNN_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13)
-
-
-bool dispatch::cuinit(){
-  if(cuda_==nullptr)
-    cuda_ = dlopen("libcuda.so", RTLD_LAZY);
-  CUresult (*fptr)(unsigned int);
-	cuInit_ = dlsym(cuda_, "cuInit");
-	*reinterpret_cast<void **>(&fptr) = cuInit_;
-	CUresult res = (*fptr)(0);
-	check(res);
-  return cuda_ != nullptr;
-}
-
-bool dispatch::nvrtcinit(){
-  if(nvrtc_==nullptr)
-    nvrtc_ = dlopen("libnvrtc.so", RTLD_LAZY);
-  return nvrtc_ != nullptr;
-}
-
-bool dispatch::nvmlinit(){
-  if(nvml_==nullptr)
-    nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
-  nvmlReturn_t (*fptr)();
-  nvmlInit_v2_ = dlsym(nvml_, "nvmlInit_v2");
-  *reinterpret_cast<void **>(&fptr) = nvmlInit_v2_;
-  nvmlReturn_t res = (*fptr)();
-  check(res);
-  return res;
-}
-
-bool dispatch::cublasinit(){
-  if(cublas_==nullptr)
-    cublas_ = dlopen("libcublas.so", RTLD_LAZY);
-  return cublas_ != nullptr;
-}
-
-bool dispatch::cudnninit(){
-  if(cudnn_==nullptr)
-    cudnn_ = dlopen("libcudnn.so", RTLD_LAZY);
-  return cudnn_ != nullptr;
-}
-
-//CUDA
-CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
-CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
-CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
-CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
-CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
-CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
-CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
-CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
-CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
-CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
-CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
-CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
-
-CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
-CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
-CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
-CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
-CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
-CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
-CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
-CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
-CUDA_DEFINE1(CUresult, cuInit, unsigned int)
-CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
-CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
-CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
-CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
-CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
-CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
-CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
-CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
-CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
-CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*)
-CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext)
-CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
-CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext)
-CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
-
-NVRTC_DEFINE3(nvrtcResult, nvrtcCompileProgram, nvrtcProgram, int, const char **)
-NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLogSize, nvrtcProgram, size_t *)
-NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTX, nvrtcProgram, char *)
-NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTXSize, nvrtcProgram, size_t *)
-NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **)
-NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *)
-
-NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
-NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
-NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
-
-cublasHandle_t dispatch::cublasHandle(Context const & ctx){
-  static std::map<Context, cublasHandle_t> handles;
-  auto pr = handles.insert({ctx, cublasHandle_t()});
-  if(pr.second)
-    cublasCreate_v2(&pr.first->second);
-  return pr.first->second;
-}
-
-cudnnHandle_t dispatch::cudnnHandle(Context const & ctx){
-  static std::map<Context, cudnnHandle_t> handles;
-  auto pr = handles.insert({ctx, cudnnHandle_t()});
-  if(pr.second)
-    cudnnCreate(&pr.first->second);
-  return pr.first->second;
-}
-
-CUBLAS_DEFINE1(cublasStatus_t, cublasCreate_v2, cublasHandle_t*)
-cublasStatus_t dispatch::cublasGetStream_v2(cublasHandle_t h, cudaStream_t *a)
-{ return f_impl<dispatch::cublasinit>(cublas_, cublasGetStream_v2, cublasGetStream_v2_, "cublasGetStream_v2", h, a); }
-cublasStatus_t dispatch::cublasSetStream_v2(cublasHandle_t h, cudaStream_t a)
-{ return f_impl<dispatch::cublasinit>(cublas_, cublasSetStream_v2, cublasSetStream_v2_, "cublasSetStream_v2", h, a); }
-cublasStatus_t dispatch::cublasSgemm_v2(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc)
-{ return f_impl<dispatch::cublasinit>(cublas_, cublasSgemm_v2, cublasSgemm_v2_, "cublasSgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
-cublasStatus_t dispatch::cublasDgemm_v2(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc)
-{ return f_impl<dispatch::cublasinit>(cublas_, cublasDgemm_v2, cublasDgemm_v2_, "cublasDgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
-cublasStatus_t dispatch::cublasHgemm(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc)
-{ return f_impl<dispatch::cublasinit>(cublas_, cublasHgemm, cublasHgemm_, "cublasHgemm", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
-CUBLAS_DEFINE19(cublasStatus_t, cublasGemmEx, cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void*, const void*, cudaDataType, int, const void*, cudaDataType, int, const void*, void*, cudaDataType, int, cudaDataType, cublasGemmAlgo_t)
-
-//cuDNN
-CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateConvolutionDescriptor, cudnnConvolutionDescriptor_t*)
-CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateTensorDescriptor, cudnnTensorDescriptor_t*)
-CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateFilterDescriptor, cudnnFilterDescriptor_t*)
-CUDNN_DEFINE1(cudnnStatus_t, cudnnCreate, cudnnHandle_t*)
-CUDNN_DEFINE7(cudnnStatus_t, cudnnSetTensor4dDescriptor, cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int)
-CUDNN_DEFINE7(cudnnStatus_t, cudnnSetFilter4dDescriptor, cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int)
-CUDNN_DEFINE5(cudnnStatus_t, cudnnSetTensorNdDescriptorEx, cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int*)
-CUDNN_DEFINE5(cudnnStatus_t, cudnnSetFilterNdDescriptor, cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int*)
-CUDNN_DEFINE1(cudnnStatus_t, cudnnCreatePoolingDescriptor, cudnnPoolingDescriptor_t*)
-CUDNN_DEFINE7(cudnnStatus_t, cudnnSetPoolingNdDescriptor, cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int*, const int*, const int*)
-CUDNN_DEFINE8(cudnnStatus_t, cudnnPoolingForward, cudnnHandle_t, const cudnnPoolingDescriptor_t, const void*, const cudnnTensorDescriptor_t, const void*, const void*, const cudnnTensorDescriptor_t, void*)
-
-
-CUDNN_DEFINE8(cudnnStatus_t, cudnnSetConvolution2dDescriptor, cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t)
-CUDNN_DEFINE7(cudnnStatus_t, cudnnSetConvolutionNdDescriptor, cudnnConvolutionDescriptor_t, int, const int*, const int*, const int*, cudnnConvolutionMode_t, cudnnDataType_t)
-CUDNN_DEFINE8(cudnnStatus_t, cudnnGetConvolutionForwardAlgorithm, cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *)
-CUDNN_DEFINE7(cudnnStatus_t, cudnnGetConvolutionForwardWorkspaceSize, cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t*)
-CUDNN_DEFINE13(cudnnStatus_t, cudnnConvolutionForward, cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *)
-CUDNN_DEFINE2(cudnnStatus_t, cudnnSetStream, cudnnHandle_t, cudaStream_t)
-CUDNN_DEFINE7(cudnnStatus_t, cudnnTransformTensor, cudnnHandle_t, const void*, const cudnnTensorDescriptor_t, const void*, const void*, const cudnnTensorDescriptor_t, void*)
-
-
-void dispatch::release(){
-  if(cuda_){
-    dlclose(cuda_);
-    cuda_ = nullptr;
-  }
-  if(nvrtc_){
-    dlclose(nvrtc_);
-    nvrtc_ = nullptr;
-  }
-  if(cublas_){
-    dlclose(cublas_);
-    cublas_ = nullptr;
-  }
-  if(cudnn_){
-    dlclose(cudnn_);
-    cudnn_ = nullptr;
-  }
-}
-
-void* dispatch::cuda_;
-void* dispatch::nvrtc_;
-void* dispatch::nvml_;
-void* dispatch::cublas_;
-void* dispatch::cudnn_;
-
-//CUDA
-void* dispatch::cuCtxGetCurrent_;
-void* dispatch::cuCtxSetCurrent_;
-void* dispatch::cuCtxDestroy_v2_;
-void* dispatch::cuEventCreate_;
-void* dispatch::cuDeviceGet_;
-void* dispatch::cuMemcpyDtoH_v2_;
-void* dispatch::cuStreamCreate_;
-void* dispatch::cuEventElapsedTime_;
-void* dispatch::cuMemFree_v2_;
-void* dispatch::cuMemcpyDtoHAsync_v2_;
-void* dispatch::cuDriverGetVersion_;
-void* dispatch::cuDeviceGetName_;
-void* dispatch::cuDeviceGetPCIBusId_;
-void* dispatch::cuModuleGetGlobal_v2_;
-
-void* dispatch::cuMemcpyHtoDAsync_v2_;
-void* dispatch::cuModuleLoad_;
-void* dispatch::cuLaunchKernel_;
-void* dispatch::cuModuleUnload_;
-void* dispatch::cuModuleLoadDataEx_;
-void* dispatch::cuDeviceGetAttribute_;
-void* dispatch::cuDeviceGetCount_;
-void* dispatch::cuMemcpyHtoD_v2_;
-void* dispatch::cuInit_;
-void* dispatch::cuEventRecord_;
-void* dispatch::cuCtxCreate_v2_;
-void* dispatch::cuModuleGetFunction_;
-void* dispatch::cuStreamSynchronize_;
-void* dispatch::cuStreamDestroy_v2_;
-void* dispatch::cuEventDestroy_v2_;
-void* dispatch::cuMemAlloc_v2_;
-void* dispatch::cuPointerGetAttribute_;
-void* dispatch::cuCtxGetDevice_;
-void* dispatch::cuMemsetD8Async_;
-void* dispatch::cuCtxPushCurrent_v2_;
-void* dispatch::cuCtxPopCurrent_v2_;
-
-void* dispatch::nvrtcCompileProgram_;
-void* dispatch::nvrtcGetProgramLogSize_;
-void* dispatch::nvrtcGetPTX_;
-void* dispatch::nvrtcGetPTXSize_;
-void* dispatch::nvrtcCreateProgram_;
-void* dispatch::nvrtcGetProgramLog_;
-
-void* dispatch::nvmlInit_v2_;
-void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
-void* dispatch::nvmlDeviceGetClockInfo_;
-void* dispatch::nvmlDeviceGetMaxClockInfo_;
-
-void* dispatch::cublasCreate_v2_;
-void* dispatch::cublasGetStream_v2_;
-void* dispatch::cublasSetStream_v2_;
-void* dispatch::cublasHgemm_;
-void* dispatch::cublasSgemm_v2_;
-void* dispatch::cublasDgemm_v2_;
-void* dispatch::cublasGemmEx_;
-
-void* dispatch::cudnnCreateConvolutionDescriptor_;
-void* dispatch::cudnnCreatePoolingDescriptor_;
-void* dispatch::cudnnCreateTensorDescriptor_;
-void* dispatch::cudnnCreateFilterDescriptor_;
-void* dispatch::cudnnCreate_;
-void* dispatch::cudnnSetTensor4dDescriptor_;
-void* dispatch::cudnnSetFilter4dDescriptor_;
-void* dispatch::cudnnSetTensorNdDescriptorEx_;
-void* dispatch::cudnnSetFilterNdDescriptor_;
-void* dispatch::cudnnSetPoolingNdDescriptor_;
-void* dispatch::cudnnSetConvolution2dDescriptor_;
-void* dispatch::cudnnSetConvolutionNdDescriptor_;
-void* dispatch::cudnnGetConvolutionForwardAlgorithm_;
-void* dispatch::cudnnGetConvolutionForwardWorkspaceSize_;
-void* dispatch::cudnnConvolutionForward_;
-void* dispatch::cudnnPoolingForward_;
-void* dispatch::cudnnSetStream_;
-void* dispatch::cudnnTransformTensor_;
-
-}
-}
--- a/lib/driver/error.cpp
+++ b/lib/driver/error.cpp
@@ -1,155 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include "isaac/driver/error.h"
-
-namespace isaac
-{
-namespace driver
-{
-
-void check(CUresult err)
-{
-  using namespace exception::cuda;
-  switch(err)
-  {
-  case CUDA_SUCCESS                              : break;
-  case CUDA_ERROR_INVALID_VALUE                  : throw invalid_value();
-  case CUDA_ERROR_OUT_OF_MEMORY                  : throw out_of_memory();
-  case CUDA_ERROR_NOT_INITIALIZED                : throw not_initialized();
-  case CUDA_ERROR_DEINITIALIZED                  : throw deinitialized();
-  case CUDA_ERROR_PROFILER_DISABLED              : throw profiler_disabled();
-  case CUDA_ERROR_PROFILER_NOT_INITIALIZED       : throw profiler_not_initialized();
-  case CUDA_ERROR_PROFILER_ALREADY_STARTED       : throw profiler_already_started();
-  case CUDA_ERROR_PROFILER_ALREADY_STOPPED       : throw profiler_already_stopped();
-  case CUDA_ERROR_NO_DEVICE                      : throw no_device();
-  case CUDA_ERROR_INVALID_DEVICE                 : throw invalid_device();
-  case CUDA_ERROR_INVALID_IMAGE                  : throw invalid_image();
-  case CUDA_ERROR_INVALID_CONTEXT                : throw invalid_context();
-  case CUDA_ERROR_CONTEXT_ALREADY_CURRENT        : throw context_already_current();
-  case CUDA_ERROR_MAP_FAILED                     : throw map_failed();
-  case CUDA_ERROR_UNMAP_FAILED                   : throw unmap_failed();
-  case CUDA_ERROR_ARRAY_IS_MAPPED                : throw array_is_mapped();
-  case CUDA_ERROR_ALREADY_MAPPED                 : throw already_mapped();
-  case CUDA_ERROR_NO_BINARY_FOR_GPU              : throw no_binary_for_gpu();
-  case CUDA_ERROR_ALREADY_ACQUIRED               : throw already_acquired();
-  case CUDA_ERROR_NOT_MAPPED                     : throw not_mapped();
-  case CUDA_ERROR_NOT_MAPPED_AS_ARRAY            : throw not_mapped_as_array();
-  case CUDA_ERROR_NOT_MAPPED_AS_POINTER          : throw not_mapped_as_pointer();
-  case CUDA_ERROR_ECC_UNCORRECTABLE              : throw ecc_uncorrectable();
-  case CUDA_ERROR_UNSUPPORTED_LIMIT              : throw unsupported_limit();
-  case CUDA_ERROR_CONTEXT_ALREADY_IN_USE         : throw context_already_in_use();
-  case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        : throw peer_access_unsupported();
-  case CUDA_ERROR_INVALID_PTX                    : throw invalid_ptx();
-  case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       : throw invalid_graphics_context();
-  case CUDA_ERROR_INVALID_SOURCE                 : throw invalid_source();
-  case CUDA_ERROR_FILE_NOT_FOUND                 : throw file_not_found();
-  case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND : throw shared_object_symbol_not_found();
-  case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      : throw shared_object_init_failed();
-  case CUDA_ERROR_OPERATING_SYSTEM               : throw operating_system();
-  case CUDA_ERROR_INVALID_HANDLE                 : throw invalid_handle();
-  case CUDA_ERROR_NOT_FOUND                      : throw not_found();
-  case CUDA_ERROR_NOT_READY                      : throw not_ready();
-  case CUDA_ERROR_ILLEGAL_ADDRESS                : throw illegal_address();
-  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        : throw launch_out_of_resources();
-  case CUDA_ERROR_LAUNCH_TIMEOUT                 : throw launch_timeout();
-  case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  : throw launch_incompatible_texturing();
-  case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    : throw peer_access_already_enabled();
-  case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        : throw peer_access_not_enabled();
-  case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         : throw primary_context_active();
-  case CUDA_ERROR_CONTEXT_IS_DESTROYED           : throw context_is_destroyed();
-  case CUDA_ERROR_ASSERT                         : throw assert_error();
-  case CUDA_ERROR_TOO_MANY_PEERS                 : throw too_many_peers();
-  case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED : throw host_memory_already_registered();
-  case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     : throw host_memory_not_registered();
-  case CUDA_ERROR_HARDWARE_STACK_ERROR           : throw hardware_stack_error();
-  case CUDA_ERROR_ILLEGAL_INSTRUCTION            : throw illegal_instruction();
-  case CUDA_ERROR_MISALIGNED_ADDRESS             : throw misaligned_address();
-  case CUDA_ERROR_INVALID_ADDRESS_SPACE          : throw invalid_address_space();
-  case CUDA_ERROR_INVALID_PC                     : throw invalid_pc();
-  case CUDA_ERROR_LAUNCH_FAILED                  : throw launch_failed();
-  case CUDA_ERROR_NOT_PERMITTED                  : throw not_permitted();
-  case CUDA_ERROR_NOT_SUPPORTED                  : throw not_supported();
-  case CUDA_ERROR_UNKNOWN                        : throw unknown();
-  default                                        : throw unknown();
-  }
-}
-
-void check(nvrtcResult err){
-  using namespace exception::nvrtc;
-
-  switch(err)
-  {
-  case NVRTC_SUCCESS:                         break;
-  case NVRTC_ERROR_OUT_OF_MEMORY:             throw out_of_memory();
-  case NVRTC_ERROR_PROGRAM_CREATION_FAILURE:  throw program_creation_failure();
-  case NVRTC_ERROR_INVALID_INPUT:             throw invalid_input();
-  case NVRTC_ERROR_INVALID_PROGRAM:           throw invalid_program();
-  case NVRTC_ERROR_INVALID_OPTION:            throw invalid_option();
-  case NVRTC_ERROR_COMPILATION:               throw compilation();
-  case NVRTC_ERROR_BUILTIN_OPERATION_FAILURE: throw builtin_operation_failure();
-  default: throw unknown_error();
-  }
-}
-
-void check(cublasStatus_t err){
-  using namespace exception::cublas;
-  switch(err)
-  {
-  case CUBLAS_STATUS_SUCCESS         : break;
-  case CUBLAS_STATUS_NOT_INITIALIZED : throw not_initialized();
-  case CUBLAS_STATUS_ALLOC_FAILED    : throw alloc_failed();
-  case CUBLAS_STATUS_INVALID_VALUE   : throw invalid_value();
-  case CUBLAS_STATUS_ARCH_MISMATCH   : throw arch_mismatch();
-  case CUBLAS_STATUS_MAPPING_ERROR   : throw mapping_error();
-  case CUBLAS_STATUS_EXECUTION_FAILED: throw execution_failed();
-  case CUBLAS_STATUS_INTERNAL_ERROR  : throw internal_error();
-  case CUBLAS_STATUS_NOT_SUPPORTED   : throw not_supported();
-  case CUBLAS_STATUS_LICENSE_ERROR   : throw license_error();
-  default                            : throw unknown();
-  }
-}
-
-void check(cudnnStatus_t err){
-  using namespace exception::cudnn;
-  switch(err)
-  {
-  case CUDNN_STATUS_SUCCESS:            break;
-  case CUDNN_STATUS_NOT_INITIALIZED:    throw not_initialized();
-  case CUDNN_STATUS_ALLOC_FAILED:       throw alloc_failed();
-  case CUDNN_STATUS_BAD_PARAM:          throw bad_param();
-  case CUDNN_STATUS_INTERNAL_ERROR:     throw internal_error();
-  case CUDNN_STATUS_INVALID_VALUE:      throw invalid_value();
-  case CUDNN_STATUS_ARCH_MISMATCH:      throw arch_mismatch();
-  case CUDNN_STATUS_MAPPING_ERROR:      throw mapping_error();
-  case CUDNN_STATUS_EXECUTION_FAILED:   throw execution_failed();
-  case CUDNN_STATUS_NOT_SUPPORTED:      throw not_supported();
-  case CUDNN_STATUS_LICENSE_ERROR:      throw license_error();
-  case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING: throw runtime_prerequisite_missing();
-  case CUDNN_STATUS_RUNTIME_IN_PROGRESS: throw runtime_in_progress();
-  case CUDNN_STATUS_RUNTIME_FP_OVERFLOW: throw runtime_fp_overflow();
-  }
-}
-
-}
-}
-
--- a/lib/driver/event.cpp
+++ b/lib/driver/event.cpp
@@ -1,40 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include "isaac/driver/event.h"
-
-namespace isaac
-{
-namespace driver
-{
-
-float Event::elapsed_time() const{
-  float time;
-  dispatch::cuEventElapsedTime(&time, cu_->first, cu_->second);
-  return time;
-}
-
-Handle<cu_event_t> const & Event::cu() const
-{ return cu_; }
-
-}
-}
--- a/lib/driver/handle.cpp
+++ b/lib/driver/handle.cpp
@@ -1,66 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <cassert>
-#include <memory>
-#include "isaac/driver/handle.h"
-
-namespace isaac
-{
-
-namespace driver
-{
-
-//CUDA
-inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); }
-inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); }
-inline void _delete(CUstream x) { dispatch::cuStreamDestroy(x); }
-inline void _delete(CUdevice) { }
-inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); }
-inline void _delete(CUfunction) { }
-inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); }
-inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
-inline void _delete(cu_platform){}
-
-//Constructor
-template<class CUType>
-Handle<CUType>::Handle(CUType cu, bool take_ownership): h_(new CUType(cu)), has_ownership_(take_ownership)
-{ }
-
-
-template<class CUType>
-Handle<CUType>::~Handle(){
-  if(has_ownership_ && h_ && h_.unique() && *h_)
-    _delete(*h_);
-}
-
-template class Handle<CUdeviceptr>;
-template class Handle<CUstream>;
-template class Handle<CUcontext>;
-template class Handle<CUdevice>;
-template class Handle<cu_event_t>;
-template class Handle<CUfunction>;
-template class Handle<CUmodule>;
-template class Handle<cu_platform>;
-
-}
-}
--- a/lib/driver/kernel.cpp
+++ b/lib/driver/kernel.cpp
@@ -1,67 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <iostream>
-#include <cstring>
-
-#include "isaac/driver/kernel.h"
-#include "isaac/driver/buffer.h"
-
-namespace isaac
-{
-
-namespace driver
-{
-
-Kernel::Kernel(Module const & program, const char * name) : program_(program), address_bits_(program.context().device().address_bits()){
-  cu_params_store_.reserve(64);
-  cu_params_.reserve(64);
-  dispatch::cuModuleGetFunction(&*cu_, program, name);
-}
-
-void Kernel::setArg(unsigned int index, std::size_t size, void* ptr){
-  if(index + 1> cu_params_store_.size()){
-    cu_params_store_.resize(index+1);
-    cu_params_.resize(index+1);
-  }
-  cu_params_store_[index].reset(malloc(size), free);
-  memcpy(cu_params_store_[index].get(), ptr, size);
-  cu_params_[index] = cu_params_store_[index].get();
-}
-
-void Kernel::setArg(unsigned int index, Buffer const & data)
-{ return setArg(index, (CUdeviceptr)data);}
-
-void* const* Kernel::cu_params() const
-{ return cu_params_.data(); }
-
-Handle<CUfunction> const & Kernel::cu() const
-{ return cu_; }
-
-Module const & Kernel::module() const
-{ return program_; }
-
-
-}
-
-}
-
--- a/lib/driver/module.cpp
+++ b/lib/driver/module.cpp
@@ -1,118 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <iostream>
-#include <fstream>
-
-#include "isaac/driver/module.h"
-#include "isaac/driver/context.h"
-#include "isaac/driver/error.h"
-
-#include "isaac/tools/sys/getenv.hpp"
-
-namespace isaac
-{
-namespace driver
-{
-
-CUjit_target_enum cutarget(Device::Architecture arch){
-  switch(arch){
-    case Device::Architecture::SM_2_0: return CU_TARGET_COMPUTE_20;
-    case Device::Architecture::SM_2_1: return CU_TARGET_COMPUTE_21;
-    case Device::Architecture::SM_3_0: return CU_TARGET_COMPUTE_30;
-    case Device::Architecture::SM_3_5: return CU_TARGET_COMPUTE_35;
-    case Device::Architecture::SM_3_7: return CU_TARGET_COMPUTE_37;
-    case Device::Architecture::SM_5_0: return CU_TARGET_COMPUTE_50;
-    case Device::Architecture::SM_5_2: return CU_TARGET_COMPUTE_52;
-    case Device::Architecture::SM_6_0: return CU_TARGET_COMPUTE_60;
-    case Device::Architecture::SM_6_1: return CU_TARGET_COMPUTE_61;
-    default: throw;
-  }
-}
-
-inline std::pair<int, int> ptx(std::pair<int, int> sm){
-  if(sm.first == 7) return {6, 0};
-  if(sm.first == 6) return {5, 0};
-  if(sm.first == 5) return {4, 3};
-  throw;
-}
-
-std::string Module::header(Device const & device){
-  auto cc = device.compute_capability();
-  auto vptx = ptx(cc);
-  std::string header;
-  header += ".version " + std::to_string(vptx.first) + "." + std::to_string(vptx.second) + "\n";
-  header += ".target sm_" + std::to_string(cc.first) + std::to_string(cc.second) + "\n";
-  header += ".address_size 64\n";
-  return header;
-}
-
-Module::Module(Context const & context, std::string const & source) : context_(context), source_(header(context.device()) + source){
-  ContextSwitcher ctx_switch(context_);
-
-  //Path to custom PTX compiler
-  std::string compiler = tools::getenv("ISAAC_PTXAS");
-  if(compiler.size()){
-    auto cc = context.device().compute_capability();
-    std::string out = context.cache_path() + "tmp.o";
-    std::string opt = " --gpu-name sm_" + std::to_string(cc.first) + std::to_string(cc.second)
-                    + "  -o " + out
-                    + "  -ias \"" + source_ + "\"";
-    std::string cmd = compiler + opt;
-    if(std::system(cmd.c_str()) != 0)
-      throw;
-    dispatch::cuModuleLoad(&*cu_, out.c_str());
-  }
-  //JIT Compilation
-  else{
-    CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER};
-    unsigned int errbufsize = 8096;
-    std::string errbuf(errbufsize, 0);
-    //CUjit_target_enum target = cutarget(context.device().architecture());
-    void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)errbuf.data()};
-    try{
-      dispatch::cuModuleLoadDataEx(&*cu_, source_.data(), 2, opt, optval);
-    }catch(exception::cuda::base const &){
-      std::cerr << "Compilation Failed! Log: " << std::endl;
-      std::cerr << errbuf << std::endl;
-      throw;
-    }
-  }
-}
-
-Context const & Module::context() const
-{ return context_; }
-
-Handle<CUmodule> const & Module::cu() const
-{ return cu_; }
-
-Buffer Module::symbol(const char *name) const{
-  CUdeviceptr handle;
-  size_t size;
-  dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
-  return Buffer(context_, handle, false);
-}
-
-
-}
-}
-
--- a/lib/driver/platform.cpp
+++ b/lib/driver/platform.cpp
@@ -1,56 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-
-#include "isaac/driver/platform.h"
-#include "isaac/driver/device.h"
-
-#include <string>
-
-namespace isaac
-{
-namespace driver
-{
-
-std::string Platform::version() const{
-  int version;
-  dispatch::cuDriverGetVersion(&version);
-  return std::to_string(version);
-}
-
-std::string Platform::name() const
-{ return (std::string)"CUDA"; }
-
-std::vector<Device> Platform::devices() const{
-  std::vector<Device> devices;
-  int N;
-  dispatch::cuDeviceGetCount(&N);
-  for(int i = 0 ; i < N ; ++i){
-    CUdevice device;
-    dispatch::cuDeviceGet(&device, i);
-    devices.push_back(Device(device));
-  }
-  return devices;
-}
-
-}
-}
--- a/lib/driver/stream.cpp
+++ b/lib/driver/stream.cpp
@@ -1,95 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <iostream>
-#include <cassert>
-#include <array>
-
-#include "isaac/driver/backend.h"
-#include "isaac/driver/stream.h"
-#include "isaac/driver/context.h"
-#include "isaac/driver/device.h"
-#include "isaac/driver/event.h"
-#include "isaac/driver/kernel.h"
-#include "isaac/driver/buffer.h"
-
-namespace isaac
-{
-
-namespace driver
-{
-
-inline CUcontext cucontext(){
-  CUcontext result;
-  dispatch::cuCtxGetCurrent(&result);
-  return result;
-}
-
-Stream::Stream(CUstream stream, bool take_ownership): context_(cucontext(), take_ownership), cu_(stream, take_ownership)
-{}
-
-Stream::Stream(Context const & context): context_(context), cu_(CUstream(), true)
-{
-  ContextSwitcher ctx_switch(context_);
-  dispatch::cuStreamCreate(&*cu_, 0);
-}
-
-void Stream::synchronize()
-{
-  ContextSwitcher ctx_switch(context_);
-  dispatch::cuStreamSynchronize(*cu_);
-}
-
-Context const & Stream::context() const
-{ return context_; }
-
-void Stream::enqueue(Kernel const & kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const *, Event* event){
-  ContextSwitcher ctx_switch(context_);
-  if(event)
-    dispatch::cuEventRecord(((cu_event_t)*event).first, *cu_);
-  dispatch::cuLaunchKernel(kernel, grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)kernel.cu_params(), NULL);
-  if(event)
-    dispatch::cuEventRecord(((cu_event_t)*event).second, *cu_);
-}
-
-void Stream::write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr){
-  ContextSwitcher ctx_switch(context_);
-  if(blocking)
-    dispatch::cuMemcpyHtoD(buffer + offset, ptr, size);
-  else
-    dispatch::cuMemcpyHtoDAsync(buffer + offset, ptr, size, *cu_);
-}
-
-void Stream::read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr){
-  ContextSwitcher ctx_switch(context_);
-  if(blocking)
-    dispatch::cuMemcpyDtoH(ptr, buffer + offset, size);
-  else
-    dispatch::cuMemcpyDtoHAsync(ptr, buffer + offset, size, *cu_);
-}
-
-Handle<CUstream> const & Stream::cu() const
-{ return cu_; }
-
-}
-
-}
--- a/lib/external/fmt/CMakeLists.txt
+++ b/lib/external/fmt/CMakeLists.txt
@@ -1,94 +0,0 @@
-# Define the fmt library, its includes and the needed defines.
-# format.cc is added to FMT_HEADERS for the header-only configuration.
-set(FMT_HEADERS format.h format.cc ostream.h ostream.cc printf.h
-                string.h time.h)
-if (HAVE_OPEN)
-  set(FMT_HEADERS ${FMT_HEADERS} posix.h)
-  set(FMT_SOURCES ${FMT_SOURCES} posix.cc)
-endif ()
-
-add_library(fmt ${FMT_SOURCES} ${FMT_HEADERS} ../README.rst ../ChangeLog.rst)
-
-option(FMT_CPPFORMAT "Build cppformat library for backward compatibility." OFF)
-if (FMT_CPPFORMAT)
-  message(WARNING "The cppformat library is deprecated, use fmt instead.")
-  add_library(cppformat ${FMT_SOURCES} ${FMT_HEADERS})
-endif ()
-
-# Starting with cmake 3.1 the CXX_STANDARD property can be used instead.
-target_compile_options(fmt PUBLIC ${CPP11_FLAG})
-if (FMT_PEDANTIC)
-  target_compile_options(fmt PRIVATE ${PEDANTIC_COMPILE_FLAGS})
-endif ()
-
-target_include_directories(fmt PUBLIC
-  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
-  $<INSTALL_INTERFACE:include>)
-
-set_target_properties(fmt PROPERTIES
-  VERSION ${FMT_VERSION} SOVERSION ${CPACK_PACKAGE_VERSION_MAJOR})
-
-if (BUILD_SHARED_LIBS)
-  if (UNIX AND NOT APPLE)
-    # Fix rpmlint warning:
-    # unused-direct-shlib-dependency /usr/lib/libformat.so.1.1.0 /lib/libm.so.6.
-    target_link_libraries(fmt -Wl,--as-needed)
-  endif ()
-  target_compile_definitions(fmt PRIVATE FMT_EXPORT INTERFACE FMT_SHARED)
-endif ()
-
-#------------------------------------------------------------------------------
-# additionally define a header only library when cmake is new enough
-if (CMAKE_VERSION VERSION_GREATER 3.1.0 OR CMAKE_VERSION VERSION_EQUAL 3.1.0)
-  add_library(fmt-header-only INTERFACE)
-
-  target_compile_definitions(fmt-header-only INTERFACE FMT_HEADER_ONLY=1)
-
-  target_include_directories(fmt-header-only INTERFACE
-    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
-    $<INSTALL_INTERFACE:include>)
-endif ()
-
-# Install targets.
-if (FMT_INSTALL)
-  include(CMakePackageConfigHelpers)
-  set(FMT_CMAKE_DIR lib/cmake/fmt CACHE STRING
-    "Installation directory for cmake files, relative to ${CMAKE_INSTALL_PREFIX}.")
-  set(version_config ${PROJECT_BINARY_DIR}/fmt-config-version.cmake)
-  set(project_config ${PROJECT_BINARY_DIR}/fmt-config.cmake)
-  set(targets_export_name fmt-targets)
-
-  set (INSTALL_TARGETS fmt)
-  if (TARGET fmt-header-only)
-    set(INSTALL_TARGETS ${INSTALL_TARGETS} fmt-header-only)
-  endif ()
-
-  set(FMT_LIB_DIR lib CACHE STRING
-    "Installation directory for libraries, relative to ${CMAKE_INSTALL_PREFIX}.")
-
-  # Generate the version, config and target files into the build directory.
-  write_basic_package_version_file(
-    ${version_config}
-    VERSION ${FMT_VERSION}
-    COMPATIBILITY AnyNewerVersion)
-  configure_package_config_file(
-    ${PROJECT_SOURCE_DIR}/support/cmake/fmt-config.cmake.in
-    ${project_config}
-    INSTALL_DESTINATION ${FMT_CMAKE_DIR})
-  export(TARGETS ${INSTALL_TARGETS}
-         FILE ${PROJECT_BINARY_DIR}/${targets_export_name}.cmake)
-
-  # Install version, config and target files.
-  install(
-    FILES ${project_config} ${version_config}
-    DESTINATION ${FMT_CMAKE_DIR})
-  install(EXPORT ${targets_export_name} DESTINATION ${FMT_CMAKE_DIR})
-
-  # Install the library and headers.
-  install(TARGETS ${INSTALL_TARGETS} EXPORT ${targets_export_name}
-          DESTINATION ${FMT_LIB_DIR})
-  install(FILES ${FMT_HEADERS} DESTINATION include/fmt)
-  if (FMT_CPPFORMAT)
-    install(TARGETS cppformat DESTINATION ${FMT_LIB_DIR})
-  endif ()
-endif ()
--- a/lib/external/fmt/format.cc
+++ b/lib/external/fmt/format.cc
@@ -1,556 +0,0 @@
-/*
- Formatting library for C++
-
- Copyright (c) 2012 - 2016, Victor Zverovich
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice, this
-    list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright notice,
-    this list of conditions and the following disclaimer in the documentation
-    and/or other materials provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "format.h"
-#include "printf.h"
-
-#include <string.h>
-
-#include <cctype>
-#include <cerrno>
-#include <climits>
-#include <cmath>
-#include <cstdarg>
-#include <cstddef>  // for std::ptrdiff_t
-
-#if defined(_WIN32) && defined(__MINGW32__)
-# include <cstring>
-#endif
-
-#if FMT_USE_WINDOWS_H
-# if defined(NOMINMAX) || defined(FMT_WIN_MINMAX)
-#  include <windows.h>
-# else
-#  define NOMINMAX
-#  include <windows.h>
-#  undef NOMINMAX
-# endif
-#endif
-
-using fmt::internal::Arg;
-
-#if FMT_EXCEPTIONS
-# define FMT_TRY try
-# define FMT_CATCH(x) catch (x)
-#else
-# define FMT_TRY if (true)
-# define FMT_CATCH(x) if (false)
-#endif
-
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4127)  // conditional expression is constant
-# pragma warning(disable: 4702)  // unreachable code
-// Disable deprecation warning for strerror. The latter is not called but
-// MSVC fails to detect it.
-# pragma warning(disable: 4996)
-#endif
-
-// Dummy implementations of strerror_r and strerror_s called if corresponding
-// system functions are not available.
-static inline fmt::internal::Null<> strerror_r(int, char *, ...) {
-  return fmt::internal::Null<>();
-}
-static inline fmt::internal::Null<> strerror_s(char *, std::size_t, ...) {
-  return fmt::internal::Null<>();
-}
-
-namespace fmt {
-
-FMT_FUNC internal::RuntimeError::~RuntimeError() throw() {}
-FMT_FUNC FormatError::~FormatError() throw() {}
-FMT_FUNC SystemError::~SystemError() throw() {}
-
-namespace {
-
-#ifndef _MSC_VER
-# define FMT_SNPRINTF snprintf
-#else  // _MSC_VER
-inline int fmt_snprintf(char *buffer, size_t size, const char *format, ...) {
-  va_list args;
-  va_start(args, format);
-  int result = vsnprintf_s(buffer, size, _TRUNCATE, format, args);
-  va_end(args);
-  return result;
-}
-# define FMT_SNPRINTF fmt_snprintf
-#endif  // _MSC_VER
-
-#if defined(_WIN32) && defined(__MINGW32__) && !defined(__NO_ISOCEXT)
-# define FMT_SWPRINTF snwprintf
-#else
-# define FMT_SWPRINTF swprintf
-#endif // defined(_WIN32) && defined(__MINGW32__) && !defined(__NO_ISOCEXT)
-
-const char RESET_COLOR[] = "\x1b[0m";
-
-typedef void (*FormatFunc)(Writer &, int, StringRef);
-
-// Portable thread-safe version of strerror.
-// Sets buffer to point to a string describing the error code.
-// This can be either a pointer to a string stored in buffer,
-// or a pointer to some static immutable string.
-// Returns one of the following values:
-//   0      - success
-//   ERANGE - buffer is not large enough to store the error message
-//   other  - failure
-// Buffer should be at least of size 1.
-int safe_strerror(
-    int error_code, char *&buffer, std::size_t buffer_size) FMT_NOEXCEPT {
-  FMT_ASSERT(buffer != 0 && buffer_size != 0, "invalid buffer");
-
-  class StrError {
-   private:
-    int error_code_;
-    char *&buffer_;
-    std::size_t buffer_size_;
-
-    // A noop assignment operator to avoid bogus warnings.
-    void operator=(const StrError &) {}
-
-    // Handle the result of XSI-compliant version of strerror_r.
-    int handle(int result) {
-      // glibc versions before 2.13 return result in errno.
-      return result == -1 ? errno : result;
-    }
-
-    // Handle the result of GNU-specific version of strerror_r.
-    int handle(char *message) {
-      // If the buffer is full then the message is probably truncated.
-      if (message == buffer_ && strlen(buffer_) == buffer_size_ - 1)
-        return ERANGE;
-      buffer_ = message;
-      return 0;
-    }
-
-    // Handle the case when strerror_r is not available.
-    int handle(internal::Null<>) {
-      return fallback(strerror_s(buffer_, buffer_size_, error_code_));
-    }
-
-    // Fallback to strerror_s when strerror_r is not available.
-    int fallback(int result) {
-      // If the buffer is full then the message is probably truncated.
-      return result == 0 && strlen(buffer_) == buffer_size_ - 1 ?
-            ERANGE : result;
-    }
-
-    // Fallback to strerror if strerror_r and strerror_s are not available.
-    int fallback(internal::Null<>) {
-      errno = 0;
-      buffer_ = strerror(error_code_);
-      return errno;
-    }
-
-   public:
-    StrError(int err_code, char *&buf, std::size_t buf_size)
-      : error_code_(err_code), buffer_(buf), buffer_size_(buf_size) {}
-
-    int run() {
-      strerror_r(0, 0, "");  // Suppress a warning about unused strerror_r.
-      return handle(strerror_r(error_code_, buffer_, buffer_size_));
-    }
-  };
-  return StrError(error_code, buffer, buffer_size).run();
-}
-
-void format_error_code(Writer &out, int error_code,
-                       StringRef message) FMT_NOEXCEPT {
-  // Report error code making sure that the output fits into
-  // INLINE_BUFFER_SIZE to avoid dynamic memory allocation and potential
-  // bad_alloc.
-  out.clear();
-  static const char SEP[] = ": ";
-  static const char ERROR_STR[] = "error ";
-  // Subtract 2 to account for terminating null characters in SEP and ERROR_STR.
-  std::size_t error_code_size = sizeof(SEP) + sizeof(ERROR_STR) - 2;
-  typedef internal::IntTraits<int>::MainType MainType;
-  MainType abs_value = static_cast<MainType>(error_code);
-  if (internal::is_negative(error_code)) {
-    abs_value = 0 - abs_value;
-    ++error_code_size;
-  }
-  error_code_size += internal::count_digits(abs_value);
-  if (message.size() <= internal::INLINE_BUFFER_SIZE - error_code_size)
-    out << message << SEP;
-  out << ERROR_STR << error_code;
-  assert(out.size() <= internal::INLINE_BUFFER_SIZE);
-}
-
-void report_error(FormatFunc func, int error_code,
-                  StringRef message) FMT_NOEXCEPT {
-  MemoryWriter full_message;
-  func(full_message, error_code, message);
-  // Use Writer::data instead of Writer::c_str to avoid potential memory
-  // allocation.
-  std::fwrite(full_message.data(), full_message.size(), 1, stderr);
-  std::fputc('\n', stderr);
-}
-}  // namespace
-
-namespace internal {
-
-// This method is used to preserve binary compatibility with fmt 3.0.
-// It can be removed in 4.0.
-FMT_FUNC void format_system_error(
-  Writer &out, int error_code, StringRef message) FMT_NOEXCEPT {
-  fmt::format_system_error(out, error_code, message);
-}
-}  // namespace internal
-
-FMT_FUNC void SystemError::init(
-    int err_code, CStringRef format_str, ArgList args) {
-  error_code_ = err_code;
-  MemoryWriter w;
-  format_system_error(w, err_code, format(format_str, args));
-  std::runtime_error &base = *this;
-  base = std::runtime_error(w.str());
-}
-
-template <typename T>
-int internal::CharTraits<char>::format_float(
-    char *buffer, std::size_t size, const char *format,
-    unsigned width, int precision, T value) {
-  if (width == 0) {
-    return precision < 0 ?
-        FMT_SNPRINTF(buffer, size, format, value) :
-        FMT_SNPRINTF(buffer, size, format, precision, value);
-  }
-  return precision < 0 ?
-      FMT_SNPRINTF(buffer, size, format, width, value) :
-      FMT_SNPRINTF(buffer, size, format, width, precision, value);
-}
-
-template <typename T>
-int internal::CharTraits<wchar_t>::format_float(
-    wchar_t *buffer, std::size_t size, const wchar_t *format,
-    unsigned width, int precision, T value) {
-  if (width == 0) {
-    return precision < 0 ?
-        FMT_SWPRINTF(buffer, size, format, value) :
-        FMT_SWPRINTF(buffer, size, format, precision, value);
-  }
-  return precision < 0 ?
-      FMT_SWPRINTF(buffer, size, format, width, value) :
-      FMT_SWPRINTF(buffer, size, format, width, precision, value);
-}
-
-template <typename T>
-const char internal::BasicData<T>::DIGITS[] =
-    "0001020304050607080910111213141516171819"
-    "2021222324252627282930313233343536373839"
-    "4041424344454647484950515253545556575859"
-    "6061626364656667686970717273747576777879"
-    "8081828384858687888990919293949596979899";
-
-#define FMT_POWERS_OF_10(factor) \
-  factor * 10, \
-  factor * 100, \
-  factor * 1000, \
-  factor * 10000, \
-  factor * 100000, \
-  factor * 1000000, \
-  factor * 10000000, \
-  factor * 100000000, \
-  factor * 1000000000
-
-template <typename T>
-const uint32_t internal::BasicData<T>::POWERS_OF_10_32[] = {
-  0, FMT_POWERS_OF_10(1)
-};
-
-template <typename T>
-const uint64_t internal::BasicData<T>::POWERS_OF_10_64[] = {
-  0,
-  FMT_POWERS_OF_10(1),
-  FMT_POWERS_OF_10(ULongLong(1000000000)),
-  // Multiply several constants instead of using a single long long constant
-  // to avoid warnings about C++98 not supporting long long.
-  ULongLong(1000000000) * ULongLong(1000000000) * 10
-};
-
-FMT_FUNC void internal::report_unknown_type(char code, const char *type) {
-  (void)type;
-  if (std::isprint(static_cast<unsigned char>(code))) {
-    FMT_THROW(FormatError(
-        format("unknown format code '{}' for {}", code, type)));
-  }
-  FMT_THROW(FormatError(
-      format("unknown format code '\\x{:02x}' for {}",
-        static_cast<unsigned>(code), type)));
-}
-
-#if FMT_USE_WINDOWS_H
-
-FMT_FUNC internal::UTF8ToUTF16::UTF8ToUTF16(StringRef s) {
-  static const char ERROR_MSG[] = "cannot convert string from UTF-8 to UTF-16";
-  if (s.size() > INT_MAX)
-    FMT_THROW(WindowsError(ERROR_INVALID_PARAMETER, ERROR_MSG));
-  int s_size = static_cast<int>(s.size());
-  int length = MultiByteToWideChar(
-      CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size, 0, 0);
-  if (length == 0)
-    FMT_THROW(WindowsError(GetLastError(), ERROR_MSG));
-  buffer_.resize(length + 1);
-  length = MultiByteToWideChar(
-    CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size, &buffer_[0], length);
-  if (length == 0)
-    FMT_THROW(WindowsError(GetLastError(), ERROR_MSG));
-  buffer_[length] = 0;
-}
-
-FMT_FUNC internal::UTF16ToUTF8::UTF16ToUTF8(WStringRef s) {
-  if (int error_code = convert(s)) {
-    FMT_THROW(WindowsError(error_code,
-        "cannot convert string from UTF-16 to UTF-8"));
-  }
-}
-
-FMT_FUNC int internal::UTF16ToUTF8::convert(WStringRef s) {
-  if (s.size() > INT_MAX)
-    return ERROR_INVALID_PARAMETER;
-  int s_size = static_cast<int>(s.size());
-  int length = WideCharToMultiByte(CP_UTF8, 0, s.data(), s_size, 0, 0, 0, 0);
-  if (length == 0)
-    return GetLastError();
-  buffer_.resize(length + 1);
-  length = WideCharToMultiByte(
-    CP_UTF8, 0, s.data(), s_size, &buffer_[0], length, 0, 0);
-  if (length == 0)
-    return GetLastError();
-  buffer_[length] = 0;
-  return 0;
-}
-
-FMT_FUNC void WindowsError::init(
-    int err_code, CStringRef format_str, ArgList args) {
-  error_code_ = err_code;
-  MemoryWriter w;
-  internal::format_windows_error(w, err_code, format(format_str, args));
-  std::runtime_error &base = *this;
-  base = std::runtime_error(w.str());
-}
-
-FMT_FUNC void internal::format_windows_error(
-    Writer &out, int error_code, StringRef message) FMT_NOEXCEPT {
-  FMT_TRY {
-    MemoryBuffer<wchar_t, INLINE_BUFFER_SIZE> buffer;
-    buffer.resize(INLINE_BUFFER_SIZE);
-    for (;;) {
-      wchar_t *system_message = &buffer[0];
-      int result = FormatMessageW(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
-                                  0, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
-                                  system_message, static_cast<uint32_t>(buffer.size()), 0);
-      if (result != 0) {
-        UTF16ToUTF8 utf8_message;
-        if (utf8_message.convert(system_message) == ERROR_SUCCESS) {
-          out << message << ": " << utf8_message;
-          return;
-        }
-        break;
-      }
-      if (GetLastError() != ERROR_INSUFFICIENT_BUFFER)
-        break;  // Can't get error message, report error code instead.
-      buffer.resize(buffer.size() * 2);
-    }
-  } FMT_CATCH(...) {}
-  fmt::format_error_code(out, error_code, message);  // 'fmt::' is for bcc32.
-}
-
-#endif  // FMT_USE_WINDOWS_H
-
-FMT_FUNC void format_system_error(
-    Writer &out, int error_code, StringRef message) FMT_NOEXCEPT {
-  FMT_TRY {
-    internal::MemoryBuffer<char, internal::INLINE_BUFFER_SIZE> buffer;
-    buffer.resize(internal::INLINE_BUFFER_SIZE);
-    for (;;) {
-      char *system_message = &buffer[0];
-      int result = safe_strerror(error_code, system_message, buffer.size());
-      if (result == 0) {
-        out << message << ": " << system_message;
-        return;
-      }
-      if (result != ERANGE)
-        break;  // Can't get error message, report error code instead.
-      buffer.resize(buffer.size() * 2);
-    }
-  } FMT_CATCH(...) {}
-  fmt::format_error_code(out, error_code, message);  // 'fmt::' is for bcc32.
-}
-
-template <typename Char>
-void internal::ArgMap<Char>::init(const ArgList &args) {
-  if (!map_.empty())
-    return;
-  typedef internal::NamedArg<Char> NamedArg;
-  const NamedArg *named_arg = 0;
-  bool use_values =
-      args.type(ArgList::MAX_PACKED_ARGS - 1) == internal::Arg::NONE;
-  if (use_values) {
-    for (unsigned i = 0;/*nothing*/; ++i) {
-      internal::Arg::Type arg_type = args.type(i);
-      switch (arg_type) {
-      case internal::Arg::NONE:
-        return;
-      case internal::Arg::NAMED_ARG:
-        named_arg = static_cast<const NamedArg*>(args.values_[i].pointer);
-        map_.push_back(Pair(named_arg->name, *named_arg));
-        break;
-      default:
-        /*nothing*/;
-      }
-    }
-    return;
-  }
-  for (unsigned i = 0; i != ArgList::MAX_PACKED_ARGS; ++i) {
-    internal::Arg::Type arg_type = args.type(i);
-    if (arg_type == internal::Arg::NAMED_ARG) {
-      named_arg = static_cast<const NamedArg*>(args.args_[i].pointer);
-      map_.push_back(Pair(named_arg->name, *named_arg));
-    }
-  }
-  for (unsigned i = ArgList::MAX_PACKED_ARGS;/*nothing*/; ++i) {
-    switch (args.args_[i].type) {
-    case internal::Arg::NONE:
-      return;
-    case internal::Arg::NAMED_ARG:
-      named_arg = static_cast<const NamedArg*>(args.args_[i].pointer);
-      map_.push_back(Pair(named_arg->name, *named_arg));
-      break;
-    default:
-      /*nothing*/;
-    }
-  }
-}
-
-template <typename Char>
-void internal::FixedBuffer<Char>::grow(std::size_t) {
-  FMT_THROW(std::runtime_error("buffer overflow"));
-}
-
-FMT_FUNC Arg internal::FormatterBase::do_get_arg(
-    unsigned arg_index, const char *&error) {
-  Arg arg = args_[arg_index];
-  switch (arg.type) {
-  case Arg::NONE:
-    error = "argument index out of range";
-    break;
-  case Arg::NAMED_ARG:
-    arg = *static_cast<const internal::Arg*>(arg.pointer);
-    break;
-  default:
-    /*nothing*/;
-  }
-  return arg;
-}
-
-FMT_FUNC void report_system_error(
-    int error_code, fmt::StringRef message) FMT_NOEXCEPT {
-  // 'fmt::' is for bcc32.
-  report_error(format_system_error, error_code, message);
-}
-
-#if FMT_USE_WINDOWS_H
-FMT_FUNC void report_windows_error(
-    int error_code, fmt::StringRef message) FMT_NOEXCEPT {
-  // 'fmt::' is for bcc32.
-  report_error(internal::format_windows_error, error_code, message);
-}
-#endif
-
-FMT_FUNC void print(std::FILE *f, CStringRef format_str, ArgList args) {
-  MemoryWriter w;
-  w.write(format_str, args);
-  std::fwrite(w.data(), 1, w.size(), f);
-}
-
-FMT_FUNC void print(CStringRef format_str, ArgList args) {
-  print(stdout, format_str, args);
-}
-
-FMT_FUNC void print_colored(Color c, CStringRef format, ArgList args) {
-  char escape[] = "\x1b[30m";
-  escape[3] = static_cast<char>('0' + c);
-  std::fputs(escape, stdout);
-  print(format, args);
-  std::fputs(RESET_COLOR, stdout);
-}
-
-template <typename Char>
-void printf(BasicWriter<Char> &w, BasicCStringRef<Char> format, ArgList args);
-
-FMT_FUNC int fprintf(std::FILE *f, CStringRef format, ArgList args) {
-  MemoryWriter w;
-  printf(w, format, args);
-  std::size_t size = w.size();
-  return std::fwrite(w.data(), 1, size, f) < size ? -1 : static_cast<int>(size);
-}
-
-#ifndef FMT_HEADER_ONLY
-
-template struct internal::BasicData<void>;
-
-// Explicit instantiations for char.
-
-template void internal::FixedBuffer<char>::grow(std::size_t);
-
-template void internal::ArgMap<char>::init(const ArgList &args);
-
-template void PrintfFormatter<char>::format(CStringRef format);
-
-template int internal::CharTraits<char>::format_float(
-    char *buffer, std::size_t size, const char *format,
-    unsigned width, int precision, double value);
-
-template int internal::CharTraits<char>::format_float(
-    char *buffer, std::size_t size, const char *format,
-    unsigned width, int precision, long double value);
-
-// Explicit instantiations for wchar_t.
-
-template void internal::FixedBuffer<wchar_t>::grow(std::size_t);
-
-template void internal::ArgMap<wchar_t>::init(const ArgList &args);
-
-template void PrintfFormatter<wchar_t>::format(WCStringRef format);
-
-template int internal::CharTraits<wchar_t>::format_float(
-    wchar_t *buffer, std::size_t size, const wchar_t *format,
-    unsigned width, int precision, double value);
-
-template int internal::CharTraits<wchar_t>::format_float(
-    wchar_t *buffer, std::size_t size, const wchar_t *format,
-    unsigned width, int precision, long double value);
-
-#endif  // FMT_HEADER_ONLY
-
-}  // namespace fmt
-
-#ifdef _MSC_VER
-# pragma warning(pop)
-#endif
--- a/lib/external/fmt/format.h
+++ b/lib/external/fmt/format.h
--- a/lib/external/fmt/ostream.cc
+++ b/lib/external/fmt/ostream.cc
@@ -1,35 +0,0 @@
-/*
- Formatting library for C++ - std::ostream support
-
- Copyright (c) 2012 - 2016, Victor Zverovich
- All rights reserved.
-
- For the license information refer to format.h.
- */
-
-#include "ostream.h"
-
-namespace fmt {
-
-namespace internal {
-FMT_FUNC void write(std::ostream &os, Writer &w) {
-  const char *data = w.data();
-  typedef internal::MakeUnsigned<std::streamsize>::Type UnsignedStreamSize;
-  UnsignedStreamSize size = w.size();
-  UnsignedStreamSize max_size =
-      internal::to_unsigned((std::numeric_limits<std::streamsize>::max)());
-  do {
-    UnsignedStreamSize n = size <= max_size ? size : max_size;
-    os.write(data, static_cast<std::streamsize>(n));
-    data += n;
-    size -= n;
-  } while (size != 0);
-}
-}
-
-FMT_FUNC void print(std::ostream &os, CStringRef format_str, ArgList args) {
-  MemoryWriter w;
-  w.write(format_str, args);
-  internal::write(os, w);
-}
-}  // namespace fmt
--- a/lib/external/fmt/ostream.h
+++ b/lib/external/fmt/ostream.h
@@ -1,106 +0,0 @@
-/*
- Formatting library for C++ - std::ostream support
-
- Copyright (c) 2012 - 2016, Victor Zverovich
- All rights reserved.
-
- For the license information refer to format.h.
- */
-
-#ifndef FMT_OSTREAM_H_
-#define FMT_OSTREAM_H_
-
-#include "format.h"
-#include <ostream>
-
-namespace fmt {
-
-namespace internal {
-
-template <class Char>
-class FormatBuf : public std::basic_streambuf<Char> {
- private:
-  typedef typename std::basic_streambuf<Char>::int_type int_type;
-  typedef typename std::basic_streambuf<Char>::traits_type traits_type;
-
-  Buffer<Char> &buffer_;
-  Char *start_;
-
- public:
-  FormatBuf(Buffer<Char> &buffer) : buffer_(buffer), start_(&buffer[0]) {
-    this->setp(start_, start_ + buffer_.capacity());
-  }
-
-  int_type overflow(int_type ch = traits_type::eof()) {
-    if (!traits_type::eq_int_type(ch, traits_type::eof())) {
-      size_t buf_size = size();
-      buffer_.resize(buf_size);
-      buffer_.reserve(buf_size * 2);
-
-      start_ = &buffer_[0];
-      start_[buf_size] = traits_type::to_char_type(ch);
-      this->setp(start_+ buf_size + 1, start_ + buf_size * 2);
-    }
-    return ch;
-  }
-
-  size_t size() const {
-    return to_unsigned(this->pptr() - start_);
-  }
-};
-
-Yes &convert(std::ostream &);
-
-struct DummyStream : std::ostream {
-  DummyStream();  // Suppress a bogus warning in MSVC.
-  // Hide all operator<< overloads from std::ostream.
-  void operator<<(Null<>);
-};
-
-No &operator<<(std::ostream &, int);
-
-template<typename T>
-struct ConvertToIntImpl<T, true> {
-  // Convert to int only if T doesn't have an overloaded operator<<.
-  enum {
-    value = sizeof(convert(get<DummyStream>() << get<T>())) == sizeof(No)
-  };
-};
-
-// Write the content of w to os.
-void write(std::ostream &os, Writer &w);
-}  // namespace internal
-
-// Formats a value.
-template <typename Char, typename ArgFormatter, typename T>
-void format_arg(BasicFormatter<Char, ArgFormatter> &f,
-                const Char *&format_str, const T &value) {
-  internal::MemoryBuffer<Char, internal::INLINE_BUFFER_SIZE> buffer;
-
-  internal::FormatBuf<Char> format_buf(buffer);
-  std::basic_ostream<Char> output(&format_buf);
-  output << value;
-
-  BasicStringRef<Char> str(&buffer[0], format_buf.size());
-  typedef internal::MakeArg< BasicFormatter<Char> > MakeArg;
-  format_str = f.format(format_str, MakeArg(str));
-}
-
-/**
-  \rst
-  Prints formatted data to the stream *os*.
-
-  **Example**::
-
-    print(cerr, "Don't {}!", "panic");
-  \endrst
- */
-FMT_API void print(std::ostream &os, CStringRef format_str, ArgList args);
-FMT_VARIADIC(void, print, std::ostream &, CStringRef)
-}  // namespace fmt
-
-#ifdef FMT_HEADER_ONLY
-# include "ostream.cc"
-#endif
-
-#endif  // FMT_OSTREAM_H_
--- a/lib/external/fmt/posix.cc
+++ b/lib/external/fmt/posix.cc
@@ -1,238 +0,0 @@
-/*
- A C++ interface to POSIX functions.
-
- Copyright (c) 2012 - 2016, Victor Zverovich
- All rights reserved.
-
- For the license information refer to format.h.
- */
-
-// Disable bogus MSVC warnings.
-#ifndef _CRT_SECURE_NO_WARNINGS
-# define _CRT_SECURE_NO_WARNINGS
-#endif
-
-#include "posix.h"
-
-#include <limits.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#ifndef _WIN32
-# include <unistd.h>
-#else
-# include <windows.h>
-# include <io.h>
-
-# define O_CREAT _O_CREAT
-# define O_TRUNC _O_TRUNC
-
-# ifndef S_IRUSR
-#  define S_IRUSR _S_IREAD
-# endif
-
-# ifndef S_IWUSR
-#  define S_IWUSR _S_IWRITE
-# endif
-
-# ifdef __MINGW32__
-#  define _SH_DENYNO 0x40
-# endif
-
-#endif  // _WIN32
-
-#ifdef fileno
-# undef fileno
-#endif
-
-namespace {
-#ifdef _WIN32
-// Return type of read and write functions.
-typedef int RWResult;
-
-// On Windows the count argument to read and write is unsigned, so convert
-// it from size_t preventing integer overflow.
-inline unsigned convert_rwcount(std::size_t count) {
-  return count <= UINT_MAX ? static_cast<unsigned>(count) : UINT_MAX;
-}
-#else
-// Return type of read and write functions.
-typedef ssize_t RWResult;
-
-inline std::size_t convert_rwcount(std::size_t count) { return count; }
-#endif
-}
-
-fmt::BufferedFile::~BufferedFile() FMT_NOEXCEPT {
-  if (file_ && FMT_SYSTEM(fclose(file_)) != 0)
-    fmt::report_system_error(errno, "cannot close file");
-}
-
-fmt::BufferedFile::BufferedFile(
-    fmt::CStringRef filename, fmt::CStringRef mode) {
-  FMT_RETRY_VAL(file_, FMT_SYSTEM(fopen(filename.c_str(), mode.c_str())), 0);
-  if (!file_)
-    throw SystemError(errno, "cannot open file {}", filename);
-}
-
-void fmt::BufferedFile::close() {
-  if (!file_)
-    return;
-  int result = FMT_SYSTEM(fclose(file_));
-  file_ = 0;
-  if (result != 0)
-    throw SystemError(errno, "cannot close file");
-}
-
-// A macro used to prevent expansion of fileno on broken versions of MinGW.
-#define FMT_ARGS
-
-int fmt::BufferedFile::fileno() const {
-  int fd = FMT_POSIX_CALL(fileno FMT_ARGS(file_));
-  if (fd == -1)
-    throw SystemError(errno, "cannot get file descriptor");
-  return fd;
-}
-
-fmt::File::File(fmt::CStringRef path, int oflag) {
-  int mode = S_IRUSR | S_IWUSR;
-#if defined(_WIN32) && !defined(__MINGW32__)
-  fd_ = -1;
-  FMT_POSIX_CALL(sopen_s(&fd_, path.c_str(), oflag, _SH_DENYNO, mode));
-#else
-  FMT_RETRY(fd_, FMT_POSIX_CALL(open(path.c_str(), oflag, mode)));
-#endif
-  if (fd_ == -1)
-    throw SystemError(errno, "cannot open file {}", path);
-}
-
-fmt::File::~File() FMT_NOEXCEPT {
-  // Don't retry close in case of EINTR!
-  // See http://linux.derkeiler.com/Mailing-Lists/Kernel/2005-09/3000.html
-  if (fd_ != -1 && FMT_POSIX_CALL(close(fd_)) != 0)
-    fmt::report_system_error(errno, "cannot close file");
-}
-
-void fmt::File::close() {
-  if (fd_ == -1)
-    return;
-  // Don't retry close in case of EINTR!
-  // See http://linux.derkeiler.com/Mailing-Lists/Kernel/2005-09/3000.html
-  int result = FMT_POSIX_CALL(close(fd_));
-  fd_ = -1;
-  if (result != 0)
-    throw SystemError(errno, "cannot close file");
-}
-
-fmt::LongLong fmt::File::size() const {
-#ifdef _WIN32
-  // Use GetFileSize instead of GetFileSizeEx for the case when _WIN32_WINNT
-  // is less than 0x0500 as is the case with some default MinGW builds.
-  // Both functions support large file sizes.
-  DWORD size_upper = 0;
-  HANDLE handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd_));
-  DWORD size_lower = FMT_SYSTEM(GetFileSize(handle, &size_upper));
-  if (size_lower == INVALID_FILE_SIZE) {
-    DWORD error = GetLastError();
-    if (error != NO_ERROR)
-      throw WindowsError(GetLastError(), "cannot get file size");
-  }
-  fmt::ULongLong long_size = size_upper;
-  return (long_size << sizeof(DWORD) * CHAR_BIT) | size_lower;
-#else
-  typedef struct stat Stat;
-  Stat file_stat = Stat();
-  if (FMT_POSIX_CALL(fstat(fd_, &file_stat)) == -1)
-    throw SystemError(errno, "cannot get file attributes");
-  FMT_STATIC_ASSERT(sizeof(fmt::LongLong) >= sizeof(file_stat.st_size),
-      "return type of File::size is not large enough");
-  return file_stat.st_size;
-#endif
-}
-
-std::size_t fmt::File::read(void *buffer, std::size_t count) {
-  RWResult result = 0;
-  FMT_RETRY(result, FMT_POSIX_CALL(read(fd_, buffer, convert_rwcount(count))));
-  if (result < 0)
-    throw SystemError(errno, "cannot read from file");
-  return internal::to_unsigned(result);
-}
-
-std::size_t fmt::File::write(const void *buffer, std::size_t count) {
-  RWResult result = 0;
-  FMT_RETRY(result, FMT_POSIX_CALL(write(fd_, buffer, convert_rwcount(count))));
-  if (result < 0)
-    throw SystemError(errno, "cannot write to file");
-  return internal::to_unsigned(result);
-}
-
-fmt::File fmt::File::dup(int fd) {
-  // Don't retry as dup doesn't return EINTR.
-  // http://pubs.opengroup.org/onlinepubs/009695399/functions/dup.html
-  int new_fd = FMT_POSIX_CALL(dup(fd));
-  if (new_fd == -1)
-    throw SystemError(errno, "cannot duplicate file descriptor {}", fd);
-  return File(new_fd);
-}
-
-void fmt::File::dup2(int fd) {
-  int result = 0;
-  FMT_RETRY(result, FMT_POSIX_CALL(dup2(fd_, fd)));
-  if (result == -1) {
-    throw SystemError(errno,
-      "cannot duplicate file descriptor {} to {}", fd_, fd);
-  }
-}
-
-void fmt::File::dup2(int fd, ErrorCode &ec) FMT_NOEXCEPT {
-  int result = 0;
-  FMT_RETRY(result, FMT_POSIX_CALL(dup2(fd_, fd)));
-  if (result == -1)
-    ec = ErrorCode(errno);
-}
-
-void fmt::File::pipe(File &read_end, File &write_end) {
-  // Close the descriptors first to make sure that assignments don't throw
-  // and there are no leaks.
-  read_end.close();
-  write_end.close();
-  int fds[2] = {};
-#ifdef _WIN32
-  // Make the default pipe capacity same as on Linux 2.6.11+.
-  enum { DEFAULT_CAPACITY = 65536 };
-  int result = FMT_POSIX_CALL(pipe(fds, DEFAULT_CAPACITY, _O_BINARY));
-#else
-  // Don't retry as the pipe function doesn't return EINTR.
-  // http://pubs.opengroup.org/onlinepubs/009696799/functions/pipe.html
-  int result = FMT_POSIX_CALL(pipe(fds));
-#endif
-  if (result != 0)
-    throw SystemError(errno, "cannot create pipe");
-  // The following assignments don't throw because read_fd and write_fd
-  // are closed.
-  read_end = File(fds[0]);
-  write_end = File(fds[1]);
-}
-
-fmt::BufferedFile fmt::File::fdopen(const char *mode) {
-  // Don't retry as fdopen doesn't return EINTR.
-  FILE *f = FMT_POSIX_CALL(fdopen(fd_, mode));
-  if (!f)
-    throw SystemError(errno, "cannot associate stream with file descriptor");
-  BufferedFile file(f);
-  fd_ = -1;
-  return file;
-}
-
-long fmt::getpagesize() {
-#ifdef _WIN32
-  SYSTEM_INFO si;
-  GetSystemInfo(&si);
-  return si.dwPageSize;
-#else
-  long size = FMT_POSIX_CALL(sysconf(_SC_PAGESIZE));
-  if (size < 0)
-    throw SystemError(errno, "cannot get memory page size");
-  return size;
-#endif
-}
--- a/lib/external/fmt/posix.h
+++ b/lib/external/fmt/posix.h
@@ -1,367 +0,0 @@
-/*
- A C++ interface to POSIX functions.
-
- Copyright (c) 2012 - 2016, Victor Zverovich
- All rights reserved.
-
- For the license information refer to format.h.
- */
-
-#ifndef FMT_POSIX_H_
-#define FMT_POSIX_H_
-
-#if defined(__MINGW32__) || defined(__CYGWIN__)
-// Workaround MinGW bug https://sourceforge.net/p/mingw/bugs/2024/.
-# undef __STRICT_ANSI__
-#endif
-
-#include <errno.h>
-#include <fcntl.h>   // for O_RDONLY
-#include <locale.h>  // for locale_t
-#include <stdio.h>
-#include <stdlib.h>  // for strtod_l
-
-#include <cstddef>
-
-#if defined __APPLE__ || defined(__FreeBSD__)
-# include <xlocale.h>  // for LC_NUMERIC_MASK on OS X
-#endif
-
-#include "format.h"
-
-#ifndef FMT_POSIX
-# if defined(_WIN32) && !defined(__MINGW32__)
-// Fix warnings about deprecated symbols.
-#  define FMT_POSIX(call) _##call
-# else
-#  define FMT_POSIX(call) call
-# endif
-#endif
-
-// Calls to system functions are wrapped in FMT_SYSTEM for testability.
-#ifdef FMT_SYSTEM
-# define FMT_POSIX_CALL(call) FMT_SYSTEM(call)
-#else
-# define FMT_SYSTEM(call) call
-# ifdef _WIN32
-// Fix warnings about deprecated symbols.
-#  define FMT_POSIX_CALL(call) ::_##call
-# else
-#  define FMT_POSIX_CALL(call) ::call
-# endif
-#endif
-
-// Retries the expression while it evaluates to error_result and errno
-// equals to EINTR.
-#ifndef _WIN32
-# define FMT_RETRY_VAL(result, expression, error_result) \
-  do { \
-    result = (expression); \
-  } while (result == error_result && errno == EINTR)
-#else
-# define FMT_RETRY_VAL(result, expression, error_result) result = (expression)
-#endif
-
-#define FMT_RETRY(result, expression) FMT_RETRY_VAL(result, expression, -1)
-
-namespace fmt {
-
-// An error code.
-class ErrorCode {
- private:
-  int value_;
-
- public:
-  explicit ErrorCode(int value = 0) FMT_NOEXCEPT : value_(value) {}
-
-  int get() const FMT_NOEXCEPT { return value_; }
-};
-
-// A buffered file.
-class BufferedFile {
- private:
-  FILE *file_;
-
-  friend class File;
-
-  explicit BufferedFile(FILE *f) : file_(f) {}
-
- public:
-  // Constructs a BufferedFile object which doesn't represent any file.
-  BufferedFile() FMT_NOEXCEPT : file_(0) {}
-
-  // Destroys the object closing the file it represents if any.
-  ~BufferedFile() FMT_NOEXCEPT;
-
-#if !FMT_USE_RVALUE_REFERENCES
-  // Emulate a move constructor and a move assignment operator if rvalue
-  // references are not supported.
-
- private:
-  // A proxy object to emulate a move constructor.
-  // It is private to make it impossible call operator Proxy directly.
-  struct Proxy {
-    FILE *file;
-  };
-
-public:
-  // A "move constructor" for moving from a temporary.
-  BufferedFile(Proxy p) FMT_NOEXCEPT : file_(p.file) {}
-
-  // A "move constructor" for moving from an lvalue.
-  BufferedFile(BufferedFile &f) FMT_NOEXCEPT : file_(f.file_) {
-    f.file_ = 0;
-  }
-
-  // A "move assignment operator" for moving from a temporary.
-  BufferedFile &operator=(Proxy p) {
-    close();
-    file_ = p.file;
-    return *this;
-  }
-
-  // A "move assignment operator" for moving from an lvalue.
-  BufferedFile &operator=(BufferedFile &other) {
-    close();
-    file_ = other.file_;
-    other.file_ = 0;
-    return *this;
-  }
-
-  // Returns a proxy object for moving from a temporary:
-  //   BufferedFile file = BufferedFile(...);
-  operator Proxy() FMT_NOEXCEPT {
-    Proxy p = {file_};
-    file_ = 0;
-    return p;
-  }
-
-#else
- private:
-  FMT_DISALLOW_COPY_AND_ASSIGN(BufferedFile);
-
- public:
-  BufferedFile(BufferedFile &&other) FMT_NOEXCEPT : file_(other.file_) {
-    other.file_ = 0;
-  }
-
-  BufferedFile& operator=(BufferedFile &&other) {
-    close();
-    file_ = other.file_;
-    other.file_ = 0;
-    return *this;
-  }
-#endif
-
-  // Opens a file.
-  BufferedFile(CStringRef filename, CStringRef mode);
-
-  // Closes the file.
-  void close();
-
-  // Returns the pointer to a FILE object representing this file.
-  FILE *get() const FMT_NOEXCEPT { return file_; }
-
-  // We place parentheses around fileno to workaround a bug in some versions
-  // of MinGW that define fileno as a macro.
-  int (fileno)() const;
-
-  void print(CStringRef format_str, const ArgList &args) {
-    fmt::print(file_, format_str, args);
-  }
-  FMT_VARIADIC(void, print, CStringRef)
-};
-
-// A file. Closed file is represented by a File object with descriptor -1.
-// Methods that are not declared with FMT_NOEXCEPT may throw
-// fmt::SystemError in case of failure. Note that some errors such as
-// closing the file multiple times will cause a crash on Windows rather
-// than an exception. You can get standard behavior by overriding the
-// invalid parameter handler with _set_invalid_parameter_handler.
-class File {
- private:
-  int fd_;  // File descriptor.
-
-  // Constructs a File object with a given descriptor.
-  explicit File(int fd) : fd_(fd) {}
-
- public:
-  // Possible values for the oflag argument to the constructor.
-  enum {
-    RDONLY = FMT_POSIX(O_RDONLY), // Open for reading only.
-    WRONLY = FMT_POSIX(O_WRONLY), // Open for writing only.
-    RDWR   = FMT_POSIX(O_RDWR)    // Open for reading and writing.
-  };
-
-  // Constructs a File object which doesn't represent any file.
-  File() FMT_NOEXCEPT : fd_(-1) {}
-
-  // Opens a file and constructs a File object representing this file.
-  File(CStringRef path, int oflag);
-
-#if !FMT_USE_RVALUE_REFERENCES
-  // Emulate a move constructor and a move assignment operator if rvalue
-  // references are not supported.
-
- private:
-  // A proxy object to emulate a move constructor.
-  // It is private to make it impossible call operator Proxy directly.
-  struct Proxy {
-    int fd;
-  };
-
- public:
-  // A "move constructor" for moving from a temporary.
-  File(Proxy p) FMT_NOEXCEPT : fd_(p.fd) {}
-
-  // A "move constructor" for moving from an lvalue.
-  File(File &other) FMT_NOEXCEPT : fd_(other.fd_) {
-    other.fd_ = -1;
-  }
-
-  // A "move assignment operator" for moving from a temporary.
-  File &operator=(Proxy p) {
-    close();
-    fd_ = p.fd;
-    return *this;
-  }
-
-  // A "move assignment operator" for moving from an lvalue.
-  File &operator=(File &other) {
-    close();
-    fd_ = other.fd_;
-    other.fd_ = -1;
-    return *this;
-  }
-
-  // Returns a proxy object for moving from a temporary:
-  //   File file = File(...);
-  operator Proxy() FMT_NOEXCEPT {
-    Proxy p = {fd_};
-    fd_ = -1;
-    return p;
-  }
-
-#else
- private:
-  FMT_DISALLOW_COPY_AND_ASSIGN(File);
-
- public:
-  File(File &&other) FMT_NOEXCEPT : fd_(other.fd_) {
-    other.fd_ = -1;
-  }
-
-  File& operator=(File &&other) {
-    close();
-    fd_ = other.fd_;
-    other.fd_ = -1;
-    return *this;
-  }
-#endif
-
-  // Destroys the object closing the file it represents if any.
-  ~File() FMT_NOEXCEPT;
-
-  // Returns the file descriptor.
-  int descriptor() const FMT_NOEXCEPT { return fd_; }
-
-  // Closes the file.
-  void close();
-
-  // Returns the file size. The size has signed type for consistency with
-  // stat::st_size.
-  LongLong size() const;
-
-  // Attempts to read count bytes from the file into the specified buffer.
-  std::size_t read(void *buffer, std::size_t count);
-
-  // Attempts to write count bytes from the specified buffer to the file.
-  std::size_t write(const void *buffer, std::size_t count);
-
-  // Duplicates a file descriptor with the dup function and returns
-  // the duplicate as a file object.
-  static File dup(int fd);
-
-  // Makes fd be the copy of this file descriptor, closing fd first if
-  // necessary.
-  void dup2(int fd);
-
-  // Makes fd be the copy of this file descriptor, closing fd first if
-  // necessary.
-  void dup2(int fd, ErrorCode &ec) FMT_NOEXCEPT;
-
-  // Creates a pipe setting up read_end and write_end file objects for reading
-  // and writing respectively.
-  static void pipe(File &read_end, File &write_end);
-
-  // Creates a BufferedFile object associated with this file and detaches
-  // this File object from the file.
-  BufferedFile fdopen(const char *mode);
-};
-
-// Returns the memory page size.
-long getpagesize();
-
-#if (defined(LC_NUMERIC_MASK) || defined(_MSC_VER)) && \
-    !defined(__ANDROID__) && !defined(__CYGWIN__)
-# define FMT_LOCALE
-#endif
-
-#ifdef FMT_LOCALE
-// A "C" numeric locale.
-class Locale {
- private:
-# ifdef _MSC_VER
-  typedef _locale_t locale_t;
-
-  enum { LC_NUMERIC_MASK = LC_NUMERIC };
-
-  static locale_t newlocale(int category_mask, const char *locale, locale_t) {
-    return _create_locale(category_mask, locale);
-  }
-
-  static void freelocale(locale_t locale) {
-    _free_locale(locale);
-  }
-
-  static double strtod_l(const char *nptr, char **endptr, _locale_t locale) {
-    return _strtod_l(nptr, endptr, locale);
-  }
-# endif
-
-  locale_t locale_;
-
-  FMT_DISALLOW_COPY_AND_ASSIGN(Locale);
-
- public:
-  typedef locale_t Type;
-
-  Locale() : locale_(newlocale(LC_NUMERIC_MASK, "C", NULL)) {
-    if (!locale_)
-      throw fmt::SystemError(errno, "cannot create locale");
-  }
-  ~Locale() { freelocale(locale_); }
-
-  Type get() const { return locale_; }
-
-  // Converts string to floating-point number and advances str past the end
-  // of the parsed input.
-  double strtod(const char *&str) const {
-    char *end = 0;
-    double result = strtod_l(str, &end, locale_);
-    str = end;
-    return result;
-  }
-};
-#endif  // FMT_LOCALE
-}  // namespace fmt
-
-#if !FMT_USE_RVALUE_REFERENCES
-namespace std {
-// For compatibility with C++98.
-inline fmt::BufferedFile &move(fmt::BufferedFile &f) { return f; }
-inline fmt::File &move(fmt::File &f) { return f; }
-}
-#endif
-
-#endif  // FMT_POSIX_H_
--- a/lib/external/fmt/printf.h
+++ b/lib/external/fmt/printf.h
@@ -1,558 +0,0 @@
-/*
- Formatting library for C++
-
- Copyright (c) 2012 - 2016, Victor Zverovich
- All rights reserved.
-
- For the license information refer to format.h.
- */
-
-#ifndef FMT_PRINTF_H_
-#define FMT_PRINTF_H_
-
-#include <algorithm>  // std::fill_n
-#include <limits>     // std::numeric_limits
-
-#include "ostream.h"
-
-namespace fmt {
-namespace internal {
-
-// Checks if a value fits in int - used to avoid warnings about comparing
-// signed and unsigned integers.
-template <bool IsSigned>
-struct IntChecker {
-  template <typename T>
-  static bool fits_in_int(T value) {
-    unsigned max = std::numeric_limits<int>::max();
-    return value <= max;
-  }
-  static bool fits_in_int(bool) { return true; }
-};
-
-template <>
-struct IntChecker<true> {
-  template <typename T>
-  static bool fits_in_int(T value) {
-    return value >= std::numeric_limits<int>::min() &&
-           value <= std::numeric_limits<int>::max();
-  }
-  static bool fits_in_int(int) { return true; }
-};
-
-class PrecisionHandler : public ArgVisitor<PrecisionHandler, int> {
- public:
-  void report_unhandled_arg() {
-    FMT_THROW(FormatError("precision is not integer"));
-  }
-
-  template <typename T>
-  int visit_any_int(T value) {
-    if (!IntChecker<std::numeric_limits<T>::is_signed>::fits_in_int(value))
-      FMT_THROW(FormatError("number is too big"));
-    return static_cast<int>(value);
-  }
-};
-
-// IsZeroInt::visit(arg) returns true iff arg is a zero integer.
-class IsZeroInt : public ArgVisitor<IsZeroInt, bool> {
- public:
-  template <typename T>
-  bool visit_any_int(T value) { return value == 0; }
-};
-
-template <typename T, typename U>
-struct is_same {
-  enum { value = 0 };
-};
-
-template <typename T>
-struct is_same<T, T> {
-  enum { value = 1 };
-};
-
-// An argument visitor that converts an integer argument to T for printf,
-// if T is an integral type. If T is void, the argument is converted to
-// corresponding signed or unsigned type depending on the type specifier:
-// 'd' and 'i' - signed, other - unsigned)
-template <typename T = void>
-class ArgConverter : public ArgVisitor<ArgConverter<T>, void> {
- private:
-  internal::Arg &arg_;
-  wchar_t type_;
-
-  FMT_DISALLOW_COPY_AND_ASSIGN(ArgConverter);
-
- public:
-  ArgConverter(internal::Arg &arg, wchar_t type)
-    : arg_(arg), type_(type) {}
-
-  void visit_bool(bool value) {
-    if (type_ != 's')
-      visit_any_int(value);
-  }
-
-  template <typename U>
-  void visit_any_int(U value) {
-    bool is_signed = type_ == 'd' || type_ == 'i';
-    using internal::Arg;
-    typedef typename internal::Conditional<
-        is_same<T, void>::value, U, T>::type TargetType;
-    if (sizeof(TargetType) <= sizeof(int)) {
-      // Extra casts are used to silence warnings.
-      if (is_signed) {
-        arg_.type = Arg::INT;
-        arg_.int_value = static_cast<int>(static_cast<TargetType>(value));
-      } else {
-        arg_.type = Arg::UINT;
-        typedef typename internal::MakeUnsigned<TargetType>::Type Unsigned;
-        arg_.uint_value = static_cast<unsigned>(static_cast<Unsigned>(value));
-      }
-    } else {
-      if (is_signed) {
-        arg_.type = Arg::LONG_LONG;
-        // glibc's printf doesn't sign extend arguments of smaller types:
-        //   std::printf("%lld", -42);  // prints "4294967254"
-        // but we don't have to do the same because it's a UB.
-        arg_.long_long_value = static_cast<LongLong>(value);
-      } else {
-        arg_.type = Arg::ULONG_LONG;
-        arg_.ulong_long_value =
-            static_cast<typename internal::MakeUnsigned<U>::Type>(value);
-      }
-    }
-  }
-};
-
-// Converts an integer argument to char for printf.
-class CharConverter : public ArgVisitor<CharConverter, void> {
- private:
-  internal::Arg &arg_;
-
-  FMT_DISALLOW_COPY_AND_ASSIGN(CharConverter);
-
- public:
-  explicit CharConverter(internal::Arg &arg) : arg_(arg) {}
-
-  template <typename T>
-  void visit_any_int(T value) {
-    arg_.type = internal::Arg::CHAR;
-    arg_.int_value = static_cast<char>(value);
-  }
-};
-
-// Checks if an argument is a valid printf width specifier and sets
-// left alignment if it is negative.
-class WidthHandler : public ArgVisitor<WidthHandler, unsigned> {
- private:
-  FormatSpec &spec_;
-
-  FMT_DISALLOW_COPY_AND_ASSIGN(WidthHandler);
-
- public:
-  explicit WidthHandler(FormatSpec &spec) : spec_(spec) {}
-
-  void report_unhandled_arg() {
-    FMT_THROW(FormatError("width is not integer"));
-  }
-
-  template <typename T>
-  unsigned visit_any_int(T value) {
-    typedef typename internal::IntTraits<T>::MainType UnsignedType;
-    UnsignedType width = static_cast<UnsignedType>(value);
-    if (internal::is_negative(value)) {
-      spec_.align_ = ALIGN_LEFT;
-      width = 0 - width;
-    }
-    unsigned int_max = std::numeric_limits<int>::max();
-    if (width > int_max)
-      FMT_THROW(FormatError("number is too big"));
-    return static_cast<unsigned>(width);
-  }
-};
-}  // namespace internal
-
-/**
-  \rst
-  A ``printf`` argument formatter based on the `curiously recurring template
-  pattern <http://en.wikipedia.org/wiki/Curiously_recurring_template_pattern>`_.
-
-  To use `~fmt::BasicPrintfArgFormatter` define a subclass that implements some
-  or all of the visit methods with the same signatures as the methods in
-  `~fmt::ArgVisitor`, for example, `~fmt::ArgVisitor::visit_int()`.
-  Pass the subclass as the *Impl* template parameter. When a formatting
-  function processes an argument, it will dispatch to a visit method
-  specific to the argument type. For example, if the argument type is
-  ``double`` then the `~fmt::ArgVisitor::visit_double()` method of a subclass
-  will be called. If the subclass doesn't contain a method with this signature,
-  then a corresponding method of `~fmt::BasicPrintfArgFormatter` or its
-  superclass will be called.
-  \endrst
- */
-template <typename Impl, typename Char>
-class BasicPrintfArgFormatter : public internal::ArgFormatterBase<Impl, Char> {
- private:
-  void write_null_pointer() {
-    this->spec().type_ = 0;
-    this->write("(nil)");
-  }
-
-  typedef internal::ArgFormatterBase<Impl, Char> Base;
-
- public:
-  /**
-    \rst
-    Constructs an argument formatter object.
-    *writer* is a reference to the output writer and *spec* contains format
-    specifier information for standard argument types.
-    \endrst
-   */
-  BasicPrintfArgFormatter(BasicWriter<Char> &writer, FormatSpec &spec)
-  : internal::ArgFormatterBase<Impl, Char>(writer, spec) {}
-
-  /** Formats an argument of type ``bool``. */
-  void visit_bool(bool value) {
-    FormatSpec &fmt_spec = this->spec();
-    if (fmt_spec.type_ != 's')
-      return this->visit_any_int(value);
-    fmt_spec.type_ = 0;
-    this->write(value);
-  }
-
-  /** Formats a character. */
-  void visit_char(int value) {
-    const FormatSpec &fmt_spec = this->spec();
-    BasicWriter<Char> &w = this->writer();
-    if (fmt_spec.type_ && fmt_spec.type_ != 'c')
-      w.write_int(value, fmt_spec);
-    typedef typename BasicWriter<Char>::CharPtr CharPtr;
-    CharPtr out = CharPtr();
-    if (fmt_spec.width_ > 1) {
-      Char fill = ' ';
-      out = w.grow_buffer(fmt_spec.width_);
-      if (fmt_spec.align_ != ALIGN_LEFT) {
-        std::fill_n(out, fmt_spec.width_ - 1, fill);
-        out += fmt_spec.width_ - 1;
-      } else {
-        std::fill_n(out + 1, fmt_spec.width_ - 1, fill);
-      }
-    } else {
-      out = w.grow_buffer(1);
-    }
-    *out = static_cast<Char>(value);
-  }
-
-  /** Formats a null-terminated C string. */
-  void visit_cstring(const char *value) {
-    if (value)
-      Base::visit_cstring(value);
-    else if (this->spec().type_ == 'p')
-      write_null_pointer();
-    else
-      this->write("(null)");
-  }
-
-  /** Formats a pointer. */
-  void visit_pointer(const void *value) {
-    if (value)
-      return Base::visit_pointer(value);
-    this->spec().type_ = 0;
-    write_null_pointer();
-  }
-
-  /** Formats an argument of a custom (user-defined) type. */
-  void visit_custom(internal::Arg::CustomValue c) {
-    BasicFormatter<Char> formatter(ArgList(), this->writer());
-    const Char format_str[] = {'}', 0};
-    const Char *format = format_str;
-    c.format(&formatter, c.value, &format);
-  }
-};
-
-/** The default printf argument formatter. */
-template <typename Char>
-class PrintfArgFormatter
-    : public BasicPrintfArgFormatter<PrintfArgFormatter<Char>, Char> {
- public:
-  /** Constructs an argument formatter object. */
-  PrintfArgFormatter(BasicWriter<Char> &w, FormatSpec &s)
-  : BasicPrintfArgFormatter<PrintfArgFormatter<Char>, Char>(w, s) {}
-};
-
-/** This template formats data and writes the output to a writer. */
-template <typename Char, typename ArgFormatter = PrintfArgFormatter<Char> >
-class PrintfFormatter : private internal::FormatterBase {
- private:
-  BasicWriter<Char> &writer_;
-
-  void parse_flags(FormatSpec &spec, const Char *&s);
-
-  // Returns the argument with specified index or, if arg_index is equal
-  // to the maximum unsigned value, the next argument.
-  internal::Arg get_arg(
-      const Char *s,
-      unsigned arg_index = (std::numeric_limits<unsigned>::max)());
-
-  // Parses argument index, flags and width and returns the argument index.
-  unsigned parse_header(const Char *&s, FormatSpec &spec);
-
- public:
-  /**
-   \rst
-   Constructs a ``PrintfFormatter`` object. References to the arguments and
-   the writer are stored in the formatter object so make sure they have
-   appropriate lifetimes.
-   \endrst
-   */
-  explicit PrintfFormatter(const ArgList &args, BasicWriter<Char> &w)
-    : FormatterBase(args), writer_(w) {}
-
-  /** Formats stored arguments and writes the output to the writer. */
-  FMT_API void format(BasicCStringRef<Char> format_str);
-};
-
-template <typename Char, typename AF>
-void PrintfFormatter<Char, AF>::parse_flags(FormatSpec &spec, const Char *&s) {
-  for (;;) {
-    switch (*s++) {
-      case '-':
-        spec.align_ = ALIGN_LEFT;
-        break;
-      case '+':
-        spec.flags_ |= SIGN_FLAG | PLUS_FLAG;
-        break;
-      case '0':
-        spec.fill_ = '0';
-        break;
-      case ' ':
-        spec.flags_ |= SIGN_FLAG;
-        break;
-      case '#':
-        spec.flags_ |= HASH_FLAG;
-        break;
-      default:
-        --s;
-        return;
-    }
-  }
-}
-
-template <typename Char, typename AF>
-internal::Arg PrintfFormatter<Char, AF>::get_arg(const Char *s,
-                                                 unsigned arg_index) {
-  (void)s;
-  const char *error = 0;
-  internal::Arg arg = arg_index == std::numeric_limits<unsigned>::max() ?
-    next_arg(error) : FormatterBase::get_arg(arg_index - 1, error);
-  if (error)
-    FMT_THROW(FormatError(!*s ? "invalid format string" : error));
-  return arg;
-}
-
-template <typename Char, typename AF>
-unsigned PrintfFormatter<Char, AF>::parse_header(
-  const Char *&s, FormatSpec &spec) {
-  unsigned arg_index = std::numeric_limits<unsigned>::max();
-  Char c = *s;
-  if (c >= '0' && c <= '9') {
-    // Parse an argument index (if followed by '$') or a width possibly
-    // preceded with '0' flag(s).
-    unsigned value = internal::parse_nonnegative_int(s);
-    if (*s == '$') {  // value is an argument index
-      ++s;
-      arg_index = value;
-    } else {
-      if (c == '0')
-        spec.fill_ = '0';
-      if (value != 0) {
-        // Nonzero value means that we parsed width and don't need to
-        // parse it or flags again, so return now.
-        spec.width_ = value;
-        return arg_index;
-      }
-    }
-  }
-  parse_flags(spec, s);
-  // Parse width.
-  if (*s >= '0' && *s <= '9') {
-    spec.width_ = internal::parse_nonnegative_int(s);
-  } else if (*s == '*') {
-    ++s;
-    spec.width_ = internal::WidthHandler(spec).visit(get_arg(s));
-  }
-  return arg_index;
-}
-
-template <typename Char, typename AF>
-void PrintfFormatter<Char, AF>::format(BasicCStringRef<Char> format_str) {
-  const Char *start = format_str.c_str();
-  const Char *s = start;
-  while (*s) {
-    Char c = *s++;
-    if (c != '%') continue;
-    if (*s == c) {
-      write(writer_, start, s);
-      start = ++s;
-      continue;
-    }
-    write(writer_, start, s - 1);
-
-    FormatSpec spec;
-    spec.align_ = ALIGN_RIGHT;
-
-    // Parse argument index, flags and width.
-    unsigned arg_index = parse_header(s, spec);
-
-    // Parse precision.
-    if (*s == '.') {
-      ++s;
-      if ('0' <= *s && *s <= '9') {
-        spec.precision_ = static_cast<int>(internal::parse_nonnegative_int(s));
-      } else if (*s == '*') {
-        ++s;
-        spec.precision_ = internal::PrecisionHandler().visit(get_arg(s));
-      }
-    }
-
-    using internal::Arg;
-    Arg arg = get_arg(s, arg_index);
-    if (spec.flag(HASH_FLAG) && internal::IsZeroInt().visit(arg))
-      spec.flags_ &= ~internal::to_unsigned<int>(HASH_FLAG);
-    if (spec.fill_ == '0') {
-      if (arg.type <= Arg::LAST_NUMERIC_TYPE)
-        spec.align_ = ALIGN_NUMERIC;
-      else
-        spec.fill_ = ' ';  // Ignore '0' flag for non-numeric types.
-    }
-
-    // Parse length and convert the argument to the required type.
-    using internal::ArgConverter;
-    switch (*s++) {
-    case 'h':
-      if (*s == 'h')
-        ArgConverter<signed char>(arg, *++s).visit(arg);
-      else
-        ArgConverter<short>(arg, *s).visit(arg);
-      break;
-    case 'l':
-      if (*s == 'l')
-        ArgConverter<fmt::LongLong>(arg, *++s).visit(arg);
-      else
-        ArgConverter<long>(arg, *s).visit(arg);
-      break;
-    case 'j':
-      ArgConverter<intmax_t>(arg, *s).visit(arg);
-      break;
-    case 'z':
-      ArgConverter<std::size_t>(arg, *s).visit(arg);
-      break;
-    case 't':
-      ArgConverter<std::ptrdiff_t>(arg, *s).visit(arg);
-      break;
-    case 'L':
-      // printf produces garbage when 'L' is omitted for long double, no
-      // need to do the same.
-      break;
-    default:
-      --s;
-      ArgConverter<void>(arg, *s).visit(arg);
-    }
-
-    // Parse type.
-    if (!*s)
-      FMT_THROW(FormatError("invalid format string"));
-    spec.type_ = static_cast<char>(*s++);
-    if (arg.type <= Arg::LAST_INTEGER_TYPE) {
-      // Normalize type.
-      switch (spec.type_) {
-      case 'i': case 'u':
-        spec.type_ = 'd';
-        break;
-      case 'c':
-        // TODO: handle wchar_t
-        internal::CharConverter(arg).visit(arg);
-        break;
-      }
-    }
-
-    start = s;
-
-    // Format argument.
-    AF(writer_, spec).visit(arg);
-  }
-  write(writer_, start, s);
-}
-
-template <typename Char>
-void printf(BasicWriter<Char> &w, BasicCStringRef<Char> format, ArgList args) {
-  PrintfFormatter<Char>(args, w).format(format);
-}
-
-/**
-  \rst
-  Formats arguments and returns the result as a string.
-
-  **Example**::
-
-    std::string message = fmt::sprintf("The answer is %d", 42);
-  \endrst
-*/
-inline std::string sprintf(CStringRef format, ArgList args) {
-  MemoryWriter w;
-  printf(w, format, args);
-  return w.str();
-}
-FMT_VARIADIC(std::string, sprintf, CStringRef)
-
-inline std::wstring sprintf(WCStringRef format, ArgList args) {
-  WMemoryWriter w;
-  printf(w, format, args);
-  return w.str();
-}
-FMT_VARIADIC_W(std::wstring, sprintf, WCStringRef)
-
-/**
-  \rst
-  Prints formatted data to the file *f*.
-
-  **Example**::
-
-    fmt::fprintf(stderr, "Don't %s!", "panic");
-  \endrst
- */
-FMT_API int fprintf(std::FILE *f, CStringRef format, ArgList args);
-FMT_VARIADIC(int, fprintf, std::FILE *, CStringRef)
-
-/**
-  \rst
-  Prints formatted data to ``stdout``.
-
-  **Example**::
-
-    fmt::printf("Elapsed time: %.2f seconds", 1.23);
-  \endrst
- */
-inline int printf(CStringRef format, ArgList args) {
-  return fprintf(stdout, format, args);
-}
-FMT_VARIADIC(int, printf, CStringRef)
-
-/**
-  \rst
-  Prints formatted data to the stream *os*.
-
-  **Example**::
-
-    fprintf(cerr, "Don't %s!", "panic");
-  \endrst
- */
-inline int fprintf(std::ostream &os, CStringRef format_str, ArgList args) {
-  MemoryWriter w;
-  printf(w, format_str, args);
-  internal::write(os, w);
-  return static_cast<int>(w.size());
-}
-FMT_VARIADIC(int, fprintf, std::ostream &, CStringRef)
-}  // namespace fmt
-
-#endif  // FMT_PRINTF_H_
--- a/lib/external/fmt/string.h
+++ b/lib/external/fmt/string.h
@@ -1,119 +0,0 @@
-/*
- Formatting library for C++ - string utilities
-
- Copyright (c) 2012 - 2016, Victor Zverovich
- All rights reserved.
-
- For the license information refer to format.h.
- */
-
-#ifndef FMT_STRING_H_
-#define FMT_STRING_H_
-
-#include "format.h"
-
-namespace fmt {
-
-namespace internal {
-
-// A buffer that stores data in ``std::string``.
-template <typename Char>
-class StringBuffer : public Buffer<Char> {
- private:
-  std::basic_string<Char> data_;
-
- protected:
-  virtual void grow(std::size_t size) {
-    data_.resize(size);
-    this->ptr_ = &data_[0];
-    this->capacity_ = size;
-  }
-
- public:
-  // Moves the data to ``str`` clearing the buffer.
-  void move_to(std::basic_string<Char> &str) {
-    data_.resize(this->size_);
-    str.swap(data_);
-    this->capacity_ = this->size_ = 0;
-    this->ptr_ = 0;
-  }
-};
-}  // namespace internal
-
-/**
-  \rst
-  This class template provides operations for formatting and writing data
-  into a character stream. The output is stored in ``std::string`` that grows
-  dynamically.
-
-  You can use one of the following typedefs for common character types
-  and the standard allocator:
-
-  +---------------+----------------------------+
-  | Type          | Definition                 |
-  +===============+============================+
-  | StringWriter  | BasicStringWriter<char>    |
-  +---------------+----------------------------+
-  | WStringWriter | BasicStringWriter<wchar_t> |
-  +---------------+----------------------------+
-
-  **Example**::
-
-     StringWriter out;
-     out << "The answer is " << 42 << "\n";
-
-  This will write the following output to the ``out`` object:
-
-  .. code-block:: none
-
-     The answer is 42
-
-  The output can be moved to an ``std::string`` with ``out.move_to()``.
-  \endrst
- */
-template <typename Char>
-class BasicStringWriter : public BasicWriter<Char> {
- private:
-  internal::StringBuffer<Char> buffer_;
-
- public:
-  /**
-    \rst
-    Constructs a :class:`fmt::BasicStringWriter` object.
-    \endrst
-   */
-  BasicStringWriter() : BasicWriter<Char>(buffer_) {}
-
-  /**
-    \rst
-    Moves the buffer content to *str* clearing the buffer.
-    \endrst
-   */
-  void move_to(std::basic_string<Char> &str) {
-    buffer_.move_to(str);
-  }
-};
-
-typedef BasicStringWriter<char> StringWriter;
-typedef BasicStringWriter<wchar_t> WStringWriter;
-
-/**
-  \rst
-  Converts *value* to ``std::string`` using the default format for type *T*.
-
-  **Example**::
-
-    #include "fmt/string.h"
-
-    std::string answer = fmt::to_string(42);
-  \endrst
- */
-template <typename T>
-std::string to_string(const T &value) {
-  fmt::MemoryWriter w;
-  w << value;
-  return w.str();
-}
-}
-
-#endif  // FMT_STRING_H_
--- a/lib/external/fmt/time.h
+++ b/lib/external/fmt/time.h
@@ -1,143 +0,0 @@
-/*
- Formatting library for C++ - time formatting
-
- Copyright (c) 2012 - 2016, Victor Zverovich
- All rights reserved.
-
- For the license information refer to format.h.
- */
-
-#ifndef FMT_TIME_H_
-#define FMT_TIME_H_
-
-#include "format.h"
-#include <ctime>
-
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4702)  // unreachable code
-# pragma warning(disable: 4996)  // "deprecated" functions
-#endif
-
-namespace fmt {
-template <typename ArgFormatter>
-void format_arg(BasicFormatter<char, ArgFormatter> &f,
-                const char *&format_str, const std::tm &tm) {
-  if (*format_str == ':')
-    ++format_str;
-  const char *end = format_str;
-  while (*end && *end != '}')
-    ++end;
-  if (*end != '}')
-    FMT_THROW(FormatError("missing '}' in format string"));
-  internal::MemoryBuffer<char, internal::INLINE_BUFFER_SIZE> format;
-  format.append(format_str, end + 1);
-  format[format.size() - 1] = '\0';
-  Buffer<char> &buffer = f.writer().buffer();
-  std::size_t start = buffer.size();
-  for (;;) {
-    std::size_t size = buffer.capacity() - start;
-    std::size_t count = std::strftime(&buffer[start], size, &format[0], &tm);
-    if (count != 0) {
-      buffer.resize(start + count);
-      break;
-    }
-    if (size >= format.size() * 256) {
-      // If the buffer is 256 times larger than the format string, assume
-      // that `strftime` gives an empty result. There doesn't seem to be a
-      // better way to distinguish the two cases:
-      // https://github.com/fmtlib/fmt/issues/367
-      break;
-    }
-    const std::size_t MIN_GROWTH = 10;
-    buffer.reserve(buffer.capacity() + (size > MIN_GROWTH ? size : MIN_GROWTH));
-  }
-  format_str = end + 1;
-}
- 
-namespace internal{
-inline Null<> localtime_r(...) { return Null<>(); }
-inline Null<> localtime_s(...) { return Null<>(); }
-inline Null<> gmtime_r(...) { return Null<>(); }
-inline Null<> gmtime_s(...) { return Null<>(); }
-}
-
-// Thread-safe replacement for std::localtime
-inline std::tm localtime(std::time_t time) {
-  struct LocalTime {
-    std::time_t time_;
-    std::tm tm_;
-
-    LocalTime(std::time_t t): time_(t) {}
-
-    bool run() {
-      using namespace fmt::internal;
-      return handle(localtime_r(&time_, &tm_));
-    }
-
-    bool handle(std::tm* tm) { return tm != 0; }
-
-    bool handle(internal::Null<>) {
-      using namespace fmt::internal;
-      return fallback(localtime_s(&tm_, &time_));
-    }
-
-    bool fallback(int res) { return res == 0; }
-
-    bool fallback(internal::Null<>) {
-      using namespace fmt::internal;
-      std::tm* tm = std::localtime(&time_);
-      if (tm != 0) tm_ = *tm;
-      return tm != 0;
-    }
-  };
-  LocalTime lt(time);
-  if (lt.run())
-    return lt.tm_;
-  // Too big time values may be unsupported.
-  FMT_THROW(fmt::FormatError("time_t value out of range"));
-  return std::tm();
-}
-
-// Thread-safe replacement for std::gmtime
-inline std::tm gmtime(std::time_t time) {
-  struct GMTime {
-    std::time_t time_;
-    std::tm tm_;
-
-    GMTime(std::time_t t): time_(t) {}
-
-    bool run() {
-      using namespace fmt::internal;
-      return handle(gmtime_r(&time_, &tm_));
-    }
-
-    bool handle(std::tm* tm) { return tm != 0; }
-
-    bool handle(internal::Null<>) {
-      using namespace fmt::internal;
-      return fallback(gmtime_s(&tm_, &time_));
-    }
-
-    bool fallback(int res) { return res == 0; }
-
-    bool fallback(internal::Null<>) {
-      std::tm* tm = std::gmtime(&time_);
-      if (tm != 0) tm_ = *tm;
-      return tm != 0;
-    }
-  };
-  GMTime gt(time);
-  if (gt.run())
-    return gt.tm_;
-  // Too big time values may be unsupported.
-  FMT_THROW(fmt::FormatError("time_t value out of range"));
-  return std::tm();
-}
-} //namespace fmt
-
-#ifdef _MSC_VER
-# pragma warning(pop)
-#endif
-
-#endif  // FMT_TIME_H_
--- a/lib/runtime/database.cpp
+++ b/lib/runtime/database.cpp
@@ -1,67 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-
-#include "isaac/runtime/predict.h"
-#include "database/sm_5_2/pool.hpp"
-#include "database/sm_5_2/conv.hpp"
-#include "database/sm_5_2/gemm.hpp"
-
-#include "database/sm_6_0/conv.hpp"
-#include "database/sm_6_0/gemm.hpp"
-
-#include "database/sm_6_1/conv.hpp"
-#include "database/sm_6_1/gemm.hpp"
-
-#include "database/sm_7_0/gemm.hpp"
-#include "database/sm_7_0/conv.hpp"
-
-namespace isaac{
-namespace runtime{
-
-typedef driver::Device::Architecture Architecture;
-
-const std::map<std::pair<driver::Device::Architecture, OperationType>, std::shared_ptr<Profile> > database =
-{
-  {{Architecture::SM_5_0, CONV}, std::make_shared<ConvProfile>((u_char*)sm_5_2::conv)},
-  {{Architecture::SM_5_0, GEMM}, std::make_shared<GEMMProfile>((u_char*)sm_5_2::gemm)},
-
-  {{Architecture::SM_5_2, POOL}, std::make_shared<PoolProfile>((u_char*)sm_5_2::pool)},
-  {{Architecture::SM_5_2, CONV}, std::make_shared<ConvProfile>((u_char*)sm_5_2::conv)},
-  {{Architecture::SM_5_2, GEMM}, std::make_shared<GEMMProfile>((u_char*)sm_5_2::gemm)},
-
-  {{Architecture::SM_6_0, POOL}, std::make_shared<PoolProfile>((u_char*)sm_5_2::pool)},
-  {{Architecture::SM_6_0, CONV}, std::make_shared<ConvProfile>((u_char*)sm_5_2::conv)},
-  {{Architecture::SM_6_0, GEMM}, std::make_shared<GEMMProfile>((u_char*)sm_5_2::gemm)},
-
-  {{Architecture::SM_6_1, POOL}, std::make_shared<PoolProfile>((u_char*)sm_5_2::pool)},
-  {{Architecture::SM_6_1, CONV}, std::make_shared<ConvProfile>((u_char*)sm_6_1::conv)},
-  {{Architecture::SM_6_1, GEMM}, std::make_shared<GEMMProfile>((u_char*)sm_6_1::gemm)},
-
-  {{Architecture::SM_7_0, POOL}, std::make_shared<PoolProfile>((u_char*)sm_5_2::pool)},
-  {{Architecture::SM_7_0, CONV}, std::make_shared<ConvProfile>((u_char*)sm_7_0::conv)},
-  {{Architecture::SM_7_0, GEMM}, std::make_shared<GEMMProfile>((u_char*)sm_7_0::gemm)}
-
-};
-
-}
-}
--- a/lib/runtime/database/sm_5_2/conv.hpp
+++ b/lib/runtime/database/sm_5_2/conv.hpp
--- a/lib/runtime/database/sm_5_2/gemm.hpp
+++ b/lib/runtime/database/sm_5_2/gemm.hpp
--- a/lib/runtime/database/sm_5_2/pool.hpp
+++ b/lib/runtime/database/sm_5_2/pool.hpp
--- a/lib/runtime/database/sm_6_0/conv.hpp
+++ b/lib/runtime/database/sm_6_0/conv.hpp
--- a/Show More
+++ b/Show More