[driver] added driver source code from isaac repository

2019-02-08 14:47:56 -05:00
parent 937bc464a3
commit a9d219cdf5
56 changed files with 51329 additions and 0 deletions
--- a/include/driver/backend.h
+++ b/include/driver/backend.h
@@ -0,0 +1,116 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_BACKEND_H
 #define TDL_INCLUDE_DRIVER_BACKEND_H
 #include <map>
 #include <list>
 #include <vector>
 namespace tdl
 {
 namespace driver
 {
 class Buffer;
 class Stream;
 class Device;
 class Context;
 class Platform;
 class Module;
 class Kernel;
 struct backend
 {
  class modules
  {
    friend class backend;
  public:
    static void release();
    static Module& get(Stream const & stream, std::string const & name, std::string const &src);
  private:
    static std::map<std::tuple<Stream, std::string>, Module * > cache_;
  };
  class kernels
  {
    friend class backend;
  public:
    static void release();
    static Kernel & get(Module const & program, std::string const & name);
  private:
    static std::map<std::tuple<Module, std::string>, Kernel * > cache_;
  };
  class contexts
  {
    friend class backend;
  private:
    static void init(std::vector<Platform> const &);
    static void release();
  public:
    static Context const & get_default();
    template<class T>
    static Context const & import(T context)
    {
      for(driver::Context const * x: cache_)
        if((T)*x==context)
          return *x;
      cache_.emplace_back(new Context(context, false));
      return *cache_.back();
    }
    static void get(std::list<Context const *> &);
  private:
    static std::list<Context const *> cache_;
  };
  class streams
  {
    friend class backend;
  private:
    static void init(std::list<Context const *> const &);
    static void release();
  public:
    static void get(Context const &, std::vector<Stream *> &streams);
    static Stream & get(Context const &, unsigned int id = 0);
    static Stream & get_default();
  private:
    static std::map< Context, std::vector<Stream*> > cache_;
  };
  static void init();
  static void release();
  static std::vector<Device> devices();
  static std::vector<Platform> platforms();
  static void synchronize(Context const &);
  static unsigned int default_device;
 };
 }
 }
 #endif
--- a/include/driver/buffer.h
+++ b/include/driver/buffer.h
@@ -0,0 +1,54 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_BUFFER_H
 #define TDL_INCLUDE_DRIVER_BUFFER_H
 #include "driver/handle.h"
 #include "driver/context.h"
 namespace tdl
 {
 namespace driver
 {
 class Stream;
 // Buffer
 class Buffer: public HandleInterface<Buffer, CUdeviceptr>
 {
 public:
  Buffer(Context const & context, size_t size);
  Buffer(Context const & context, CUdeviceptr cu, bool take_ownership);
  void set_zero(Stream const & queue, size_t size);
  Handle<CUdeviceptr> const & cu() const;
  Handle<CUdeviceptr> & cu();
 private:
  Context context_;
  Handle<CUdeviceptr> cu_;
 };
 }
 }
 #endif
--- a/include/driver/context.h
+++ b/include/driver/context.h
@@ -0,0 +1,66 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_CONTEXT_H
 #define TDL_INCLUDE_DRIVER_CONTEXT_H
 #include "driver/device.h"
 #include "driver/handle.h"
 namespace tdl
 {
 namespace driver
 {
 class Context: public HandleInterface<Context, CUcontext>
 {
 private:
  static std::string get_cache_path();
  static CUdevice device(CUcontext);
 public:
  //Constructors
  explicit Context(CUcontext context, bool take_ownership = true);
  explicit Context(Device const & device);
  //Accessors
  Device const & device() const;
  std::string const & cache_path() const;
  Handle<CUcontext> const & cu() const;
 private:
  Handle<CUcontext> cu_;
  Device device_;
  std::string cache_path_;
 };
 class ContextSwitcher{
 public:
    ContextSwitcher(Context const & ctx);
    ~ContextSwitcher();
 private:
    Context const & ctx_;
 };
 }
 }
 #endif
--- a/include/driver/cublas.h
+++ b/include/driver/cublas.h
@@ -0,0 +1,229 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_CUBLAS_H
 #define TDL_INCLUDE_DRIVER_CUBLAS_H
 #include "isaac/templates/common.hpp"
 #include "driver/dispatch.h"
 #include "driver/buffer.h"
 #include "driver/stream.h"
 #include "driver/backend.h"
 #include "driver/error.h"
 #include "tools/bench.hpp"
 #include "tools/collections.hpp"
 namespace tdl
 {
 namespace driver
 {
 enum cublasStrategy_t{
    CUBLAS_PREFER_FASTEST,
    CUBLAS_HEURISTICS
 };
 static const std::vector<cublasGemmAlgo_t> cublasAlgorithms = {
  CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1, CUBLAS_GEMM_ALGO2, CUBLAS_GEMM_ALGO3,
  CUBLAS_GEMM_ALGO4, CUBLAS_GEMM_ALGO5, CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7
 };
 static const std::map<DType, cudaDataType> cudtype = {{FLOAT_TYPE, CUDA_R_32F}, {DOUBLE_TYPE,CUDA_R_64F}};
 static const std::map<char, cublasOperation_t> cuop = {{'N', CUBLAS_OP_N}, {'T', CUBLAS_OP_T}};
 inline cublasGemmAlgo_t cublasGemmFastest(Stream& stream, cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K,
                         void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb,
                         void* beta, CUdeviceptr C, int32_t ldc){
  typedef std::tuple<cudaDataType_t, cublasOperation_t, cublasOperation_t, int32_t, int32_t, int32_t> key_t;
  // Benchmark fastest algorithm in cublasGemmEx
  auto benchmark_fastest = [&](key_t const &){
    std::vector<double> times;
    for(cublasGemmAlgo_t a: cublasAlgorithms){
      try{
        times.push_back(bench([&](){ dispatch::cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, a); },
        [&](){ stream.synchronize(); },
        stream.context().device()));
      }catch(driver::exception::cublas::base const &){
        times.push_back(INFINITY);
      }
    }
    size_t argmin = std::min_element(times.begin(), times.end()) - times.begin();
    return cublasAlgorithms[argmin];
  };
  // Cache result
  static cpp::CachedMap<key_t, cublasGemmAlgo_t> cache(benchmark_fastest);
  return cache.get(std::make_tuple(cudt, AT, BT, M, N, K));
 }
 /* Wrapper for cublasGemmEx */
 inline void cublasGemmEx(cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K,
                         void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb,
                         void* beta, CUdeviceptr C, int32_t ldc, cublasGemmAlgo_t algo)
 { dispatch::cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, algo); }
 /* Simplified API for default GEMM */
 inline void cublasGemm(DType dtype, Stream& stream, char cAT, char cBT, int32_t M, int32_t N, int32_t K, scalar alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, scalar beta, Buffer& C, int32_t ldc, cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT){
  ContextSwitcher ctx_switch(stream.context());
  cublasHandle_t handle = dispatch::cublasHandle(stream.context());
  dispatch::cublasSetStream_v2(handle, (CUstream)stream);
  if(fastest)
    *fastest = cublasGemmFastest(stream, handle, cudtype.at(dtype), cuop.at(cAT), cuop.at(cBT), M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc);
  else
    cublasGemmEx(handle, cudtype.at(dtype), cuop.at(cAT), cuop.at(cBT), M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc, algo);
 }
 inline cudnnDataType_t cudnnDtype(DType dtype){
  switch(dtype){
    case INT8X4_TYPE: return CUDNN_DATA_INT8x4;
    case INT32_TYPE: return CUDNN_DATA_INT32;
    case FLOAT_TYPE: return CUDNN_DATA_FLOAT;
    case DOUBLE_TYPE: return CUDNN_DATA_DOUBLE;
  }
  throw;
 }
 inline cudnnTensorFormat_t format(cudnnDataType_t cutype){
  switch(cutype){
    case CUDNN_DATA_INT8x4: return CUDNN_TENSOR_NCHW_VECT_C;
    default: return CUDNN_TENSOR_NCHW;
  }
 }
 inline void cudnnConv(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t C, int32_t T, int32_t R, int32_t S,
                      int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, Buffer const & F, scalar beta, Buffer const & O){
  driver::Context const & ctx = stream.context();
  ContextSwitcher switch_ctx(ctx);
  std::vector<int> pad = {pad_d, pad_h, pad_w};
  std::vector<int> stride = {stride_d, stride_h, stride_w};
  std::vector<int> upscale = {1, 1, 1};
  std::vector<int> Oshapes = {N, K, M, P, Q};
  std::vector<int> Fshapes = {K, C, T, R, S};
  std::vector<int> Ishapes = {N, C, D, H, W};
  if(M == 1 && T == 1 && D == 1){
    pad.erase(pad.begin());
    stride.erase(stride.begin());
    upscale.erase(upscale.begin());
    Oshapes.erase(Oshapes.begin() + 2);
    Ishapes.erase(Ishapes.begin() + 2);
    Fshapes.erase(Fshapes.begin() + 2);
  }
  cudnnHandle_t handle = dispatch::cudnnHandle(ctx);
  cudnnDataType_t in_cutype = cudnnDtype(dtype);
  cudnnDataType_t conv_cutype = (dtype == INT8X4_TYPE)?CUDNN_DATA_INT32:in_cutype;
  dispatch::cudnnSetStream(handle, (CUstream)stream);
  cudnnTensorDescriptor_t tO, tI;
  cudnnFilterDescriptor_t tF;
  cudnnConvolutionDescriptor_t conv;
  cudnnConvolutionFwdAlgo_t algo;
  dispatch::cudnnCreateTensorDescriptor(&tO);
  dispatch::cudnnCreateTensorDescriptor(&tI);
  dispatch::cudnnCreateFilterDescriptor(&tF);
  dispatch::cudnnSetTensorNdDescriptorEx(tO, format(in_cutype), in_cutype, Oshapes.size(), Oshapes.data());
  dispatch::cudnnSetFilterNdDescriptor(tF, in_cutype, format(in_cutype), Fshapes.size(), Fshapes.data());
  dispatch::cudnnSetTensorNdDescriptorEx(tI, format(in_cutype), in_cutype, Ishapes.size(), Ishapes.data());
  dispatch::cudnnCreateConvolutionDescriptor(&conv);
  dispatch::cudnnSetConvolutionNdDescriptor(conv, pad.size(), pad.data(), stride.data(), upscale.data(), CUDNN_CROSS_CORRELATION, conv_cutype);
  dispatch::cudnnGetConvolutionForwardAlgorithm(handle, tI, tF, conv, tO, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 1024*1024*64, &algo);
  size_t workspace_size;
  dispatch::cudnnGetConvolutionForwardWorkspaceSize(handle, tI, tF, conv, tO, algo, &workspace_size);
  static Buffer work(ctx, 1024*1024*64);
  CUdeviceptr twork = work;
  CUdeviceptr pI = I, pF = F, pO = O;
  dispatch::cudnnConvolutionForward(handle, alpha.data(), tI, (void*)pI, tF, (void*)pF, conv, algo, (void*)twork, workspace_size, beta.data(), tO, (void*)pO);
 }
 inline void cudnnPool(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t T, int32_t R, int32_t S,
                      int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, scalar beta, Buffer const & O){
  driver::Context const & ctx = stream.context();
  ContextSwitcher switch_ctx(ctx);
  std::vector<int> pad = {pad_d, pad_h, pad_w};
  std::vector<int> stride = {stride_d, stride_h, stride_w};
  std::vector<int> upscale = {1, 1, 1};
  std::vector<int> Oshapes = {N, K, M, P, Q};
  std::vector<int> Ishapes = {N, K, D, H, W};
  std::vector<int> window = {T, R, S};
  if(M == 1 && T == 1 && D == 1){
    window.erase(window.begin());
    pad.erase(pad.begin());
    stride.erase(stride.begin());
    upscale.erase(upscale.begin());
    Oshapes.erase(Oshapes.begin() + 2);
    Ishapes.erase(Ishapes.begin() + 2);
  }
  cudnnHandle_t handle = dispatch::cudnnHandle(ctx);
  cudnnDataType_t cutype = cudnnDtype(dtype);
  dispatch::cudnnSetStream(handle, (CUstream)stream);
  cudnnTensorDescriptor_t tO, tI;
  cudnnPoolingDescriptor_t desc;
  dispatch::cudnnCreateTensorDescriptor(&tO);
  dispatch::cudnnCreateTensorDescriptor(&tI);
  dispatch::cudnnSetTensorNdDescriptorEx(tO, CUDNN_TENSOR_NCHW, cutype, Oshapes.size(), Oshapes.data());
  dispatch::cudnnSetTensorNdDescriptorEx(tI, CUDNN_TENSOR_NCHW, cutype, Ishapes.size(), Ishapes.data());
  dispatch::cudnnCreatePoolingDescriptor(&desc);
  dispatch::cudnnSetPoolingNdDescriptor(desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN, window.size(), window.data(), pad.data(), stride.data());
  CUdeviceptr pI = I, pO = O;
  dispatch::cudnnPoolingForward(handle, desc, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO);
 }
 inline void cudnnTransformTensor(driver::Stream & stream,
               DType in_dtype, DType out_dtype,
               cudnnTensorFormat_t in_layout, cudnnTensorFormat_t out_layout,
               int32_t N, int32_t C, int32_t D, int32_t H, int32_t W,
               scalar alpha, driver::Buffer const & I, scalar beta, driver::Buffer& O)
 {
  cudnnHandle_t handle = dispatch::cudnnHandle(stream.context());
  dispatch::cudnnSetStream(handle, (CUstream)stream);
  cudnnTensorDescriptor_t tO, tI;
  std::vector<int> shapes = {N, C, D, H, W};
  dispatch::cudnnCreateTensorDescriptor(&tI);
  dispatch::cudnnSetTensorNdDescriptorEx(tI, in_layout, cudnnDtype(in_dtype), shapes.size(), shapes.data());
  dispatch::cudnnCreateTensorDescriptor(&tO);
  dispatch::cudnnSetTensorNdDescriptorEx(tO, out_layout, cudnnDtype(out_dtype), shapes.size(), shapes.data());
  CUdeviceptr pI = I, pO = O;
  dispatch::cudnnTransformTensor(handle, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO);
 }
 }
 }
 #endif
--- a/include/driver/device.h
+++ b/include/driver/device.h
@@ -0,0 +1,98 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_DEVICE_H
 #define TDL_INCLUDE_DRIVER_DEVICE_H
 #include "driver/platform.h"
 #include "driver/handle.h"
 namespace tdl
 {
 namespace driver
 {
 // Device
 class Device: public HandleInterface<Device, CUdevice>
 {
 public:
  //Supported architectures
  enum class Architecture{
    //NVidia
    SM_2_0,
    SM_2_1,
    SM_3_0,
    SM_3_5,
    SM_3_7,
    SM_5_0,
    SM_5_2,
    SM_6_0,
    SM_6_1,
    SM_7_0,
    UNKNOWN
  };
 private:
  //Metaprogramming elper to get cuda info from attribute
  template<CUdevice_attribute attr>
  int cuGetInfo() const;
  inline Architecture nv_arch(std::pair<unsigned int, unsigned int> sm) const;
  inline nvmlDevice_t nvml_device() const;
 public:
  Device(CUdevice cu = CUdevice(), bool take_ownership = true): cu_(cu, take_ownership){}
  //Accessors
  Architecture architecture() const;
  Handle<CUdevice> const & cu() const;
  //Informations
  std::string infos() const;
  size_t address_bits() const;
  driver::Platform platform() const;
  std::vector<size_t> max_block_dim() const;
  size_t max_threads_per_block() const;
  size_t max_shared_memory() const;
  size_t warp_size() const;
  //Compute Capability
  void interpret_as(std::pair<size_t, size_t> cc);
  std::pair<size_t, size_t> compute_capability() const;
  //Identifier
  std::string name() const;
  std::string pci_bus_id() const;
  //Clocks
  size_t current_sm_clock() const;
  size_t current_mem_clock() const;
  size_t max_sm_clock() const;
  size_t max_mem_clock() const;
 private:
  Handle<CUdevice> cu_;
  std::shared_ptr<std::pair<size_t, size_t>> interpreted_as_;
 };
 }
 }
 #endif
--- a/include/driver/dispatch.h
+++ b/include/driver/dispatch.h
@@ -0,0 +1,258 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_DISPATCHER_H
 #define TDL_INCLUDE_DRIVER_DISPATCHER_H
 #include <type_traits>
 #include <dlfcn.h>
 //CUDA Backend
 #include "external/CUDA/cuda.h"
 #include "external/CUDA/nvrtc.h"
 #include "external/CUDA/cublas_v2.h"
 #include "external/CUDA/cudnn.h"
 #include "external/CUDA/nvml.h"
 //Exceptions
 #include <iostream>
 #include <stdexcept>
 namespace tdl
 {
 namespace driver
 {
 class Context;
 template<class T> void check(T){}
 void check(nvrtcResult err);
 void check(CUresult err);
 void check(cublasStatus_t err);
 void check(cudnnStatus_t err);
 class dispatch
 {
 private:
  template <class F>
  struct return_type;
  template <class R, class... A>
  struct return_type<R (*)(A...)>
  { typedef R type; };
  typedef bool (*f_init_t)();
  template<f_init_t initializer, typename FunPtrT, typename... Args>
  static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args)
  {
    initializer();
    if(cache == nullptr){
      cache = dlsym(lib_h, name);
 			if(cache == 0)
 				throw std::runtime_error("dlsym unable to load function");
 		}
    FunPtrT fptr;
    *reinterpret_cast<void **>(&fptr) = cache;
    typename return_type<FunPtrT>::type res = (*fptr)(args...);
    check(res);
    return res;
  }
 public:
  static bool nvrtcinit();
  static bool nvmlinit();
  static bool cuinit();
  static bool cublasinit();
  static bool cudnninit();
  static void release();
  //CUDA
  static CUresult cuCtxGetCurrent(CUcontext *pctx);
  static CUresult cuCtxSetCurrent(CUcontext ctx);
  static CUresult cuCtxDestroy_v2(CUcontext ctx);
  static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
  static CUresult cuDeviceGet(CUdevice *device, int ordinal);
  static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
  static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
  static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
  static CUresult cuMemFree_v2(CUdeviceptr dptr);
  static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
  static CUresult cuDriverGetVersion(int *driverVersion);
  static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
  static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev);
  static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t* bytes, CUmodule hmod, const char *name);
  static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
  static CUresult cuModuleLoad(CUmodule *module, const char *fname);
  static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
  static CUresult cuModuleUnload(CUmodule hmod);
  static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
  static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
  static CUresult cuDeviceGetCount(int *count);
  static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
  static CUresult cuInit(unsigned int Flags);
  static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
  static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
  static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
  static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
  static CUresult cuStreamSynchronize(CUstream hStream);
  static CUresult cuStreamDestroy_v2(CUstream hStream);
  static CUresult cuEventDestroy_v2(CUevent hEvent);
  static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
  static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr);
  static CUresult cuCtxGetDevice(CUdevice* result);
  static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream);
  static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device);
  static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
  static nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char **options);
  static nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
  static nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
  static nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
  static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
  static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
  static cublasHandle_t cublasHandle(Context const & ctx);
  static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
  static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId);
  static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId);
  static cublasStatus_t cublasSgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
  static cublasStatus_t cublasDgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
  static cublasStatus_t cublasHgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc);
  static cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const void *beta, void *C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo);
  static cudnnHandle_t cudnnHandle(Context const & ctx);
  static cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
  static cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc);
  static cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
  static cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
  static cudnnStatus_t cudnnCreate(cudnnHandle_t *handle);
  static cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w);
  static cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w);
  static cudnnStatus_t cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int nbDims, const int dimA[]);
  static cudnnStatus_t cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int nbDims, const int filterDimA[]);
  static cudnnStatus_t cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, int pad_h, int pad_w, int u, int v, int upscalex, int upscaley, cudnnConvolutionMode_t mode);
  static cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, int arrayLength, const int padA[], const int filterStrideA[], const int upscaleA[], cudnnConvolutionMode_t mode, cudnnDataType_t dataType);
  static cudnnStatus_t cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims, const int windowDimA[], const int paddingA[], const int strideA[]);
  static cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionFwdAlgo_t *algo);
  static cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo, size_t *sizeInBytes);
  static cudnnStatus_t cudnnConvolutionForward(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const cudnnFilterDescriptor_t wDesc, const void *w, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo, void *workSpace, size_t workSpaceSizeInBytes, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
  static cudnnStatus_t cudnnPoolingForward(cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
  static cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
  static cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
 private:
  static void* cuda_;
  static void* nvrtc_;
  static void* nvml_;
  static void* cublas_;
  static void* cudnn_;
  //CUDA
  static void* cuCtxGetCurrent_;
  static void* cuCtxSetCurrent_;
  static void* cuCtxDestroy_v2_;
  static void* cuEventCreate_;
  static void* cuDeviceGet_;
  static void* cuMemcpyDtoH_v2_;
  static void* cuStreamCreate_;
  static void* cuEventElapsedTime_;
  static void* cuMemFree_v2_;
  static void* cuMemcpyDtoHAsync_v2_;
  static void* cuDriverGetVersion_;
  static void* cuDeviceGetName_;
  static void* cuDeviceGetPCIBusId_;
  static void* cuModuleGetGlobal_v2_;
  static void* cuMemcpyHtoDAsync_v2_;
  static void* cuModuleLoad_;
  static void* cuLaunchKernel_;
  static void* cuModuleUnload_;
  static void* cuModuleLoadDataEx_;
  static void* cuDeviceGetAttribute_;
  static void* cuDeviceGetCount_;
  static void* cuMemcpyHtoD_v2_;
  static void* cuInit_;
  static void* cuEventRecord_;
  static void* cuCtxCreate_v2_;
  static void* cuModuleGetFunction_;
  static void* cuStreamSynchronize_;
  static void* cuStreamDestroy_v2_;
  static void* cuEventDestroy_v2_;
  static void* cuMemAlloc_v2_;
  static void* cuPointerGetAttribute_;
  static void* cuCtxGetDevice_;
  static void* cuMemsetD8Async_;
  static void* cuCtxPushCurrent_v2_;
  static void* cuCtxPopCurrent_v2_;
  static void* nvmlInit_v2_;
  static void* nvmlDeviceGetHandleByPciBusId_v2_;
  static void* nvmlDeviceGetClockInfo_;
  static void* nvmlDeviceGetMaxClockInfo_;
  static void* nvrtcCompileProgram_;
  static void* nvrtcGetProgramLogSize_;
  static void* nvrtcGetPTX_;
  static void* nvrtcGetPTXSize_;
  static void* nvrtcCreateProgram_;
  static void* nvrtcGetProgramLog_;
  static void* cublasCreate_v2_;
  static void* cublasGetStream_v2_;
  static void* cublasSetStream_v2_;
  static void* cublasHgemm_;
  static void* cublasSgemm_v2_;
  static void* cublasDgemm_v2_;
  static void* cublasGemmEx_;
  static void* cudnnCreateConvolutionDescriptor_;
  static void* cudnnCreatePoolingDescriptor_;
  static void* cudnnCreateTensorDescriptor_;
  static void* cudnnCreateFilterDescriptor_;
  static void* cudnnCreate_;
  static void* cudnnSetTensor4dDescriptor_;
  static void* cudnnSetFilter4dDescriptor_;
  static void* cudnnSetTensorNdDescriptorEx_;
  static void* cudnnSetFilterNdDescriptor_;
  static void* cudnnSetConvolution2dDescriptor_;
  static void* cudnnSetConvolutionNdDescriptor_;
  static void* cudnnSetPoolingNdDescriptor_;
  static void* cudnnGetConvolutionForwardAlgorithm_;
  static void* cudnnGetConvolutionForwardWorkspaceSize_;
  static void* cudnnConvolutionForward_;
  static void* cudnnPoolingForward_;
  static void* cudnnSetStream_;
  static void* cudnnTransformTensor_;
 };
 }
 }
 #endif
--- a/include/driver/error.h
+++ b/include/driver/error.h
@@ -0,0 +1,228 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_ERROR_H
 #define TDL_INCLUDE_DRIVER_ERROR_H
 #include <exception>
 #include "driver/dispatch.h"
 namespace tdl
 {
  namespace driver
  {
  namespace exception
  {
  namespace nvrtc
  {
 #define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
  ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory              ,"out of memory");
  ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure   ,"program creation failure");
  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input              ,"invalid input");
  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program            ,"invalid program");
  ISAAC_CREATE_NVRTC_EXCEPTION(invalid_option             ,"invalid option");
  ISAAC_CREATE_NVRTC_EXCEPTION(compilation                ,"compilation");
  ISAAC_CREATE_NVRTC_EXCEPTION(builtin_operation_failure  ,"builtin operation failure");
  ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error              ,"unknown error");
 #undef ISAAC_CREATE_NVRTC_EXCEPTION
  }
  namespace cuda
  {
  class base: public std::exception{};
 #define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "CUDA: Error- " msg; } }
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_value                   ,"invalid value");
  ISAAC_CREATE_CUDA_EXCEPTION(out_of_memory                   ,"out of memory");
  ISAAC_CREATE_CUDA_EXCEPTION(not_initialized                 ,"not initialized");
  ISAAC_CREATE_CUDA_EXCEPTION(deinitialized                   ,"deinitialized");
  ISAAC_CREATE_CUDA_EXCEPTION(profiler_disabled               ,"profiler disabled");
  ISAAC_CREATE_CUDA_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
  ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_started        ,"profiler already started");
  ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
  ISAAC_CREATE_CUDA_EXCEPTION(no_device                       ,"no device");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_device                  ,"invalid device");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_image                   ,"invalid image");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_context                 ,"invalid context");
  ISAAC_CREATE_CUDA_EXCEPTION(context_already_current         ,"context already current");
  ISAAC_CREATE_CUDA_EXCEPTION(map_failed                      ,"map failed");
  ISAAC_CREATE_CUDA_EXCEPTION(unmap_failed                    ,"unmap failed");
  ISAAC_CREATE_CUDA_EXCEPTION(array_is_mapped                 ,"array is mapped");
  ISAAC_CREATE_CUDA_EXCEPTION(already_mapped                  ,"already mapped");
  ISAAC_CREATE_CUDA_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
  ISAAC_CREATE_CUDA_EXCEPTION(already_acquired                ,"already acquired");
  ISAAC_CREATE_CUDA_EXCEPTION(not_mapped                      ,"not mapped");
  ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
  ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
  ISAAC_CREATE_CUDA_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
  ISAAC_CREATE_CUDA_EXCEPTION(unsupported_limit               ,"unsupported limit");
  ISAAC_CREATE_CUDA_EXCEPTION(context_already_in_use          ,"context already in use");
  ISAAC_CREATE_CUDA_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_ptx                     ,"invalid ptx");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_source                  ,"invalid source");
  ISAAC_CREATE_CUDA_EXCEPTION(file_not_found                  ,"file not found");
  ISAAC_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
  ISAAC_CREATE_CUDA_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
  ISAAC_CREATE_CUDA_EXCEPTION(operating_system                ,"operating system");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_handle                  ,"invalid handle");
  ISAAC_CREATE_CUDA_EXCEPTION(not_found                       ,"not found");
  ISAAC_CREATE_CUDA_EXCEPTION(not_ready                       ,"not ready");
  ISAAC_CREATE_CUDA_EXCEPTION(illegal_address                 ,"illegal address");
  ISAAC_CREATE_CUDA_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
  ISAAC_CREATE_CUDA_EXCEPTION(launch_timeout                  ,"launch timeout");
  ISAAC_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
  ISAAC_CREATE_CUDA_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
  ISAAC_CREATE_CUDA_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
  ISAAC_CREATE_CUDA_EXCEPTION(primary_context_active          ,"primary context active");
  ISAAC_CREATE_CUDA_EXCEPTION(context_is_destroyed            ,"context is destroyed");
  ISAAC_CREATE_CUDA_EXCEPTION(assert_error                    ,"assert");
  ISAAC_CREATE_CUDA_EXCEPTION(too_many_peers                  ,"too many peers");
  ISAAC_CREATE_CUDA_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
  ISAAC_CREATE_CUDA_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
  ISAAC_CREATE_CUDA_EXCEPTION(hardware_stack_error            ,"hardware stack error");
  ISAAC_CREATE_CUDA_EXCEPTION(illegal_instruction             ,"illegal instruction");
  ISAAC_CREATE_CUDA_EXCEPTION(misaligned_address              ,"misaligned address");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_address_space           ,"invalid address space");
  ISAAC_CREATE_CUDA_EXCEPTION(invalid_pc                      ,"invalid pc");
  ISAAC_CREATE_CUDA_EXCEPTION(launch_failed                   ,"launch failed");
  ISAAC_CREATE_CUDA_EXCEPTION(not_permitted                   ,"not permitted");
  ISAAC_CREATE_CUDA_EXCEPTION(not_supported                   ,"not supported");
  ISAAC_CREATE_CUDA_EXCEPTION(unknown                         ,"unknown");
 #undef ISAAC_CREATE_CUDA_EXCEPTION
  }
  namespace cublas
  {
  class base: public std::exception{};
 #define ISAAC_CREATE_CUBLAS_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } }
  ISAAC_CREATE_CUBLAS_EXCEPTION(not_initialized              ,"not initialized");
  ISAAC_CREATE_CUBLAS_EXCEPTION(alloc_failed                 ,"alloc failed");
  ISAAC_CREATE_CUBLAS_EXCEPTION(invalid_value                ,"invalid value");
  ISAAC_CREATE_CUBLAS_EXCEPTION(arch_mismatch                ,"arch mismatch");
  ISAAC_CREATE_CUBLAS_EXCEPTION(mapping_error                ,"mapping error");
  ISAAC_CREATE_CUBLAS_EXCEPTION(execution_failed             ,"execution failed");
  ISAAC_CREATE_CUBLAS_EXCEPTION(internal_error               ,"internal error");
  ISAAC_CREATE_CUBLAS_EXCEPTION(not_supported                ,"not supported");
  ISAAC_CREATE_CUBLAS_EXCEPTION(license_error                ,"license error");
  ISAAC_CREATE_CUBLAS_EXCEPTION(unknown                      ,"unknown");
 #undef ISAAC_CREATE_CUBLAS_EXCEPTION
  }
  namespace cudnn
  {
 #define ISAAC_CREATE_CUDNN_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "CUDNN: Error- " msg; } }
  ISAAC_CREATE_CUDNN_EXCEPTION(not_initialized              ,"not initialized");
  ISAAC_CREATE_CUDNN_EXCEPTION(alloc_failed                 ,"allocation failed");
  ISAAC_CREATE_CUDNN_EXCEPTION(bad_param                    ,"bad param");
  ISAAC_CREATE_CUDNN_EXCEPTION(internal_error               ,"internal error");
  ISAAC_CREATE_CUDNN_EXCEPTION(invalid_value                ,"invalid value");
  ISAAC_CREATE_CUDNN_EXCEPTION(arch_mismatch                ,"arch mismatch");
  ISAAC_CREATE_CUDNN_EXCEPTION(mapping_error                ,"mapping error");
  ISAAC_CREATE_CUDNN_EXCEPTION(execution_failed             ,"execution failed");
  ISAAC_CREATE_CUDNN_EXCEPTION(not_supported                ,"not supported");
  ISAAC_CREATE_CUDNN_EXCEPTION(license_error                ,"license error");
  ISAAC_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing ,"prerequisite missing");
  ISAAC_CREATE_CUDNN_EXCEPTION(runtime_in_progress          ,"runtime in progress");
  ISAAC_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow          ,"runtime fp overflow");
  }
  namespace ocl
  {
  class base: public std::exception{};
 #define ISAAC_CREATE_CL_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "OpenCL: Error- " msg; } }
  ISAAC_CREATE_CL_EXCEPTION(device_not_found,                  "device not found");
  ISAAC_CREATE_CL_EXCEPTION(device_not_available,              "device not available");
  ISAAC_CREATE_CL_EXCEPTION(compiler_not_available,            "compiler not available");
  ISAAC_CREATE_CL_EXCEPTION(mem_object_allocation_failure,     "object allocation failure");
  ISAAC_CREATE_CL_EXCEPTION(out_of_resources,                  "launch out of resources");
  ISAAC_CREATE_CL_EXCEPTION(out_of_host_memory,                "out of host memory");
  ISAAC_CREATE_CL_EXCEPTION(profiling_info_not_available,      "profiling info not available");
  ISAAC_CREATE_CL_EXCEPTION(mem_copy_overlap,                  "mem copy overlap");
  ISAAC_CREATE_CL_EXCEPTION(image_format_mismatch,             "image format mismatch");
  ISAAC_CREATE_CL_EXCEPTION(image_format_not_supported,        "image format not supported");
  ISAAC_CREATE_CL_EXCEPTION(build_program_failure,             "build program failure");
  ISAAC_CREATE_CL_EXCEPTION(map_failure,                       "map failure");
  ISAAC_CREATE_CL_EXCEPTION(invalid_value,                     "invalid value");
  ISAAC_CREATE_CL_EXCEPTION(invalid_device_type,               "invalid device type");
  ISAAC_CREATE_CL_EXCEPTION(invalid_platform,                  "invalid platform");
  ISAAC_CREATE_CL_EXCEPTION(invalid_device,                    "invalid device");
  ISAAC_CREATE_CL_EXCEPTION(invalid_context,                   "invalid context");
  ISAAC_CREATE_CL_EXCEPTION(invalid_queue_properties,          "invalid queue properties");
  ISAAC_CREATE_CL_EXCEPTION(invalid_command_queue,             "invalid command queue");
  ISAAC_CREATE_CL_EXCEPTION(invalid_host_ptr,                  "invalid host pointer");
  ISAAC_CREATE_CL_EXCEPTION(invalid_mem_object,                "invalid mem object");
  ISAAC_CREATE_CL_EXCEPTION(invalid_image_format_descriptor,   "invalid image format descriptor");
  ISAAC_CREATE_CL_EXCEPTION(invalid_image_size,                "invalid image size");
  ISAAC_CREATE_CL_EXCEPTION(invalid_sampler,                   "invalid sampler");
  ISAAC_CREATE_CL_EXCEPTION(invalid_binary,                    "invalid binary");
  ISAAC_CREATE_CL_EXCEPTION(invalid_build_options,             "invalid build options");
  ISAAC_CREATE_CL_EXCEPTION(invalid_program,                   "invalid program");
  ISAAC_CREATE_CL_EXCEPTION(invalid_program_executable,        "invalid program executable");
  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_name,               "invalid kernel name");
  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_definition,         "invalid kernel definition");
  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel,                    "invalid kernel");
  ISAAC_CREATE_CL_EXCEPTION(invalid_arg_index,                 "invalid arg index");
  ISAAC_CREATE_CL_EXCEPTION(invalid_arg_value,                 "invalid arg value");
  ISAAC_CREATE_CL_EXCEPTION(invalid_arg_size,                  "invalid arg size");
  ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_args,               "invalid kernel args");
  ISAAC_CREATE_CL_EXCEPTION(invalid_work_dimension,            "invalid work dimension");
  ISAAC_CREATE_CL_EXCEPTION(invalid_work_group_size,           "invalid work group size");
  ISAAC_CREATE_CL_EXCEPTION(invalid_work_item_size,            "invalid work item size");
  ISAAC_CREATE_CL_EXCEPTION(invalid_global_offset,             "invalid global offset");
  ISAAC_CREATE_CL_EXCEPTION(invalid_event_wait_list,           "invalid event wait list");
  ISAAC_CREATE_CL_EXCEPTION(invalid_event,                     "invalid event");
  ISAAC_CREATE_CL_EXCEPTION(invalid_operation,                 "invalid operation");
  ISAAC_CREATE_CL_EXCEPTION(invalid_gl_object,                 "invalid GL object");
  ISAAC_CREATE_CL_EXCEPTION(invalid_buffer_size,               "invalid buffer size");
  ISAAC_CREATE_CL_EXCEPTION(invalid_mip_level,                 "invalid MIP level");
  ISAAC_CREATE_CL_EXCEPTION(invalid_global_work_size,          "invalid global work size");
 #ifdef CL_INVALID_PROPERTY
  ISAAC_CREATE_CL_EXCEPTION(invalid_property,                  "invalid property");
 #endif
  }
  }
  }
 }
 #endif
--- a/include/driver/event.h
+++ b/include/driver/event.h
@@ -0,0 +1,49 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_EVENT_H
 #define TDL_INCLUDE_DRIVER_EVENT_H
 #include "driver/handle.h"
 namespace tdl
 {
 namespace driver
 {
 // Event
 class Event: public HandleInterface<Event, cu_event_t>
 {
 public:
  float elapsed_time() const;
  Handle<cu_event_t> const & cu() const;
 private:
  Handle<cu_event_t> cu_;
 };
 }
 }
 #endif
--- a/include/driver/handle.h
+++ b/include/driver/handle.h
@@ -0,0 +1,82 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_HANDLE_H
 #define TDL_INCLUDE_DRIVER_HANDLE_H
 #include <memory>
 #include <iostream>
 #include <functional>
 #include <type_traits>
 #include "driver/dispatch.h"
 namespace tdl
 {
 namespace driver
 {
 struct cu_event_t{
  operator bool() const { return first && second; }
  CUevent first;
  CUevent second;
 };
 struct cu_platform{
  cu_platform() : status_(dispatch::cuInit(0)) { }
  operator bool() const { return status_; }
 private:
  CUresult status_;
 };
 template<class T, class CUType>
 class HandleInterface{
 public:
    //Accessors
    operator CUType() const { return *(((T*)this)->cu().h_); }
    //Comparison
    bool operator==(HandleInterface const & y) { return (CUType)(*this) == (CUType)(y); }
    bool operator!=(HandleInterface const & y) { return (CUType)(*this) != (CUType)(y); }
    bool operator<(HandleInterface const & y) { return (CUType)(*this) < (CUType)(y); }
 };
 template<class CUType>
 class Handle{
 public:
  template<class, class> friend class HandleInterface;
 public:
  //Constructors
  Handle(CUType cu = CUType(), bool take_ownership = true);
  ~Handle();
  CUType& operator*() { return *h_; }
  CUType const & operator*() const { return *h_; }
  CUType* operator->() const { return h_.get(); }
 protected:
  std::shared_ptr<CUType> h_;
  bool has_ownership_;
 };
 }
 }
 #endif
--- a/include/driver/kernel.h
+++ b/include/driver/kernel.h
@@ -0,0 +1,68 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_KERNEL_H
 #define TDL_INCLUDE_DRIVER_KERNEL_H
 #include "driver/module.h"
 #include "driver/handle.h"
 #include <memory>
 namespace tdl
 {
 namespace driver
 {
 class Buffer;
 // Kernel
 class Kernel: public HandleInterface<Kernel, CUfunction>
 {
 public:
  //Constructors
  Kernel(Module const & program, const char * name);
  //Accessors
  Handle<CUfunction> const & cu() const;
  Module const & module() const;
  //Arguments setters
  void setArg(unsigned int index, std::size_t size, void* ptr);
  void setArg(unsigned int index, Buffer const &);
  template<class T> void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); }
  //Arguments getters
  void* const* cu_params() const;
 private:
  Handle<CUfunction> cu_;
  Module program_;
  unsigned int address_bits_;
  std::vector<std::shared_ptr<void> >  cu_params_store_;
  std::vector<void*>  cu_params_;
 };
 }
 }
 #endif
--- a/include/driver/module.h
+++ b/include/driver/module.h
@@ -0,0 +1,61 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_MODULE_H
 #define TDL_INCLUDE_DRIVER_MODULE_H
 #include <map>
 #include "driver/handle.h"
 #include "driver/context.h"
 #include "driver/buffer.h"
 namespace tdl
 {
 namespace driver
 {
 class Context;
 class Device;
 class Module: public HandleInterface<Module, CUmodule>
 {
  static std::string header(Device const & device);
 public:
  Module(Context const & context, std::string const & source);
  Context const & context() const;
  Handle<CUmodule> const & cu() const;
  Buffer symbol(const char * name) const;
 private:
  Handle<CUmodule> cu_;
  Context context_;
  std::string source_;
 };
 }
 }
 #endif
--- a/include/driver/platform.h
+++ b/include/driver/platform.h
@@ -0,0 +1,54 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_PLATFORM_H
 #define TDL_INCLUDE_DRIVER_PLATFORM_H
 #include <vector>
 #include <string>
 #include "driver/handle.h"
 namespace tdl
 {
 namespace driver
 {
 class Device;
 class Platform
 {
 public:
  //Accessors
  std::string name() const;
  std::string version() const;
  std::vector<Device> devices() const;
 private:
  Handle<cu_platform> cu_;
 };
 }
 }
 #endif
--- a/include/driver/stream.h
+++ b/include/driver/stream.h
@@ -0,0 +1,82 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #ifndef TDL_INCLUDE_DRIVER_STREAM_H
 #define TDL_INCLUDE_DRIVER_STREAM_H
 #include <map>
 #include "driver/context.h"
 #include "driver/device.h"
 #include "driver/handle.h"
 #include "driver/buffer.h"
 namespace tdl
 {
 namespace driver
 {
 class Kernel;
 class Event;
 class Range;
 class Buffer;
 // Command Queue
 class Stream: public HandleInterface<Stream, CUstream>
 {
 public:
  //Constructors
  Stream(CUstream stream, bool take_ownership);
  Stream(Context const & context);
  //Accessors
  Handle<CUstream> const & cu() const;
  Context const & context() const;
  //Synchronize
  void synchronize();
  //Enqueue
  void enqueue(Kernel const & kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const * = NULL, Event *event = NULL);
  // Write
  void write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
  template<class T> void write(Buffer const & buffer, bool blocking, std::size_t offset, std::vector<T> const & x)
  { write(buffer, blocking, offset, x.size()*sizeof(T), x.data()); }
  // Read
  void read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr);
  template<class T> void read(Buffer const & buffer, bool blocking, std::size_t offset, std::vector<T>& x)
  { read(buffer, blocking, offset, x.size()*sizeof(T), x.data()); }
 private:
  Context context_;
  Handle<CUstream> cu_;
 };
 }
 }
 #endif
--- a/include/external/CUDA/builtin_types.h
+++ b/include/external/CUDA/builtin_types.h
@@ -0,0 +1,64 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "device_types.h"
 #if !defined(__CUDACC_RTC__)
 #define EXCLUDE_FROM_RTC
 #include "driver_types.h"
 #undef EXCLUDE_FROM_RTC
 #endif /* !__CUDACC_RTC__ */
 #include "surface_types.h"
 #include "texture_types.h"
 #include "vector_types.h"
--- a/include/external/CUDA/channel_descriptor.h
+++ b/include/external/CUDA/channel_descriptor.h
@@ -0,0 +1,412 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__CHANNEL_DESCRIPTOR_H__)
 #define __CHANNEL_DESCRIPTOR_H__
 #if defined(__cplusplus)
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "driver_types.h"
 #include "cuda_runtime_api.h"
 #include "host_defines.h"
 #include "vector_types.h"
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 /**
 * \addtogroup CUDART_HIGHLEVEL
 *
 * @{
 */
 /**
 * \brief \hl Returns a channel descriptor using the specified format
 *
 * Returns a channel descriptor with format \p f and number of bits of each
 * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
 * defined as:
 * \code
  struct cudaChannelFormatDesc {
    int x, y, z, w;
    enum cudaChannelFormatKind f;
  };
 * \endcode
 *
 * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
 * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
 *
 * \return
 * Channel descriptor with format \p f
 *
 * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
 * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
 * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
 * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
 * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
 * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
 */
 template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
 {
  return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
 }
 static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
 }
 static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
 }
 static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
 }
 static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
 {
  int e = (int)sizeof(char) * 8;
 #if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 #else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 #endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
 {
  int e = (int)sizeof(signed char) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
 {
  int e = (int)sizeof(unsigned char) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
 {
  int e = (int)sizeof(signed char) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
 {
  int e = (int)sizeof(unsigned char) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
 {
  int e = (int)sizeof(signed char) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
 {
  int e = (int)sizeof(unsigned char) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
 {
  int e = (int)sizeof(signed char) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
 {
  int e = (int)sizeof(unsigned char) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
 {
  int e = (int)sizeof(short) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
 {
  int e = (int)sizeof(short) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
 {
  int e = (int)sizeof(short) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
 {
  int e = (int)sizeof(short) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
 {
  int e = (int)sizeof(unsigned short) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
 {
  int e = (int)sizeof(int) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
 {
  int e = (int)sizeof(unsigned int) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
 {
  int e = (int)sizeof(int) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
 {
  int e = (int)sizeof(unsigned int) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
 {
  int e = (int)sizeof(int) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
 {
  int e = (int)sizeof(unsigned int) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
 {
  int e = (int)sizeof(int) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
 {
  int e = (int)sizeof(unsigned int) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
 }
 #if !defined(__LP64__)
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
 {
  int e = (int)sizeof(long) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
 {
  int e = (int)sizeof(unsigned long) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
 {
  int e = (int)sizeof(long) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
 {
  int e = (int)sizeof(unsigned long) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
 {
  int e = (int)sizeof(long) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
 {
  int e = (int)sizeof(unsigned long) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
 {
  int e = (int)sizeof(long) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
 {
  int e = (int)sizeof(unsigned long) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
 }
 #endif /* !__LP64__ */
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
 {
  int e = (int)sizeof(float) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
 {
  int e = (int)sizeof(float) * 8;
  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
 {
  int e = (int)sizeof(float) * 8;
  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
 }
 template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
 {
  int e = (int)sizeof(float) * 8;
  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
 }
 #endif /* __cplusplus */
 /** @} */
 /** @} */ /* END CUDART_TEXTURE_HL */
 #endif /* !__CHANNEL_DESCRIPTOR_H__ */
--- a/include/external/CUDA/crt/host_config.h
+++ b/include/external/CUDA/crt/host_config.h
@@ -0,0 +1,266 @@
 /*
 * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__HOST_CONFIG_H__)
 #define __HOST_CONFIG_H__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #if defined(__CUDACC__)
 #if defined(__CUDACC_RTC__)
 #define _CRTIMP
 #define __THROW
 #else /* __CUDACC_RTC__ */
 /* check for host compilers that are compatible with nvcc */
 #if !defined(__GNUC__) && !defined(_WIN32)
 #error --- !!! UNSUPPORTED COMPILER !!! ---
 #endif /* !__GNUC__ && !_WIN32 */
 #if defined(__ICC)
 #if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700) || !defined(__GNUC__) || !defined(__LP64__)
 #error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, and ICC 17.0 on Linux x86_64 are supported!
 #endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 17.0) || !__GNUC__ || !__LP64__ */
 #endif /* __ICC */
 #if defined(__PGIC__)
 #if (!(__PGIC__ == 17) && \
     !(__PGIC__ == 99 && __PGIC_MINOR__ == 99)) || \
    !defined(__GNUC__) || !defined(__LP64__)
 #error -- unsupported pgc++ configuration! Only pgc++ 17 on Linux x86_64 is supported!
 #endif /* (!(__PGIC__ == 17) &&
           !(__PGIC__ == 99 && __PGIC_MINOR__ == 99 )) ||
          !__GNUC__ || !__LP64__ */
 #endif /* __PGIC__ */
 #if defined(__powerpc__)
 #if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
 #error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
 #endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
 #if defined(__ibmxl_vrm__) && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000)
 #error -- unsupported xlC version! only xlC 13.1 is supported
 #endif /* __ibmxl_vrm__ && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) */
 #endif /* __powerpc__ */
 #if defined(__GNUC__)
 #if __GNUC__ > 6
 #error -- unsupported GNU version! gcc versions later than 6 are not supported!
 #endif /* __GNUC__ > 6 */
 #if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
 #error -- clang and clang++ are the only supported host compilers on Mac OS X!
 #endif /* __APPLE__ && __MACH__ && !__clang__ */
 #endif /* __GNUC__ */
 #if defined(_WIN32)
 #if _MSC_VER < 1600 || _MSC_VER > 1911
 #error -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!
 #elif _MSC_VER == 1600 /* _MSC_VERION == 1600 */
 #pragma message("support for Microsoft Visual Studio 2010 has been deprecated!")
 #endif /* _MSC_VER < 1600 || _MSC_VER > 1800 || _MSC_VERSION == 1600 */
 #endif /* _WIN32 */
 /* configure host compiler */
 #if defined(__APPLE__)
 #define _CRTIMP
 #define _ACRTIMP
 #define __THROW
 #if defined(__BLOCKS__) /* nvcc does not support closures */
 #undef __BLOCKS__
 #endif /* __BLOCKS__ */
 #elif defined(__ANDROID__)
 #define _CRTIMP
 #define _ACRTIMP
 #define __THROW
 #elif defined(__QNX__)
 #define _CRTIMP
 #define _ACRTIMP
 #define __THROW
 #elif defined(__HORIZON__)
 #define _CRTIMP
 #define _ACRTIMP
 #define __THROW
 #elif defined(__GNUC__)
 #define _CRTIMP
 #define _ACRTIMP
 #include <features.h> /* for __THROW */
 #elif defined(_WIN32)
 #if _MSC_VER >= 1500
 #undef _USE_DECLSPECS_FOR_SAL
 #define _USE_DECLSPECS_FOR_SAL \
        1
 #endif /* _MSC_VER >= 1500 */
 #if !defined(_CRT_NONSTDC_NO_WARNINGS)
 #define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
 #endif /* !_CRT_NONSTDC_NO_WARNINGS */
 #if !defined(_CRT_SECURE_NO_WARNINGS)
 #define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
 #endif /* !_CRT_SECURE_NO_WARNINGS */
 #if !defined(NOMINMAX)
 #define NOMINMAX /* min and max are part of cuda runtime */
 #endif /* !NOMINMAX */
 #include <crtdefs.h> /* for _CRTIMP */
 #if _MSC_VER >= 1900
 #include <corecrt.h> /* for _ACRTIMP */
 #endif /* _MSC_VER >= 1900 */
 #define __THROW
 #endif /* __APPLE__ */
 #endif /* __CUDACC_RTC__ */
 #if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER)))
 #if __CUDACC_RTC__
 typedef char *va_list;
 #else /* !__CUDACC_RTC__ */
 #include <cstdarg>
 #endif /* __CUDACC_RTC__ */
 #undef va_start
 #undef va_end
 #undef va_arg
 #ifdef __PGIC__
 #undef __builtin_va_end
 #define va_start(v,l) __builtin_alt_va_start(v,l)
 #define va_end(v) __builtin_va_end(v)
 #define va_arg(v,l) __builtin_alt_va_arg(v,l)
 #if (__cplusplus >= 201103L)
 #undef va_copy
 #define va_copy(d,s)  __builtin_va_copy(d,s)
 #endif
 #else /* !__PGIC__ */
 #define va_start(ap, x) (__cu_va_start(&ap, x))
 #define va_end(ap) (__cu_va_end(&ap))
 #define va_arg(ap, t)  (*((t *)__cu_va_arg(&ap, (t *)0)))
 #if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L))
 #undef va_copy
 #define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps)))
 #endif /* (_MSC_VER >= 1800)  || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */
 #endif /* __PGIC__ */
 #endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */
 #endif /* __CUDACC__ */
 #endif /* !__HOST_CONFIG_H__ */
--- a/include/external/CUDA/crt/host_defines.h
+++ b/include/external/CUDA/crt/host_defines.h
@@ -0,0 +1,216 @@
 /*
 * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__HOST_DEFINES_H__)
 #define __HOST_DEFINES_H__
 /* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
 #if defined(__GNUC__) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
 #if defined(__CUDACC_RTC__)
 #define __volatile__ volatile
 #endif /* __CUDACC_RTC__ */
 #define __no_return__ \
        __attribute__((noreturn))
 #if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
 /* gcc allows users to define attributes with underscores, 
   e.g., __attribute__((__noinline__)).
   Consider a non-CUDA source file (e.g. .cpp) that has the 
   above attribute specification, and includes this header file. In that case,
   defining __noinline__ as below  would cause a gcc compilation error.
   Hence, only define __noinline__ when the code is being processed
   by a  CUDA compiler component.
 */   
 #define __noinline__ \
        __attribute__((noinline))
 #endif /* __CUDACC__  || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */
 #define __forceinline__ \
        __inline__ __attribute__((always_inline))
 #define __align__(n) \
        __attribute__((aligned(n)))
 #define __thread__ \
        __thread
 #define __import__
 #define __export__
 #define __cdecl
 #define __annotate__(a) \
        __attribute__((a))
 #define __location__(a) \
        __annotate__(a)
 #define CUDARTAPI
 #elif defined(_MSC_VER)
 #if _MSC_VER >= 1400
 #define __restrict__ \
        __restrict
 #else /* _MSC_VER >= 1400 */
 #define __restrict__
 #endif /* _MSC_VER >= 1400 */
 #define __inline__ \
        __inline
 #define __no_return__ \
        __declspec(noreturn)
 #define __noinline__ \
        __declspec(noinline)
 #define __forceinline__ \
        __forceinline
 #define __align__(n) \
        __declspec(align(n))
 #define __thread__ \
        __declspec(thread)
 #define __import__ \
        __declspec(dllimport)
 #define __export__ \
        __declspec(dllexport)
 #define __annotate__(a) \
        __declspec(a)
 #define __location__(a) \
        __annotate__(__##a##__)
 #define CUDARTAPI \
        __stdcall
 #else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
 #define __inline__
 #if !defined(__align__)
 #error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
 #endif /* !__align__ */
 #if !defined(CUDARTAPI)
 #error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
 #endif /* !CUDARTAPI */
 #endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
 #if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
    (defined(_MSC_VER) && _MSC_VER < 1900) || \
    (!defined(__GNUC__) && !defined(_MSC_VER))
 #define __specialization_static \
        static
 #else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
         (_MSC_VER && _MSC_VER < 1900) ||
         (!__GNUC__ && !_MSC_VER) */
 #define __specialization_static
 #endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
         (_MSC_VER && _MSC_VER < 1900) ||
         (!__GNUC__ && !_MSC_VER) */
 #if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__)
 #undef __annotate__
 #define __annotate__(a)
 #else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
 #define __launch_bounds__(...) \
        __annotate__(launch_bounds(__VA_ARGS__))
 #endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
 #if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \
    defined(__GNUC__) || defined(_WIN64)
 #define __builtin_align__(a) \
        __align__(a)
 #else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
 #define __builtin_align__(a)
 #endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__  || _WIN64 */
 #define __host__ \
        __location__(host)
 #define __device__ \
        __location__(device)
 #define __global__ \
        __location__(global)
 #define __shared__ \
        __location__(shared)
 #define __constant__ \
        __location__(constant)
 #define __managed__ \
        __location__(managed)
 #if !defined(__CUDACC__)
 #define __device_builtin__
 #define __device_builtin_texture_type__
 #define __device_builtin_surface_type__
 #define __cudart_builtin__
 #else /* defined(__CUDACC__) */
 #define __device_builtin__ \
        __location__(device_builtin)
 #define __device_builtin_texture_type__ \
        __location__(device_builtin_texture_type)
 #define __device_builtin_surface_type__ \
        __location__(device_builtin_surface_type)
 #define __cudart_builtin__ \
        __location__(cudart_builtin)
 #endif /* !defined(__CUDACC__) */
 #endif /* !__HOST_DEFINES_H__ */
--- a/include/external/CUDA/cuComplex.h
+++ b/include/external/CUDA/cuComplex.h
@@ -0,0 +1,338 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(CU_COMPLEX_H_)
 #define CU_COMPLEX_H_
 /* When trying to include C header file in C++ Code extern "C" is required
 * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
 * extern "C" cannot be nested
 * Hence keep the header out of extern "C" block
 */
 #include <math.h>       /* import fabsf, sqrt */
 #if defined(__cplusplus)
 extern "C" {
 #endif /* __cplusplus */
 #include "vector_types.h"
 typedef float2 cuFloatComplex;
 __host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) 
 { 
    return x.x; 
 }
 __host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) 
 { 
    return x.y; 
 }
 __host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex 
                                                             (float r, float i)
 {
    cuFloatComplex res;
    res.x = r;
    res.y = i;
    return res;
 }
 __host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
 {
    return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
 }
 __host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
                                                              cuFloatComplex y)
 {
    return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), 
                                cuCimagf(x) + cuCimagf(y));
 }
 __host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
                                                              cuFloatComplex y)
 {
        return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), 
                                    cuCimagf(x) - cuCimagf(y));
 }
 /* This implementation could suffer from intermediate overflow even though
 * the final result would be in range. However, various implementations do
 * not guard against this (presumably to avoid losing performance), so we 
 * don't do it either to stay competitive.
 */
 __host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
                                                              cuFloatComplex y)
 {
    cuFloatComplex prod;
    prod = make_cuFloatComplex  ((cuCrealf(x) * cuCrealf(y)) - 
                                 (cuCimagf(x) * cuCimagf(y)),
                                 (cuCrealf(x) * cuCimagf(y)) + 
                                 (cuCimagf(x) * cuCrealf(y)));
    return prod;
 }
 /* This implementation guards against intermediate underflow and overflow
 * by scaling. Such guarded implementations are usually the default for
 * complex library implementations, with some also offering an unguarded,
 * faster version.
 */
 __host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
                                                              cuFloatComplex y)
 {
    cuFloatComplex quot;
    float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
    float oos = 1.0f / s;
    float ars = cuCrealf(x) * oos;
    float ais = cuCimagf(x) * oos;
    float brs = cuCrealf(y) * oos;
    float bis = cuCimagf(y) * oos;
    s = (brs * brs) + (bis * bis);
    oos = 1.0f / s;
    quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
                                ((ais * brs) - (ars * bis)) * oos);
    return quot;
 }
 /* 
 * We would like to call hypotf(), but it's not available on all platforms.
 * This discrete implementation guards against intermediate underflow and 
 * overflow by scaling. Otherwise we would lose half the exponent range. 
 * There are various ways of doing guarded computation. For now chose the 
 * simplest and fastest solution, however this may suffer from inaccuracies 
 * if sqrt and division are not IEEE compliant. 
 */
 __host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
 {
    float a = cuCrealf(x);
    float b = cuCimagf(x);
    float v, w, t;
    a = fabsf(a);
    b = fabsf(b);
    if (a > b) {
        v = a;
        w = b; 
    } else {
        v = b;
        w = a;
    }
    t = w / v;
    t = 1.0f + t * t;
    t = v * sqrtf(t);
    if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
        t = v + w;
    }
    return t;
 }
 /* Double precision */
 typedef double2 cuDoubleComplex;
 __host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) 
 { 
    return x.x; 
 }
 __host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) 
 { 
    return x.y; 
 }
 __host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex 
                                                           (double r, double i)
 {
    cuDoubleComplex res;
    res.x = r;
    res.y = i;
    return res;
 }
 __host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
 {
    return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
 }
 __host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
                                                             cuDoubleComplex y)
 {
    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), 
                                 cuCimag(x) + cuCimag(y));
 }
 __host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
                                                             cuDoubleComplex y)
 {
    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), 
                                 cuCimag(x) - cuCimag(y));
 }
 /* This implementation could suffer from intermediate overflow even though
 * the final result would be in range. However, various implementations do
 * not guard against this (presumably to avoid losing performance), so we 
 * don't do it either to stay competitive.
 */
 __host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
                                                             cuDoubleComplex y)
 {
    cuDoubleComplex prod;
    prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - 
                                 (cuCimag(x) * cuCimag(y)),
                                 (cuCreal(x) * cuCimag(y)) + 
                                 (cuCimag(x) * cuCreal(y)));
    return prod;
 }
 /* This implementation guards against intermediate underflow and overflow
 * by scaling. Such guarded implementations are usually the default for
 * complex library implementations, with some also offering an unguarded,
 * faster version.
 */
 __host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
                                                             cuDoubleComplex y)
 {
    cuDoubleComplex quot;
    double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
    double oos = 1.0 / s;
    double ars = cuCreal(x) * oos;
    double ais = cuCimag(x) * oos;
    double brs = cuCreal(y) * oos;
    double bis = cuCimag(y) * oos;
    s = (brs * brs) + (bis * bis);
    oos = 1.0 / s;
    quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
                                 ((ais * brs) - (ars * bis)) * oos);
    return quot;
 }
 /* This implementation guards against intermediate underflow and overflow
 * by scaling. Otherwise we would lose half the exponent range. There are
 * various ways of doing guarded computation. For now chose the simplest
 * and fastest solution, however this may suffer from inaccuracies if sqrt
 * and division are not IEEE compliant.
 */
 __host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
 {
    double a = cuCreal(x);
    double b = cuCimag(x);
    double v, w, t;
    a = fabs(a);
    b = fabs(b);
    if (a > b) {
        v = a;
        w = b; 
    } else {
        v = b;
        w = a;
    }
    t = w / v;
    t = 1.0 + t * t;
    t = v * sqrt(t);
    if ((v == 0.0) || 
        (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
        t = v + w;
    }
    return t;
 }
 #if defined(__cplusplus)
 }
 #endif /* __cplusplus */
 /* aliases */
 typedef cuFloatComplex cuComplex;
 __host__ __device__ static __inline__ cuComplex make_cuComplex (float x, 
                                                                float y) 
 { 
    return make_cuFloatComplex (x, y); 
 }
 /* float-to-double promotion */
 __host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
                                                      (cuFloatComplex c)
 {
    return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
 }
 __host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
 (cuDoubleComplex c)
 {
 	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
 }
 __host__ __device__ static __inline__  cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
 {
    float real_res;
    float imag_res;
    real_res = (cuCrealf(x) *  cuCrealf(y)) + cuCrealf(d);
    imag_res = (cuCrealf(x) *  cuCimagf(y)) + cuCimagf(d);
    real_res = -(cuCimagf(x) * cuCimagf(y))  + real_res;  
    imag_res =  (cuCimagf(x) *  cuCrealf(y)) + imag_res;          
    return make_cuComplex(real_res, imag_res);
 }
 __host__ __device__ static __inline__  cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
 {
    double real_res;
    double imag_res;
    real_res = (cuCreal(x) *  cuCreal(y)) + cuCreal(d);
    imag_res = (cuCreal(x) *  cuCimag(y)) + cuCimag(d);
    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
    return make_cuDoubleComplex(real_res, imag_res);
 }
 #endif /* !defined(CU_COMPLEX_H_) */
--- a/include/external/CUDA/cublas.h
+++ b/include/external/CUDA/cublas.h
@@ -0,0 +1,565 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 /*
 * This is the public header file for the CUBLAS library, defining the API
 *
 * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines) 
 * on top of the CUDA runtime. 
 */
 #if !defined(CUBLAS_H_)
 #define CUBLAS_H_
 #include <cuda_runtime.h>
 #ifndef CUBLASWINAPI
 #ifdef _WIN32
 #define CUBLASWINAPI __stdcall
 #else
 #define CUBLASWINAPI 
 #endif
 #endif
 #undef CUBLASAPI
 #ifdef __CUDACC__
 #define CUBLASAPI __host__
 #else
 #define CUBLASAPI
 #endif
 #include "cublas_api.h"
 #if defined(__cplusplus)
 extern "C" {
 #endif
 /* CUBLAS data types */
 #define cublasStatus cublasStatus_t
 cublasStatus CUBLASWINAPI cublasInit (void);
 cublasStatus CUBLASWINAPI cublasShutdown (void);
 cublasStatus CUBLASWINAPI cublasGetError (void);
 cublasStatus CUBLASWINAPI cublasGetVersion(int *version);
 cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr);
 cublasStatus CUBLASWINAPI cublasFree (void *devicePtr);
 cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream);
 /* ---------------- CUBLAS BLAS1 functions ---------------- */
 /* NRM2 */
 float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx);
 double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx);
 float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx);
 double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* DOT */
 float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, 
                               int incy);
 double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, 
                               int incy);
 cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, 
                               int incy);
 cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, 
                               int incy);
 cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
                               int incy);
 cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
                               int incy);
 /*------------------------------------------------------------------------*/
 /* SCAL */
 void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx);
 void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx);
 void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx);
 void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* AXPY */
 void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, 
                               float *y, int incy);
 void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, 
                               int incx, double *y, int incy);
 void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, 
                               int incx, cuComplex *y, int incy);
 void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
                               int incx, cuDoubleComplex *y, int incy);
 /*------------------------------------------------------------------------*/
 /* COPY */
 void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, 
                               int incy);
 void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, 
                               int incy);
 void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
                               int incy);
 void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
                               int incy);
 /*------------------------------------------------------------------------*/
 /* SWAP */
 void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy);
 void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy);
 void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy);
 void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy);           
 /*------------------------------------------------------------------------*/
 /* AMAX */
 int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx);
 int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx);
 int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx);
 int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* AMIN */
 int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx);
 int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx);
 int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx);
 int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* ASUM */
 float CUBLASWINAPI cublasSasum (int n, const float *x, int incx);
 double CUBLASWINAPI cublasDasum (int n, const double *x, int incx);
 float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx);
 double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* ROT */
 void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, 
                              float sc, float ss);
 void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, 
                              double sc, double ss);
 void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, 
                              int incy, float c, cuComplex s);
 void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, 
                              cuDoubleComplex *y, int incy, double sc, 
                              cuDoubleComplex cs);
 void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
                               int incy, float c, float s);
 void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, 
                               cuDoubleComplex *y, int incy, double c, double s);
 /*------------------------------------------------------------------------*/
 /* ROTG */
 void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss);
 void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss);
 void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
                               cuComplex *cs);                                     
 void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
                               cuDoubleComplex *cs);                                                               
 /*------------------------------------------------------------------------*/
 /* ROTM */
 void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, 
                              const float* sparam);
 void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, 
                              const double* sparam);
 /*------------------------------------------------------------------------*/
 /* ROTMG */
 void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, 
                                const float *sy1, float* sparam);
 void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, 
                                const double *sy1, double* sparam);
 /* --------------- CUBLAS BLAS2 functions  ---------------- */
 /* GEMV */
 void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
                               const float *A, int lda, const float *x, int incx,
                               float beta, float *y, int incy);
 void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
                               const double *A, int lda, const double *x, int incx,
                               double beta, double *y, int incy);
 void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
                               const cuComplex *A, int lda, const cuComplex *x, int incx,
                               cuComplex beta, cuComplex *y, int incy);
 void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
                               cuDoubleComplex beta, cuDoubleComplex *y, int incy);
 /*------------------------------------------------------------------------*/
 /* GBMV */
 void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, 
                               float alpha, const float *A, int lda, 
                               const float *x, int incx, float beta, float *y, 
                               int incy);
 void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, 
                               double alpha, const double *A, int lda, 
                               const double *x, int incx, double beta, double *y, 
                               int incy);
 void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, 
                               cuComplex alpha, const cuComplex *A, int lda, 
                               const cuComplex *x, int incx, cuComplex beta, cuComplex *y, 
                               int incy);
 void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, 
                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, 
                               const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, 
                               int incy);                  
 /*------------------------------------------------------------------------*/
 /* TRMV */
 void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, 
                               const float *A, int lda, float *x, int incx);
 void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, 
                               const double *A, int lda, double *x, int incx);
 void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, 
                               const cuComplex *A, int lda, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, 
                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* TBMV */
 void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, 
                               const float *A, int lda, float *x, int incx);
 void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, 
                               const double *A, int lda, double *x, int incx);
 void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, 
                               const cuComplex *A, int lda, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, 
                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* TPMV */                                                    
 void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx);
 void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
 void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/
 /* TRSV */
 void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx);
 void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx);
 void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, 
                              cuDoubleComplex *x, int incx);       
 /*------------------------------------------------------------------------*/
 /* TPSV */
 void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, 
                              float *x, int incx);
 void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
 void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
 void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, 
                              cuDoubleComplex *x, int incx);
 /*------------------------------------------------------------------------*/                                         
 /* TBSV */                                         
 void CUBLASWINAPI cublasStbsv(char uplo, char trans, 
                              char diag, int n, int k, const float *A, 
                              int lda, float *x, int incx);
 void CUBLASWINAPI cublasDtbsv(char uplo, char trans, 
                              char diag, int n, int k, const double *A, 
                              int lda, double *x, int incx);
 void CUBLASWINAPI cublasCtbsv(char uplo, char trans, 
                              char diag, int n, int k, const cuComplex *A, 
                              int lda, cuComplex *x, int incx);      
 void CUBLASWINAPI cublasZtbsv(char uplo, char trans, 
                              char diag, int n, int k, const cuDoubleComplex *A, 
                              int lda, cuDoubleComplex *x, int incx);  
 /*------------------------------------------------------------------------*/                                         
 /* SYMV/HEMV */
 void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
                               int lda, const float *x, int incx, float beta, 
                               float *y, int incy);
 void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
                               int lda, const double *x, int incx, double beta, 
                               double *y, int incy);
 void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
                               int lda, const cuComplex *x, int incx, cuComplex beta, 
                               cuComplex *y, int incy);
 void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
                               int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, 
                               cuDoubleComplex *y, int incy);
 /*------------------------------------------------------------------------*/       
 /* SBMV/HBMV */
 void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, 
                               const float *A, int lda, const float *x, int incx, 
                               float beta, float *y, int incy);
 void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, 
                               const double *A, int lda, const double *x, int incx, 
                               double beta, double *y, int incy);
 void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, 
                               const cuComplex *A, int lda, const cuComplex *x, int incx, 
                               cuComplex beta, cuComplex *y, int incy);
 void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, 
                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, 
                               cuDoubleComplex beta, cuDoubleComplex *y, int incy);
 /*------------------------------------------------------------------------*/       
 /* SPMV/HPMV */
 void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
                              const float *AP, const float *x,
                              int incx, float beta, float *y, int incy);
 void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
                              const double *AP, const double *x,
                              int incx, double beta, double *y, int incy);
 void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
                              const cuComplex *AP, const cuComplex *x,
                              int incx, cuComplex beta, cuComplex *y, int incy);
 void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
                              const cuDoubleComplex *AP, const cuDoubleComplex *x,
                              int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy);
 /*------------------------------------------------------------------------*/       
 /* GER */
 void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
                              const float *y, int incy, float *A, int lda);
 void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
                              const double *y, int incy, double *A, int lda);
 void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
                               int incx, const cuComplex *y, int incy,
                               cuComplex *A, int lda);
 void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
                               int incx, const cuComplex *y, int incy,
                               cuComplex *A, int lda);
 void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
                               int incx, const cuDoubleComplex *y, int incy,
                               cuDoubleComplex *A, int lda);
 void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
                               int incx, const cuDoubleComplex *y, int incy,
                               cuDoubleComplex *A, int lda);
 /*------------------------------------------------------------------------*/       
 /* SYR/HER */
 void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
                              int incx, float *A, int lda);
 void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
                              int incx, double *A, int lda);
 void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, 
                              const cuComplex *x, int incx, cuComplex *A, int lda);
 void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, 
                              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda);
 /*------------------------------------------------------------------------*/       
 /* SPR/HPR */
 void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
                              int incx, float *AP);
 void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
                              int incx, double *AP);
 void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
                              int incx, cuComplex *AP);
 void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
                              int incx, cuDoubleComplex *AP);
 /*------------------------------------------------------------------------*/       
 /* SYR2/HER2 */
 void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, 
                               int incx, const float *y, int incy, float *A, 
                               int lda);
 void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, 
                               int incx, const double *y, int incy, double *A, 
                               int lda);
 void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, 
                               int incx, const cuComplex *y, int incy, cuComplex *A, 
                               int lda);
 void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
                               int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, 
                               int lda);
 /*------------------------------------------------------------------------*/       
 /* SPR2/HPR2 */
 void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, 
                               int incx, const float *y, int incy, float *AP);
 void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
                               const double *x, int incx, const double *y,
                               int incy, double *AP);
 void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
                               const cuComplex *x, int incx, const cuComplex *y,
                               int incy, cuComplex *AP);
 void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
                               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
                               int incy, cuDoubleComplex *AP);
 /* ------------------------BLAS3 Functions ------------------------------- */
 /* GEMM */
 void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, 
                               float alpha, const float *A, int lda, 
                               const float *B, int ldb, float beta, float *C, 
                               int ldc);
 void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
                               double alpha, const double *A, int lda, 
                               const double *B, int ldb, double beta, double *C, 
                               int ldc);              
 void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, 
                               cuComplex alpha, const cuComplex *A, int lda,
                               const cuComplex *B, int ldb, cuComplex beta,
                               cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
                               int k, cuDoubleComplex alpha,
                               const cuDoubleComplex *A, int lda,
                               const cuDoubleComplex *B, int ldb,
                               cuDoubleComplex beta, cuDoubleComplex *C,
                               int ldc);                   
 /* -------------------------------------------------------*/
 /* SYRK */
 void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, 
                               const float *A, int lda, float beta, float *C, 
                               int ldc);
 void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
                               double alpha, const double *A, int lda,
                               double beta, double *C, int ldc);
 void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
                               cuComplex alpha, const cuComplex *A, int lda,
                               cuComplex beta, cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
                               cuDoubleComplex alpha,
                               const cuDoubleComplex *A, int lda,
                               cuDoubleComplex beta,
                               cuDoubleComplex *C, int ldc);
 /* ------------------------------------------------------- */
 /* HERK */
 void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
                               float alpha, const cuComplex *A, int lda,
                               float beta, cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
                               double alpha,
                               const cuDoubleComplex *A, int lda,
                               double beta,
                               cuDoubleComplex *C, int ldc);
 /* ------------------------------------------------------- */
 /* SYR2K */
 void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, 
                                const float *A, int lda, const float *B, int ldb, 
                                float beta, float *C, int ldc);
 void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
                                double alpha, const double *A, int lda,
                                const double *B, int ldb, double beta,
                                double *C, int ldc);
 void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
                                cuComplex alpha, const cuComplex *A, int lda,
                                const cuComplex *B, int ldb, cuComplex beta,
                                cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
                                const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
                                cuDoubleComplex *C, int ldc);                             
 /* ------------------------------------------------------- */
 /* HER2K */
 void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
                                cuComplex alpha, const cuComplex *A, int lda,
                                const cuComplex *B, int ldb, float beta,
                                cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
                                const cuDoubleComplex *B, int ldb, double beta,
                                cuDoubleComplex *C, int ldc); 
 /*------------------------------------------------------------------------*/       
 /* SYMM*/
 void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, 
                               const float *A, int lda, const float *B, int ldb,
                               float beta, float *C, int ldc);
 void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, 
                               const double *A, int lda, const double *B, int ldb,
                               double beta, double *C, int ldc);
 void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, 
                               const cuComplex *A, int lda, const cuComplex *B, int ldb,
                               cuComplex beta, cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, 
                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
                               cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
 /*------------------------------------------------------------------------*/       
 /* HEMM*/
 void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
                               cuComplex alpha, const cuComplex *A, int lda,
                               const cuComplex *B, int ldb, cuComplex beta,
                               cuComplex *C, int ldc);
 void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
                               const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
                               cuDoubleComplex *C, int ldc);  
 /*------------------------------------------------------------------------*/       
 /* TRSM*/
 void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
                               int m, int n, float alpha, const float *A, int lda,
                               float *B, int ldb);
 void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
                               char diag, int m, int n, double alpha,
                               const double *A, int lda, double *B,
                               int ldb);
 void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
                               int m, int n, cuComplex alpha, const cuComplex *A,
                               int lda, cuComplex *B, int ldb);
 void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
                               char diag, int m, int n, cuDoubleComplex alpha,
                               const cuDoubleComplex *A, int lda,
                               cuDoubleComplex *B, int ldb);                                                        
 /*------------------------------------------------------------------------*/       
 /* TRMM*/
 void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
                               int m, int n, float alpha, const float *A, int lda,
                               float *B, int ldb);
 void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
                               char diag, int m, int n, double alpha,
                               const double *A, int lda, double *B,
                               int ldb);
 void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
                               int m, int n, cuComplex alpha, const cuComplex *A,
                               int lda, cuComplex *B, int ldb);
 void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
                               char diag, int m, int n, cuDoubleComplex alpha,
                               const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
                               int ldb);
 #if defined(__cplusplus)
 }
 #endif /* __cplusplus */
 #endif /* !defined(CUBLAS_H_) */
--- a/include/external/CUDA/cublas_api.h
+++ b/include/external/CUDA/cublas_api.h
--- a/include/external/CUDA/cublas_v2.h
+++ b/include/external/CUDA/cublas_v2.h
@@ -0,0 +1,274 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 /*
 * This is the public header file for the new CUBLAS library API, it mapped the generic 
 * Cublas name functions to the actual _v2 implementations.
 */
 #if !defined(CUBLAS_V2_H_)
 #define CUBLAS_V2_H_
 #undef CUBLASAPI
 #ifdef __CUDACC__
 #define CUBLASAPI __host__ __device__
 #else
 #define CUBLASAPI
 #endif
 #include "cublas_api.h"
 #define cublasCreate         cublasCreate_v2
 #define cublasDestroy        cublasDestroy_v2
 #define cublasGetVersion     cublasGetVersion_v2
 #define cublasSetStream      cublasSetStream_v2
 #define cublasGetStream      cublasGetStream_v2
 #define cublasGetPointerMode cublasGetPointerMode_v2
 #define cublasSetPointerMode cublasSetPointerMode_v2
 /* Blas3 Routines   */
 #define cublasSnrm2          cublasSnrm2_v2
 #define cublasDnrm2          cublasDnrm2_v2 
 #define cublasScnrm2         cublasScnrm2_v2
 #define cublasDznrm2         cublasDznrm2_v2
 #define cublasSdot           cublasSdot_v2
 #define cublasDdot           cublasDdot_v2
 #define cublasCdotu          cublasCdotu_v2
 #define cublasCdotc          cublasCdotc_v2
 #define cublasZdotu          cublasZdotu_v2
 #define cublasZdotc          cublasZdotc_v2
 #define cublasSscal          cublasSscal_v2
 #define cublasDscal          cublasDscal_v2
 #define cublasCscal          cublasCscal_v2
 #define cublasCsscal         cublasCsscal_v2
 #define cublasZscal          cublasZscal_v2
 #define cublasZdscal         cublasZdscal_v2
 #define cublasSaxpy          cublasSaxpy_v2
 #define cublasDaxpy          cublasDaxpy_v2
 #define cublasCaxpy          cublasCaxpy_v2
 #define cublasZaxpy          cublasZaxpy_v2
 #define cublasScopy          cublasScopy_v2
 #define cublasDcopy          cublasDcopy_v2
 #define cublasCcopy          cublasCcopy_v2
 #define cublasZcopy          cublasZcopy_v2
 #define cublasSswap          cublasSswap_v2
 #define cublasDswap          cublasDswap_v2
 #define cublasCswap          cublasCswap_v2
 #define cublasZswap          cublasZswap_v2
 #define cublasIsamax         cublasIsamax_v2
 #define cublasIdamax         cublasIdamax_v2
 #define cublasIcamax         cublasIcamax_v2
 #define cublasIzamax         cublasIzamax_v2
 #define cublasIsamin         cublasIsamin_v2
 #define cublasIdamin         cublasIdamin_v2
 #define cublasIcamin         cublasIcamin_v2
 #define cublasIzamin         cublasIzamin_v2
 #define cublasSasum          cublasSasum_v2
 #define cublasDasum          cublasDasum_v2
 #define cublasScasum         cublasScasum_v2
 #define cublasDzasum         cublasDzasum_v2
 #define cublasSrot           cublasSrot_v2 
 #define cublasDrot           cublasDrot_v2 
 #define cublasCrot           cublasCrot_v2 
 #define cublasCsrot          cublasCsrot_v2
 #define cublasZrot           cublasZrot_v2 
 #define cublasZdrot          cublasZdrot_v2
 #define cublasSrotg          cublasSrotg_v2
 #define cublasDrotg          cublasDrotg_v2
 #define cublasCrotg          cublasCrotg_v2
 #define cublasZrotg          cublasZrotg_v2
 #define cublasSrotm          cublasSrotm_v2 
 #define cublasDrotm          cublasDrotm_v2 
 #define cublasSrotmg         cublasSrotmg_v2 
 #define cublasDrotmg         cublasDrotmg_v2 
 /* Blas2 Routines */
 #define cublasSgemv          cublasSgemv_v2
 #define cublasDgemv          cublasDgemv_v2
 #define cublasCgemv          cublasCgemv_v2
 #define cublasZgemv          cublasZgemv_v2
 #define cublasSgbmv          cublasSgbmv_v2
 #define cublasDgbmv          cublasDgbmv_v2
 #define cublasCgbmv          cublasCgbmv_v2
 #define cublasZgbmv          cublasZgbmv_v2
 #define cublasStrmv          cublasStrmv_v2
 #define cublasDtrmv          cublasDtrmv_v2
 #define cublasCtrmv          cublasCtrmv_v2
 #define cublasZtrmv          cublasZtrmv_v2
 #define cublasStbmv          cublasStbmv_v2
 #define cublasDtbmv          cublasDtbmv_v2
 #define cublasCtbmv          cublasCtbmv_v2
 #define cublasZtbmv          cublasZtbmv_v2
 #define cublasStpmv          cublasStpmv_v2
 #define cublasDtpmv          cublasDtpmv_v2
 #define cublasCtpmv          cublasCtpmv_v2
 #define cublasZtpmv          cublasZtpmv_v2
 #define cublasStrsv          cublasStrsv_v2
 #define cublasDtrsv          cublasDtrsv_v2
 #define cublasCtrsv          cublasCtrsv_v2
 #define cublasZtrsv          cublasZtrsv_v2
 #define cublasStpsv          cublasStpsv_v2
 #define cublasDtpsv          cublasDtpsv_v2
 #define cublasCtpsv          cublasCtpsv_v2
 #define cublasZtpsv          cublasZtpsv_v2
 #define cublasStbsv          cublasStbsv_v2
 #define cublasDtbsv          cublasDtbsv_v2
 #define cublasCtbsv          cublasCtbsv_v2
 #define cublasZtbsv          cublasZtbsv_v2
 #define cublasSsymv          cublasSsymv_v2
 #define cublasDsymv          cublasDsymv_v2
 #define cublasCsymv          cublasCsymv_v2
 #define cublasZsymv          cublasZsymv_v2
 #define cublasChemv          cublasChemv_v2
 #define cublasZhemv          cublasZhemv_v2
 #define cublasSsbmv          cublasSsbmv_v2
 #define cublasDsbmv          cublasDsbmv_v2
 #define cublasChbmv          cublasChbmv_v2
 #define cublasZhbmv          cublasZhbmv_v2
 #define cublasSspmv          cublasSspmv_v2
 #define cublasDspmv          cublasDspmv_v2
 #define cublasChpmv          cublasChpmv_v2
 #define cublasZhpmv          cublasZhpmv_v2
 #define cublasSger           cublasSger_v2
 #define cublasDger           cublasDger_v2
 #define cublasCgeru          cublasCgeru_v2
 #define cublasCgerc          cublasCgerc_v2
 #define cublasZgeru          cublasZgeru_v2
 #define cublasZgerc          cublasZgerc_v2
 #define cublasSsyr           cublasSsyr_v2
 #define cublasDsyr           cublasDsyr_v2
 #define cublasCsyr           cublasCsyr_v2
 #define cublasZsyr           cublasZsyr_v2
 #define cublasCher           cublasCher_v2
 #define cublasZher           cublasZher_v2
 #define cublasSspr           cublasSspr_v2
 #define cublasDspr           cublasDspr_v2
 #define cublasChpr           cublasChpr_v2
 #define cublasZhpr           cublasZhpr_v2
 #define cublasSsyr2          cublasSsyr2_v2
 #define cublasDsyr2          cublasDsyr2_v2
 #define cublasCsyr2          cublasCsyr2_v2
 #define cublasZsyr2          cublasZsyr2_v2
 #define cublasCher2          cublasCher2_v2
 #define cublasZher2          cublasZher2_v2
 #define cublasSspr2          cublasSspr2_v2
 #define cublasDspr2          cublasDspr2_v2
 #define cublasChpr2          cublasChpr2_v2
 #define cublasZhpr2          cublasZhpr2_v2
 /* Blas3 Routines   */
 #define cublasSgemm          cublasSgemm_v2
 #define cublasDgemm          cublasDgemm_v2
 #define cublasCgemm          cublasCgemm_v2
 #define cublasZgemm          cublasZgemm_v2
 #define cublasSsyrk          cublasSsyrk_v2
 #define cublasDsyrk          cublasDsyrk_v2
 #define cublasCsyrk          cublasCsyrk_v2
 #define cublasZsyrk          cublasZsyrk_v2
 #define cublasCherk          cublasCherk_v2
 #define cublasZherk          cublasZherk_v2
 #define cublasSsyr2k         cublasSsyr2k_v2
 #define cublasDsyr2k         cublasDsyr2k_v2
 #define cublasCsyr2k         cublasCsyr2k_v2
 #define cublasZsyr2k         cublasZsyr2k_v2
 #define cublasCher2k         cublasCher2k_v2
 #define cublasZher2k         cublasZher2k_v2
 #define cublasSsymm          cublasSsymm_v2
 #define cublasDsymm          cublasDsymm_v2
 #define cublasCsymm          cublasCsymm_v2
 #define cublasZsymm          cublasZsymm_v2
 #define cublasChemm          cublasChemm_v2
 #define cublasZhemm          cublasZhemm_v2
 #define cublasStrsm          cublasStrsm_v2
 #define cublasDtrsm          cublasDtrsm_v2
 #define cublasCtrsm          cublasCtrsm_v2
 #define cublasZtrsm          cublasZtrsm_v2
 #define cublasStrmm          cublasStrmm_v2
 #define cublasDtrmm          cublasDtrmm_v2
 #define cublasCtrmm          cublasCtrmm_v2
 #define cublasZtrmm          cublasZtrmm_v2
 #endif /* !defined(CUBLAS_V2_H_) */
--- a/include/external/CUDA/cuda.h
+++ b/include/external/CUDA/cuda.h
--- a/include/external/CUDA/cuda_device_runtime_api.h
+++ b/include/external/CUDA/cuda_device_runtime_api.h
@@ -0,0 +1,248 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
 #define __CUDA_DEVICE_RUNTIME_API_H__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #if !defined(__CUDACC_RTC__)
 #if (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
 #if defined(__cplusplus)
 extern "C" {
 #endif
 struct cudaFuncAttributes;
 #if defined(_WIN32)
 #define __NV_WEAK__ __declspec(nv_weak)
 #else
 #define __NV_WEAK__ __attribute__((nv_weak))
 #endif
 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s) 
 { 
  return cudaErrorUnknown;
 }
 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c) 
 { 
  return cudaErrorUnknown;
 }
 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
 {
  return cudaErrorUnknown;
 }
 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaGetDevice(int *device)
 {
  return cudaErrorUnknown;
 }
 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
 {
  return cudaErrorUnknown;
 }
 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
 {
  return cudaErrorUnknown;
 }
 #undef __NV_WEAK__
 #if defined(__cplusplus)
 }
 #endif
 #endif /* (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
 #endif /* !defined(__CUDACC_RTC__) */
 #if defined(__cplusplus) && defined(__CUDACC__)         /* Visible to nvcc front-end only */
 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)   // Visible to SM>=3.5 and "__host__ __device__" only
 #include "driver_types.h"
 #include "host_defines.h"
 extern "C"
 {
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
 extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
 extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
 /**
 * \ingroup CUDART_EXECUTION
 * \brief Obtains a parameter buffer
 *
 * Obtains a parameter buffer which can be filled with parameters for a kernel launch.
 * Parameters passed to ::cudaLaunchDevice must be allocated via this function.
 *
 * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
 * CUDA user code should use <<< >>> to launch kernels.
 *
 * \param alignment - Specifies alignment requirement of the parameter buffer
 * \param size      - Specifies size requirement in bytes
 *
 * \return
 * Returns pointer to the allocated parameterBuffer
 * \notefnerr
 *
 * \sa cudaLaunchDevice
 */
 extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
 /**
 * \ingroup CUDART_EXECUTION
 * \brief Launches a specified kernel
 *
 * Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
 * by calling ::cudaGetParameterBuffer().
 *
 * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
 * CUDA user code should use <<< >>> to launch the kernels.
 *
 * \param func            - Pointer to the kernel to be launched
 * \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
 * \param gridDimension   - Specifies grid dimensions
 * \param blockDimension  - Specifies block dimensions
 * \param sharedMemSize   - Specifies size of shared memory
 * \param stream          - Specifies the stream to be used
 *
 * \return
 * ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
 * ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
 * \notefnerr
 * \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
 * Guide for the detailed descriptions of launch configuration and parameter layout respectively.
 *
 * \sa cudaGetParameterBuffer
 */
 extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
 #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
    // When compiling for the device and per thread default stream is enabled, add
    // a static inline redirect to the per thread stream entry points.
    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
    cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
    {
        return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
    }
    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
    cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
    {
        return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
    }
 #else
    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
 #endif
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
 extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle);
 extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle);
 }
 template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
 template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
 template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
 template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
 #endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
 #endif /* defined(__cplusplus) && defined(__CUDACC__) */
 #endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */
--- a/include/external/CUDA/cuda_fp16.h
+++ b/include/external/CUDA/cuda_fp16.h
--- a/include/external/CUDA/cuda_fp16.hpp
+++ b/include/external/CUDA/cuda_fp16.hpp
--- a/include/external/CUDA/cuda_runtime.h
+++ b/include/external/CUDA/cuda_runtime.h
--- a/include/external/CUDA/cuda_runtime_api.h
+++ b/include/external/CUDA/cuda_runtime_api.h
--- a/include/external/CUDA/cudnn.h
+++ b/include/external/CUDA/cudnn.h
--- a/include/external/CUDA/cusparse.h
+++ b/include/external/CUDA/cusparse.h
--- a/include/external/CUDA/device_types.h
+++ b/include/external/CUDA/device_types.h
@@ -0,0 +1,69 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__DEVICE_TYPES_H__)
 #define __DEVICE_TYPES_H__
 #include "host_defines.h"
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 enum __device_builtin__ cudaRoundMode
 {
    cudaRoundNearest,
    cudaRoundZero,
    cudaRoundPosInf,
    cudaRoundMinInf
 };
 #endif /* !__DEVICE_TYPES_H__ */
--- a/include/external/CUDA/driver_functions.h
+++ b/include/external/CUDA/driver_functions.h
@@ -0,0 +1,145 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__DRIVER_FUNCTIONS_H__)
 #define __DRIVER_FUNCTIONS_H__
 #include "builtin_types.h"
 #include "host_defines.h"
 #include "driver_types.h"
 /**
 * \addtogroup CUDART_MEMORY
 *
 * @{
 */
 /**
 * \brief Returns a cudaPitchedPtr based on input parameters
 *
 * Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
 * \p p, \p xsz, and \p ysz.
 *
 * \param d   - Pointer to allocated memory
 * \param p   - Pitch of allocated memory in bytes
 * \param xsz - Logical width of allocation in elements
 * \param ysz - Logical height of allocation in elements
 *
 * \return
 * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
 *
 * \sa make_cudaExtent, make_cudaPos
 */
 static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) 
 {
  struct cudaPitchedPtr s;
  s.ptr   = d;
  s.pitch = p;
  s.xsize = xsz;
  s.ysize = ysz;
  return s;
 }
 /**
 * \brief Returns a cudaPos based on input parameters
 *
 * Returns a ::cudaPos based on the specified input parameters \p x,
 * \p y, and \p z.
 *
 * \param x - X position
 * \param y - Y position
 * \param z - Z position
 *
 * \return
 * ::cudaPos specified by \p x, \p y, and \p z
 *
 * \sa make_cudaExtent, make_cudaPitchedPtr
 */
 static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) 
 {
  struct cudaPos p;
  p.x = x;
  p.y = y;
  p.z = z;
  return p;
 }
 /**
 * \brief Returns a cudaExtent based on input parameters
 *
 * Returns a ::cudaExtent based on the specified input parameters \p w,
 * \p h, and \p d.
 *
 * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
 * \param h - Height in elements
 * \param d - Depth in elements
 *
 * \return
 * ::cudaExtent specified by \p w, \p h, and \p d
 *
 * \sa make_cudaPitchedPtr, make_cudaPos
 */
 static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) 
 {
  struct cudaExtent e;
  e.width  = w;
  e.height = h;
  e.depth  = d;
  return e;
 }
 /** @} */ /* END CUDART_MEMORY */
 #endif /* !__DRIVER_FUNCTIONS_H__ */
--- a/include/external/CUDA/driver_types.h
+++ b/include/external/CUDA/driver_types.h
--- a/include/external/CUDA/host_config.h
+++ b/include/external/CUDA/host_config.h
@@ -0,0 +1,50 @@
 /*
 * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #include "crt/host_config.h"
--- a/include/external/CUDA/host_defines.h
+++ b/include/external/CUDA/host_defines.h
@@ -0,0 +1,50 @@
 /*
 * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #include "crt/host_defines.h"
--- a/include/external/CUDA/library_types.h
+++ b/include/external/CUDA/library_types.h
@@ -0,0 +1,80 @@
 /*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__LIBRARY_TYPES_H__)
 #define __LIBRARY_TYPES_H__
 typedef enum cudaDataType_t
 {
 	CUDA_R_16F= 2,  /* real as a half */
 	CUDA_C_16F= 6,  /* complex as a pair of half numbers */
 	CUDA_R_32F= 0,  /* real as a float */
 	CUDA_C_32F= 4,  /* complex as a pair of float numbers */
 	CUDA_R_64F= 1,  /* real as a double */
 	CUDA_C_64F= 5,  /* complex as a pair of double numbers */
 	CUDA_R_8I = 3,  /* real as a signed char */
 	CUDA_C_8I = 7,  /* complex as a pair of signed char numbers */
 	CUDA_R_8U = 8,  /* real as a unsigned char */
 	CUDA_C_8U = 9,  /* complex as a pair of unsigned char numbers */
 	CUDA_R_32I= 10, /* real as a signed int */
 	CUDA_C_32I= 11, /* complex as a pair of signed int numbers */
 	CUDA_R_32U= 12, /* real as a unsigned int */
 	CUDA_C_32U= 13  /* complex as a pair of unsigned int numbers */
 } cudaDataType; 
 typedef enum libraryPropertyType_t
 {
 	MAJOR_VERSION,
 	MINOR_VERSION,
 	PATCH_LEVEL
 } libraryPropertyType;
 #endif /* !__LIBRARY_TYPES_H__ */
--- a/include/external/CUDA/nvml.h
+++ b/include/external/CUDA/nvml.h
--- a/include/external/CUDA/nvrtc.h
+++ b/include/external/CUDA/nvrtc.h
@@ -0,0 +1,525 @@
 //
 // NVIDIA_COPYRIGHT_BEGIN
 //
 // Copyright (c) 2014-2017, NVIDIA CORPORATION.  All rights reserved.
 //
 // NVIDIA CORPORATION and its licensors retain all intellectual property
 // and proprietary rights in and to this software, related documentation
 // and any modifications thereto.  Any use, reproduction, disclosure or
 // distribution of this software and related documentation without an express
 // license agreement from NVIDIA CORPORATION is strictly prohibited.
 //
 // NVIDIA_COPYRIGHT_END
 //
 #ifndef __NVRTC_H__
 #define __NVRTC_H__
 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */
 #include <stdlib.h>
 /*************************************************************************//**
 *
 * \defgroup error Error Handling
 *
 * NVRTC defines the following enumeration type and function for API call
 * error handling.
 *
 ****************************************************************************/
 /**
 * \ingroup error
 * \brief   The enumerated type nvrtcResult defines API call result codes.
 *          NVRTC API functions return nvrtcResult to indicate the call
 *          result.
 */
 typedef enum {
  NVRTC_SUCCESS = 0,
  NVRTC_ERROR_OUT_OF_MEMORY = 1,
  NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
  NVRTC_ERROR_INVALID_INPUT = 3,
  NVRTC_ERROR_INVALID_PROGRAM = 4,
  NVRTC_ERROR_INVALID_OPTION = 5,
  NVRTC_ERROR_COMPILATION = 6,
  NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
  NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
  NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
  NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
  NVRTC_ERROR_INTERNAL_ERROR = 11
 } nvrtcResult;
 /**
 * \ingroup error
 * \brief   nvrtcGetErrorString is a helper function that returns a string
 *          describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to
 *          \c "NVRTC_SUCCESS".
 *          For unrecognized enumeration values, it returns
 *          \c "NVRTC_ERROR unknown".
 *
 * \param   [in] result CUDA Runtime Compilation API result code.
 * \return  Message string for the given #nvrtcResult code.
 */
 const char *nvrtcGetErrorString(nvrtcResult result);
 /*************************************************************************//**
 *
 * \defgroup query General Information Query
 *
 * NVRTC defines the following function for general information query.
 *
 ****************************************************************************/
 /**
 * \ingroup query
 * \brief   nvrtcVersion sets the output parameters \p major and \p minor
 *          with the CUDA Runtime Compilation version number.
 *
 * \param   [out] major CUDA Runtime Compilation major version number.
 * \param   [out] minor CUDA Runtime Compilation minor version number.
 * \return
 *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
 *
 */
 nvrtcResult nvrtcVersion(int *major, int *minor);
 /*************************************************************************//**
 *
 * \defgroup compilation Compilation
 *
 * NVRTC defines the following type and functions for actual compilation.
 *
 ****************************************************************************/
 /**
 * \ingroup compilation
 * \brief   nvrtcProgram is the unit of compilation, and an opaque handle for
 *          a program.
 *
 * To compile a CUDA program string, an instance of nvrtcProgram must be
 * created first with ::nvrtcCreateProgram, then compiled with
 * ::nvrtcCompileProgram.
 */
 typedef struct _nvrtcProgram *nvrtcProgram;
 /**
 * \ingroup compilation
 * \brief   nvrtcCreateProgram creates an instance of nvrtcProgram with the
 *          given input parameters, and sets the output parameter \p prog with
 *          it.
 *
 * \param   [out] prog         CUDA Runtime Compilation program.
 * \param   [in]  src          CUDA program source.
 * \param   [in]  name         CUDA program name.\n
 *                             \p name can be \c NULL; \c "default_program" is
 *                             used when \p name is \c NULL.
 * \param   [in]  numHeaders   Number of headers used.\n
 *                             \p numHeaders must be greater than or equal to 0.
 * \param   [in]  headers      Sources of the headers.\n
 *                             \p headers can be \c NULL when \p numHeaders is
 *                             0.
 * \param   [in]  includeNames Name of each header by which they can be
 *                             included in the CUDA program source.\n
 *                             \p includeNames can be \c NULL when \p numHeaders
 *                             is 0.
 * \return
 *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
 *
 * \see     ::nvrtcDestroyProgram
 */
 nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
                               const char *src,
                               const char *name,
                               int numHeaders,
                               const char * const *headers,
                               const char * const *includeNames);
 /**
 * \ingroup compilation
 * \brief   nvrtcDestroyProgram destroys the given program.
 *
 * \param    [in] prog CUDA Runtime Compilation program.
 * \return
 *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
 *
 * \see     ::nvrtcCreateProgram
 */
 nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
 /**
 * \ingroup compilation
 * \brief   nvrtcCompileProgram compiles the given program.
 *
 * It supports compile options listed in \ref options.
 */
 nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
                                int numOptions, const char * const *options);
 /**
 * \ingroup compilation
 * \brief   nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX
 *          generated by the previous compilation of \p prog (including the
 *          trailing \c NULL).
 *
 * \param   [in]  prog       CUDA Runtime Compilation program.
 * \param   [out] ptxSizeRet Size of the generated PTX (including the trailing
 *                           \c NULL).
 * \return
 *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
 *
 * \see     ::nvrtcGetPTX
 */
 nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
 /**
 * \ingroup compilation
 * \brief   nvrtcGetPTX stores the PTX generated by the previous compilation
 *          of \p prog in the memory pointed by \p ptx.
 *
 * \param   [in]  prog CUDA Runtime Compilation program.
 * \param   [out] ptx  Compiled result.
 * \return
 *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
 *
 * \see     ::nvrtcGetPTXSize
 */
 nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
 /**
 * \ingroup compilation
 * \brief   nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
 *          log generated by the previous compilation of \p prog (including the
 *          trailing \c NULL).
 *
 * Note that compilation log may be generated with warnings and informative
 * messages, even when the compilation of \p prog succeeds.
 *
 * \param   [in]  prog       CUDA Runtime Compilation program.
 * \param   [out] logSizeRet Size of the compilation log
 *                           (including the trailing \c NULL).
 * \return
 *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
 *
 * \see     ::nvrtcGetProgramLog
 */
 nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
 /**
 * \ingroup compilation
 * \brief   nvrtcGetProgramLog stores the log generated by the previous
 *          compilation of \p prog in the memory pointed by \p log.
 *
 * \param   [in]  prog CUDA Runtime Compilation program.
 * \param   [out] log  Compilation log.
 * \return
 *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
 *
 * \see     ::nvrtcGetProgramLogSize
 */
 nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
 /**
 * \ingroup compilation
 * \brief   nvrtcAddNameExpression notes the given name expression
 *          denoting a __global__ function or function template
 *          instantiation.
 *
 * The identical name expression string must be provided on a subsequent
 * call to nvrtcGetLoweredName to extract the lowered name.
 * \param   [in]  prog CUDA Runtime Compilation program.
 * \param   [in] name_expression constant expression denoting a __global__
 *               function or function template instantiation.
 * \return
 *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink
 *
 * \see     ::nvrtcGetLoweredName
 */
 nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog,
                                   const char * const name_expression);
 /**
 * \ingroup compilation
 * \brief   nvrtcGetLoweredName extracts the lowered (mangled) name
 *          for a __global__ function or function template instantiation,
 *          and updates *lowered_name to point to it. The memory containing
 *          the name is released when the NVRTC program is destroyed by 
 *          nvrtcDestroyProgram.
 *          The identical name expression must have been previously
 *          provided to nvrtcAddNameExpression.
 *
 * \param   [in]  prog CUDA Runtime Compilation program.
 * \param   [in] name_expression constant expression denoting a __global__
 *               function or function template instantiation.
 * \param   [out] lowered_name initialized by the function to point to a
 *               C string containing the lowered (mangled)
 *               name corresponding to the provided name expression.
 * \return
 *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink
 *
 * \see     ::nvrtcAddNameExpression
 */
 nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog,
                                const char *const name_expression,
                                const char** lowered_name);
 /**
 * \defgroup options Supported Compile Options
 *
 * NVRTC supports the compile options below.
 * Option names with two preceding dashs (\c --) are long option names and
 * option names with one preceding dash (\c -) are short option names.
 * Short option names can be used instead of long option names.
 * When a compile option takes an argument, an assignment operator (\c =)
 * is used to separate the compile option argument from the compile option
 * name, e.g., \c "--gpu-architecture=compute_30".
 * Alternatively, the compile option name and the argument can be specified in
 * separate strings without an assignment operator, .e.g,
 * \c "--gpu-architecture" \c "compute_30".
 * Single-character short option names, such as \c -D, \c -U, and \c -I, do
 * not require an assignment operator, and the compile option name and the
 * argument can be present in the same string with or without spaces between
 * them.
 * For instance, \c "-D=<def>", \c "-D<def>", and \c "-D <def>" are all
 * supported.
 *
 * The valid compiler options are:
 *
 *   - Compilation targets
 *     - \c --gpu-architecture=\<arch\> (\c -arch)\n
 *       Specify the name of the class of GPU architectures for which the
 *       input must be compiled.\n
 *       - Valid <c>\<arch\></c>s:
 *         - \c compute_30
 *         - \c compute_32
 *         - \c compute_35
 *         - \c compute_37
 *         - \c compute_50
 *         - \c compute_52
 *         - \c compute_53
 *         - \c compute_60
 *         - \c compute_61
 *         - \c compute_62
 *         - \c compute_70
 *         - \c compute_72
 *       - Default: \c compute_30
 *   - Separate compilation / whole-program compilation
 *     - \c --device-c (\c -dc)\n
 *       Generate relocatable code that can be linked with other relocatable
 *       device code.  It is equivalent to --relocatable-device-code=true.
 *     - \c --device-w (\c -dw)\n
 *       Generate non-relocatable code.  It is equivalent to
 *       \c --relocatable-device-code=false.
 *     - \c --relocatable-device-code={true|false} (\c -rdc)\n
 *       Enable (disable) the generation of relocatable device code.
 *       - Default: \c false
 *   - Debugging support
 *     - \c --device-debug (\c -G)\n
 *       Generate debug information.
 *     - \c --generate-line-info (\c -lineinfo)\n
 *       Generate line-number information.
 *   - Code generation
 *     - \c --maxrregcount=\<N\> (\c -maxrregcount)\n
 *       Specify the maximum amount of registers that GPU functions can use.
 *       Until a function-specific limit, a higher value will generally
 *       increase the performance of individual GPU threads that execute this
 *       function.  However, because thread registers are allocated from a
 *       global register pool on each GPU, a higher value of this option will
 *       also reduce the maximum thread block size, thereby reducing the amount
 *       of thread parallelism.  Hence, a good maxrregcount value is the result
 *       of a trade-off.  If this option is not specified, then no maximum is
 *       assumed.  Value less than the minimum registers required by ABI will
 *       be bumped up by the compiler to ABI minimum limit.
 *     - \c --ftz={true|false} (\c -ftz)\n
 *       When performing single-precision floating-point operations, flush
 *       denormal values to zero or preserve denormal values.
 *       \c --use_fast_math implies \c --ftz=true.
 *       - Default: \c false
 *     - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n
 *       For single-precision floating-point square root, use IEEE
 *       round-to-nearest mode or use a faster approximation.
 *       \c --use_fast_math implies \c --prec-sqrt=false.
 *       - Default: \c true
 *     - \c --prec-div={true|false} (\c -prec-div)\n
 *       For single-precision floating-point division and reciprocals, use IEEE
 *       round-to-nearest mode or use a faster approximation.
 *       \c --use_fast_math implies \c --prec-div=false.
 *       - Default: \c true
 *     - \c --fmad={true|false} (\c -fmad)\n
 *       Enables (disables) the contraction of floating-point multiplies and
 *       adds/subtracts into floating-point multiply-add operations (FMAD,
 *       FFMA, or DFMA).  \c --use_fast_math implies \c --fmad=true.
 *       - Default: \c true
 *     - \c --use_fast_math (\c -use_fast_math)\n
 *       Make use of fast math operations.
 *       \c --use_fast_math implies \c --ftz=true \c --prec-div=false
 *       \c --prec-sqrt=false \c --fmad=true.
 *   - Preprocessing
 *     - \c --define-macro=\<def\> (\c -D)\n
 *       \c \<def\> can be either \c \<name\> or \c \<name=definitions\>.
 *       - \c \<name\> \n
 *         Predefine \c \<name\> as a macro with definition \c 1.
 *       - \c \<name\>=\<definition\> \n
 *         The contents of \c \<definition\> are tokenized and preprocessed
 *         as if they appeared during translation phase three in a \c \#define
 *         directive.  In particular, the definition will be truncated by
 *         embedded new line characters.
 *     - \c --undefine-macro=\<def\> (\c -U)\n
 *       Cancel any previous definition of \c \<def\>.
 *     - \c --include-path=\<dir\> (\c -I)\n
 *       Add the directory \c \<dir\> to the list of directories to be
 *       searched for headers.  These paths are searched after the list of
 *       headers given to ::nvrtcCreateProgram.
 *     - \c --pre-include=\<header\> (\c -include)\n
 *       Preinclude \c \<header\> during preprocessing.
 *   - Language Dialect
 *     - \c --std={c++11|c++14} (\c -std={c++11|c++14})\n
 *       Set language dialect to C++11 or C++14.
 *     - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n
 *       Provide builtin definitions of \c std::move and \c std::forward,
 *       when C++11 language dialect is selected.
 *       - Default: \c true
 *     - \c --builtin-initializer-list={true|false}
 *       (\c -builtin-initializer-list)\n
 *       Provide builtin definitions of \c std::initializer_list class and
 *       member functions when C++11 language dialect is selected.
 *       - Default: \c true
 *   - Misc.
 *     - \c --disable-warnings (\c -w)\n
 *       Inhibit all warning messages.
 *     - \c --restrict (\c -restrict)\n
 *       Programmer assertion that all kernel pointer parameters are restrict
 *       pointers.
 *     - \c --device-as-default-execution-space
 *       (\c -default-device)\n
 *       Treat entities with no execution space annotation as \c __device__
 *       entities.
 *
 * \param   [in] prog       CUDA Runtime Compilation program.
 * \param   [in] numOptions Number of compiler options passed.
 * \param   [in] options    Compiler options in the form of C string array.\n
 *                          \p options can be \c NULL when \p numOptions is 0.
 *
 * \return
 *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
 *   - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
 */
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
 /* The utility function 'nvrtcGetTypeName' is not available by default. Define
   the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available.
 */
 #if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__
 #if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__
 #include <cxxabi.h>
 #include <cstdlib>
 #elif defined(_WIN32)
 #include <Windows.h>
 #include <DbgHelp.h>
 #endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */
 #include <string>
 #include <typeinfo>
 /*************************************************************************//**
 *
 * \defgroup hosthelper Host Helper
 *
 * NVRTC defines the following functions for easier interaction with host code.
 *
 ****************************************************************************/
 /**
 * \ingroup hosthelper
 * \brief   nvrtcGetTypeName stores the source level name of the template type argument
 *          T in the given std::string location.
 *
 * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
 * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
 * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
 * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
 * otherwise *result is initialized with the extracted name.
 * 
 * \param   [in] result: pointer to std::string in which to store the type name.
 * \return
 *  - \link #nvrtcResult NVRTC_SUCCESS \endlink
 *  - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
 *
 */
 template <typename T>
 nvrtcResult nvrtcGetTypeName(std::string *result)
 {
  const char *name = typeid(T).name();
 #if USE_CXXABI || __clang__ || __GNUC__
  int status;
  char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status);
  if (status == 0) {
    *result = undecorated_name;
    free(undecorated_name);
    return NVRTC_SUCCESS;
  }
 #elif defined(_WIN32)
  char undecorated_name[4096];
  if(UnDecorateSymbolName(name, undecorated_name,
                          sizeof(undecorated_name) / sizeof(*undecorated_name),
                          UNDNAME_COMPLETE) ) {
    *result = undecorated_name;
    return NVRTC_SUCCESS;
  }
 #endif  /* USE_CXXABI || __clang__ || __GNUC__ */
  return NVRTC_ERROR_INTERNAL_ERROR;
 }
 #endif  /* NVRTC_GET_TYPE_NAME */
 #endif /* __NVRTC_H__ */
--- a/include/external/CUDA/surface_types.h
+++ b/include/external/CUDA/surface_types.h
@@ -0,0 +1,119 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__SURFACE_TYPES_H__)
 #define __SURFACE_TYPES_H__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "driver_types.h"
 /**
 * \addtogroup CUDART_TYPES
 *
 * @{
 */
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #define cudaSurfaceType1D              0x01
 #define cudaSurfaceType2D              0x02
 #define cudaSurfaceType3D              0x03
 #define cudaSurfaceTypeCubemap         0x0C
 #define cudaSurfaceType1DLayered       0xF1
 #define cudaSurfaceType2DLayered       0xF2
 #define cudaSurfaceTypeCubemapLayered  0xFC
 /**
 * CUDA Surface boundary modes
 */
 enum __device_builtin__ cudaSurfaceBoundaryMode
 {
    cudaBoundaryModeZero  = 0,    /**< Zero boundary mode */
    cudaBoundaryModeClamp = 1,    /**< Clamp boundary mode */
    cudaBoundaryModeTrap  = 2     /**< Trap boundary mode */
 };
 /**
 * CUDA Surface format modes
 */
 enum __device_builtin__  cudaSurfaceFormatMode
 {
    cudaFormatModeForced = 0,     /**< Forced format mode */
    cudaFormatModeAuto = 1        /**< Auto format mode */
 };
 /**
 * CUDA Surface reference
 */
 struct __device_builtin__ surfaceReference
 {
    /**
     * Channel descriptor for surface reference
     */
    struct cudaChannelFormatDesc channelDesc;
 };
 /**
 * An opaque value that represents a CUDA Surface object
 */
 typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
 /** @} */
 /** @} */ /* END CUDART_TYPES */
 #endif /* !__SURFACE_TYPES_H__ */
--- a/include/external/CUDA/texture_types.h
+++ b/include/external/CUDA/texture_types.h
@@ -0,0 +1,217 @@
 /*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__TEXTURE_TYPES_H__)
 #define __TEXTURE_TYPES_H__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "driver_types.h"
 /**
 * \addtogroup CUDART_TYPES
 *
 * @{
 */
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #define cudaTextureType1D              0x01
 #define cudaTextureType2D              0x02
 #define cudaTextureType3D              0x03
 #define cudaTextureTypeCubemap         0x0C
 #define cudaTextureType1DLayered       0xF1
 #define cudaTextureType2DLayered       0xF2
 #define cudaTextureTypeCubemapLayered  0xFC
 /**
 * CUDA texture address modes
 */
 enum __device_builtin__ cudaTextureAddressMode
 {
    cudaAddressModeWrap   = 0,    /**< Wrapping address mode */
    cudaAddressModeClamp  = 1,    /**< Clamp to edge address mode */
    cudaAddressModeMirror = 2,    /**< Mirror address mode */
    cudaAddressModeBorder = 3     /**< Border address mode */
 };
 /**
 * CUDA texture filter modes
 */
 enum __device_builtin__ cudaTextureFilterMode
 {
    cudaFilterModePoint  = 0,     /**< Point filter mode */
    cudaFilterModeLinear = 1      /**< Linear filter mode */
 };
 /**
 * CUDA texture read modes
 */
 enum __device_builtin__ cudaTextureReadMode
 {
    cudaReadModeElementType     = 0,  /**< Read texture as specified element type */
    cudaReadModeNormalizedFloat = 1   /**< Read texture as normalized float */
 };
 /**
 * CUDA texture reference
 */
 struct __device_builtin__ textureReference
 {
    /**
     * Indicates whether texture reads are normalized or not
     */
    int                          normalized;
    /**
     * Texture filter mode
     */
    enum cudaTextureFilterMode   filterMode;
    /**
     * Texture address mode for up to 3 dimensions
     */
    enum cudaTextureAddressMode  addressMode[3];
    /**
     * Channel descriptor for the texture reference
     */
    struct cudaChannelFormatDesc channelDesc;
    /**
     * Perform sRGB->linear conversion during texture read
     */
    int                          sRGB;
    /**
     * Limit to the anisotropy ratio
     */
    unsigned int                 maxAnisotropy;
    /**
     * Mipmap filter mode
     */
    enum cudaTextureFilterMode   mipmapFilterMode;
    /**
     * Offset applied to the supplied mipmap level
     */
    float                        mipmapLevelBias;
    /**
     * Lower end of the mipmap level range to clamp access to
     */
    float                        minMipmapLevelClamp;
    /**
     * Upper end of the mipmap level range to clamp access to
     */
    float                        maxMipmapLevelClamp;
    int                          __cudaReserved[15];
 };
 /**
 * CUDA texture descriptor
 */
 struct __device_builtin__ cudaTextureDesc
 {
    /**
     * Texture address mode for up to 3 dimensions
     */
    enum cudaTextureAddressMode addressMode[3];
    /**
     * Texture filter mode
     */
    enum cudaTextureFilterMode  filterMode;
    /**
     * Texture read mode
     */
    enum cudaTextureReadMode    readMode;
    /**
     * Perform sRGB->linear conversion during texture read
     */
    int                         sRGB;
    /**
     * Texture Border Color
     */
    float                       borderColor[4];
    /**
     * Indicates whether texture reads are normalized or not
     */
    int                         normalizedCoords;
    /**
     * Limit to the anisotropy ratio
     */
    unsigned int                maxAnisotropy;
    /**
     * Mipmap filter mode
     */
    enum cudaTextureFilterMode  mipmapFilterMode;
    /**
     * Offset applied to the supplied mipmap level
     */
    float                       mipmapLevelBias;
    /**
     * Lower end of the mipmap level range to clamp access to
     */
    float                       minMipmapLevelClamp;
    /**
     * Upper end of the mipmap level range to clamp access to
     */
    float                       maxMipmapLevelClamp;
 };
 /**
 * An opaque value that represents a CUDA texture object
 */
 typedef __device_builtin__ unsigned long long cudaTextureObject_t;
 /** @} */
 /** @} */ /* END CUDART_TYPES */
 #endif /* !__TEXTURE_TYPES_H__ */
--- a/include/external/CUDA/vector_functions.h
+++ b/include/external/CUDA/vector_functions.h
@@ -0,0 +1,177 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__VECTOR_FUNCTIONS_H__)
 #define __VECTOR_FUNCTIONS_H__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "builtin_types.h"
 #include "host_defines.h"
 #include "vector_types.h"
 #if defined(__CUDACC_RTC__)
 #define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
 #else /* !__CUDACC_RTC__ */
 #define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
 #endif /* __CUDACC_RTC__ */
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 __VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
 __VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
 __VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
 __VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
 __VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
 __VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
 __VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
 __VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
 __VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
 __VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
 __VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
 __VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
 __VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
 __VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
 __VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
 __VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
 __VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
 __VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
 __VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
 __VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
 __VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
 __VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
 __VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
 __VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
 __VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
 __VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
 __VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
 __VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
 __VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
 __VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
 __VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
 __VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
 __VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
 __VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
 __VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
 __VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
 __VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
 __VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
 __VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
 __VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
 __VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
 __VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
 __VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
 __VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
 __VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
 __VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
 __VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
 __VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
 #undef __VECTOR_FUNCTIONS_DECL__
 #if !defined(__CUDACC_RTC__)
 #include "vector_functions.hpp"
 #endif /* !__CUDACC_RTC__ */
 #endif /* !__VECTOR_FUNCTIONS_H__ */
--- a/include/external/CUDA/vector_functions.hpp
+++ b/include/external/CUDA/vector_functions.hpp
@@ -0,0 +1,318 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__VECTOR_FUNCTIONS_HPP__)
 #define __VECTOR_FUNCTIONS_HPP__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "builtin_types.h"
 #include "host_defines.h"
 #include "vector_types.h"
 #if defined(__CUDACC_RTC__)
 #define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
 #else /* !__CUDACC_RTC__ */
 #define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
 #endif /* __CUDACC_RTC__ */
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 __VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
 {
  char1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
 {
  uchar1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
 {
  char2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
 {
  uchar2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
 {
  char3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
 {
  uchar3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
 {
  char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
 {
  uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
 {
  short1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
 {
  ushort1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
 {
  short2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
 {
  ushort2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
 { 
  short3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
 {
  ushort3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
 {
  short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
 {
  ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
 {
  int1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
 {
  uint1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
 {
  int2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
 {
  uint2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
 {
  int3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
 {
  uint3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
 {
  int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
 {
  uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
 {
  long1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
 {
  ulong1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
 {
  long2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
 {
  ulong2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
 {
  long3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
 {
  ulong3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
 {
  long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
 {
  ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
 {
  float1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
 {
  float2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
 {
  float3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
 {
  float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
 {
  longlong1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
 {
  ulonglong1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
 {
  longlong2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
 {
  ulonglong2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
 {
  longlong3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
 {
  ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
 {
  longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
 {
  ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
 {
  double1 t; t.x = x; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
 {
  double2 t; t.x = x; t.y = y; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
 {
  double3 t; t.x = x; t.y = y; t.z = z; return t;
 }
 __VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
 {
  double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
 }
 #undef __VECTOR_FUNCTIONS_DECL__
 #endif /* !__VECTOR_FUNCTIONS_HPP__ */
--- a/include/external/CUDA/vector_types.h
+++ b/include/external/CUDA/vector_types.h
@@ -0,0 +1,425 @@
 /*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */
 #if !defined(__VECTOR_TYPES_H__)
 #define __VECTOR_TYPES_H__
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #include "host_defines.h"
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 #if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && \
    defined(_WIN32) && !defined(_WIN64)
 #pragma warning(push)
 #pragma warning(disable: 4201 4408)
 #define __cuda_builtin_vector_align8(tag, members) \
 struct __device_builtin__ tag                      \
 {                                                  \
    union                                          \
    {                                              \
        struct { members };                        \
        struct { long long int :1,:0; };           \
    };                                             \
 }
 #else /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
 #define __cuda_builtin_vector_align8(tag, members) \
 struct __device_builtin__ __align__(8) tag         \
 {                                                  \
    members                                        \
 }
 #endif /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
 struct __device_builtin__ char1
 {
    signed char x;
 };
 struct __device_builtin__ uchar1
 {
    unsigned char x;
 };
 struct __device_builtin__ __align__(2) char2
 {
    signed char x, y;
 };
 struct __device_builtin__ __align__(2) uchar2
 {
    unsigned char x, y;
 };
 struct __device_builtin__ char3
 {
    signed char x, y, z;
 };
 struct __device_builtin__ uchar3
 {
    unsigned char x, y, z;
 };
 struct __device_builtin__ __align__(4) char4
 {
    signed char x, y, z, w;
 };
 struct __device_builtin__ __align__(4) uchar4
 {
    unsigned char x, y, z, w;
 };
 struct __device_builtin__ short1
 {
    short x;
 };
 struct __device_builtin__ ushort1
 {
    unsigned short x;
 };
 struct __device_builtin__ __align__(4) short2
 {
    short x, y;
 };
 struct __device_builtin__ __align__(4) ushort2
 {
    unsigned short x, y;
 };
 struct __device_builtin__ short3
 {
    short x, y, z;
 };
 struct __device_builtin__ ushort3
 {
    unsigned short x, y, z;
 };
 __cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
 __cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
 struct __device_builtin__ int1
 {
    int x;
 };
 struct __device_builtin__ uint1
 {
    unsigned int x;
 };
 __cuda_builtin_vector_align8(int2, int x; int y;);
 __cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
 struct __device_builtin__ int3
 {
    int x, y, z;
 };
 struct __device_builtin__ uint3
 {
    unsigned int x, y, z;
 };
 struct __device_builtin__ __builtin_align__(16) int4
 {
    int x, y, z, w;
 };
 struct __device_builtin__ __builtin_align__(16) uint4
 {
    unsigned int x, y, z, w;
 };
 struct __device_builtin__ long1
 {
    long int x;
 };
 struct __device_builtin__ ulong1
 {
    unsigned long x;
 };
 #if defined(_WIN32)
 __cuda_builtin_vector_align8(long2, long int x; long int y;);
 __cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
 #else /* !_WIN32 */
 struct __device_builtin__ __align__(2*sizeof(long int)) long2
 {
    long int x, y;
 };
 struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
 {
    unsigned long int x, y;
 };
 #endif /* _WIN32 */
 struct __device_builtin__ long3
 {
    long int x, y, z;
 };
 struct __device_builtin__ ulong3
 {
    unsigned long int x, y, z;
 };
 struct __device_builtin__ __builtin_align__(16) long4
 {
    long int x, y, z, w;
 };
 struct __device_builtin__ __builtin_align__(16) ulong4
 {
    unsigned long int x, y, z, w;
 };
 struct __device_builtin__ float1
 {
    float x;
 };
 #if !defined(__CUDACC__) && defined(__arm__) && \
    defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-pedantic"
 struct __device_builtin__ __attribute__((aligned(8))) float2
 {
    float x; float y; float __cuda_gnu_arm_ice_workaround[0];
 };
 #pragma GCC poison __cuda_gnu_arm_ice_workaround
 #pragma GCC diagnostic pop
 #else /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
         __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
 __cuda_builtin_vector_align8(float2, float x; float y;);
 #endif /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
          __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
 struct __device_builtin__ float3
 {
    float x, y, z;
 };
 struct __device_builtin__ __builtin_align__(16) float4
 {
    float x, y, z, w;
 };
 struct __device_builtin__ longlong1
 {
    long long int x;
 };
 struct __device_builtin__ ulonglong1
 {
    unsigned long long int x;
 };
 struct __device_builtin__ __builtin_align__(16) longlong2
 {
    long long int x, y;
 };
 struct __device_builtin__ __builtin_align__(16) ulonglong2
 {
    unsigned long long int x, y;
 };
 struct __device_builtin__ longlong3
 {
    long long int x, y, z;
 };
 struct __device_builtin__ ulonglong3
 {
    unsigned long long int x, y, z;
 };
 struct __device_builtin__ __builtin_align__(16) longlong4
 {
    long long int x, y, z ,w;
 };
 struct __device_builtin__ __builtin_align__(16) ulonglong4
 {
    unsigned long long int x, y, z, w;
 };
 struct __device_builtin__ double1
 {
    double x;
 };
 struct __device_builtin__ __builtin_align__(16) double2
 {
    double x, y;
 };
 struct __device_builtin__ double3
 {
    double x, y, z;
 };
 struct __device_builtin__ __builtin_align__(16) double4
 {
    double x, y, z, w;
 };
 #if !defined(__CUDACC__) && defined(_WIN32) && !defined(_WIN64)
 #pragma warning(pop)
 #endif /* !__CUDACC__ && _WIN32 && !_WIN64 */
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 typedef __device_builtin__ struct char1 char1;
 typedef __device_builtin__ struct uchar1 uchar1;
 typedef __device_builtin__ struct char2 char2;
 typedef __device_builtin__ struct uchar2 uchar2;
 typedef __device_builtin__ struct char3 char3;
 typedef __device_builtin__ struct uchar3 uchar3;
 typedef __device_builtin__ struct char4 char4;
 typedef __device_builtin__ struct uchar4 uchar4;
 typedef __device_builtin__ struct short1 short1;
 typedef __device_builtin__ struct ushort1 ushort1;
 typedef __device_builtin__ struct short2 short2;
 typedef __device_builtin__ struct ushort2 ushort2;
 typedef __device_builtin__ struct short3 short3;
 typedef __device_builtin__ struct ushort3 ushort3;
 typedef __device_builtin__ struct short4 short4;
 typedef __device_builtin__ struct ushort4 ushort4;
 typedef __device_builtin__ struct int1 int1;
 typedef __device_builtin__ struct uint1 uint1;
 typedef __device_builtin__ struct int2 int2;
 typedef __device_builtin__ struct uint2 uint2;
 typedef __device_builtin__ struct int3 int3;
 typedef __device_builtin__ struct uint3 uint3;
 typedef __device_builtin__ struct int4 int4;
 typedef __device_builtin__ struct uint4 uint4;
 typedef __device_builtin__ struct long1 long1;
 typedef __device_builtin__ struct ulong1 ulong1;
 typedef __device_builtin__ struct long2 long2;
 typedef __device_builtin__ struct ulong2 ulong2;
 typedef __device_builtin__ struct long3 long3;
 typedef __device_builtin__ struct ulong3 ulong3;
 typedef __device_builtin__ struct long4 long4;
 typedef __device_builtin__ struct ulong4 ulong4;
 typedef __device_builtin__ struct float1 float1;
 typedef __device_builtin__ struct float2 float2;
 typedef __device_builtin__ struct float3 float3;
 typedef __device_builtin__ struct float4 float4;
 typedef __device_builtin__ struct longlong1 longlong1;
 typedef __device_builtin__ struct ulonglong1 ulonglong1;
 typedef __device_builtin__ struct longlong2 longlong2;
 typedef __device_builtin__ struct ulonglong2 ulonglong2;
 typedef __device_builtin__ struct longlong3 longlong3;
 typedef __device_builtin__ struct ulonglong3 ulonglong3;
 typedef __device_builtin__ struct longlong4 longlong4;
 typedef __device_builtin__ struct ulonglong4 ulonglong4;
 typedef __device_builtin__ struct double1 double1;
 typedef __device_builtin__ struct double2 double2;
 typedef __device_builtin__ struct double3 double3;
 typedef __device_builtin__ struct double4 double4;
 /*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
 struct __device_builtin__ dim3
 {
    unsigned int x, y, z;
 #if defined(__cplusplus)
    __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
    __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
    __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
 #endif /* __cplusplus */
 };
 typedef __device_builtin__ struct dim3 dim3;
 #undef  __cuda_builtin_vector_align8
 #endif /* !__VECTOR_TYPES_H__ */
--- a/include/tools/sys/getenv.hpp
+++ b/include/tools/sys/getenv.hpp
@@ -0,0 +1,56 @@
 /*
 * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
 *
 * This file is part of ISAAC.
 *
 * ISAAC is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301  USA
 */
 #ifndef TDL_TOOLS_SYS_GETENV_HPP
 #define TDL_TOOLS_SYS_GETENV_HPP
 #include <string>
 #include <cstdlib>
 namespace tdl
 {
 namespace tools
 {
    inline std::string getenv(const char * name)
    {
        #ifdef _MSC_VER
            char* cache_path = 0;
            std::size_t sz = 0;
            _dupenv_s(&cache_path, &sz, name);
        #else
            const char * cache_path = std::getenv(name);
        #endif
        if(!cache_path)
            return "";
        std::string result(cache_path);
        #ifdef _MSC_VER
            free(cache_path);
        #endif
        return result;
    }
 }
 }
 #endif
--- a/include/tools/sys/mkdir.hpp
+++ b/include/tools/sys/mkdir.hpp
@@ -0,0 +1,68 @@
 /*
 * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
 *
 * This file is part of ISAAC.
 *
 * ISAAC is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301  USA
 */
 #ifndef TDL_TOOLS_SYS_MKDIR_HPP
 #define TDL_TOOLS_SYS_MKDIR_HPP
 #include <cstring>
 #include <string>
 #include <cstdlib>
 #include <sys/stat.h>
 #include <errno.h>
 #if defined(_WIN32)
  #include <direct.h>
 #endif
 namespace tdl
 {
 namespace tools
 {
    inline int mkdir(std::string const & path)
    {
        #if defined(_WIN32)
            return _mkdir(path.c_str());
        #else
            return ::mkdir(path.c_str(), 0777);
        #endif
    }
    inline int mkpath(std::string const & path)
    {
        int status = 0;
        size_t pp = 0;
        size_t sp;
        while ((sp = path.find('/', pp)) != std::string::npos)
        {
            if (sp != pp){
                status = mkdir(path.substr(0, sp));
            }
            pp = sp + 1;
        }
        return (status==0 || errno==EEXIST)?0:-1;
    }
 }
 }
 #endif
--- a/lib/driver/backend.cpp
+++ b/lib/driver/backend.cpp
@@ -0,0 +1,196 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "driver/dispatch.h"
 #include "driver/backend.h"
 #include "driver/buffer.h"
 #include "driver/context.h"
 #include "driver/stream.h"
 #include "driver/kernel.h"
 #include <assert.h>
 #include <stdexcept>
 #include <vector>
 namespace tdl
 {
 namespace driver
 {
 /*-----------------------------------*/
 //---------- Modules ----------------*/
 /*-----------------------------------*/
 void backend::modules::release(){
  for(auto & x: cache_)
    delete x.second;
  cache_.clear();
 }
 Module& backend::modules::get(Stream const & stream, std::string const & name, std::string const & src){
  std::tuple<Stream, std::string> key(stream, name);
  if(cache_.find(key)==cache_.end())
    return *cache_.insert(std::make_pair(key, new Module(stream.context(), src))).first->second;
  return *cache_.at(key);
 }
 std::map<std::tuple<Stream, std::string>, Module * >  backend::modules::cache_;
 /*-----------------------------------*/
 //-----------  Kernels --------------*/
 /*-----------------------------------*/
 void backend::kernels::release(){
  for(auto & x: cache_)
    delete x.second;
  cache_.clear();
 }
 Kernel & backend::kernels::get(Module const & program, std::string const & name){
  std::tuple<Module, std::string> key(program, name);
  if(cache_.find(key)==cache_.end())
    return *cache_.insert(std::make_pair(key, new Kernel(program, name.c_str()))).first->second;
  return *cache_.at(key);
 }
 std::map<std::tuple<Module, std::string>, Kernel * > backend::kernels::cache_;
 /*-----------------------------------*/
 //------------  Queues --------------*/
 /*-----------------------------------*/
 void backend::streams::init(std::list<const Context *> const & contexts){
  for(Context const * ctx : contexts)
    if(cache_.find(*ctx)==cache_.end())
      cache_.insert(std::make_pair(*ctx, std::vector<Stream*>{new Stream(*ctx)}));
 }
 void backend::streams::release(){
  for(auto & x: cache_)
    for(auto & y: x.second)
      delete y;
  cache_.clear();
 }
 Stream & backend::streams::get_default()
 { return get(contexts::get_default(), 0); }
 Stream & backend::streams::get(Context const & context, unsigned int id){
  init(std::list<Context const *>(1,&context));
  for(auto & x : cache_)
    if(x.first==context)
      return *x.second[id];
  throw;
 }
 void backend::streams::get(Context const & context, std::vector<Stream*> & queues){
  init(std::list<Context const *>(1,&context));
  queues = cache_.at(context);
 }
 std::map<Context, std::vector<Stream*> > backend::streams::cache_;
 /*-----------------------------------*/
 //------------  Contexts ------------*/
 /*-----------------------------------*/
 void backend::contexts::init(std::vector<Platform> const & platforms){
  for(Platform const & platform: platforms){
    for(Device const & device: platform.devices())
      cache_.push_back(new Context(device));
  }
 }
 void backend::contexts::release(){
  for(auto & x: cache_)
    delete x;
  cache_.clear();
 }
 Context const & backend::contexts::get_default(){
  backend::init();
  std::list<Context const *>::const_iterator it = cache_.begin();
  std::advance(it, default_device);
  return **it;
 }
 void backend::contexts::get(std::list<Context const *> & contexts){
  backend::init();
  contexts = cache_;
 }
 std::list<Context const *> backend::contexts::cache_;
 /*-----------------------------------*/
 //------------  General -------------*/
 /*-----------------------------------*/
 std::vector<Device> backend::devices(){
  std::vector<Platform> platforms = backend::platforms();
  std::vector<Device> result;
  for(Platform const & platform: platforms){
    auto devices = platform.devices();
    result.insert(result.end(), devices.begin(), devices.end());
  }
  return result;
 }
 std::vector<Platform> backend::platforms(){
  std::vector<Platform> platforms;
  //if CUDA is here
  if(dispatch::cuinit())
    platforms.push_back(Platform());
  if(platforms.empty())
    throw std::runtime_error("ISAAC: No backend available. Make sure CUDA is available in your library path");
  return platforms;
 }
 void backend::synchronize(Context const & context){
  for(Stream * queue: streams::cache_.at(context))
    queue->synchronize();
 }
 void backend::release(){
  backend::kernels::release();
 //  backend::programs::release();
  backend::streams::release();
  backend::contexts::release();
 }
 void backend::init(){
  if(!contexts::cache_.empty())
    return;
  std::vector<Platform> platforms = backend::platforms();
  contexts::init(platforms);
  streams::init(contexts::cache_);
 }
 unsigned int backend::default_device = 0;
 }
 }
--- a/lib/driver/buffer.cpp
+++ b/lib/driver/buffer.cpp
@@ -0,0 +1,60 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <iostream>
 #include "driver/stream.h"
 #include "driver/buffer.h"
 #include "driver/context.h"
 #include "driver/dispatch.h"
 namespace tdl
 {
 namespace driver
 {
 Buffer::Buffer(Context const & context, size_t size) : context_(context)
 {
  ContextSwitcher ctx_switch(context_);
  dispatch::cuMemAlloc(&*cu_, size);
 }
 Buffer::Buffer(Context const & context, CUdeviceptr cu, bool take_ownership):
  context_(context), cu_(cu, take_ownership)
 { }
 void Buffer::set_zero(Stream const & queue, size_t size)
 {
  ContextSwitcher ctx_switch(context_);
  dispatch::cuMemsetD8Async(*cu_, 0, size, queue);
 }
 Handle<CUdeviceptr> const & Buffer::cu() const
 { return cu_; }
 Handle<CUdeviceptr> & Buffer::cu()
 { return cu_; }
 }
 }
--- a/lib/driver/context.cpp
+++ b/lib/driver/context.cpp
@@ -0,0 +1,99 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <iostream>
 #include <cassert>
 #include "driver/context.h"
 #include "driver/module.h"
 #include "tools/sys/getenv.hpp"
 #include "tools/sys/mkdir.hpp"
 namespace tdl
 {
 namespace driver
 {
 std::string Context::get_cache_path(){
  //user-specified cache path
  std::string result = tools::getenv("ISAAC_CACHE_PATH");
  if(!result.empty()){
    if(tools::mkpath(result)==0)
      return result;
  }
  //create in home
  result = tools::getenv("HOME");
  if(!result.empty())
  {
    result = result + "/.isaac/cache/";
    if(tools::mkpath(result)==0)
      return result;
  }
  //couldn't find a directory
  return "";
 }
 CUdevice Context::device(CUcontext context){
  dispatch::cuCtxPushCurrent_v2(context);
  CUdevice res;
  dispatch::cuCtxGetDevice(&res);
  dispatch::cuCtxPopCurrent_v2(NULL);
  return res;
 }
 Context::Context(CUcontext context, bool take_ownership): cu_(context, take_ownership), device_(device(context), false), cache_path_(get_cache_path())
 { }
 Context::Context(Device const & device): device_(device), cache_path_(get_cache_path())
 {
  dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, (CUdevice)device);
  dispatch::cuCtxPopCurrent_v2(NULL);
 }
 Device const & Context::device() const
 { return device_; }
 std::string const & Context::cache_path() const
 { return cache_path_; }
 Handle<CUcontext> const & Context::cu() const
 { return cu_; }
 /* Context Switcher */
 ContextSwitcher::ContextSwitcher(Context const & ctx): ctx_(ctx)
 {
  dispatch::cuCtxPushCurrent_v2(ctx_);
 }
 ContextSwitcher::~ContextSwitcher()
 {
  CUcontext tmp;
  dispatch::cuCtxPopCurrent_v2(&tmp);
  assert(tmp==(CUcontext)ctx_ && "Switching back to invalid context!");
 }
 }
 }
--- a/lib/driver/device.cpp
+++ b/lib/driver/device.cpp
@@ -0,0 +1,197 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <map>
 #include <algorithm>
 #include <sstream>
 #include <cstring>
 #include <memory>
 #include "driver/device.h"
 namespace tdl
 {
 namespace driver
 {
 /* Architecture [NVidia] */
 Device::Architecture Device::nv_arch(std::pair<unsigned int, unsigned int> sm) const{
  switch(sm.first)
  {
   case 7:
     switch(sm.second)
     {
     case 0: return Architecture::SM_7_0;
     }
  case 6:
    switch(sm.second)
    {
    case 0: return Architecture::SM_6_0;
    case 1: return Architecture::SM_6_1;
    }
  case 5:
    switch(sm.second)
    {
    case 0: return Architecture::SM_5_0;
    case 2: return Architecture::SM_5_2;
    default: return Architecture::UNKNOWN;
    }
  case 3:
    switch(sm.second)
    {
    case 0: return Architecture::SM_3_0;
    case 5: return Architecture::SM_3_5;
    case 7: return Architecture::SM_3_7;
    default: return Architecture::UNKNOWN;
    }
  case 2:
    switch(sm.second)
    {
    case 0: return Architecture::SM_2_0;
    case 1: return Architecture::SM_2_1;
    default: return Architecture::UNKNOWN;
    }
  default: return Architecture::UNKNOWN;
  }
 }
 template<CUdevice_attribute attr>
 int Device::cuGetInfo() const{
  int res;
  dispatch::cuDeviceGetAttribute(&res, attr, *cu_);
  return res;
 }
 nvmlDevice_t Device::nvml_device() const{
  std::map<std::string, nvmlDevice_t> map;
  std::string key = pci_bus_id();
  if(map.find(key)==map.end()){
    nvmlDevice_t device;
    dispatch::nvmlDeviceGetHandleByPciBusId_v2(key.c_str(), &device);
    return map.insert(std::make_pair(key, device)).first->second;
  }
  return map.at(key);
 }
 /* Architecture */
 Device::Architecture Device::architecture() const
 {  return nv_arch(compute_capability()); }
 /* Attributes */
 size_t Device::address_bits() const
 { return sizeof(size_t)*8; }
 driver::Platform Device::platform() const
 { return Platform(); }
 std::string Device::name() const{
    char tmp[128];
    dispatch::cuDeviceGetName(tmp, 128, *cu_);
    return std::string(tmp);
 }
 std::string Device::pci_bus_id() const{
  char tmp[128];
  dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_);
  return std::string(tmp);
 }
 void Device::interpret_as(std::pair<size_t, size_t> cc){
  interpreted_as_ = std::make_shared<std::pair<size_t, size_t>>(cc);
 }
 std::pair<size_t, size_t> Device::compute_capability() const{
  if(interpreted_as_)
    return *interpreted_as_;
  size_t _major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>();
  size_t _minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
  return std::make_pair(_major, _minor);
 }
 size_t Device::max_threads_per_block() const
 { return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK>(); }
 size_t Device::max_shared_memory() const
 { return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK>(); }
 size_t Device::warp_size() const
 { return cuGetInfo<CU_DEVICE_ATTRIBUTE_WARP_SIZE>(); }
 std::vector<size_t> Device::max_block_dim() const{
  std::vector<size_t> result(3);
  result[0] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X>();
  result[1] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y>();
  result[2] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z>();
  return result;
 }
 size_t Device::current_sm_clock() const{
  unsigned int result;
  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
  return result;
 }
 size_t Device::max_sm_clock() const{
  unsigned int result;
  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
  return result;
 }
 size_t Device::current_mem_clock() const{
  unsigned int result;
  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
  return result;
 }
 size_t Device::max_mem_clock() const{
  unsigned int result;
  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
  return result;
 }
 /* Infos */
 std::string Device::infos() const{
  std::ostringstream oss;
  std::vector<size_t> max_wi_sizes = max_block_dim();
  oss << "Platform: " << platform().name() << std::endl;
  oss << "Name: " << name() << std::endl;
  oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl;
  oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl;
  oss << "Local memory size: " << max_shared_memory() << std::endl;
  return oss.str();
 }
 Handle<CUdevice> const & Device::cu() const
 { return cu_; }
 }
 }
--- a/lib/driver/dispatch.cpp
+++ b/lib/driver/dispatch.cpp
@@ -0,0 +1,363 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <map>
 #include "driver/dispatch.h"
 #include "driver/context.h"
 namespace tdl
 {
 namespace driver
 {
 //Helpers for function definition
 #define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }
 #define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }
 #define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }
 #define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }
 #define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }
 #define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }
 #define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }
 #define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }
 #define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }
 #define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }
 #define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }
 #define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }
 #define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
 #define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }
 //Specialized helpers for CUDA
 #define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
 #define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
 #define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
 #define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
 #define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
 #define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
 #define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
 #define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
 #define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
 #define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
 #define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
 #define NVRTC_DEFINE1(ret, fname, t1) DEFINE1(nvrtcinit, nvrtc_, ret, fname, t1)
 #define NVRTC_DEFINE2(ret, fname, t1, t2) DEFINE2(nvrtcinit, nvrtc_, ret, fname, t1, t2)
 #define NVRTC_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3)
 #define NVRTC_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4)
 #define NVRTC_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5)
 #define NVRTC_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6)
 #define NVRTC_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
 #define NVRTC_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
 #define NVRTC_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
 #define NVRTC_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
 #define NVRTC_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
 #define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
 #define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
 #define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
 #define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
 #define CUBLAS_DEFINE1(ret, fname, t1) DEFINE1(cublasinit, cublas_, ret, fname, t1)
 #define CUBLAS_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cublasinit, cublas_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13)
 #define CUBLAS_DEFINE19(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) DEFINE19(cublasinit, cublas_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19)
 #define CUDNN_DEFINE1(ret, fname, t1) DEFINE1(cudnninit, cudnn_, ret, fname, t1)
 #define CUDNN_DEFINE2(ret, fname, t1, t2) DEFINE2(cudnninit, cudnn_, ret, fname, t1, t2)
 #define CUDNN_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cudnninit, cudnn_, ret, fname, t1, t2, t3)
 #define CUDNN_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5)
 #define CUDNN_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6)
 #define CUDNN_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
 #define CUDNN_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
 #define CUDNN_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13)
 bool dispatch::cuinit(){
  if(cuda_==nullptr)
    cuda_ = dlopen("libcuda.so", RTLD_LAZY);
  CUresult (*fptr)(unsigned int);
 	cuInit_ = dlsym(cuda_, "cuInit");
 	*reinterpret_cast<void **>(&fptr) = cuInit_;
 	CUresult res = (*fptr)(0);
 	check(res);
  return cuda_ != nullptr;
 }
 bool dispatch::nvrtcinit(){
  if(nvrtc_==nullptr)
    nvrtc_ = dlopen("libnvrtc.so", RTLD_LAZY);
  return nvrtc_ != nullptr;
 }
 bool dispatch::nvmlinit(){
  if(nvml_==nullptr)
    nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
  nvmlReturn_t (*fptr)();
  nvmlInit_v2_ = dlsym(nvml_, "nvmlInit_v2");
  *reinterpret_cast<void **>(&fptr) = nvmlInit_v2_;
  nvmlReturn_t res = (*fptr)();
  check(res);
  return res;
 }
 bool dispatch::cublasinit(){
  if(cublas_==nullptr)
    cublas_ = dlopen("libcublas.so", RTLD_LAZY);
  return cublas_ != nullptr;
 }
 bool dispatch::cudnninit(){
  if(cudnn_==nullptr)
    cudnn_ = dlopen("libcudnn.so", RTLD_LAZY);
  return cudnn_ != nullptr;
 }
 //CUDA
 CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
 CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
 CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
 CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
 CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
 CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
 CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
 CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
 CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
 CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
 CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
 CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
 CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
 CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
 CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
 CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
 CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
 CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
 CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
 CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
 CUDA_DEFINE1(CUresult, cuInit, unsigned int)
 CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
 CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
 CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
 CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
 CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
 CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
 CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
 CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
 CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
 CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*)
 CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext)
 CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
 CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext)
 CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
 NVRTC_DEFINE3(nvrtcResult, nvrtcCompileProgram, nvrtcProgram, int, const char **)
 NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLogSize, nvrtcProgram, size_t *)
 NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTX, nvrtcProgram, char *)
 NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTXSize, nvrtcProgram, size_t *)
 NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **)
 NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *)
 NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 cublasHandle_t dispatch::cublasHandle(Context const & ctx){
  static std::map<Context, cublasHandle_t> handles;
  auto pr = handles.insert({ctx, cublasHandle_t()});
  if(pr.second)
    cublasCreate_v2(&pr.first->second);
  return pr.first->second;
 }
 cudnnHandle_t dispatch::cudnnHandle(Context const & ctx){
  static std::map<Context, cudnnHandle_t> handles;
  auto pr = handles.insert({ctx, cudnnHandle_t()});
  if(pr.second)
    cudnnCreate(&pr.first->second);
  return pr.first->second;
 }
 CUBLAS_DEFINE1(cublasStatus_t, cublasCreate_v2, cublasHandle_t*)
 cublasStatus_t dispatch::cublasGetStream_v2(cublasHandle_t h, cudaStream_t *a)
 { return f_impl<dispatch::cublasinit>(cublas_, cublasGetStream_v2, cublasGetStream_v2_, "cublasGetStream_v2", h, a); }
 cublasStatus_t dispatch::cublasSetStream_v2(cublasHandle_t h, cudaStream_t a)
 { return f_impl<dispatch::cublasinit>(cublas_, cublasSetStream_v2, cublasSetStream_v2_, "cublasSetStream_v2", h, a); }
 cublasStatus_t dispatch::cublasSgemm_v2(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc)
 { return f_impl<dispatch::cublasinit>(cublas_, cublasSgemm_v2, cublasSgemm_v2_, "cublasSgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
 cublasStatus_t dispatch::cublasDgemm_v2(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc)
 { return f_impl<dispatch::cublasinit>(cublas_, cublasDgemm_v2, cublasDgemm_v2_, "cublasDgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
 cublasStatus_t dispatch::cublasHgemm(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc)
 { return f_impl<dispatch::cublasinit>(cublas_, cublasHgemm, cublasHgemm_, "cublasHgemm", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
 CUBLAS_DEFINE19(cublasStatus_t, cublasGemmEx, cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void*, const void*, cudaDataType, int, const void*, cudaDataType, int, const void*, void*, cudaDataType, int, cudaDataType, cublasGemmAlgo_t)
 //cuDNN
 CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateConvolutionDescriptor, cudnnConvolutionDescriptor_t*)
 CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateTensorDescriptor, cudnnTensorDescriptor_t*)
 CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateFilterDescriptor, cudnnFilterDescriptor_t*)
 CUDNN_DEFINE1(cudnnStatus_t, cudnnCreate, cudnnHandle_t*)
 CUDNN_DEFINE7(cudnnStatus_t, cudnnSetTensor4dDescriptor, cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int)
 CUDNN_DEFINE7(cudnnStatus_t, cudnnSetFilter4dDescriptor, cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int)
 CUDNN_DEFINE5(cudnnStatus_t, cudnnSetTensorNdDescriptorEx, cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int*)
 CUDNN_DEFINE5(cudnnStatus_t, cudnnSetFilterNdDescriptor, cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int*)
 CUDNN_DEFINE1(cudnnStatus_t, cudnnCreatePoolingDescriptor, cudnnPoolingDescriptor_t*)
 CUDNN_DEFINE7(cudnnStatus_t, cudnnSetPoolingNdDescriptor, cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int*, const int*, const int*)
 CUDNN_DEFINE8(cudnnStatus_t, cudnnPoolingForward, cudnnHandle_t, const cudnnPoolingDescriptor_t, const void*, const cudnnTensorDescriptor_t, const void*, const void*, const cudnnTensorDescriptor_t, void*)
 CUDNN_DEFINE8(cudnnStatus_t, cudnnSetConvolution2dDescriptor, cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t)
 CUDNN_DEFINE7(cudnnStatus_t, cudnnSetConvolutionNdDescriptor, cudnnConvolutionDescriptor_t, int, const int*, const int*, const int*, cudnnConvolutionMode_t, cudnnDataType_t)
 CUDNN_DEFINE8(cudnnStatus_t, cudnnGetConvolutionForwardAlgorithm, cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *)
 CUDNN_DEFINE7(cudnnStatus_t, cudnnGetConvolutionForwardWorkspaceSize, cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t*)
 CUDNN_DEFINE13(cudnnStatus_t, cudnnConvolutionForward, cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *)
 CUDNN_DEFINE2(cudnnStatus_t, cudnnSetStream, cudnnHandle_t, cudaStream_t)
 CUDNN_DEFINE7(cudnnStatus_t, cudnnTransformTensor, cudnnHandle_t, const void*, const cudnnTensorDescriptor_t, const void*, const void*, const cudnnTensorDescriptor_t, void*)
 void dispatch::release(){
  if(cuda_){
    dlclose(cuda_);
    cuda_ = nullptr;
  }
  if(nvrtc_){
    dlclose(nvrtc_);
    nvrtc_ = nullptr;
  }
  if(cublas_){
    dlclose(cublas_);
    cublas_ = nullptr;
  }
  if(cudnn_){
    dlclose(cudnn_);
    cudnn_ = nullptr;
  }
 }
 void* dispatch::cuda_;
 void* dispatch::nvrtc_;
 void* dispatch::nvml_;
 void* dispatch::cublas_;
 void* dispatch::cudnn_;
 //CUDA
 void* dispatch::cuCtxGetCurrent_;
 void* dispatch::cuCtxSetCurrent_;
 void* dispatch::cuCtxDestroy_v2_;
 void* dispatch::cuEventCreate_;
 void* dispatch::cuDeviceGet_;
 void* dispatch::cuMemcpyDtoH_v2_;
 void* dispatch::cuStreamCreate_;
 void* dispatch::cuEventElapsedTime_;
 void* dispatch::cuMemFree_v2_;
 void* dispatch::cuMemcpyDtoHAsync_v2_;
 void* dispatch::cuDriverGetVersion_;
 void* dispatch::cuDeviceGetName_;
 void* dispatch::cuDeviceGetPCIBusId_;
 void* dispatch::cuModuleGetGlobal_v2_;
 void* dispatch::cuMemcpyHtoDAsync_v2_;
 void* dispatch::cuModuleLoad_;
 void* dispatch::cuLaunchKernel_;
 void* dispatch::cuModuleUnload_;
 void* dispatch::cuModuleLoadDataEx_;
 void* dispatch::cuDeviceGetAttribute_;
 void* dispatch::cuDeviceGetCount_;
 void* dispatch::cuMemcpyHtoD_v2_;
 void* dispatch::cuInit_;
 void* dispatch::cuEventRecord_;
 void* dispatch::cuCtxCreate_v2_;
 void* dispatch::cuModuleGetFunction_;
 void* dispatch::cuStreamSynchronize_;
 void* dispatch::cuStreamDestroy_v2_;
 void* dispatch::cuEventDestroy_v2_;
 void* dispatch::cuMemAlloc_v2_;
 void* dispatch::cuPointerGetAttribute_;
 void* dispatch::cuCtxGetDevice_;
 void* dispatch::cuMemsetD8Async_;
 void* dispatch::cuCtxPushCurrent_v2_;
 void* dispatch::cuCtxPopCurrent_v2_;
 void* dispatch::nvrtcCompileProgram_;
 void* dispatch::nvrtcGetProgramLogSize_;
 void* dispatch::nvrtcGetPTX_;
 void* dispatch::nvrtcGetPTXSize_;
 void* dispatch::nvrtcCreateProgram_;
 void* dispatch::nvrtcGetProgramLog_;
 void* dispatch::nvmlInit_v2_;
 void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
 void* dispatch::nvmlDeviceGetClockInfo_;
 void* dispatch::nvmlDeviceGetMaxClockInfo_;
 void* dispatch::cublasCreate_v2_;
 void* dispatch::cublasGetStream_v2_;
 void* dispatch::cublasSetStream_v2_;
 void* dispatch::cublasHgemm_;
 void* dispatch::cublasSgemm_v2_;
 void* dispatch::cublasDgemm_v2_;
 void* dispatch::cublasGemmEx_;
 void* dispatch::cudnnCreateConvolutionDescriptor_;
 void* dispatch::cudnnCreatePoolingDescriptor_;
 void* dispatch::cudnnCreateTensorDescriptor_;
 void* dispatch::cudnnCreateFilterDescriptor_;
 void* dispatch::cudnnCreate_;
 void* dispatch::cudnnSetTensor4dDescriptor_;
 void* dispatch::cudnnSetFilter4dDescriptor_;
 void* dispatch::cudnnSetTensorNdDescriptorEx_;
 void* dispatch::cudnnSetFilterNdDescriptor_;
 void* dispatch::cudnnSetPoolingNdDescriptor_;
 void* dispatch::cudnnSetConvolution2dDescriptor_;
 void* dispatch::cudnnSetConvolutionNdDescriptor_;
 void* dispatch::cudnnGetConvolutionForwardAlgorithm_;
 void* dispatch::cudnnGetConvolutionForwardWorkspaceSize_;
 void* dispatch::cudnnConvolutionForward_;
 void* dispatch::cudnnPoolingForward_;
 void* dispatch::cudnnSetStream_;
 void* dispatch::cudnnTransformTensor_;
 }
 }
--- a/lib/driver/error.cpp
+++ b/lib/driver/error.cpp
@@ -0,0 +1,155 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "driver/error.h"
 namespace tdl
 {
 namespace driver
 {
 void check(CUresult err)
 {
  using namespace exception::cuda;
  switch(err)
  {
  case CUDA_SUCCESS                              : break;
  case CUDA_ERROR_INVALID_VALUE                  : throw invalid_value();
  case CUDA_ERROR_OUT_OF_MEMORY                  : throw out_of_memory();
  case CUDA_ERROR_NOT_INITIALIZED                : throw not_initialized();
  case CUDA_ERROR_DEINITIALIZED                  : throw deinitialized();
  case CUDA_ERROR_PROFILER_DISABLED              : throw profiler_disabled();
  case CUDA_ERROR_PROFILER_NOT_INITIALIZED       : throw profiler_not_initialized();
  case CUDA_ERROR_PROFILER_ALREADY_STARTED       : throw profiler_already_started();
  case CUDA_ERROR_PROFILER_ALREADY_STOPPED       : throw profiler_already_stopped();
  case CUDA_ERROR_NO_DEVICE                      : throw no_device();
  case CUDA_ERROR_INVALID_DEVICE                 : throw invalid_device();
  case CUDA_ERROR_INVALID_IMAGE                  : throw invalid_image();
  case CUDA_ERROR_INVALID_CONTEXT                : throw invalid_context();
  case CUDA_ERROR_CONTEXT_ALREADY_CURRENT        : throw context_already_current();
  case CUDA_ERROR_MAP_FAILED                     : throw map_failed();
  case CUDA_ERROR_UNMAP_FAILED                   : throw unmap_failed();
  case CUDA_ERROR_ARRAY_IS_MAPPED                : throw array_is_mapped();
  case CUDA_ERROR_ALREADY_MAPPED                 : throw already_mapped();
  case CUDA_ERROR_NO_BINARY_FOR_GPU              : throw no_binary_for_gpu();
  case CUDA_ERROR_ALREADY_ACQUIRED               : throw already_acquired();
  case CUDA_ERROR_NOT_MAPPED                     : throw not_mapped();
  case CUDA_ERROR_NOT_MAPPED_AS_ARRAY            : throw not_mapped_as_array();
  case CUDA_ERROR_NOT_MAPPED_AS_POINTER          : throw not_mapped_as_pointer();
  case CUDA_ERROR_ECC_UNCORRECTABLE              : throw ecc_uncorrectable();
  case CUDA_ERROR_UNSUPPORTED_LIMIT              : throw unsupported_limit();
  case CUDA_ERROR_CONTEXT_ALREADY_IN_USE         : throw context_already_in_use();
  case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        : throw peer_access_unsupported();
  case CUDA_ERROR_INVALID_PTX                    : throw invalid_ptx();
  case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       : throw invalid_graphics_context();
  case CUDA_ERROR_INVALID_SOURCE                 : throw invalid_source();
  case CUDA_ERROR_FILE_NOT_FOUND                 : throw file_not_found();
  case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND : throw shared_object_symbol_not_found();
  case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      : throw shared_object_init_failed();
  case CUDA_ERROR_OPERATING_SYSTEM               : throw operating_system();
  case CUDA_ERROR_INVALID_HANDLE                 : throw invalid_handle();
  case CUDA_ERROR_NOT_FOUND                      : throw not_found();
  case CUDA_ERROR_NOT_READY                      : throw not_ready();
  case CUDA_ERROR_ILLEGAL_ADDRESS                : throw illegal_address();
  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        : throw launch_out_of_resources();
  case CUDA_ERROR_LAUNCH_TIMEOUT                 : throw launch_timeout();
  case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  : throw launch_incompatible_texturing();
  case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    : throw peer_access_already_enabled();
  case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        : throw peer_access_not_enabled();
  case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         : throw primary_context_active();
  case CUDA_ERROR_CONTEXT_IS_DESTROYED           : throw context_is_destroyed();
  case CUDA_ERROR_ASSERT                         : throw assert_error();
  case CUDA_ERROR_TOO_MANY_PEERS                 : throw too_many_peers();
  case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED : throw host_memory_already_registered();
  case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     : throw host_memory_not_registered();
  case CUDA_ERROR_HARDWARE_STACK_ERROR           : throw hardware_stack_error();
  case CUDA_ERROR_ILLEGAL_INSTRUCTION            : throw illegal_instruction();
  case CUDA_ERROR_MISALIGNED_ADDRESS             : throw misaligned_address();
  case CUDA_ERROR_INVALID_ADDRESS_SPACE          : throw invalid_address_space();
  case CUDA_ERROR_INVALID_PC                     : throw invalid_pc();
  case CUDA_ERROR_LAUNCH_FAILED                  : throw launch_failed();
  case CUDA_ERROR_NOT_PERMITTED                  : throw not_permitted();
  case CUDA_ERROR_NOT_SUPPORTED                  : throw not_supported();
  case CUDA_ERROR_UNKNOWN                        : throw unknown();
  default                                        : throw unknown();
  }
 }
 void check(nvrtcResult err){
  using namespace exception::nvrtc;
  switch(err)
  {
  case NVRTC_SUCCESS:                         break;
  case NVRTC_ERROR_OUT_OF_MEMORY:             throw out_of_memory();
  case NVRTC_ERROR_PROGRAM_CREATION_FAILURE:  throw program_creation_failure();
  case NVRTC_ERROR_INVALID_INPUT:             throw invalid_input();
  case NVRTC_ERROR_INVALID_PROGRAM:           throw invalid_program();
  case NVRTC_ERROR_INVALID_OPTION:            throw invalid_option();
  case NVRTC_ERROR_COMPILATION:               throw compilation();
  case NVRTC_ERROR_BUILTIN_OPERATION_FAILURE: throw builtin_operation_failure();
  default: throw unknown_error();
  }
 }
 void check(cublasStatus_t err){
  using namespace exception::cublas;
  switch(err)
  {
  case CUBLAS_STATUS_SUCCESS         : break;
  case CUBLAS_STATUS_NOT_INITIALIZED : throw not_initialized();
  case CUBLAS_STATUS_ALLOC_FAILED    : throw alloc_failed();
  case CUBLAS_STATUS_INVALID_VALUE   : throw invalid_value();
  case CUBLAS_STATUS_ARCH_MISMATCH   : throw arch_mismatch();
  case CUBLAS_STATUS_MAPPING_ERROR   : throw mapping_error();
  case CUBLAS_STATUS_EXECUTION_FAILED: throw execution_failed();
  case CUBLAS_STATUS_INTERNAL_ERROR  : throw internal_error();
  case CUBLAS_STATUS_NOT_SUPPORTED   : throw not_supported();
  case CUBLAS_STATUS_LICENSE_ERROR   : throw license_error();
  default                            : throw unknown();
  }
 }
 void check(cudnnStatus_t err){
  using namespace exception::cudnn;
  switch(err)
  {
  case CUDNN_STATUS_SUCCESS:            break;
  case CUDNN_STATUS_NOT_INITIALIZED:    throw not_initialized();
  case CUDNN_STATUS_ALLOC_FAILED:       throw alloc_failed();
  case CUDNN_STATUS_BAD_PARAM:          throw bad_param();
  case CUDNN_STATUS_INTERNAL_ERROR:     throw internal_error();
  case CUDNN_STATUS_INVALID_VALUE:      throw invalid_value();
  case CUDNN_STATUS_ARCH_MISMATCH:      throw arch_mismatch();
  case CUDNN_STATUS_MAPPING_ERROR:      throw mapping_error();
  case CUDNN_STATUS_EXECUTION_FAILED:   throw execution_failed();
  case CUDNN_STATUS_NOT_SUPPORTED:      throw not_supported();
  case CUDNN_STATUS_LICENSE_ERROR:      throw license_error();
  case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING: throw runtime_prerequisite_missing();
  case CUDNN_STATUS_RUNTIME_IN_PROGRESS: throw runtime_in_progress();
  case CUDNN_STATUS_RUNTIME_FP_OVERFLOW: throw runtime_fp_overflow();
  }
 }
 }
 }
--- a/lib/driver/event.cpp
+++ b/lib/driver/event.cpp
@@ -0,0 +1,40 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "driver/event.h"
 namespace tdl
 {
 namespace driver
 {
 float Event::elapsed_time() const{
  float time;
  dispatch::cuEventElapsedTime(&time, cu_->first, cu_->second);
  return time;
 }
 Handle<cu_event_t> const & Event::cu() const
 { return cu_; }
 }
 }
--- a/lib/driver/handle.cpp
+++ b/lib/driver/handle.cpp
@@ -0,0 +1,66 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <cassert>
 #include <memory>
 #include "driver/handle.h"
 namespace tdl
 {
 namespace driver
 {
 //CUDA
 inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); }
 inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); }
 inline void _delete(CUstream x) { dispatch::cuStreamDestroy(x); }
 inline void _delete(CUdevice) { }
 inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); }
 inline void _delete(CUfunction) { }
 inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); }
 inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
 inline void _delete(cu_platform){}
 //Constructor
 template<class CUType>
 Handle<CUType>::Handle(CUType cu, bool take_ownership): h_(new CUType(cu)), has_ownership_(take_ownership)
 { }
 template<class CUType>
 Handle<CUType>::~Handle(){
  if(has_ownership_ && h_ && h_.unique() && *h_)
    _delete(*h_);
 }
 template class Handle<CUdeviceptr>;
 template class Handle<CUstream>;
 template class Handle<CUcontext>;
 template class Handle<CUdevice>;
 template class Handle<cu_event_t>;
 template class Handle<CUfunction>;
 template class Handle<CUmodule>;
 template class Handle<cu_platform>;
 }
 }
--- a/lib/driver/kernel.cpp
+++ b/lib/driver/kernel.cpp
@@ -0,0 +1,67 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <iostream>
 #include <cstring>
 #include "driver/kernel.h"
 #include "driver/buffer.h"
 namespace tdl
 {
 namespace driver
 {
 Kernel::Kernel(Module const & program, const char * name) : program_(program), address_bits_(program.context().device().address_bits()){
  cu_params_store_.reserve(64);
  cu_params_.reserve(64);
  dispatch::cuModuleGetFunction(&*cu_, program, name);
 }
 void Kernel::setArg(unsigned int index, std::size_t size, void* ptr){
  if(index + 1> cu_params_store_.size()){
    cu_params_store_.resize(index+1);
    cu_params_.resize(index+1);
  }
  cu_params_store_[index].reset(malloc(size), free);
  memcpy(cu_params_store_[index].get(), ptr, size);
  cu_params_[index] = cu_params_store_[index].get();
 }
 void Kernel::setArg(unsigned int index, Buffer const & data)
 { return setArg(index, (CUdeviceptr)data);}
 void* const* Kernel::cu_params() const
 { return cu_params_.data(); }
 Handle<CUfunction> const & Kernel::cu() const
 { return cu_; }
 Module const & Kernel::module() const
 { return program_; }
 }
 }
--- a/lib/driver/module.cpp
+++ b/lib/driver/module.cpp
@@ -0,0 +1,118 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <iostream>
 #include <fstream>
 #include "driver/module.h"
 #include "driver/context.h"
 #include "driver/error.h"
 #include "tools/sys/getenv.hpp"
 namespace tdl
 {
 namespace driver
 {
 CUjit_target_enum cutarget(Device::Architecture arch){
  switch(arch){
    case Device::Architecture::SM_2_0: return CU_TARGET_COMPUTE_20;
    case Device::Architecture::SM_2_1: return CU_TARGET_COMPUTE_21;
    case Device::Architecture::SM_3_0: return CU_TARGET_COMPUTE_30;
    case Device::Architecture::SM_3_5: return CU_TARGET_COMPUTE_35;
    case Device::Architecture::SM_3_7: return CU_TARGET_COMPUTE_37;
    case Device::Architecture::SM_5_0: return CU_TARGET_COMPUTE_50;
    case Device::Architecture::SM_5_2: return CU_TARGET_COMPUTE_52;
    case Device::Architecture::SM_6_0: return CU_TARGET_COMPUTE_60;
    case Device::Architecture::SM_6_1: return CU_TARGET_COMPUTE_61;
    default: throw;
  }
 }
 inline std::pair<int, int> ptx(std::pair<int, int> sm){
  if(sm.first == 7) return {6, 0};
  if(sm.first == 6) return {5, 0};
  if(sm.first == 5) return {4, 3};
  throw;
 }
 std::string Module::header(Device const & device){
  auto cc = device.compute_capability();
  auto vptx = ptx(cc);
  std::string header;
  header += ".version " + std::to_string(vptx.first) + "." + std::to_string(vptx.second) + "\n";
  header += ".target sm_" + std::to_string(cc.first) + std::to_string(cc.second) + "\n";
  header += ".address_size 64\n";
  return header;
 }
 Module::Module(Context const & context, std::string const & source) : context_(context), source_(header(context.device()) + source){
  ContextSwitcher ctx_switch(context_);
  //Path to custom PTX compiler
  std::string compiler = tools::getenv("ISAAC_PTXAS");
  if(compiler.size()){
    auto cc = context.device().compute_capability();
    std::string out = context.cache_path() + "tmp.o";
    std::string opt = " --gpu-name sm_" + std::to_string(cc.first) + std::to_string(cc.second)
                    + "  -o " + out
                    + "  -ias \"" + source_ + "\"";
    std::string cmd = compiler + opt;
    if(std::system(cmd.c_str()) != 0)
      throw;
    dispatch::cuModuleLoad(&*cu_, out.c_str());
  }
  //JIT Compilation
  else{
    CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER};
    unsigned int errbufsize = 8096;
    std::string errbuf(errbufsize, 0);
    //CUjit_target_enum target = cutarget(context.device().architecture());
    void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)errbuf.data()};
    try{
      dispatch::cuModuleLoadDataEx(&*cu_, source_.data(), 2, opt, optval);
    }catch(exception::cuda::base const &){
      std::cerr << "Compilation Failed! Log: " << std::endl;
      std::cerr << errbuf << std::endl;
      throw;
    }
  }
 }
 Context const & Module::context() const
 { return context_; }
 Handle<CUmodule> const & Module::cu() const
 { return cu_; }
 Buffer Module::symbol(const char *name) const{
  CUdeviceptr handle;
  size_t size;
  dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
  return Buffer(context_, handle, false);
 }
 }
 }
--- a/lib/driver/platform.cpp
+++ b/lib/driver/platform.cpp
@@ -0,0 +1,56 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "driver/platform.h"
 #include "driver/device.h"
 #include <string>
 namespace tdl
 {
 namespace driver
 {
 std::string Platform::version() const{
  int version;
  dispatch::cuDriverGetVersion(&version);
  return std::to_string(version);
 }
 std::string Platform::name() const
 { return (std::string)"CUDA"; }
 std::vector<Device> Platform::devices() const{
  std::vector<Device> devices;
  int N;
  dispatch::cuDeviceGetCount(&N);
  for(int i = 0 ; i < N ; ++i){
    CUdevice device;
    dispatch::cuDeviceGet(&device, i);
    devices.push_back(Device(device));
  }
  return devices;
 }
 }
 }
--- a/lib/driver/stream.cpp
+++ b/lib/driver/stream.cpp
@@ -0,0 +1,95 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <iostream>
 #include <cassert>
 #include <array>
 #include "driver/backend.h"
 #include "driver/stream.h"
 #include "driver/context.h"
 #include "driver/device.h"
 #include "driver/event.h"
 #include "driver/kernel.h"
 #include "driver/buffer.h"
 namespace tdl
 {
 namespace driver
 {
 inline CUcontext cucontext(){
  CUcontext result;
  dispatch::cuCtxGetCurrent(&result);
  return result;
 }
 Stream::Stream(CUstream stream, bool take_ownership): context_(cucontext(), take_ownership), cu_(stream, take_ownership)
 {}
 Stream::Stream(Context const & context): context_(context), cu_(CUstream(), true)
 {
  ContextSwitcher ctx_switch(context_);
  dispatch::cuStreamCreate(&*cu_, 0);
 }
 void Stream::synchronize()
 {
  ContextSwitcher ctx_switch(context_);
  dispatch::cuStreamSynchronize(*cu_);
 }
 Context const & Stream::context() const
 { return context_; }
 void Stream::enqueue(Kernel const & kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const *, Event* event){
  ContextSwitcher ctx_switch(context_);
  if(event)
    dispatch::cuEventRecord(((cu_event_t)*event).first, *cu_);
  dispatch::cuLaunchKernel(kernel, grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)kernel.cu_params(), NULL);
  if(event)
    dispatch::cuEventRecord(((cu_event_t)*event).second, *cu_);
 }
 void Stream::write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr){
  ContextSwitcher ctx_switch(context_);
  if(blocking)
    dispatch::cuMemcpyHtoD(buffer + offset, ptr, size);
  else
    dispatch::cuMemcpyHtoDAsync(buffer + offset, ptr, size, *cu_);
 }
 void Stream::read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr){
  ContextSwitcher ctx_switch(context_);
  if(blocking)
    dispatch::cuMemcpyDtoH(ptr, buffer + offset, size);
  else
    dispatch::cuMemcpyDtoHAsync(ptr, buffer + offset, size, *cu_);
 }
 Handle<CUstream> const & Stream::cu() const
 { return cu_; }
 }
 }