GEMM: Added skeleton for cuBLAS GEMM calls

2016-10-03 21:26:05 -04:00
parent 889b4cffdf
commit f1a636f83f
27 changed files with 18566 additions and 667 deletions
--- a/include/isaac/driver/dispatch.h
+++ b/include/isaac/driver/dispatch.h
@@ -31,6 +31,7 @@
 //CUDA Backend
 #include "isaac/driver/external/CUDA/cuda.h"
 #include "isaac/driver/external/CUDA/nvrtc.h"
+#include "isaac/driver/external/CUDA/cublas.h"

 #include <iostream>

@@ -48,9 +49,7 @@ private:

    template <class R, class... A>
    struct return_type<R (*)(A...)>
-    {
-      typedef R type;
-    };
+    { typedef R type; };

    typedef bool (*f_init_t)();

@@ -65,10 +64,13 @@ private:
        return (*fptr)(args...);
    }

+    static void cublasCreate(cublasHandle_t* h);
+
 public:
    static bool clinit();
-    static bool cuinit();
+    static bool cublasinit();
    static bool nvrtcinit();
+    static bool cuinit();

    static void release();

@@ -144,10 +146,17 @@ public:
    static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
    static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);

+    static void cublasGetStream(cudaStream_t *streamId);
+    static void cublasSetStream(cudaStream_t streamId);
+    static void cublasSgemm (cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
+    static void cublasDgemm (cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
+
 private:
    static void* opencl_;
    static void* cuda_;
    static void* nvrtc_;
+    static void* cublas_;
+    static cublasHandle_t cublas_handle_;

    //OpenCL
    static void* clBuildProgram_;
@@ -220,6 +229,12 @@ private:
    static void* nvrtcGetPTXSize_;
    static void* nvrtcCreateProgram_;
    static void* nvrtcGetProgramLog_;
+
+    static void* cublasCreate_;
+    static void* cublasGetStream_;
+    static void* cublasSetStream_;
+    static void* cublasSgemm_;
+    static void* cublasDgemm_;
 };

 }
--- a/include/isaac/driver/external/CUDA/builtin_types.h
+++ b/include/isaac/driver/external/CUDA/builtin_types.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "device_types.h"
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "driver_types.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "surface_types.h"
+#include "texture_types.h"
+#include "vector_types.h"
--- a/include/isaac/driver/external/CUDA/channel_descriptor.h
+++ b/include/isaac/driver/external/CUDA/channel_descriptor.h
@@ -0,0 +1,412 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CHANNEL_DESCRIPTOR_H__)
+#define __CHANNEL_DESCRIPTOR_H__
+
+#if defined(__cplusplus)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "driver_types.h"
+#include "cuda_runtime_api.h"
+#include "host_defines.h"
+#include "vector_types.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * \addtogroup CUDART_HIGHLEVEL
+ *
+ * @{
+ */
+
+/**
+ * \brief \hl Returns a channel descriptor using the specified format
+ *
+ * Returns a channel descriptor with format \p f and number of bits of each
+ * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
+ * defined as:
+ * \code
+  struct cudaChannelFormatDesc {
+    int x, y, z, w;
+    enum cudaChannelFormatKind f;
+  };
+ * \endcode
+ *
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * \return
+ * Channel descriptor with format \p f
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
+ * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
+ * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
+ */
+template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+  return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
+{
+  int e = (int)sizeof(char) * 8;
+
+#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+#if !defined(__LP64__)
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+#endif /* !__LP64__ */
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+#endif /* __cplusplus */
+
+/** @} */
+/** @} */ /* END CUDART_TEXTURE_HL */
+
+#endif /* !__CHANNEL_DESCRIPTOR_H__ */
--- a/include/isaac/driver/external/CUDA/cuComplex.h
+++ b/include/isaac/driver/external/CUDA/cuComplex.h
@@ -0,0 +1,338 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CU_COMPLEX_H_)
+#define CU_COMPLEX_H_
+
+/* When trying to include C header file in C++ Code extern "C" is required
+ * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
+ * extern "C" cannot be nested
+ * Hence keep the header out of extern "C" block
+ */
+
+#include <math.h>       /* import fabsf, sqrt */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+#include "vector_types.h"
+
+typedef float2 cuFloatComplex;
+
+__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) 
+{ 
+    return x.x; 
+}
+
+__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) 
+{ 
+    return x.y; 
+}
+
+__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex 
+                                                             (float r, float i)
+{
+    cuFloatComplex res;
+    res.x = r;
+    res.y = i;
+    return res;
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
+{
+    return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
+}
+__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), 
+                                cuCimagf(x) + cuCimagf(y));
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+        return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), 
+                                    cuCimagf(x) - cuCimagf(y));
+}
+
+/* This implementation could suffer from intermediate overflow even though
+ * the final result would be in range. However, various implementations do
+ * not guard against this (presumably to avoid losing performance), so we 
+ * don't do it either to stay competitive.
+ */
+__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    cuFloatComplex prod;
+    prod = make_cuFloatComplex  ((cuCrealf(x) * cuCrealf(y)) - 
+                                 (cuCimagf(x) * cuCimagf(y)),
+                                 (cuCrealf(x) * cuCimagf(y)) + 
+                                 (cuCimagf(x) * cuCrealf(y)));
+    return prod;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Such guarded implementations are usually the default for
+ * complex library implementations, with some also offering an unguarded,
+ * faster version.
+ */
+__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    cuFloatComplex quot;
+    float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
+    float oos = 1.0f / s;
+    float ars = cuCrealf(x) * oos;
+    float ais = cuCimagf(x) * oos;
+    float brs = cuCrealf(y) * oos;
+    float bis = cuCimagf(y) * oos;
+    s = (brs * brs) + (bis * bis);
+    oos = 1.0f / s;
+    quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
+                                ((ais * brs) - (ars * bis)) * oos);
+    return quot;
+}
+
+/* 
+ * We would like to call hypotf(), but it's not available on all platforms.
+ * This discrete implementation guards against intermediate underflow and 
+ * overflow by scaling. Otherwise we would lose half the exponent range. 
+ * There are various ways of doing guarded computation. For now chose the 
+ * simplest and fastest solution, however this may suffer from inaccuracies 
+ * if sqrt and division are not IEEE compliant. 
+ */
+__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
+{
+    float a = cuCrealf(x);
+    float b = cuCimagf(x);
+    float v, w, t;
+    a = fabsf(a);
+    b = fabsf(b);
+    if (a > b) {
+        v = a;
+        w = b; 
+    } else {
+        v = b;
+        w = a;
+    }
+    t = w / v;
+    t = 1.0f + t * t;
+    t = v * sqrtf(t);
+    if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
+        t = v + w;
+    }
+    return t;
+}
+
+/* Double precision */
+typedef double2 cuDoubleComplex;
+
+__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) 
+{ 
+    return x.x; 
+}
+
+__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) 
+{ 
+    return x.y; 
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex 
+                                                           (double r, double i)
+{
+    cuDoubleComplex res;
+    res.x = r;
+    res.y = i;
+    return res;
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
+{
+    return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), 
+                                 cuCimag(x) + cuCimag(y));
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), 
+                                 cuCimag(x) - cuCimag(y));
+}
+
+/* This implementation could suffer from intermediate overflow even though
+ * the final result would be in range. However, various implementations do
+ * not guard against this (presumably to avoid losing performance), so we 
+ * don't do it either to stay competitive.
+ */
+__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    cuDoubleComplex prod;
+    prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - 
+                                 (cuCimag(x) * cuCimag(y)),
+                                 (cuCreal(x) * cuCimag(y)) + 
+                                 (cuCimag(x) * cuCreal(y)));
+    return prod;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Such guarded implementations are usually the default for
+ * complex library implementations, with some also offering an unguarded,
+ * faster version.
+ */
+__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    cuDoubleComplex quot;
+    double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
+    double oos = 1.0 / s;
+    double ars = cuCreal(x) * oos;
+    double ais = cuCimag(x) * oos;
+    double brs = cuCreal(y) * oos;
+    double bis = cuCimag(y) * oos;
+    s = (brs * brs) + (bis * bis);
+    oos = 1.0 / s;
+    quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
+                                 ((ais * brs) - (ars * bis)) * oos);
+    return quot;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Otherwise we would lose half the exponent range. There are
+ * various ways of doing guarded computation. For now chose the simplest
+ * and fastest solution, however this may suffer from inaccuracies if sqrt
+ * and division are not IEEE compliant.
+ */
+__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
+{
+    double a = cuCreal(x);
+    double b = cuCimag(x);
+    double v, w, t;
+    a = fabs(a);
+    b = fabs(b);
+    if (a > b) {
+        v = a;
+        w = b; 
+    } else {
+        v = b;
+        w = a;
+    }
+    t = w / v;
+    t = 1.0 + t * t;
+    t = v * sqrt(t);
+    if ((v == 0.0) || 
+        (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
+        t = v + w;
+    }
+    return t;
+}
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+/* aliases */
+typedef cuFloatComplex cuComplex;
+__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, 
+                                                                float y) 
+{ 
+    return make_cuFloatComplex (x, y); 
+}
+
+/* float-to-double promotion */
+__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
+                                                      (cuFloatComplex c)
+{
+    return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
+(cuDoubleComplex c)
+{
+	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
+}
+
+
+__host__ __device__ static __inline__  cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
+{
+    float real_res;
+    float imag_res;
+    
+    real_res = (cuCrealf(x) *  cuCrealf(y)) + cuCrealf(d);
+    imag_res = (cuCrealf(x) *  cuCimagf(y)) + cuCimagf(d);
+            
+    real_res = -(cuCimagf(x) * cuCimagf(y))  + real_res;  
+    imag_res =  (cuCimagf(x) *  cuCrealf(y)) + imag_res;          
+     
+    return make_cuComplex(real_res, imag_res);
+}
+
+__host__ __device__ static __inline__  cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
+{
+    double real_res;
+    double imag_res;
+    
+    real_res = (cuCreal(x) *  cuCreal(y)) + cuCreal(d);
+    imag_res = (cuCreal(x) *  cuCimag(y)) + cuCimag(d);
+            
+    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
+    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
+     
+    return make_cuDoubleComplex(real_res, imag_res);
+}
+
+#endif /* !defined(CU_COMPLEX_H_) */
--- a/include/isaac/driver/external/CUDA/cublas.h
+++ b/include/isaac/driver/external/CUDA/cublas.h
@@ -0,0 +1,565 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+ 
+/*
+ * This is the public header file for the CUBLAS library, defining the API
+ *
+ * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines) 
+ * on top of the CUDA runtime. 
+ */
+
+#if !defined(CUBLAS_H_)
+#define CUBLAS_H_
+
+#include <cuda_runtime.h>
+
+#ifndef CUBLASWINAPI
+#ifdef _WIN32
+#define CUBLASWINAPI __stdcall
+#else
+#define CUBLASWINAPI 
+#endif
+#endif
+
+#undef CUBLASAPI
+#ifdef __CUDACC__
+#define CUBLASAPI __host__
+#else
+#define CUBLASAPI
+#endif
+
+#include "cublas_api.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* CUBLAS data types */
+#define cublasStatus cublasStatus_t
+
+cublasStatus CUBLASWINAPI cublasInit (void);
+cublasStatus CUBLASWINAPI cublasShutdown (void);
+cublasStatus CUBLASWINAPI cublasGetError (void);
+
+cublasStatus CUBLASWINAPI cublasGetVersion(int *version);
+cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr);
+
+cublasStatus CUBLASWINAPI cublasFree (void *devicePtr);
+
+
+cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream);
+
+
+
+/* ---------------- CUBLAS BLAS1 functions ---------------- */
+/* NRM2 */
+float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx);
+double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx);
+float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx);
+double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx);
+/*------------------------------------------------------------------------*/
+/* DOT */
+float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, 
+                               int incy);
+double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, 
+                               int incy);
+cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, 
+                               int incy);
+cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, 
+                               int incy);
+cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
+                               int incy);
+cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
+                               int incy);
+/*------------------------------------------------------------------------*/
+/* SCAL */
+void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx);
+void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx);
+void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx);
+void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx);
+
+void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx);
+void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx);
+/*------------------------------------------------------------------------*/
+/* AXPY */
+void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, 
+                               float *y, int incy);
+void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, 
+                               int incx, double *y, int incy);
+void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, 
+                               int incx, cuComplex *y, int incy);
+void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
+                               int incx, cuDoubleComplex *y, int incy);
+/*------------------------------------------------------------------------*/
+/* COPY */
+void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, 
+                               int incy);
+void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, 
+                               int incy);
+void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
+                               int incy);
+void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
+                               int incy);
+/*------------------------------------------------------------------------*/
+/* SWAP */
+void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy);
+void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy);
+void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy);
+void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy);           
+/*------------------------------------------------------------------------*/
+/* AMAX */
+int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx);
+int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx);
+int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx);
+int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx);
+/*------------------------------------------------------------------------*/
+/* AMIN */
+int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx);
+int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx);
+
+int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx);
+int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx);
+/*------------------------------------------------------------------------*/
+/* ASUM */
+float CUBLASWINAPI cublasSasum (int n, const float *x, int incx);
+double CUBLASWINAPI cublasDasum (int n, const double *x, int incx);
+float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx);
+double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx);
+/*------------------------------------------------------------------------*/
+/* ROT */
+void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, 
+                              float sc, float ss);
+void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, 
+                              double sc, double ss);
+void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, 
+                              int incy, float c, cuComplex s);
+void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, 
+                              cuDoubleComplex *y, int incy, double sc, 
+                              cuDoubleComplex cs);
+void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
+                               int incy, float c, float s);
+void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, 
+                               cuDoubleComplex *y, int incy, double c, double s);
+/*------------------------------------------------------------------------*/
+/* ROTG */
+void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss);
+void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss);
+void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
+                               cuComplex *cs);                                     
+void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
+                               cuDoubleComplex *cs);                                                               
+/*------------------------------------------------------------------------*/
+/* ROTM */
+void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, 
+                              const float* sparam);
+void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, 
+                              const double* sparam);
+/*------------------------------------------------------------------------*/
+/* ROTMG */
+void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, 
+                                const float *sy1, float* sparam);
+void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, 
+                                const double *sy1, double* sparam);
+                           
+/* --------------- CUBLAS BLAS2 functions  ---------------- */
+/* GEMV */
+void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
+                               const float *A, int lda, const float *x, int incx,
+                               float beta, float *y, int incy);
+void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
+                               const double *A, int lda, const double *x, int incx,
+                               double beta, double *y, int incy);
+void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
+                               const cuComplex *A, int lda, const cuComplex *x, int incx,
+                               cuComplex beta, cuComplex *y, int incy);
+void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+                               cuDoubleComplex beta, cuDoubleComplex *y, int incy);
+/*------------------------------------------------------------------------*/
+/* GBMV */
+void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, 
+                               float alpha, const float *A, int lda, 
+                               const float *x, int incx, float beta, float *y, 
+                               int incy);
+void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, 
+                               double alpha, const double *A, int lda, 
+                               const double *x, int incx, double beta, double *y, 
+                               int incy);
+void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, 
+                               cuComplex alpha, const cuComplex *A, int lda, 
+                               const cuComplex *x, int incx, cuComplex beta, cuComplex *y, 
+                               int incy);
+void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, 
+                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, 
+                               const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, 
+                               int incy);                  
+/*------------------------------------------------------------------------*/
+/* TRMV */
+void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, 
+                               const float *A, int lda, float *x, int incx);
+void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, 
+                               const double *A, int lda, double *x, int incx);
+void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, 
+                               const cuComplex *A, int lda, cuComplex *x, int incx);
+void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, 
+                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
+/*------------------------------------------------------------------------*/
+/* TBMV */
+void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, 
+                               const float *A, int lda, float *x, int incx);
+void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, 
+                               const double *A, int lda, double *x, int incx);
+void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, 
+                               const cuComplex *A, int lda, cuComplex *x, int incx);
+void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, 
+                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
+/*------------------------------------------------------------------------*/
+/* TPMV */                                                    
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx);
+
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
+
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
+                                         
+void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx);
+/*------------------------------------------------------------------------*/
+/* TRSV */
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx);
+
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx);
+
+void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx);
+
+void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, 
+                              cuDoubleComplex *x, int incx);       
+/*------------------------------------------------------------------------*/
+/* TPSV */
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, 
+                              float *x, int incx);
+                                                                                                            
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
+
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
+
+void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, 
+                              cuDoubleComplex *x, int incx);
+/*------------------------------------------------------------------------*/                                         
+/* TBSV */                                         
+void CUBLASWINAPI cublasStbsv(char uplo, char trans, 
+                              char diag, int n, int k, const float *A, 
+                              int lda, float *x, int incx);
+    
+void CUBLASWINAPI cublasDtbsv(char uplo, char trans, 
+                              char diag, int n, int k, const double *A, 
+                              int lda, double *x, int incx);
+void CUBLASWINAPI cublasCtbsv(char uplo, char trans, 
+                              char diag, int n, int k, const cuComplex *A, 
+                              int lda, cuComplex *x, int incx);      
+                                         
+void CUBLASWINAPI cublasZtbsv(char uplo, char trans, 
+                              char diag, int n, int k, const cuDoubleComplex *A, 
+                              int lda, cuDoubleComplex *x, int incx);  
+/*------------------------------------------------------------------------*/                                         
+/* SYMV/HEMV */
+void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
+                               int lda, const float *x, int incx, float beta, 
+                               float *y, int incy);
+void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
+                               int lda, const double *x, int incx, double beta, 
+                               double *y, int incy);
+void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
+                               int lda, const cuComplex *x, int incx, cuComplex beta, 
+                               cuComplex *y, int incy);
+void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, 
+                               cuDoubleComplex *y, int incy);
+/*------------------------------------------------------------------------*/       
+/* SBMV/HBMV */
+void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, 
+                               const float *A, int lda, const float *x, int incx, 
+                               float beta, float *y, int incy);
+void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, 
+                               const double *A, int lda, const double *x, int incx, 
+                               double beta, double *y, int incy);
+void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, 
+                               const cuComplex *A, int lda, const cuComplex *x, int incx, 
+                               cuComplex beta, cuComplex *y, int incy);
+void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, 
+                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, 
+                               cuDoubleComplex beta, cuDoubleComplex *y, int incy);
+/*------------------------------------------------------------------------*/       
+/* SPMV/HPMV */
+void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
+                              const float *AP, const float *x,
+                              int incx, float beta, float *y, int incy);
+void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
+                              const double *AP, const double *x,
+                              int incx, double beta, double *y, int incy);
+void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *AP, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y, int incy);
+void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *AP, const cuDoubleComplex *x,
+                              int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy);
+
+/*------------------------------------------------------------------------*/       
+/* GER */
+void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
+                              const float *y, int incy, float *A, int lda);
+void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
+                              const double *y, int incy, double *A, int lda);
+
+void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
+                               int incx, const cuComplex *y, int incy,
+                               cuComplex *A, int lda);
+void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
+                               int incx, const cuComplex *y, int incy,
+                               cuComplex *A, int lda);
+void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
+                               int incx, const cuDoubleComplex *y, int incy,
+                               cuDoubleComplex *A, int lda);
+void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
+                               int incx, const cuDoubleComplex *y, int incy,
+                               cuDoubleComplex *A, int lda);
+/*------------------------------------------------------------------------*/       
+/* SYR/HER */
+void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
+                              int incx, float *A, int lda);
+void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
+                              int incx, double *A, int lda);
+
+void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, 
+                              const cuComplex *x, int incx, cuComplex *A, int lda);
+void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, 
+                              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda);
+
+/*------------------------------------------------------------------------*/       
+/* SPR/HPR */
+void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
+                              int incx, float *AP);
+void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
+                              int incx, double *AP);
+void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
+                              int incx, cuComplex *AP);
+void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
+                              int incx, cuDoubleComplex *AP);
+/*------------------------------------------------------------------------*/       
+/* SYR2/HER2 */
+void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, 
+                               int incx, const float *y, int incy, float *A, 
+                               int lda);
+void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, 
+                               int incx, const double *y, int incy, double *A, 
+                               int lda);
+void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, 
+                               int incx, const cuComplex *y, int incy, cuComplex *A, 
+                               int lda);
+void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
+                               int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, 
+                               int lda);
+
+/*------------------------------------------------------------------------*/       
+/* SPR2/HPR2 */
+void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, 
+                               int incx, const float *y, int incy, float *AP);
+void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
+                               const double *x, int incx, const double *y,
+                               int incy, double *AP);
+void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
+                               const cuComplex *x, int incx, const cuComplex *y,
+                               int incy, cuComplex *AP);
+void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
+                               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+                               int incy, cuDoubleComplex *AP);
+/* ------------------------BLAS3 Functions ------------------------------- */
+/* GEMM */
+void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, 
+                               float alpha, const float *A, int lda, 
+                               const float *B, int ldb, float beta, float *C, 
+                               int ldc);
+void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
+                               double alpha, const double *A, int lda, 
+                               const double *B, int ldb, double beta, double *C, 
+                               int ldc);              
+void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, 
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc);
+void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
+                               int k, cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda,
+                               const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C,
+                               int ldc);                   
+/* -------------------------------------------------------*/
+/* SYRK */
+void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, 
+                               const float *A, int lda, float beta, float *C, 
+                               int ldc);
+void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
+                               double alpha, const double *A, int lda,
+                               double beta, double *C, int ldc);
+
+void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               cuComplex beta, cuComplex *C, int ldc);
+void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda,
+                               cuDoubleComplex beta,
+                               cuDoubleComplex *C, int ldc);
+/* ------------------------------------------------------- */
+/* HERK */
+void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
+                               float alpha, const cuComplex *A, int lda,
+                               float beta, cuComplex *C, int ldc);
+void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
+                               double alpha,
+                               const cuDoubleComplex *A, int lda,
+                               double beta,
+                               cuDoubleComplex *C, int ldc);
+/* ------------------------------------------------------- */
+/* SYR2K */
+void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, 
+                                const float *A, int lda, const float *B, int ldb, 
+                                float beta, float *C, int ldc);
+
+void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
+                                double alpha, const double *A, int lda,
+                                const double *B, int ldb, double beta,
+                                double *C, int ldc);
+void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
+                                cuComplex alpha, const cuComplex *A, int lda,
+                                const cuComplex *B, int ldb, cuComplex beta,
+                                cuComplex *C, int ldc);
+
+void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
+                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
+                                const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
+                                cuDoubleComplex *C, int ldc);                             
+/* ------------------------------------------------------- */
+/* HER2K */
+void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
+                                cuComplex alpha, const cuComplex *A, int lda,
+                                const cuComplex *B, int ldb, float beta,
+                                cuComplex *C, int ldc);
+
+void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
+                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
+                                const cuDoubleComplex *B, int ldb, double beta,
+                                cuDoubleComplex *C, int ldc); 
+
+/*------------------------------------------------------------------------*/       
+/* SYMM*/
+void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, 
+                               const float *A, int lda, const float *B, int ldb,
+                               float beta, float *C, int ldc);
+void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, 
+                               const double *A, int lda, const double *B, int ldb,
+                               double beta, double *C, int ldc);
+          
+void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, 
+                               const cuComplex *A, int lda, const cuComplex *B, int ldb,
+                               cuComplex beta, cuComplex *C, int ldc);
+          
+void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, 
+                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
+/*------------------------------------------------------------------------*/       
+/* HEMM*/
+void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc);
+void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
+                               const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
+                               cuDoubleComplex *C, int ldc);  
+
+/*------------------------------------------------------------------------*/       
+/* TRSM*/
+void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
+                               int m, int n, float alpha, const float *A, int lda,
+                               float *B, int ldb);
+
+void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
+                               char diag, int m, int n, double alpha,
+                               const double *A, int lda, double *B,
+                               int ldb);
+
+void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
+                               int m, int n, cuComplex alpha, const cuComplex *A,
+                               int lda, cuComplex *B, int ldb);
+
+void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
+                               char diag, int m, int n, cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda,
+                               cuDoubleComplex *B, int ldb);                                                        
+/*------------------------------------------------------------------------*/       
+/* TRMM*/
+void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
+                               int m, int n, float alpha, const float *A, int lda,
+                               float *B, int ldb);
+void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
+                               char diag, int m, int n, double alpha,
+                               const double *A, int lda, double *B,
+                               int ldb);
+void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
+                               int m, int n, cuComplex alpha, const cuComplex *A,
+                               int lda, cuComplex *B, int ldb);
+void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
+                               char diag, int m, int n, cuDoubleComplex alpha,
+                               const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
+                               int ldb);
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif /* !defined(CUBLAS_H_) */
--- a/include/isaac/driver/external/CUDA/cublas_api.h
+++ b/include/isaac/driver/external/CUDA/cublas_api.h
--- a/include/isaac/driver/external/CUDA/cublas_v2.h
+++ b/include/isaac/driver/external/CUDA/cublas_v2.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+ 
+/*
+ * This is the public header file for the new CUBLAS library API, it mapped the generic 
+ * Cublas name functions to the actual _v2 implementations.
+ */
+
+#if !defined(CUBLAS_V2_H_)
+#define CUBLAS_V2_H_
+
+#undef CUBLASAPI
+#ifdef __CUDACC__
+#define CUBLASAPI __host__ __device__
+#else
+#define CUBLASAPI
+#endif
+
+#include "cublas_api.h"
+
+#define cublasCreate         cublasCreate_v2
+#define cublasDestroy        cublasDestroy_v2
+#define cublasGetVersion     cublasGetVersion_v2
+#define cublasSetStream      cublasSetStream_v2
+#define cublasGetStream      cublasGetStream_v2
+#define cublasGetPointerMode cublasGetPointerMode_v2
+#define cublasSetPointerMode cublasSetPointerMode_v2
+
+/* Blas3 Routines   */
+
+#define cublasSnrm2          cublasSnrm2_v2
+#define cublasDnrm2          cublasDnrm2_v2 
+#define cublasScnrm2         cublasScnrm2_v2
+#define cublasDznrm2         cublasDznrm2_v2
+
+#define cublasSdot           cublasSdot_v2
+#define cublasDdot           cublasDdot_v2
+#define cublasCdotu          cublasCdotu_v2
+#define cublasCdotc          cublasCdotc_v2
+#define cublasZdotu          cublasZdotu_v2
+#define cublasZdotc          cublasZdotc_v2
+
+#define cublasSscal          cublasSscal_v2
+#define cublasDscal          cublasDscal_v2
+#define cublasCscal          cublasCscal_v2
+#define cublasCsscal         cublasCsscal_v2
+#define cublasZscal          cublasZscal_v2
+#define cublasZdscal         cublasZdscal_v2
+
+#define cublasSaxpy          cublasSaxpy_v2
+#define cublasDaxpy          cublasDaxpy_v2
+#define cublasCaxpy          cublasCaxpy_v2
+#define cublasZaxpy          cublasZaxpy_v2
+
+#define cublasScopy          cublasScopy_v2
+#define cublasDcopy          cublasDcopy_v2
+#define cublasCcopy          cublasCcopy_v2
+#define cublasZcopy          cublasZcopy_v2
+
+#define cublasSswap          cublasSswap_v2
+#define cublasDswap          cublasDswap_v2
+#define cublasCswap          cublasCswap_v2
+#define cublasZswap          cublasZswap_v2
+
+#define cublasIsamax         cublasIsamax_v2
+#define cublasIdamax         cublasIdamax_v2
+#define cublasIcamax         cublasIcamax_v2
+#define cublasIzamax         cublasIzamax_v2
+ 
+#define cublasIsamin         cublasIsamin_v2
+#define cublasIdamin         cublasIdamin_v2
+#define cublasIcamin         cublasIcamin_v2
+#define cublasIzamin         cublasIzamin_v2
+                         
+#define cublasSasum          cublasSasum_v2
+#define cublasDasum          cublasDasum_v2
+#define cublasScasum         cublasScasum_v2
+#define cublasDzasum         cublasDzasum_v2
+
+#define cublasSrot           cublasSrot_v2 
+#define cublasDrot           cublasDrot_v2 
+#define cublasCrot           cublasCrot_v2 
+#define cublasCsrot          cublasCsrot_v2
+#define cublasZrot           cublasZrot_v2 
+#define cublasZdrot          cublasZdrot_v2
+
+#define cublasSrotg          cublasSrotg_v2
+#define cublasDrotg          cublasDrotg_v2
+#define cublasCrotg          cublasCrotg_v2
+#define cublasZrotg          cublasZrotg_v2
+
+#define cublasSrotm          cublasSrotm_v2 
+#define cublasDrotm          cublasDrotm_v2 
+                                
+#define cublasSrotmg         cublasSrotmg_v2 
+#define cublasDrotmg         cublasDrotmg_v2 
+
+
+/* Blas2 Routines */
+
+#define cublasSgemv          cublasSgemv_v2
+#define cublasDgemv          cublasDgemv_v2
+#define cublasCgemv          cublasCgemv_v2
+#define cublasZgemv          cublasZgemv_v2
+
+#define cublasSgbmv          cublasSgbmv_v2
+#define cublasDgbmv          cublasDgbmv_v2
+#define cublasCgbmv          cublasCgbmv_v2
+#define cublasZgbmv          cublasZgbmv_v2
+
+#define cublasStrmv          cublasStrmv_v2
+#define cublasDtrmv          cublasDtrmv_v2
+#define cublasCtrmv          cublasCtrmv_v2
+#define cublasZtrmv          cublasZtrmv_v2
+
+#define cublasStbmv          cublasStbmv_v2
+#define cublasDtbmv          cublasDtbmv_v2
+#define cublasCtbmv          cublasCtbmv_v2
+#define cublasZtbmv          cublasZtbmv_v2
+
+#define cublasStpmv          cublasStpmv_v2
+#define cublasDtpmv          cublasDtpmv_v2
+#define cublasCtpmv          cublasCtpmv_v2
+#define cublasZtpmv          cublasZtpmv_v2
+
+#define cublasStrsv          cublasStrsv_v2
+#define cublasDtrsv          cublasDtrsv_v2
+#define cublasCtrsv          cublasCtrsv_v2
+#define cublasZtrsv          cublasZtrsv_v2
+
+#define cublasStpsv          cublasStpsv_v2
+#define cublasDtpsv          cublasDtpsv_v2
+#define cublasCtpsv          cublasCtpsv_v2
+#define cublasZtpsv          cublasZtpsv_v2
+
+#define cublasStbsv          cublasStbsv_v2
+#define cublasDtbsv          cublasDtbsv_v2
+#define cublasCtbsv          cublasCtbsv_v2
+#define cublasZtbsv          cublasZtbsv_v2
+
+#define cublasSsymv          cublasSsymv_v2
+#define cublasDsymv          cublasDsymv_v2
+#define cublasCsymv          cublasCsymv_v2
+#define cublasZsymv          cublasZsymv_v2
+#define cublasChemv          cublasChemv_v2
+#define cublasZhemv          cublasZhemv_v2
+
+#define cublasSsbmv          cublasSsbmv_v2
+#define cublasDsbmv          cublasDsbmv_v2
+#define cublasChbmv          cublasChbmv_v2
+#define cublasZhbmv          cublasZhbmv_v2
+
+#define cublasSspmv          cublasSspmv_v2
+#define cublasDspmv          cublasDspmv_v2
+#define cublasChpmv          cublasChpmv_v2
+#define cublasZhpmv          cublasZhpmv_v2
+
+
+#define cublasSger           cublasSger_v2
+#define cublasDger           cublasDger_v2
+#define cublasCgeru          cublasCgeru_v2
+#define cublasCgerc          cublasCgerc_v2
+#define cublasZgeru          cublasZgeru_v2
+#define cublasZgerc          cublasZgerc_v2
+
+#define cublasSsyr           cublasSsyr_v2
+#define cublasDsyr           cublasDsyr_v2
+#define cublasCsyr           cublasCsyr_v2
+#define cublasZsyr           cublasZsyr_v2
+#define cublasCher           cublasCher_v2
+#define cublasZher           cublasZher_v2
+
+#define cublasSspr           cublasSspr_v2
+#define cublasDspr           cublasDspr_v2
+#define cublasChpr           cublasChpr_v2
+#define cublasZhpr           cublasZhpr_v2
+
+#define cublasSsyr2          cublasSsyr2_v2
+#define cublasDsyr2          cublasDsyr2_v2
+#define cublasCsyr2          cublasCsyr2_v2
+#define cublasZsyr2          cublasZsyr2_v2
+#define cublasCher2          cublasCher2_v2
+#define cublasZher2          cublasZher2_v2
+
+#define cublasSspr2          cublasSspr2_v2
+#define cublasDspr2          cublasDspr2_v2
+#define cublasChpr2          cublasChpr2_v2
+#define cublasZhpr2          cublasZhpr2_v2
+
+/* Blas3 Routines   */
+
+#define cublasSgemm          cublasSgemm_v2
+#define cublasDgemm          cublasDgemm_v2
+#define cublasCgemm          cublasCgemm_v2
+#define cublasZgemm          cublasZgemm_v2
+
+#define cublasSsyrk          cublasSsyrk_v2
+#define cublasDsyrk          cublasDsyrk_v2
+#define cublasCsyrk          cublasCsyrk_v2
+#define cublasZsyrk          cublasZsyrk_v2
+#define cublasCherk          cublasCherk_v2
+#define cublasZherk          cublasZherk_v2
+
+#define cublasSsyr2k         cublasSsyr2k_v2
+#define cublasDsyr2k         cublasDsyr2k_v2
+#define cublasCsyr2k         cublasCsyr2k_v2
+#define cublasZsyr2k         cublasZsyr2k_v2
+#define cublasCher2k         cublasCher2k_v2
+#define cublasZher2k         cublasZher2k_v2
+
+#define cublasSsymm          cublasSsymm_v2
+#define cublasDsymm          cublasDsymm_v2
+#define cublasCsymm          cublasCsymm_v2
+#define cublasZsymm          cublasZsymm_v2
+#define cublasChemm          cublasChemm_v2
+#define cublasZhemm          cublasZhemm_v2
+
+#define cublasStrsm          cublasStrsm_v2
+#define cublasDtrsm          cublasDtrsm_v2
+#define cublasCtrsm          cublasCtrsm_v2
+#define cublasZtrsm          cublasZtrsm_v2
+
+#define cublasStrmm          cublasStrmm_v2
+#define cublasDtrmm          cublasDtrmm_v2
+#define cublasCtrmm          cublasCtrmm_v2
+#define cublasZtrmm          cublasZtrmm_v2
+
+#endif /* !defined(CUBLAS_V2_H_) */
--- a/include/isaac/driver/external/CUDA/cuda_device_runtime_api.h
+++ b/include/isaac/driver/external/CUDA/cuda_device_runtime_api.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
+#define __CUDA_DEVICE_RUNTIME_API_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDABE__)
+
+#if (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
+struct cudaFuncAttributes;
+
+__device__ __attribute__((nv_weak)) cudaError_t cudaMalloc(void **p, size_t s) 
+{ 
+  return cudaErrorUnknown;
+}
+
+__device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c) 
+{ 
+  return cudaErrorUnknown;
+}
+
+__device__ __attribute__((nv_weak)) cudaError_t cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
+{
+  return cudaErrorUnknown;
+}
+
+__device__ __attribute__((nv_weak)) cudaError_t cudaGetDevice(int *device)
+{
+  return cudaErrorUnknown;
+}
+
+__device__ __attribute__((nv_weak)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
+{
+  return cudaErrorUnknown;
+}
+
+__device__ __attribute__((nv_weak)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
+{
+  return cudaErrorUnknown;
+}
+
+#endif /* (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
+
+#else /* defined(__CUDABE__) */
+
+#if defined(__cplusplus) && defined(__CUDACC__)         // Visible to nvcc front-end only
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)   // Visible to SM>=3.5 and "__host__ __device__" only
+
+#include "driver_types.h"
+#include "host_defines.h"
+
+extern "C"
+{
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
+
+/**
+ * \ingroup CUDART_EXECUTION
+ * \brief Obtains a parameter buffer
+ *
+ * Obtains a parameter buffer which can be filled with parameters for a kernel launch.
+ * Parameters passed to ::cudaLaunchDevice must be allocated via this function.
+ *
+ * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
+ * CUDA user code should use <<< >>> to launch kernels.
+ *
+ * \param alignment - Specifies alignment requirement of the parameter buffer
+ * \param size      - Specifies size requirement in bytes
+ *
+ * \return
+ * Returns pointer to the allocated parameterBuffer
+ * \notefnerr
+ *
+ * \sa cudaLaunchDevice
+ */
+extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
+
+/**
+ * \ingroup CUDART_EXECUTION
+ * \brief Launches a specified kernel
+ *
+ * Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
+ * by calling ::cudaGetParameterBuffer().
+ *
+ * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
+ * CUDA user code should use <<< >>> to launch the kernels.
+ *
+ * \param func            - Pointer to the kernel to be launched
+ * \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
+ * \param gridDimension   - Specifies grid dimensions
+ * \param blockDimension  - Specifies block dimensions
+ * \param sharedMemSize   - Specifies size of shared memory
+ * \param stream          - Specifies the stream to be used
+ *
+ * \return
+ * ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
+ * \notefnerr
+ * \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
+ * Guide for the detailed descriptions of launch configuration and parameter layout respectively.
+ *
+ * \sa cudaGetParameterBuffer
+ */
+extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
+    // When compiling for the device and per thread default stream is enabled, add
+    // a static inline redirect to the per thread stream entry points.
+
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
+    cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
+    {
+        return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
+    }
+
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
+    cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
+    {
+        return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
+    }
+#else
+    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
+#endif
+
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+
+}
+
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+
+#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
+#endif // defined(__cplusplus) && defined(__CUDACC__)
+
+#endif /* defined(__CUDABE__) */
+
+#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */
--- a/include/isaac/driver/external/CUDA/cuda_fp16.h
+++ b/include/isaac/driver/external/CUDA/cuda_fp16.h
--- a/include/isaac/driver/external/CUDA/cuda_runtime.h
+++ b/include/isaac/driver/external/CUDA/cuda_runtime.h
--- a/include/isaac/driver/external/CUDA/cuda_runtime_api.h
+++ b/include/isaac/driver/external/CUDA/cuda_runtime_api.h
--- a/include/isaac/driver/external/CUDA/device_types.h
+++ b/include/isaac/driver/external/CUDA/device_types.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_TYPES_H__)
+#define __DEVICE_TYPES_H__
+
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+enum __device_builtin__ cudaRoundMode
+{
+    cudaRoundNearest,
+    cudaRoundZero,
+    cudaRoundPosInf,
+    cudaRoundMinInf
+};
+
+#endif /* !__DEVICE_TYPES_H__ */
--- a/include/isaac/driver/external/CUDA/driver_functions.h
+++ b/include/isaac/driver/external/CUDA/driver_functions.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DRIVER_FUNCTIONS_H__)
+#define __DRIVER_FUNCTIONS_H__
+
+#include "builtin_types.h"
+#include "host_defines.h"
+#include "driver_types.h"
+
+/**
+ * \addtogroup CUDART_MEMORY
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a cudaPitchedPtr based on input parameters
+ *
+ * Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
+ * \p p, \p xsz, and \p ysz.
+ *
+ * \param d   - Pointer to allocated memory
+ * \param p   - Pitch of allocated memory in bytes
+ * \param xsz - Logical width of allocation in elements
+ * \param ysz - Logical height of allocation in elements
+ *
+ * \return
+ * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
+ *
+ * \sa make_cudaExtent, make_cudaPos
+ */
+static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) 
+{
+  struct cudaPitchedPtr s;
+
+  s.ptr   = d;
+  s.pitch = p;
+  s.xsize = xsz;
+  s.ysize = ysz;
+
+  return s;
+}
+
+/**
+ * \brief Returns a cudaPos based on input parameters
+ *
+ * Returns a ::cudaPos based on the specified input parameters \p x,
+ * \p y, and \p z.
+ *
+ * \param x - X position
+ * \param y - Y position
+ * \param z - Z position
+ *
+ * \return
+ * ::cudaPos specified by \p x, \p y, and \p z
+ *
+ * \sa make_cudaExtent, make_cudaPitchedPtr
+ */
+static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) 
+{
+  struct cudaPos p;
+
+  p.x = x;
+  p.y = y;
+  p.z = z;
+
+  return p;
+}
+
+/**
+ * \brief Returns a cudaExtent based on input parameters
+ *
+ * Returns a ::cudaExtent based on the specified input parameters \p w,
+ * \p h, and \p d.
+ *
+ * \param w - Width in bytes
+ * \param h - Height in elements
+ * \param d - Depth in elements
+ *
+ * \return
+ * ::cudaExtent specified by \p w, \p h, and \p d
+ *
+ * \sa make_cudaPitchedPtr, make_cudaPos
+ */
+static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) 
+{
+  struct cudaExtent e;
+
+  e.width  = w;
+  e.height = h;
+  e.depth  = d;
+
+  return e;
+}
+
+/** @} */ /* END CUDART_MEMORY */
+
+#endif /* !__DRIVER_FUNCTIONS_H__ */
--- a/include/isaac/driver/external/CUDA/driver_types.h
+++ b/include/isaac/driver/external/CUDA/driver_types.h
--- a/include/isaac/driver/external/CUDA/host_config.h
+++ b/include/isaac/driver/external/CUDA/host_config.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__HOST_CONFIG_H__)
+#define __HOST_CONFIG_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+
+#define _CRTIMP
+#define __THROW
+
+#else /* __CUDACC_RTC__ */
+
+/* check for host compilers that are compatible with nvcc */
+#if !defined(__GNUC__) && !defined(_WIN32)
+
+#error --- !!! UNSUPPORTED COMPILER !!! ---
+
+#endif /* !__GNUC__ && !_WIN32 */
+
+#if defined(__ICC)
+
+#if __ICC != 1500 || !defined(__GNUC__) || !defined(__LP64__)
+
+#error -- unsupported ICC configuration! Only ICC 15.0 on Linux x86_64 is supported!
+
+#endif /* __ICC != 1500 || !__GNUC__ || !__LP64__ */
+
+#endif /* __ICC */
+
+#if defined(__PGIC__)
+
+#if __PGIC__ != 15 || __PGIC_MINOR__ != 4 || !defined(__GNUC__) || !defined(__LP64__)
+
+#error -- unsupported pgc++ configuration! Only pgc++ 15.4 on Linux x86_64 is supported!
+
+#endif /* __PGIC__ != 15 || __PGIC_MINOR != 4 || !__GNUC__ || !__LP64__ */
+
+#endif /* __PGIC__ */
+
+#if defined(__powerpc__)
+
+#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
+
+#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
+
+#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
+
+#if defined(__ibmxl_vrm__) && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000)
+
+#error -- unsupported xlC version! only xlC 13.1 is supported
+
+#endif /* __ibmxl_vrm__ && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) */
+
+#endif /* __powerpc__ */
+
+#if defined(__GNUC__)
+
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 9)
+
+#error -- unsupported GNU version! gcc versions later than 4.9 are not supported!
+
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 9) */
+
+#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
+#error -- clang and clang++ are the only supported host compilers on Mac OS X!
+#endif /* __APPLE__ && __MACH__ && !__clang__ */
+
+#endif /* __GNUC__ */
+
+#if defined(_WIN32)
+
+#if _MSC_VER < 1600 || _MSC_VER > 1800
+
+#error -- unsupported Microsoft Visual Studio version! Only the versions 2010, 2012, and 2013 are supported!
+
+#endif /* _MSC_VER < 1600 || _MSC_VER > 1800 */
+
+#endif /* _WIN32 */
+
+/* configure host compiler */
+#if defined(__APPLE__)
+
+#define _CRTIMP
+#define __THROW
+
+#if defined(__BLOCKS__) /* nvcc does not support closures */
+
+#undef __BLOCKS__
+
+#endif /* __BLOCKS__ */
+
+#elif defined(__ANDROID__)
+
+#define _CRTIMP
+#define __THROW
+
+#elif defined(__QNX__)
+
+#define _CRTIMP
+#define __THROW
+
+#elif defined(__GNUC__)
+
+#define _CRTIMP
+
+#include <features.h> /* for __THROW */
+
+#elif defined(_WIN32)
+
+#if _MSC_VER >= 1500
+
+#undef _USE_DECLSPECS_FOR_SAL
+#define _USE_DECLSPECS_FOR_SAL \
+        1
+
+#endif /* _MSC_VER >= 1500 */
+
+#if !defined(_CRT_NONSTDC_NO_WARNINGS)
+
+#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
+
+#endif /* !_CRT_NONSTDC_NO_WARNINGS */
+
+#if !defined(_CRT_SECURE_NO_WARNINGS)
+
+#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
+
+#endif /* !_CRT_SECURE_NO_WARNINGS */
+
+#if !defined(NOMINMAX)
+
+#define NOMINMAX /* min and max are part of cuda runtime */
+
+#endif /* !NOMINMAX */
+
+#include <crtdefs.h> /* for _CRTIMP */
+
+#define __THROW
+
+#endif /* __APPLE__ */
+
+#endif /* __CUDACC_RTC__ */
+
+#endif /* __CUDACC__ */
+
+#endif /* !__HOST_CONFIG_H__ */
--- a/include/isaac/driver/external/CUDA/host_defines.h
+++ b/include/isaac/driver/external/CUDA/host_defines.h
@@ -0,0 +1,241 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__HOST_DEFINES_H__)
+#define __HOST_DEFINES_H__
+
+/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
+#if defined(__GNUC__) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
+
+#if defined(__CUDACC_RTC__)
+#define __volatile__ volatile
+#endif /* __CUDACC_RTC__ */
+
+#define __no_return__ \
+        __attribute__((noreturn))
+        
+#if defined(__CUDACC__) || defined(__CUDA_ARCH__)
+/* gcc allows users to define attributes with underscores, 
+   e.g., __attribute__((__noinline__)).
+   Consider a non-CUDA source file (e.g. .cpp) that has the 
+   above attribute specification, and includes this header file. In that case,
+   defining __noinline__ as below  would cause a gcc compilation error.
+   Hence, only define __noinline__ when the code is being processed
+   by a  CUDA compiler component.
+*/   
+#define __noinline__ \
+        __attribute__((noinline))
+#endif /* __CUDACC__  || __CUDA_ARCH__ */       
+        
+#define __forceinline__ \
+        __inline__ __attribute__((always_inline))
+#define __align__(n) \
+        __attribute__((aligned(n)))
+#define __thread__ \
+        __thread
+#define __import__
+#define __export__
+#define __cdecl
+#define __annotate__(a) \
+        __attribute__((a))
+#define __location__(a) \
+        __annotate__(a)
+#define CUDARTAPI
+
+#elif defined(_MSC_VER)
+
+#if _MSC_VER >= 1400
+
+#define __restrict__ \
+        __restrict
+
+#else /* _MSC_VER >= 1400 */
+
+#define __restrict__
+
+#endif /* _MSC_VER >= 1400 */
+
+#define __inline__ \
+        __inline
+#define __no_return__ \
+        __declspec(noreturn)
+#define __noinline__ \
+        __declspec(noinline)
+#define __forceinline__ \
+        __forceinline
+#define __align__(n) \
+        __declspec(align(n))
+#define __thread__ \
+        __declspec(thread)
+#define __import__ \
+        __declspec(dllimport)
+#define __export__ \
+        __declspec(dllexport)
+#define __annotate__(a) \
+        __declspec(a)
+#define __location__(a) \
+        __annotate__(__##a##__)
+#define CUDARTAPI \
+        __stdcall
+
+#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
+
+#define __inline__
+
+#if !defined(__align__)
+
+#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
+
+#endif /* !__align__ */
+
+#if !defined(CUDARTAPI)
+
+#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
+
+#endif /* !CUDARTAPI */
+
+#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
+
+#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
+    (defined(_MSC_VER) && _MSC_VER < 1900) || \
+    (!defined(__GNUC__) && !defined(_MSC_VER))
+
+#define __specialization_static \
+        static
+
+#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
+         (_MSC_VER && _MSC_VER < 1900) ||
+         (!__GNUC__ && !_MSC_VER) */
+
+#define __specialization_static
+
+#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
+         (_MSC_VER && _MSC_VER < 1900) ||
+         (!__GNUC__ && !_MSC_VER) */
+
+#if !defined(__CUDACC__) && !defined(__CUDABE__)
+
+#undef __annotate__
+#define __annotate__(a)
+
+#else /* !__CUDACC__ && !__CUDABE__ */
+
+#define __launch_bounds__(...) \
+        __annotate__(launch_bounds(__VA_ARGS__))
+
+#endif /* !__CUDACC__ && !__CUDABE__ */
+
+#if defined(__CUDACC__) || defined(__CUDABE__) || \
+    defined(__GNUC__) || defined(_WIN64)
+
+#define __builtin_align__(a) \
+        __align__(a)
+
+#else /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */
+
+#define __builtin_align__(a)
+
+#endif /* __CUDACC__ || __CUDABE__ || __GNUC__  || _WIN64 */
+
+#define __host__ \
+        __location__(host)
+#define __device__ \
+        __location__(device)
+#define __global__ \
+        __location__(global)
+#define __shared__ \
+        __location__(shared)
+#define __constant__ \
+        __location__(constant)
+#define __managed__ \
+        __location__(managed)
+        
+#if (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__)) || !defined(__CUDACC__)
+#define __device_builtin__
+#define __device_builtin_texture_type__
+#define __device_builtin_surface_type__
+#define __cudart_builtin__
+#else /* (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__))  || !__CUDACC__ */
+#define __device_builtin__ \
+        __location__(device_builtin)
+#define __device_builtin_texture_type__ \
+        __location__(device_builtin_texture_type)
+#define __device_builtin_surface_type__ \
+        __location__(device_builtin_surface_type)
+#define __cudart_builtin__ \
+        __location__(cudart_builtin)
+#endif /* (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__))  || !__CUDACC__ */
+
+#if defined(__CUDACC__) && defined(__clang__)
+
+#if !defined(__has_feature)
+#error --- !!! The Clang version does not support __has_feature !!! ---
+#endif /* !__has_feature */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+#if (__has_feature(cxx_noexcept))
+#define NV_CLANG_ATOMIC_NOEXCEPT noexcept
+#define NV_CLANG_ATOMIC_NOEXCEPT_(x) noexcept(x)
+#else /* !__has_feature(cxx_noexcept) */
+#define NV_CLANG_ATOMIC_NOEXCEPT throw()
+#define NV_CLANG_ATOMIC_NOEXCEPT_(x)
+#endif /* __has_feature(cxx_noexcept) */
+template <typename T> struct __nv_clang_atomic_t {
+  __nv_clang_atomic_t() NV_CLANG_ATOMIC_NOEXCEPT;
+  __nv_clang_atomic_t(const T &x) NV_CLANG_ATOMIC_NOEXCEPT; 
+  operator T() volatile NV_CLANG_ATOMIC_NOEXCEPT;
+  operator T() NV_CLANG_ATOMIC_NOEXCEPT;
+};
+#define _Atomic(X) __nv_clang_atomic_t<X>
+#endif /* defined(__cplusplus) && defined(__CUDACC__) */
+
+#endif /* __CUDACC__ && __clang__ */
+
+
+#endif /* !__HOST_DEFINES_H__ */
--- a/include/isaac/driver/external/CUDA/surface_types.h
+++ b/include/isaac/driver/external/CUDA/surface_types.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SURFACE_TYPES_H__)
+#define __SURFACE_TYPES_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "driver_types.h"
+
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#define cudaSurfaceType1D              0x01
+#define cudaSurfaceType2D              0x02
+#define cudaSurfaceType3D              0x03
+#define cudaSurfaceTypeCubemap         0x0C
+#define cudaSurfaceType1DLayered       0xF1
+#define cudaSurfaceType2DLayered       0xF2
+#define cudaSurfaceTypeCubemapLayered  0xFC
+
+/**
+ * CUDA Surface boundary modes
+ */
+enum __device_builtin__ cudaSurfaceBoundaryMode
+{
+    cudaBoundaryModeZero  = 0,    /**< Zero boundary mode */
+    cudaBoundaryModeClamp = 1,    /**< Clamp boundary mode */
+    cudaBoundaryModeTrap  = 2     /**< Trap boundary mode */
+};
+
+/**
+ * CUDA Surface format modes
+ */
+enum __device_builtin__  cudaSurfaceFormatMode
+{
+    cudaFormatModeForced = 0,     /**< Forced format mode */
+    cudaFormatModeAuto = 1        /**< Auto format mode */
+};
+
+/**
+ * CUDA Surface reference
+ */
+struct __device_builtin__ surfaceReference
+{
+    /**
+     * Channel descriptor for surface reference
+     */
+    struct cudaChannelFormatDesc channelDesc;
+};
+
+/**
+ * An opaque value that represents a CUDA Surface object
+ */
+typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#endif /* !__SURFACE_TYPES_H__ */
--- a/include/isaac/driver/external/CUDA/texture_types.h
+++ b/include/isaac/driver/external/CUDA/texture_types.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__TEXTURE_TYPES_H__)
+#define __TEXTURE_TYPES_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "driver_types.h"
+
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#define cudaTextureType1D              0x01
+#define cudaTextureType2D              0x02
+#define cudaTextureType3D              0x03
+#define cudaTextureTypeCubemap         0x0C
+#define cudaTextureType1DLayered       0xF1
+#define cudaTextureType2DLayered       0xF2
+#define cudaTextureTypeCubemapLayered  0xFC
+
+/**
+ * CUDA texture address modes
+ */
+enum __device_builtin__ cudaTextureAddressMode
+{
+    cudaAddressModeWrap   = 0,    /**< Wrapping address mode */
+    cudaAddressModeClamp  = 1,    /**< Clamp to edge address mode */
+    cudaAddressModeMirror = 2,    /**< Mirror address mode */
+    cudaAddressModeBorder = 3     /**< Border address mode */
+};
+
+/**
+ * CUDA texture filter modes
+ */
+enum __device_builtin__ cudaTextureFilterMode
+{
+    cudaFilterModePoint  = 0,     /**< Point filter mode */
+    cudaFilterModeLinear = 1      /**< Linear filter mode */
+};
+
+/**
+ * CUDA texture read modes
+ */
+enum __device_builtin__ cudaTextureReadMode
+{
+    cudaReadModeElementType     = 0,  /**< Read texture as specified element type */
+    cudaReadModeNormalizedFloat = 1   /**< Read texture as normalized float */
+};
+
+/**
+ * CUDA texture reference
+ */
+struct __device_builtin__ textureReference
+{
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                          normalized;
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode   filterMode;
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode  addressMode[3];
+    /**
+     * Channel descriptor for the texture reference
+     */
+    struct cudaChannelFormatDesc channelDesc;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                          sRGB;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                 maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode   mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                        mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                        minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                        maxMipmapLevelClamp;
+    int                          __cudaReserved[15];
+};
+
+/**
+ * CUDA texture descriptor
+ */
+struct __device_builtin__ cudaTextureDesc
+{
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode addressMode[3];
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode  filterMode;
+    /**
+     * Texture read mode
+     */
+    enum cudaTextureReadMode    readMode;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                         sRGB;
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                         normalizedCoords;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode  mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                       mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                       minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                       maxMipmapLevelClamp;
+};
+
+/**
+ * An opaque value that represents a CUDA texture object
+ */
+typedef __device_builtin__ unsigned long long cudaTextureObject_t;
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#endif /* !__TEXTURE_TYPES_H__ */
--- a/include/isaac/driver/external/CUDA/vector_functions.h
+++ b/include/isaac/driver/external/CUDA/vector_functions.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_FUNCTIONS_H__)
+#define __VECTOR_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "host_defines.h"
+#include "vector_types.h"
+
+#if defined(__CUDACC_RTC__)
+#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
+
+__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
+
+__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
+
+__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
+
+__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
+
+__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
+
+__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
+
+__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
+
+__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
+
+__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
+
+__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
+
+__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
+
+__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
+
+__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
+
+__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
+
+__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+
+__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
+
+__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
+
+__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
+
+__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
+
+__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
+
+__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
+
+__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
+
+__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
+
+__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
+
+__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
+
+__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
+
+__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
+
+__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
+
+__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
+
+__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
+
+__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
+
+__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
+
+__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
+
+__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
+
+__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
+
+__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
+
+__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
+
+__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
+
+__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
+
+__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
+
+__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
+
+__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
+
+__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
+
+#undef __VECTOR_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__)
+#include "vector_functions.hpp"
+#endif /* !__CUDACC_RTC__ */
+
+#endif /* !__VECTOR_FUNCTIONS_H__ */
--- a/include/isaac/driver/external/CUDA/vector_functions.hpp
+++ b/include/isaac/driver/external/CUDA/vector_functions.hpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_FUNCTIONS_HPP__)
+#define __VECTOR_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "host_defines.h"
+#include "vector_types.h"
+
+#if defined(__CUDACC_RTC__)
+#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
+{
+  char1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
+{
+  uchar1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
+{
+  char2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
+{
+  uchar2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
+{
+  char3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
+{
+  uchar3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
+{
+  char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
+{
+  uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
+{
+  short1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
+{
+  ushort1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
+{
+  short2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
+{
+  ushort2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
+{ 
+  short3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
+{
+  ushort3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
+{
+  short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+{
+  ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
+{
+  int1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
+{
+  uint1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
+{
+  int2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
+{
+  uint2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
+{
+  int3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
+{
+  uint3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
+{
+  int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
+{
+  uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
+{
+  long1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
+{
+  ulong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
+{
+  long2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
+{
+  ulong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
+{
+  long3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
+{
+  ulong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
+{
+  long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
+{
+  ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
+{
+  float1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
+{
+  float2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
+{
+  float3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
+{
+  float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
+{
+  longlong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
+{
+  ulonglong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
+{
+  longlong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
+{
+  ulonglong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
+{
+  longlong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
+{
+  ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
+{
+  longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
+{
+  ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
+{
+  double1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
+{
+  double2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
+{
+  double3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
+{
+  double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+#undef __VECTOR_FUNCTIONS_DECL__
+
+#endif /* !__VECTOR_FUNCTIONS_HPP__ */
+
--- a/include/isaac/driver/external/CUDA/vector_types.h
+++ b/include/isaac/driver/external/CUDA/vector_types.h
@@ -0,0 +1,431 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_TYPES_H__)
+#define __VECTOR_TYPES_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDA_LIBDEVICE__) && !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "builtin_types.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDA_LIBDEVICE__ && !__CUDACC_RTC__ */
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && !defined(__CUDABE__) && \
+    defined(_WIN32) && !defined(_WIN64)
+
+#pragma warning(push)
+#pragma warning(disable: 4201 4408)
+
+#define __cuda_builtin_vector_align8(tag, members) \
+struct __device_builtin__ tag                      \
+{                                                  \
+    union                                          \
+    {                                              \
+        struct { members };                        \
+        struct { long long int :1,:0; };           \
+    };                                             \
+}
+
+#else /* !__CUDACC__ && !__CUDACC_RTC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
+
+#define __cuda_builtin_vector_align8(tag, members) \
+struct __device_builtin__ __align__(8) tag         \
+{                                                  \
+    members                                        \
+}
+
+#endif /* !__CUDACC__ && !__CUDACC_RTC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
+
+struct __device_builtin__ char1
+{
+    signed char x;
+};
+
+struct __device_builtin__ uchar1
+{
+    unsigned char x;
+};
+
+
+struct __device_builtin__ __align__(2) char2
+{
+    signed char x, y;
+};
+
+struct __device_builtin__ __align__(2) uchar2
+{
+    unsigned char x, y;
+};
+
+struct __device_builtin__ char3
+{
+    signed char x, y, z;
+};
+
+struct __device_builtin__ uchar3
+{
+    unsigned char x, y, z;
+};
+
+struct __device_builtin__ __align__(4) char4
+{
+    signed char x, y, z, w;
+};
+
+struct __device_builtin__ __align__(4) uchar4
+{
+    unsigned char x, y, z, w;
+};
+
+struct __device_builtin__ short1
+{
+    short x;
+};
+
+struct __device_builtin__ ushort1
+{
+    unsigned short x;
+};
+
+struct __device_builtin__ __align__(4) short2
+{
+    short x, y;
+};
+
+struct __device_builtin__ __align__(4) ushort2
+{
+    unsigned short x, y;
+};
+
+struct __device_builtin__ short3
+{
+    short x, y, z;
+};
+
+struct __device_builtin__ ushort3
+{
+    unsigned short x, y, z;
+};
+
+__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
+__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
+
+struct __device_builtin__ int1
+{
+    int x;
+};
+
+struct __device_builtin__ uint1
+{
+    unsigned int x;
+};
+
+__cuda_builtin_vector_align8(int2, int x; int y;);
+__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
+
+struct __device_builtin__ int3
+{
+    int x, y, z;
+};
+
+struct __device_builtin__ uint3
+{
+    unsigned int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) int4
+{
+    int x, y, z, w;
+};
+
+struct __device_builtin__ __builtin_align__(16) uint4
+{
+    unsigned int x, y, z, w;
+};
+
+struct __device_builtin__ long1
+{
+    long int x;
+};
+
+struct __device_builtin__ ulong1
+{
+    unsigned long x;
+};
+
+#if defined(__CUDACC_RTC__) || defined(_WIN32)
+__cuda_builtin_vector_align8(long2, long int x; long int y;);
+__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
+#else /* __CUDACC_RTC__ || _WIN32 */
+
+struct __device_builtin__ __align__(2*sizeof(long int)) long2
+{
+    long int x, y;
+};
+
+struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
+{
+    unsigned long int x, y;
+};
+
+#endif /* __CUDACC_RTC__ || _WIN32 */
+
+struct __device_builtin__ long3
+{
+    long int x, y, z;
+};
+
+struct __device_builtin__ ulong3
+{
+    unsigned long int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) long4
+{
+    long int x, y, z, w;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulong4
+{
+    unsigned long int x, y, z, w;
+};
+
+struct __device_builtin__ float1
+{
+    float x;
+};
+
+#if !defined(__CUDACC__) && !defined(__CUDABE__) && defined(__arm__) && \
+    defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-pedantic"
+
+struct __device_builtin__ __attribute__((aligned(8))) float2
+{
+    float x; float y; float __cuda_gnu_arm_ice_workaround[0];
+};
+
+#pragma GCC poison __cuda_gnu_arm_ice_workaround
+#pragma GCC diagnostic pop
+
+#else /* !__CUDACC__ && !__CUDABE__ && __arm__ && __ARM_PCS_VFP &&
+         __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
+
+__cuda_builtin_vector_align8(float2, float x; float y;);
+
+#endif /* !__CUDACC__ && !__CUDABE__ && __arm__ && __ARM_PCS_VFP &&
+          __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
+
+struct __device_builtin__ float3
+{
+    float x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) float4
+{
+    float x, y, z, w;
+};
+
+struct __device_builtin__ longlong1
+{
+    long long int x;
+};
+
+struct __device_builtin__ ulonglong1
+{
+    unsigned long long int x;
+};
+
+struct __device_builtin__ __builtin_align__(16) longlong2
+{
+    long long int x, y;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulonglong2
+{
+    unsigned long long int x, y;
+};
+
+struct __device_builtin__ longlong3
+{
+    long long int x, y, z;
+};
+
+struct __device_builtin__ ulonglong3
+{
+    unsigned long long int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) longlong4
+{
+    long long int x, y, z ,w;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulonglong4
+{
+    unsigned long long int x, y, z, w;
+};
+
+struct __device_builtin__ double1
+{
+    double x;
+};
+
+struct __device_builtin__ __builtin_align__(16) double2
+{
+    double x, y;
+};
+
+struct __device_builtin__ double3
+{
+    double x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) double4
+{
+    double x, y, z, w;
+};
+
+#if !defined(__CUDACC__) && !defined(__CUDABE__) && \
+    defined(_WIN32) && !defined(_WIN64)
+
+#pragma warning(pop)
+
+#endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+typedef __device_builtin__ struct char1 char1;
+typedef __device_builtin__ struct uchar1 uchar1;
+typedef __device_builtin__ struct char2 char2;
+typedef __device_builtin__ struct uchar2 uchar2;
+typedef __device_builtin__ struct char3 char3;
+typedef __device_builtin__ struct uchar3 uchar3;
+typedef __device_builtin__ struct char4 char4;
+typedef __device_builtin__ struct uchar4 uchar4;
+typedef __device_builtin__ struct short1 short1;
+typedef __device_builtin__ struct ushort1 ushort1;
+typedef __device_builtin__ struct short2 short2;
+typedef __device_builtin__ struct ushort2 ushort2;
+typedef __device_builtin__ struct short3 short3;
+typedef __device_builtin__ struct ushort3 ushort3;
+typedef __device_builtin__ struct short4 short4;
+typedef __device_builtin__ struct ushort4 ushort4;
+typedef __device_builtin__ struct int1 int1;
+typedef __device_builtin__ struct uint1 uint1;
+typedef __device_builtin__ struct int2 int2;
+typedef __device_builtin__ struct uint2 uint2;
+typedef __device_builtin__ struct int3 int3;
+typedef __device_builtin__ struct uint3 uint3;
+typedef __device_builtin__ struct int4 int4;
+typedef __device_builtin__ struct uint4 uint4;
+typedef __device_builtin__ struct long1 long1;
+typedef __device_builtin__ struct ulong1 ulong1;
+typedef __device_builtin__ struct long2 long2;
+typedef __device_builtin__ struct ulong2 ulong2;
+typedef __device_builtin__ struct long3 long3;
+typedef __device_builtin__ struct ulong3 ulong3;
+typedef __device_builtin__ struct long4 long4;
+typedef __device_builtin__ struct ulong4 ulong4;
+typedef __device_builtin__ struct float1 float1;
+typedef __device_builtin__ struct float2 float2;
+typedef __device_builtin__ struct float3 float3;
+typedef __device_builtin__ struct float4 float4;
+typedef __device_builtin__ struct longlong1 longlong1;
+typedef __device_builtin__ struct ulonglong1 ulonglong1;
+typedef __device_builtin__ struct longlong2 longlong2;
+typedef __device_builtin__ struct ulonglong2 ulonglong2;
+typedef __device_builtin__ struct longlong3 longlong3;
+typedef __device_builtin__ struct ulonglong3 ulonglong3;
+typedef __device_builtin__ struct longlong4 longlong4;
+typedef __device_builtin__ struct ulonglong4 ulonglong4;
+typedef __device_builtin__ struct double1 double1;
+typedef __device_builtin__ struct double2 double2;
+typedef __device_builtin__ struct double3 double3;
+typedef __device_builtin__ struct double4 double4;
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+struct __device_builtin__ dim3
+{
+    unsigned int x, y, z;
+#if defined(__cplusplus)
+    __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
+    __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
+    __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
+#endif /* __cplusplus */
+};
+
+typedef __device_builtin__ struct dim3 dim3;
+
+#undef  __cuda_builtin_vector_align8
+
+#endif /* !__VECTOR_TYPES_H__ */
--- a/include/isaac/jit/generation/base.h
+++ b/include/isaac/jit/generation/base.h
@@ -85,11 +85,20 @@ public:
  virtual int is_invalid(expression_tree const & expressions, driver::Device const & device) const = 0;
  virtual void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & expressions) = 0;
  std::string generate(std::string const & suffix, expression_tree const & expressions, driver::Device const & device);
-  std::shared_ptr<base> getptr() {
-      return shared_from_this();
-  }
+  std::shared_ptr<base> getptr();
 };

+class external_base: public base
+{
+private:
+  virtual std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & mapping);
+public:
+  external_base();
+  virtual unsigned int temporary_workspace(expression_tree const &) const;
+  virtual unsigned int lmem_usage(expression_tree const &) const;
+  virtual unsigned int registers_usage(expression_tree const &) const;
+  virtual std::vector<int_t> input_sizes(expression_tree const & expressions) const = 0;
+};

 class parameterized_base : public base
 {
--- a/include/isaac/jit/generation/gemm.h
+++ b/include/isaac/jit/generation/gemm.h
@@ -31,6 +31,21 @@ namespace isaac
 namespace templates
 {

+
+class cublas_gemm : public external_base
+{
+  bool init();
+public:
+  cublas_gemm(char A_trans, char B_trans);
+  int is_invalid(expression_tree const  &, driver::Device const &) const;
+  std::vector<int_t> input_sizes(expression_tree const & expressions) const;
+  void enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & h);
+private:
+  const char A_trans_;
+  const char B_trans_;
+  bool init_;
+};
+
 class gemm : public parameterized_base
 {
 private:
@@ -41,16 +56,16 @@ private:
  std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const &) const;
  void enqueue_block(driver::CommandQueue & queue, int_t M, int_t N, int_t K, const expression_tree::node &A, const expression_tree::node &B, const expression_tree::node &C,
                     value_scalar const &alpha, value_scalar const &beta, driver::Program const & program, std::string const & suffix, runtime::execution_options_type const & options);
-  std::vector<int_t> infos(expression_tree const & expressions,  isaac::symbolic::preset::gemm::args &arguments) const;
+
 public:
  gemm(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
       , int_t ms, int_t ks, int_t ns, fetch_type Afetch , fetch_type Bfetch
       , int_t lf0, int_t lf1, char A_trans, char B_trans);
  std::vector<int_t> input_sizes(expression_tree const & expressions) const;
-  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &ctr);
+  void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & h);
+
 private:
  //Parameters
-
  unsigned int mL_;
  unsigned int kL_;
  unsigned int nL_;
--- a/lib/driver/dispatch.cpp
+++ b/lib/driver/dispatch.cpp
@@ -60,6 +60,10 @@ namespace driver
 #define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
 {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }

+#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
+ {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
+
+
 //Specialized helpers for OpenCL
 #define OCL_DEFINE1(ret, fname, t1) DEFINE1(clinit, opencl_, ret, fname, t1)
 #define OCL_DEFINE2(ret, fname, t1, t2) DEFINE2(clinit, opencl_, ret, fname, t1, t2)
@@ -96,6 +100,8 @@ namespace driver
 #define NVRTC_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
 #define NVRTC_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)

+#define CUBLAS_DEFINE1(ret, fname, t1) DEFINE1(cublasinit, cublas_, ret, fname, t1)
+#define CUBLAS_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cublasinit, cublas_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13)

 bool dispatch::clinit()
 {
@@ -118,6 +124,16 @@ bool dispatch::nvrtcinit()
  return nvrtc_ != nullptr;
 }

+bool dispatch::cublasinit()
+{
+  if(cublas_==nullptr){
+    cublas_ = dlopen("libcublas.so", RTLD_LAZY);
+    if(cublas_!=nullptr)
+      cublasCreate(&cublas_handle_);
+  }
+  return cublas_ != nullptr;
+}
+

 //OpenCL

@@ -196,6 +212,20 @@ NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTXSize, nvrtcProgram, size_t *)
 NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **)
 NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *)

+CUBLAS_DEFINE1(void, cublasCreate, cublasHandle_t*)
+
+void dispatch::cublasGetStream(cudaStream_t *a)
+{ f_impl<dispatch::cublasinit>(cublas_, cublasGetStream_v2, cublasGetStream_, "cublasGetStream_v2", cublas_handle_, a); }
+
+void dispatch::cublasSetStream(cudaStream_t a)
+{ f_impl<dispatch::cublasinit>(cublas_, cublasSetStream_v2, cublasSetStream_, "cublasSetStream_v2", cublas_handle_, a); }
+
+void dispatch::cublasSgemm(cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc)
+{ f_impl<dispatch::cublasinit>(cublas_, cublasSgemm_v2, cublasSgemm_, "cublasSgemm_v2", cublas_handle_, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
+
+void dispatch::cublasDgemm(cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc)
+{ f_impl<dispatch::cublasinit>(cublas_, cublasDgemm_v2, cublasDgemm_, "cublasDgemm_v2", cublas_handle_, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
+
 void dispatch::release()
 {
  if(opencl_){
@@ -210,11 +240,17 @@ void dispatch::release()
    dlclose(nvrtc_);
    nvrtc_ = nullptr;
  }
+  if(cublas_){
+    dlclose(cublas_);
+    cublas_ = nullptr;
+  }
 }

 void * dispatch::opencl_;
 void * dispatch::cuda_;
 void * dispatch::nvrtc_;
+void * dispatch::cublas_;
+cublasHandle_t dispatch::cublas_handle_;

 //OpenCL
 void* dispatch::clBuildProgram_;
@@ -288,5 +324,11 @@ void* dispatch::nvrtcGetPTXSize_;
 void* dispatch::nvrtcCreateProgram_;
 void* dispatch::nvrtcGetProgramLog_;

+void* dispatch::cublasCreate_;
+void* dispatch::cublasGetStream_;
+void* dispatch::cublasSetStream_;
+void* dispatch::cublasSgemm_;
+void* dispatch::cublasDgemm_;
+
 }
 }
--- a/lib/jit/generation/base.cpp
+++ b/lib/jit/generation/base.cpp
@@ -66,6 +66,24 @@ std::string base::generate(std::string const & suffix, expression_tree const  &
  return generate_impl(suffix, expression, device, mapping);
 }

+
+/* External base */
+external_base::external_base()
+{ }
+
+std::string external_base::generate_impl(std::string const &, expression_tree const &, driver::Device const &, symbolic::symbols_table const &)
+{ return ""; }
+
+unsigned int external_base::temporary_workspace(expression_tree const &) const
+{ return 0; }
+
+unsigned int external_base::lmem_usage(expression_tree const &) const
+{ return 0; }
+
+unsigned int external_base::registers_usage(expression_tree const &) const
+{ return 0; }
+
+/* Parameterized base */
 int parameterized_base::is_invalid_impl(driver::Device const &, expression_tree const  &) const
 { return TEMPLATE_VALID; }

@@ -104,5 +122,8 @@ int parameterized_base::is_invalid(expression_tree const  & expressions, driver:
  return is_invalid_impl(device, expressions);
 }

+std::shared_ptr<base> base::getptr()
+{ return shared_from_this(); }
+
 }
 }
--- a/lib/jit/generation/gemm.cpp
+++ b/lib/jit/generation/gemm.cpp
@@ -1,4 +1,4 @@
-/*
+/*
 * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
 *
 * This file is part of ISAAC.
@@ -20,6 +20,7 @@
 */

 #include "isaac/array.h"
+#include "isaac/driver/dispatch.h"
 #include "isaac/jit/syntax/expression/preset.h"
 #include "isaac/jit/syntax/engine/process.h"
 #include "isaac/jit/generation/gemm.h"
@@ -37,8 +38,73 @@ namespace isaac
 namespace templates
 {

-  unsigned int gemm::lmem_usage(expression_tree const & expression) const
-  {
+std::vector<int_t> infos(expression_tree const & tree, symbolic::preset::gemm::args& arguments, char A_trans)
+{
+  expression_tree::data_type const & array = tree.data();
+  std::size_t root = tree.root();
+  arguments = symbolic::preset::gemm::check(array, root);
+  int_t M = arguments.C->shape[0];
+  int_t N = arguments.C->shape[1];
+  int_t K = (A_trans=='T')?arguments.A->shape[0]:arguments.A->shape[1];
+  return {M, N, K};
+}
+
+/* ------------------ CUBLAS ------------------ */
+bool cublas_gemm::init()
+{
+  return driver::dispatch::cublasinit();
+}
+
+cublas_gemm::cublas_gemm(char A_trans, char B_trans): A_trans_(A_trans), B_trans_(B_trans), init_(driver::dispatch::cublasinit())
+{ }
+
+int cublas_gemm::is_invalid(expression_tree const  &, driver::Device const & device) const
+{ return init_ && device.backend()==driver::CUDA; }
+
+std::vector<int_t> cublas_gemm::input_sizes(expression_tree const & expressions) const
+{
+  symbolic::preset::gemm::args dummy;
+  return infos((expression_tree&)expressions, dummy, A_trans_);
+}
+
+void cublas_gemm::enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & control)
+{
+  namespace drv = driver;;
+  //Get GEMM info
+  symbolic::preset::gemm::args args;
+  std::vector<int_t> MNK = infos(control.x(), args, A_trans_);
+  int_t M = MNK[0], N = MNK[1], K = MNK[2];
+  CUdeviceptr cuA = args.A->array.handle.cu;
+  CUdeviceptr cuB = args.B->array.handle.cu;
+  CUdeviceptr cuC = args.C->array.handle.cu;
+  runtime::execution_options_type const & opt = control.execution_options();
+  auto cuT = [](char xt) { return xt=='N'?CUBLAS_OP_N:CUBLAS_OP_T; };
+  //Set new stream
+  cudaStream_t bkp;
+  drv::Event event(drv::CUDA);
+  drv::dispatch::cublasGetStream(&bkp);
+  drv::dispatch::cublasSetStream((cudaStream_t)queue.handle().cu());
+  values_holder alpha = args.alpha.values();
+  values_holder beta = args.beta.values();
+  if(opt.events)
+    drv::check(drv::dispatch::cuEventRecord(event.handle().cu().first, queue.handle().cu()));
+  if(args.C->dtype==FLOAT_TYPE)
+    drv::dispatch::cublasSgemm(cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float32, (float*)cuA, args.A->ld[1], (float*)cuB, args.B->ld[1], &beta.float32, (float*)cuC, args.C->ld[1]);
+  else
+    drv::dispatch::cublasDgemm(cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float64, (double*)cuA, args.A->ld[1], (double*)cuB, args.B->ld[1], &beta.float64, (double*)cuC, args.C->ld[1]);
+  if(opt.events){
+    drv::check(drv::dispatch::cuEventRecord(event.handle().cu().second, queue.handle().cu()));
+    opt.events->push_back(event);
+  }
+  //Revert old stream
+  drv::dispatch::cublasSetStream(bkp);
+}
+
+
+
+/* -------------------------------------------- */
+unsigned int gemm::lmem_usage(expression_tree const & expression) const
+{
  unsigned int N = 0;
  size_t llda = (A_trans_=='N')?mL_:kL_+vwidth_;
  size_t lnda = (A_trans_=='N')?kL_:mL_;
@@ -47,25 +113,25 @@ namespace templates
  N += llda*lnda;
  N += lldb*lndb;
  return N*size_of(expression.dtype());
-  }
+}

-  unsigned int gemm::registers_usage(expression_tree const & expression) const
-  {
+unsigned int gemm::registers_usage(expression_tree const & expression) const
+{
  unsigned int N = mS_ * nS_ + mS_ * kS_ + kS_ * nS_;
  return N*size_of(expression.dtype());
-  }
+}

-  unsigned int gemm::temporary_workspace(expression_tree const & expressions) const
-  {
+unsigned int gemm::temporary_workspace(expression_tree const & expressions) const
+{
  std::vector<int_t> MNK = input_sizes(expressions);
  int_t M = MNK[0]; int_t N = MNK[1];
  if(depth_ > 1)
    return M*N*depth_;
  return 0;
-  }
+}

-  int gemm::is_invalid_impl(driver::Device const &, expression_tree const &) const
-  {
+int gemm::is_invalid_impl(driver::Device const &, expression_tree const &) const
+{
  if(Afetch_!=FETCH_FROM_LOCAL || Bfetch_!=FETCH_FROM_LOCAL)
    return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;

@@ -109,10 +175,10 @@ namespace templates
  }

  return TEMPLATE_VALID;
-  }
+}

-  std::string gemm::generate_impl(std::string const & suffix, expression_tree const & tree, driver::Device const & device, symbolic::symbols_table const &) const
-  {
+std::string gemm::generate_impl(std::string const & suffix, expression_tree const & tree, driver::Device const & device, symbolic::symbols_table const &) const
+{
  using std::string;
  using tools::to_string;

@@ -124,7 +190,7 @@ namespace templates
 #define VSTORE_LDSB(value, offset, ptr) vstore(vwidth_, sdtype, value, offset, ptr, "1", backend, lldb%vwidth_==0)

  symbolic::preset::gemm::args args;
-    infos(tree, args);
+  infos(tree, args, A_trans_);
  std::string ASTRIDE1 = (args.A->ld[0] > 1)?"*Astride1":"";
  std::string BSTRIDE1 = (args.B->ld[0] > 1)?"*Bstride1":"";
  std::string CSTRIDE1 = (args.C->ld[0] > 1)?"*Cstride1":"";
@@ -574,13 +640,13 @@ namespace templates

 #undef VLOAD
 #undef VST0RE
-  }
+}

-  void gemm::enqueue_block(driver::CommandQueue & queue, int_t M, int_t N, int_t K,
+void gemm::enqueue_block(driver::CommandQueue & queue, int_t M, int_t N, int_t K,
                         expression_tree::node const & A, expression_tree::node const & B, expression_tree::node const & C,
                         value_scalar const & alpha, value_scalar const & beta,
                         driver::Program const & program, std::string const & suffix, runtime::execution_options_type const & options)
-  {
+{
  using tools::align;

  if(M==0 || N==0 || K==0)
@@ -665,45 +731,34 @@ namespace templates
    options.enqueue(program.context(), reduce, global, local);
  }

-  }
+}

-  std::vector<int_t> gemm::infos(expression_tree const & tree, symbolic::preset::gemm::args& arguments) const
-  {
-    expression_tree::data_type const & array = tree.data();
-    std::size_t root = tree.root();
-    arguments = symbolic::preset::gemm::check(array, root);
-    int_t M = arguments.C->shape[0];
-    int_t N = arguments.C->shape[1];
-    int_t K = (A_trans_=='T')?arguments.A->shape[0]:arguments.A->shape[1];
-    return {M, N, K};
-  }
-
-  gemm::gemm(unsigned int vwidth
+gemm::gemm(unsigned int vwidth
           ,int_t ls0, int_t kL, int_t ls1, int_t D
           ,int_t ms, int_t ks, int_t ns
           ,fetch_type Afetch , fetch_type Bfetch
           ,int_t lf0, int_t lf1, char A_trans, char B_trans) :
  parameterized_base(vwidth, ls0, ls1), mL_(ms*ls0), kL_(kL), nL_(ns*ls1), depth_(D), mS_(ms), kS_(ks), nS_(ns),
  Afetch_(Afetch), Bfetch_(Bfetch), lf0_(lf0), lf1_(lf1), A_trans_(A_trans), B_trans_(B_trans)
-  {
+{
  if(A_trans_=='N' && B_trans_=='N') type_ = GEMM_NN;
  else if(A_trans_=='T' && B_trans_=='N') type_ = GEMM_TN;
  else if(A_trans_=='N' && B_trans_=='T') type_ = GEMM_NT;
  else if(A_trans_=='T' && B_trans_=='T') type_ = GEMM_TT;
  else throw;
-  }
+}

-  std::vector<int_t> gemm::input_sizes(expression_tree const & expressions) const
-  {
+std::vector<int_t> gemm::input_sizes(expression_tree const & expressions) const
+{
  symbolic::preset::gemm::args dummy;
-    return infos((expression_tree&)expressions, dummy);
-  }
+  return infos((expression_tree&)expressions, dummy, A_trans_);
+}

-  void gemm::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & control)
-  {
+void gemm::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & control)
+{
  expression_tree const & expressions = control.x();
  symbolic::preset::gemm::args args;
-    std::vector<int_t> MNK = infos(expressions, args);
+  std::vector<int_t> MNK = infos(expressions, args, A_trans_);
  int_t M = MNK[0];
  int_t N = MNK[1];
  int_t K = MNK[2];
@@ -713,44 +768,44 @@ namespace templates
  //Enqueue
  runtime::execution_options_type const & options = control.execution_options();
  enqueue_block(queue,  M, N, K, *args.A, *args.B, *args.C, args.alpha, args.beta, program, suffix, options);
-  }
+}

-  //
-  gemm_nn::gemm_nn(unsigned int vwidth
+//
+gemm_nn::gemm_nn(unsigned int vwidth
                 , int_t ls0, int_t KL, int_t ls1, int_t D
                 , int_t ms, int_t ks, int_t ns
                 , fetch_type Afetch , fetch_type Bfetch
                 , int_t lf0, int_t lf1) :
  gemm(vwidth, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lf0, lf1, 'N', 'N')
-  {
-  }
+{
+}

-  //
-  gemm_tn::gemm_tn(unsigned int vwidth
+//
+gemm_tn::gemm_tn(unsigned int vwidth
                 , int_t ls0, int_t KL, int_t ls1, int_t D
                 , int_t ms, int_t ks, int_t ns
                 , fetch_type Afetch , fetch_type Bfetch
                 , int_t lf0, int_t lf1) :
  gemm(vwidth, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lf0, lf1, 'T', 'N')
-  { }
+{ }

-  //
-  gemm_nt::gemm_nt(unsigned int vwidth
+//
+gemm_nt::gemm_nt(unsigned int vwidth
                 , int_t ls0, int_t KL, int_t ls1, int_t D
                 , int_t ms, int_t ks, int_t ns
                 , fetch_type Afetch , fetch_type Bfetch
                 , int_t lf0, int_t lf1) :
  gemm(vwidth, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lf0, lf1, 'N', 'T')
-  { }
+{ }

-  //
-  gemm_tt::gemm_tt(unsigned int vwidth
+//
+gemm_tt::gemm_tt(unsigned int vwidth
                 , int_t ls0, int_t KL, int_t ls1, int_t D
                 , int_t ms, int_t ks, int_t ns
                 , fetch_type Afetch , fetch_type Bfetch
                 , int_t lf0, int_t lf1) :
  gemm(vwidth, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lf0, lf1, 'T', 'T')
-  { }
+{ }

 }
 }
--- a/python/setup.py
+++ b/python/setup.py
@@ -73,7 +73,7 @@ def main():
      libraries += ['gnustl_shared']

    #Source files
-    src =  'src/lib/runtime/predictors/random_forest.cpp src/lib/runtime/profiles.cpp src/lib/runtime/database.cpp src/lib/runtime/execute.cpp src/lib/exception/driver.cpp src/lib/exception/api.cpp src/lib/random/rand.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/gemm.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/object.cpp src/lib/value_scalar.cpp src/lib/array.cpp src/lib/api/blas/cublas.cpp src/lib/api/blas/clBLAS.cpp src/lib/driver/dispatch.cpp src/lib/driver/kernel.cpp src/lib/driver/backend.cpp src/lib/driver/platform.cpp src/lib/driver/buffer.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/check.cpp src/lib/driver/command_queue.cpp src/lib/driver/handle.cpp src/lib/driver/context.cpp src/lib/driver/program.cpp '.split() + [os.path.join('src', 'bind', sf)  for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
+    src =  'src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/value_scalar.cpp src/lib/random/rand.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/driver/dispatch.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/generation/gemm.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/runtime/execute.cpp src/lib/runtime/predictors/random_forest.cpp src/lib/runtime/profiles.cpp src/lib/runtime/database.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf)  for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
    boostsrc = 'external/boost/libs/'
    for s in ['numpy','python','smart_ptr','system','thread']:
        src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]