diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2fb182135..e921f7275 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -184,7 +184,6 @@ target_link_libraries(triton
   TritonAnalysis
   TritonTransforms
   TritonGPUTransforms
-  TritonDriver
   TritonLLVMIR
   TritonPTX
   ${dialect_libs}
diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
index ac7e877dc..7fb68f48a 100644
--- a/bin/CMakeLists.txt
+++ b/bin/CMakeLists.txt
@@ -26,35 +26,35 @@ target_link_libraries(triton-opt PRIVATE
 mlir_check_all_link_libraries(triton-opt)
 
 
-add_llvm_executable(triton-translate triton-translate.cpp PARTIAL_SOURCES_INTENDED)
-llvm_update_compile_flags(triton-translate)
-target_link_libraries(triton-translate PRIVATE
-        TritonAnalysis
-        TritonTransforms
-        TritonGPUTransforms
-        TritonLLVMIR
-        TritonDriver
-        ${dialect_libs}
-        ${conversion_libs}
-        # tests
-        TritonTestAnalysis
+# add_llvm_executable(triton-translate triton-translate.cpp PARTIAL_SOURCES_INTENDED)
+#llvm_update_compile_flags(triton-translate)
+# target_link_libraries(triton-translate PRIVATE
+#         TritonAnalysis
+#         TritonTransforms
+#         TritonGPUTransforms
+#         TritonLLVMIR
+#         TritonDriver
+#         ${dialect_libs}
+#         ${conversion_libs}
+#         # tests
+#         TritonTestAnalysis
 
-        LLVMCore
-        LLVMSupport
-        LLVMOption
-        LLVMCodeGen
-        LLVMAsmParser
+#         LLVMCore
+#         LLVMSupport
+#         LLVMOption
+#         LLVMCodeGen
+#         LLVMAsmParser
 
-        # MLIR core
-        MLIROptLib
-        MLIRIR
-        MLIRPass
-        MLIRSupport
-        MLIRTransforms
-        MLIRExecutionEngine
-        MLIRMathToLLVM
-        MLIRTransformUtils
-        MLIRLLVMToLLVMIRTranslation
-        MLIRNVVMToLLVMIRTranslation
-        )
-mlir_check_all_link_libraries(triton-translate)
+#         # MLIR core
+#         MLIROptLib
+#         MLIRIR
+#         MLIRPass
+#         MLIRSupport
+#         MLIRTransforms
+#         MLIRExecutionEngine
+#         MLIRMathToLLVM
+#         MLIRTransformUtils
+#         MLIRLLVMToLLVMIRTranslation
+#         MLIRNVVMToLLVMIRTranslation
+#         )
+# mlir_check_all_link_libraries(triton-translate)
diff --git a/include/triton/Target/PTX/PTXTranslation.h b/include/triton/Target/PTX/PTXTranslation.h
index 45f8e5240..df15edc73 100644
--- a/include/triton/Target/PTX/PTXTranslation.h
+++ b/include/triton/Target/PTX/PTXTranslation.h
@@ -1,34 +1,17 @@
 #ifndef TRITON_TARGET_PTXTRANSLATION_H
 #define TRITON_TARGET_PTXTRANSLATION_H
 
-#include "triton/driver/dispatch.h"
-
+#include <memory>
 #include <string>
 
-namespace mlir {
-
-class ModuleOp;
-
-} // namespace mlir
+namespace llvm {
+class Module;
+} // namespace llvm
 
 namespace triton {
 
-template <CUdevice_attribute attr> int cuGetInfo(CUdevice device) {
-  int res;
-  driver::dispatch::cuDeviceGetAttribute(&res, attr, device);
-  return res;
-}
-
-void getCuCCAndVersionFromDevice(uint64_t device, int *cc, int *version,
-                                 std::string *ptxasPath);
-
 // Translate TritonGPU IR to PTX code.
-std::tuple<std::string, // ptx code
-           size_t,      // PTX cc
-           int,         // PTX version
-           std::string  // ptxas path
-           >
-translateTritonGPUToPTX(mlir::ModuleOp module, uint64_t device);
+std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version);
 
 } // namespace triton
 
diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h
deleted file mode 100644
index de0fa403c..000000000
--- a/include/triton/driver/dispatch.h
+++ /dev/null
@@ -1,376 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_DISPATCH_H_
-#define _TRITON_DRIVER_DISPATCH_H_
-
-#include <dlfcn.h>
-#include <type_traits>
-
-// CUDA Backend
-#include "triton/external/CUDA/cuda.h"
-#include "triton/external/CUDA/nvml.h"
-
-//// HIP backend
-//#define __HIP_PLATFORM_AMD__
-#include "triton/external/hip.h"
-
-// Exceptions
-#include <iostream>
-#include <stdexcept>
-
-namespace llvm {
-class PassRegistry;
-class Module;
-} // namespace llvm
-
-namespace triton {
-namespace driver {
-
-class cu_context;
-
-template <class T> void check(T) {}
-void check(CUresult err);
-void check(hipError_t err);
-
-class dispatch {
-protected:
-  template <class F> struct return_type;
-
-  template <class R, class... A> struct return_type<R (*)(A...)> {
-    typedef R type;
-  };
-
-  typedef bool (*f_init_t)();
-
-  template <f_init_t initializer, typename FunPtrT, typename... Args>
-  static typename return_type<FunPtrT>::type
-  f_impl(void *&lib_h, FunPtrT, void *&cache, const char *name, Args... args) {
-    initializer();
-    if (cache == nullptr) {
-      cache = dlsym(lib_h, name);
-      if (cache == 0) {
-#ifdef __EXCEPTIONS
-        throw std::runtime_error("dlsym unable to load function");
-#else
-        std::cerr << "Triton: dlsym unable to load function `" << name << "`"
-                  << std::endl;
-        std::abort();
-#endif
-      }
-    }
-    FunPtrT fptr;
-    *reinterpret_cast<void **>(&fptr) = cache;
-    typename return_type<FunPtrT>::type res = (*fptr)(args...);
-    check(res);
-    return res;
-  }
-
-public:
-  static void release();
-  // Nvidia
-  static bool nvmlinit();
-  static bool cuinit();
-  // AMD
-  static bool hipinit();
-
-  /* ------------------- *
-   * CUDA
-   * ------------------- */
-  // context management
-  static CUresult cuInit(unsigned int Flags);
-  static CUresult cuCtxDestroy_v2(CUcontext ctx);
-  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags,
-                                 CUdevice dev);
-  static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
-  static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
-  static CUresult cuCtxGetDevice(CUdevice *result);
-  static CUresult cuCtxEnablePeerAccess(CUcontext peerContext,
-                                        unsigned int flags);
-  static CUresult cuDriverGetVersion(int *driverVersion);
-  // device management
-  static CUresult cuDeviceGet(CUdevice *device, int ordinal);
-  static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
-  static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev);
-  static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
-                                       CUdevice dev);
-  static CUresult cuDeviceGetCount(int *count);
-  // link management
-  static CUresult cuLinkAddData_v2(CUlinkState state, CUjitInputType type,
-                                   void *data, size_t size, const char *name,
-                                   unsigned int numOptions,
-                                   CUjit_option *options, void **optionValues);
-  static CUresult cuLinkCreate_v2(unsigned int numOptions,
-                                  CUjit_option *options, void **optionValues,
-                                  CUlinkState *stateOut);
-  static CUresult cuLinkComplete(CUlinkState state, void **cubinOut,
-                                 size_t *sizeOut);
-  static CUresult cuLinkDestroy(CUlinkState state);
-  // module management
-  static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t *bytes,
-                                       CUmodule hmod, const char *name);
-  static CUresult cuModuleLoad(CUmodule *module, const char *fname);
-  static CUresult cuModuleLoadData(CUmodule *module, const void *image);
-  static CUresult cuModuleUnload(CUmodule hmod);
-  static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
-                                     unsigned int numOptions,
-                                     CUjit_option *options,
-                                     void **optionValues);
-  static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
-                                      const char *name);
-  // stream management
-  static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
-  static CUresult cuStreamSynchronize(CUstream hStream);
-  static CUresult cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
-  static CUresult cuStreamDestroy_v2(CUstream hStream);
-  static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
-                                 unsigned int gridDimY, unsigned int gridDimZ,
-                                 unsigned int blockDimX, unsigned int blockDimY,
-                                 unsigned int blockDimZ,
-                                 unsigned int sharedMemBytes, CUstream hStream,
-                                 void **kernelParams, void **extra);
-  // function management
-  static CUresult cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
-                                     CUfunction hfunc);
-  static CUresult cuFuncSetAttribute(CUfunction hfunc,
-                                     CUfunction_attribute attrib, int value);
-  static CUresult cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
-  // memory management
-  static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
-  static CUresult cuPointerGetAttribute(void *data,
-                                        CUpointer_attribute attribute,
-                                        CUdeviceptr ptr);
-  static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N,
-                                  CUstream stream);
-  static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice,
-                                  size_t ByteCount);
-  static CUresult cuMemFree_v2(CUdeviceptr dptr);
-  static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice,
-                                       size_t ByteCount, CUstream hStream);
-  static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice,
-                                       const void *srcHost, size_t ByteCount,
-                                       CUstream hStream);
-  static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost,
-                                  size_t ByteCount);
-  // event management
-  static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
-  static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
-                                     CUevent hEnd);
-  static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
-  static CUresult cuEventDestroy_v2(CUevent hEvent);
-
-  /* ------------------- *
-   * NVML
-   * ------------------- */
-  static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2(const char *pciBusId,
-                                                       nvmlDevice_t *device);
-  static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device,
-                                             nvmlClockType_t type,
-                                             unsigned int *clock);
-  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device,
-                                                nvmlClockType_t type,
-                                                unsigned int *clock);
-  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device,
-                                                      unsigned int mem_clock,
-                                                      unsigned int sm_clock);
-
-  /* ------------------- *
-   * HIP
-   * ------------------- */
-  // context management
-  static hipError_t hipInit(unsigned int Flags);
-  static hipError_t hipCtxDestroy(hipCtx_t ctx);
-  static hipError_t hipCtxCreate(hipCtx_t *pctx, unsigned int flags,
-                                 hipDevice_t dev);
-  static hipError_t hipCtxPushCurrent(hipCtx_t ctx);
-  static hipError_t hipCtxPopCurrent(hipCtx_t *pctx);
-  static hipError_t hipCtxGetDevice(hipDevice_t *result);
-  static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerContext,
-                                           unsigned int flags);
-  static hipError_t hipDriverGetVersion(int *driverVersion);
-  // device management
-  static hipError_t hipGetDevice(hipDevice_t *device, int ordinal);
-  static hipError_t hipDeviceGetName(char *name, int len, hipDevice_t dev);
-  static hipError_t hipDeviceGetPCIBusId(char *id, int len, hipDevice_t dev);
-  static hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attrib,
-                                          hipDevice_t dev);
-  static hipError_t hipGetDeviceCount(int *count);
-  // module management
-  static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
-                                       hipModule_t hmod, const char *name);
-  static hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
-  static hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
-  static hipError_t hipModuleUnload(hipModule_t hmod);
-  static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image,
-                                        unsigned int numOptions,
-                                        hipJitOption *options,
-                                        void **optionValues);
-  static hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
-                                         const char *name);
-  // stream management
-  static hipError_t hipStreamCreate(hipStream_t *phStream, unsigned int Flags);
-  static hipError_t hipStreamSynchronize(hipStream_t hStream);
-  static hipError_t hipStreamDestroy(hipStream_t hStream);
-  static hipError_t
-  hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
-                        unsigned int gridDimY, unsigned int gridDimZ,
-                        unsigned int blockDimX, unsigned int blockDimY,
-                        unsigned int blockDimZ, unsigned int sharedMemBytes,
-                        hipStream_t hStream, void **kernelParams, void **extra);
-  // function management
-  static hipError_t hipFuncGetAttributes(hipFuncAttributes *attrib,
-                                         void *hfunc);
-  static hipError_t hipFuncSetAttribute(hipFunction_t hfunc,
-                                        hipFuncAttribute attrib, int value);
-  static hipError_t hipFuncSetCacheConfig(hipFunction_t hfunc,
-                                          hipFuncCache_t config);
-  // memory management
-  static hipError_t hipMalloc(hipDeviceptr_t *dptr, size_t bytesize);
-  static hipError_t hipPointerGetAttribute(void *data,
-                                           CUpointer_attribute attribute,
-                                           hipDeviceptr_t ptr);
-  static hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char x,
-                                     size_t N, hipStream_t stream);
-  static hipError_t hipMemcpyDtoH(void *dstHost, hipDeviceptr_t srcDevice,
-                                  size_t ByteCount);
-  static hipError_t hipFree(hipDeviceptr_t dptr);
-  static hipError_t hipMemcpyDtoHAsync(void *dstHost, hipDeviceptr_t srcDevice,
-                                       size_t ByteCount, hipStream_t hStream);
-  static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice,
-                                       const void *srcHost, size_t ByteCount,
-                                       hipStream_t hStream);
-  static hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, const void *srcHost,
-                                  size_t ByteCount);
-  // event management
-  static hipError_t hipEventCreate(hipEvent_t *phEvent, unsigned int Flags);
-  static hipError_t hipEventElapsedTime(float *pMilliseconds, hipEvent_t hStart,
-                                        hipEvent_t hEnd);
-  static hipError_t hipEventRecord(hipEvent_t hEvent, hipStream_t hStream);
-  static hipError_t hipEventDestroy(hipEvent_t hEvent);
-
-private:
-  // Libraries
-  static void *cuda_;
-  static void *nvml_;
-  static void *hip_;
-
-  /* ------------------- *
-   * CUDA
-   * ------------------- */
-  // context management
-  static void *cuCtxGetCurrent_;
-  static void *cuCtxSetCurrent_;
-  static void *cuCtxDestroy_v2_;
-  static void *cuCtxCreate_v2_;
-  static void *cuCtxGetDevice_;
-  static void *cuCtxPushCurrent_v2_;
-  static void *cuCtxPopCurrent_v2_;
-  static void *cuCtxEnablePeerAccess_;
-  static void *cuDriverGetVersion_;
-  static void *cuInit_;
-  // device management
-  static void *cuDeviceGet_;
-  static void *cuDeviceGetName_;
-  static void *cuDeviceGetPCIBusId_;
-  static void *cuDeviceGetAttribute_;
-  static void *cuDeviceGetCount_;
-  // link management
-  static void *cuLinkAddData_v2_;
-  static void *cuLinkCreate_v2_;
-  static void *cuLinkDestroy_;
-  static void *cuLinkComplete_;
-  // module management
-  static void *cuModuleGetGlobal_v2_;
-  static void *cuModuleLoad_;
-  static void *cuModuleUnload_;
-  static void *cuModuleLoadDataEx_;
-  static void *cuModuleLoadData_;
-  static void *cuModuleGetFunction_;
-  // stream management
-  static void *cuStreamCreate_;
-  static void *cuStreamSynchronize_;
-  static void *cuStreamDestroy_v2_;
-  static void *cuStreamGetCtx_;
-  static void *cuLaunchKernel_;
-  // function management
-  static void *cuFuncGetAttribute_;
-  static void *cuFuncSetAttribute_;
-  static void *cuFuncSetCacheConfig_;
-  // memory management
-  static void *cuMemcpyDtoH_v2_;
-  static void *cuMemFree_v2_;
-  static void *cuMemcpyDtoHAsync_v2_;
-  static void *cuMemcpyHtoDAsync_v2_;
-  static void *cuMemcpyHtoD_v2_;
-  static void *cuMemAlloc_v2_;
-  static void *cuMemsetD8Async_;
-  static void *cuPointerGetAttribute_;
-  // event management
-  static void *cuEventCreate_;
-  static void *cuEventElapsedTime_;
-  static void *cuEventRecord_;
-  static void *cuEventDestroy_v2_;
-
-  /* ------------------- *
-   * NVML
-   * ------------------- */
-  static void *nvmlInit_v2_;
-  static void *nvmlDeviceGetHandleByPciBusId_v2_;
-  static void *nvmlDeviceGetClockInfo_;
-  static void *nvmlDeviceGetMaxClockInfo_;
-  static void *nvmlDeviceSetApplicationsClocks_;
-
-  /* ------------------- *
-   * HIP
-   * ------------------- */
-  // context management
-  static void *hipInit_;
-  static void *hipCtxDestroy_;
-  static void *hipCtxCreate_;
-  static void *hipCtxPushCurrent_;
-  static void *hipCtxPopCurrent_;
-  static void *hipCtxGetDevice_;
-  static void *hipCtxEnablePeerAccess_;
-  static void *hipDriverGetVersion_;
-  // device management
-  static void *hipGetDevice_;
-  static void *hipDeviceGetName_;
-  static void *hipDeviceGetPCIBusId_;
-  static void *hipDeviceGetAttribute_;
-  static void *hipGetDeviceCount_;
-  // module management
-  static void *hipModuleGetGlobal_;
-  static void *hipModuleLoad_;
-  static void *hipModuleLoadData_;
-  static void *hipModuleUnload_;
-  static void *hipModuleLoadDataEx_;
-  static void *hipModuleGetFunction_;
-  // stream management
-  static void *hipStreamCreate_;
-  static void *hipStreamSynchronize_;
-  static void *hipStreamDestroy_;
-  static void *hipModuleLaunchKernel_;
-  ;
-  // function management
-  static void *hipFuncGetAttributes_;
-  static void *hipFuncSetAttribute_;
-  static void *hipFuncSetCacheConfig_;
-  // memory management
-  static void *hipMalloc_;
-  static void *hipPointerGetAttribute_;
-  static void *hipMemsetD8Async_;
-  static void *hipMemcpyDtoH_;
-  static void *hipFree_;
-  static void *hipMemcpyDtoHAsync_;
-  static void *hipMemcpyHtoDAsync_;
-  static void *hipMemcpyHtoD_;
-  // event management
-  static void *hipEventCreate_;
-  static void *hipEventElapsedTime_;
-  static void *hipEventRecord_;
-  static void *hipEventDestroy_;
-};
-
-} // namespace driver
-} // namespace triton
-
-#endif
diff --git a/include/triton/driver/error.h b/include/triton/driver/error.h
deleted file mode 100644
index 229e1dee4..000000000
--- a/include/triton/driver/error.h
+++ /dev/null
@@ -1,254 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_ERROR_H_
-#define _TRITON_DRIVER_ERROR_H_
-
-#include "triton/driver/dispatch.h"
-#include <exception>
-
-namespace triton {
-
-namespace driver {
-
-namespace exception {
-
-namespace nvrtc {
-
-#define TRITON_CREATE_NVRTC_EXCEPTION(name, msg)                               \
-  class name : public std::exception {                                         \
-  public:                                                                      \
-    const char *what() const throw() override { return "NVRTC: Error- " msg; } \
-  }
-
-TRITON_CREATE_NVRTC_EXCEPTION(out_of_memory, "out of memory");
-TRITON_CREATE_NVRTC_EXCEPTION(program_creation_failure,
-                              "program creation failure");
-TRITON_CREATE_NVRTC_EXCEPTION(invalid_input, "invalid input");
-TRITON_CREATE_NVRTC_EXCEPTION(invalid_program, "invalid program");
-TRITON_CREATE_NVRTC_EXCEPTION(invalid_option, "invalid option");
-TRITON_CREATE_NVRTC_EXCEPTION(compilation, "compilation");
-TRITON_CREATE_NVRTC_EXCEPTION(builtin_operation_failure,
-                              "builtin operation failure");
-TRITON_CREATE_NVRTC_EXCEPTION(unknown_error, "unknown error");
-
-#undef TRITON_CREATE_NVRTC_EXCEPTION
-} // namespace nvrtc
-
-namespace cuda {
-class base : public std::exception {};
-
-#define TRITON_CREATE_CUDA_EXCEPTION(name, msg)                                \
-  class name : public base {                                                   \
-  public:                                                                      \
-    const char *what() const throw() override { return "CUDA: Error- " msg; }  \
-  }
-
-TRITON_CREATE_CUDA_EXCEPTION(invalid_value, "invalid value");
-TRITON_CREATE_CUDA_EXCEPTION(out_of_memory, "out of memory");
-TRITON_CREATE_CUDA_EXCEPTION(not_initialized, "not initialized");
-TRITON_CREATE_CUDA_EXCEPTION(deinitialized, "deinitialized");
-TRITON_CREATE_CUDA_EXCEPTION(profiler_disabled, "profiler disabled");
-TRITON_CREATE_CUDA_EXCEPTION(profiler_not_initialized,
-                             "profiler not initialized");
-TRITON_CREATE_CUDA_EXCEPTION(profiler_already_started,
-                             "profiler already started");
-TRITON_CREATE_CUDA_EXCEPTION(profiler_already_stopped,
-                             "profiler already stopped");
-TRITON_CREATE_CUDA_EXCEPTION(no_device, "no device");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_device, "invalid device");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_image, "invalid image");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_context, "invalid context");
-TRITON_CREATE_CUDA_EXCEPTION(context_already_current,
-                             "context already current");
-TRITON_CREATE_CUDA_EXCEPTION(map_failed, "map failed");
-TRITON_CREATE_CUDA_EXCEPTION(unmap_failed, "unmap failed");
-TRITON_CREATE_CUDA_EXCEPTION(array_is_mapped, "array is mapped");
-TRITON_CREATE_CUDA_EXCEPTION(already_mapped, "already mapped");
-TRITON_CREATE_CUDA_EXCEPTION(no_binary_for_gpu, "no binary for gpu");
-TRITON_CREATE_CUDA_EXCEPTION(already_acquired, "already acquired");
-TRITON_CREATE_CUDA_EXCEPTION(not_mapped, "not mapped");
-TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_array, "not mapped as array");
-TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer, "not mapped as pointer");
-TRITON_CREATE_CUDA_EXCEPTION(ecc_uncorrectable, "ecc uncorrectable");
-TRITON_CREATE_CUDA_EXCEPTION(unsupported_limit, "unsupported limit");
-TRITON_CREATE_CUDA_EXCEPTION(context_already_in_use, "context already in use");
-TRITON_CREATE_CUDA_EXCEPTION(peer_access_unsupported,
-                             "peer access unsupported");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_ptx, "invalid ptx");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_graphics_context,
-                             "invalid graphics context");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_source, "invalid source");
-TRITON_CREATE_CUDA_EXCEPTION(file_not_found, "file not found");
-TRITON_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found,
-                             "shared object symbol not found");
-TRITON_CREATE_CUDA_EXCEPTION(shared_object_init_failed,
-                             "shared object init failed");
-TRITON_CREATE_CUDA_EXCEPTION(operating_system, "operating system");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_handle, "invalid handle");
-TRITON_CREATE_CUDA_EXCEPTION(not_found, "not found");
-TRITON_CREATE_CUDA_EXCEPTION(not_ready, "not ready");
-TRITON_CREATE_CUDA_EXCEPTION(illegal_address, "illegal address");
-TRITON_CREATE_CUDA_EXCEPTION(launch_out_of_resources,
-                             "launch out of resources");
-TRITON_CREATE_CUDA_EXCEPTION(launch_timeout, "launch timeout");
-TRITON_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing,
-                             "launch incompatible texturing");
-TRITON_CREATE_CUDA_EXCEPTION(peer_access_already_enabled,
-                             "peer access already enabled");
-TRITON_CREATE_CUDA_EXCEPTION(peer_access_not_enabled,
-                             "peer access not enabled");
-TRITON_CREATE_CUDA_EXCEPTION(primary_context_active, "primary context active");
-TRITON_CREATE_CUDA_EXCEPTION(context_is_destroyed, "context is destroyed");
-TRITON_CREATE_CUDA_EXCEPTION(assert_error, "assert");
-TRITON_CREATE_CUDA_EXCEPTION(too_many_peers, "too many peers");
-TRITON_CREATE_CUDA_EXCEPTION(host_memory_already_registered,
-                             "host memory already registered");
-TRITON_CREATE_CUDA_EXCEPTION(host_memory_not_registered,
-                             "hot memory not registered");
-TRITON_CREATE_CUDA_EXCEPTION(hardware_stack_error, "hardware stack error");
-TRITON_CREATE_CUDA_EXCEPTION(illegal_instruction, "illegal instruction");
-TRITON_CREATE_CUDA_EXCEPTION(misaligned_address, "misaligned address");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_address_space, "invalid address space");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_pc, "invalid pc");
-TRITON_CREATE_CUDA_EXCEPTION(launch_failed, "launch failed");
-TRITON_CREATE_CUDA_EXCEPTION(not_permitted, "not permitted");
-TRITON_CREATE_CUDA_EXCEPTION(not_supported, "not supported");
-TRITON_CREATE_CUDA_EXCEPTION(unknown, "unknown");
-
-#undef TRITON_CREATE_CUDA_EXCEPTION
-} // namespace cuda
-
-namespace cublas {
-class base : public std::exception {};
-
-#define TRITON_CREATE_CUBLAS_EXCEPTION(name, msg)                              \
-  class name : public base {                                                   \
-  public:                                                                      \
-    const char *what() const throw() override {                                \
-      return "CUBLAS: Error- " msg;                                            \
-    }                                                                          \
-  }
-
-TRITON_CREATE_CUBLAS_EXCEPTION(not_initialized, "not initialized");
-TRITON_CREATE_CUBLAS_EXCEPTION(alloc_failed, "alloc failed");
-TRITON_CREATE_CUBLAS_EXCEPTION(invalid_value, "invalid value");
-TRITON_CREATE_CUBLAS_EXCEPTION(arch_mismatch, "arch mismatch");
-TRITON_CREATE_CUBLAS_EXCEPTION(mapping_error, "mapping error");
-TRITON_CREATE_CUBLAS_EXCEPTION(execution_failed, "execution failed");
-TRITON_CREATE_CUBLAS_EXCEPTION(internal_error, "internal error");
-TRITON_CREATE_CUBLAS_EXCEPTION(not_supported, "not supported");
-TRITON_CREATE_CUBLAS_EXCEPTION(license_error, "license error");
-TRITON_CREATE_CUBLAS_EXCEPTION(unknown, "unknown");
-
-#undef TRITON_CREATE_CUBLAS_EXCEPTION
-} // namespace cublas
-
-namespace cudnn {
-#define TRITON_CREATE_CUDNN_EXCEPTION(name, msg)                               \
-  class name : public std::exception {                                         \
-  public:                                                                      \
-    const char *what() const throw() override { return "CUDNN: Error- " msg; } \
-  }
-
-TRITON_CREATE_CUDNN_EXCEPTION(not_initialized, "not initialized");
-TRITON_CREATE_CUDNN_EXCEPTION(alloc_failed, "allocation failed");
-TRITON_CREATE_CUDNN_EXCEPTION(bad_param, "bad param");
-TRITON_CREATE_CUDNN_EXCEPTION(internal_error, "internal error");
-TRITON_CREATE_CUDNN_EXCEPTION(invalid_value, "invalid value");
-TRITON_CREATE_CUDNN_EXCEPTION(arch_mismatch, "arch mismatch");
-TRITON_CREATE_CUDNN_EXCEPTION(mapping_error, "mapping error");
-TRITON_CREATE_CUDNN_EXCEPTION(execution_failed, "execution failed");
-TRITON_CREATE_CUDNN_EXCEPTION(not_supported, "not supported");
-TRITON_CREATE_CUDNN_EXCEPTION(license_error, "license error");
-TRITON_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing,
-                              "prerequisite missing");
-TRITON_CREATE_CUDNN_EXCEPTION(runtime_in_progress, "runtime in progress");
-TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow, "runtime fp overflow");
-} // namespace cudnn
-
-namespace hip {
-class base : public std::exception {};
-
-#define TRITON_CREATE_HIP_EXCEPTION(name, msg)                                 \
-  class name : public base {                                                   \
-  public:                                                                      \
-    const char *what() const throw() override { return "HIP: Error- " msg; }   \
-  }
-
-TRITON_CREATE_HIP_EXCEPTION(invalid_value, "invalid value");
-TRITON_CREATE_HIP_EXCEPTION(out_of_memory, "out of memory");
-TRITON_CREATE_HIP_EXCEPTION(not_initialized, "not initialized");
-TRITON_CREATE_HIP_EXCEPTION(deinitialized, "deinitialized");
-TRITON_CREATE_HIP_EXCEPTION(profiler_disabled, "profiler disabled");
-TRITON_CREATE_HIP_EXCEPTION(profiler_not_initialized,
-                            "profiler not initialized");
-TRITON_CREATE_HIP_EXCEPTION(profiler_already_started,
-                            "profiler already started");
-TRITON_CREATE_HIP_EXCEPTION(profiler_already_stopped,
-                            "profiler already stopped");
-TRITON_CREATE_HIP_EXCEPTION(no_device, "no device");
-TRITON_CREATE_HIP_EXCEPTION(invalid_device, "invalid device");
-TRITON_CREATE_HIP_EXCEPTION(invalid_image, "invalid image");
-TRITON_CREATE_HIP_EXCEPTION(invalid_context, "invalid context");
-TRITON_CREATE_HIP_EXCEPTION(context_already_current, "context already current");
-TRITON_CREATE_HIP_EXCEPTION(map_failed, "map failed");
-TRITON_CREATE_HIP_EXCEPTION(unmap_failed, "unmap failed");
-TRITON_CREATE_HIP_EXCEPTION(array_is_mapped, "array is mapped");
-TRITON_CREATE_HIP_EXCEPTION(already_mapped, "already mapped");
-TRITON_CREATE_HIP_EXCEPTION(no_binary_for_gpu, "no binary for gpu");
-TRITON_CREATE_HIP_EXCEPTION(already_acquired, "already acquired");
-TRITON_CREATE_HIP_EXCEPTION(not_mapped, "not mapped");
-TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_array, "not mapped as array");
-TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_pointer, "not mapped as pointer");
-TRITON_CREATE_HIP_EXCEPTION(ecc_uncorrectable, "ecc uncorrectable");
-TRITON_CREATE_HIP_EXCEPTION(unsupported_limit, "unsupported limit");
-TRITON_CREATE_HIP_EXCEPTION(context_already_in_use, "context already in use");
-TRITON_CREATE_HIP_EXCEPTION(peer_access_unsupported, "peer access unsupported");
-TRITON_CREATE_HIP_EXCEPTION(invalid_ptx, "invalid ptx");
-TRITON_CREATE_HIP_EXCEPTION(invalid_graphics_context,
-                            "invalid graphics context");
-TRITON_CREATE_HIP_EXCEPTION(invalid_source, "invalid source");
-TRITON_CREATE_HIP_EXCEPTION(file_not_found, "file not found");
-TRITON_CREATE_HIP_EXCEPTION(shared_object_symbol_not_found,
-                            "shared object symbol not found");
-TRITON_CREATE_HIP_EXCEPTION(shared_object_init_failed,
-                            "shared object init failed");
-TRITON_CREATE_HIP_EXCEPTION(operating_system, "operating system");
-TRITON_CREATE_HIP_EXCEPTION(invalid_handle, "invalid handle");
-TRITON_CREATE_HIP_EXCEPTION(not_found, "not found");
-TRITON_CREATE_HIP_EXCEPTION(not_ready, "not ready");
-TRITON_CREATE_HIP_EXCEPTION(illegal_address, "illegal address");
-TRITON_CREATE_HIP_EXCEPTION(launch_out_of_resources, "launch out of resources");
-TRITON_CREATE_HIP_EXCEPTION(launch_timeout, "launch timeout");
-TRITON_CREATE_HIP_EXCEPTION(launch_incompatible_texturing,
-                            "launch incompatible texturing");
-TRITON_CREATE_HIP_EXCEPTION(peer_access_already_enabled,
-                            "peer access already enabled");
-TRITON_CREATE_HIP_EXCEPTION(peer_access_not_enabled, "peer access not enabled");
-TRITON_CREATE_HIP_EXCEPTION(primary_context_active, "primary context active");
-TRITON_CREATE_HIP_EXCEPTION(context_is_destroyed, "context is destroyed");
-TRITON_CREATE_HIP_EXCEPTION(assert_error, "assert");
-TRITON_CREATE_HIP_EXCEPTION(too_many_peers, "too many peers");
-TRITON_CREATE_HIP_EXCEPTION(host_memory_already_registered,
-                            "host memory already registered");
-TRITON_CREATE_HIP_EXCEPTION(host_memory_not_registered,
-                            "hot memory not registered");
-TRITON_CREATE_HIP_EXCEPTION(hardware_stack_error, "hardware stack error");
-TRITON_CREATE_HIP_EXCEPTION(illegal_instruction, "illegal instruction");
-TRITON_CREATE_HIP_EXCEPTION(misaligned_address, "misaligned address");
-TRITON_CREATE_HIP_EXCEPTION(invalid_address_space, "invalid address space");
-TRITON_CREATE_HIP_EXCEPTION(invalid_pc, "invalid pc");
-TRITON_CREATE_HIP_EXCEPTION(launch_failed, "launch failed");
-TRITON_CREATE_HIP_EXCEPTION(not_permitted, "not permitted");
-TRITON_CREATE_HIP_EXCEPTION(not_supported, "not supported");
-TRITON_CREATE_HIP_EXCEPTION(invalid_symbol, "invalid symbol");
-TRITON_CREATE_HIP_EXCEPTION(unknown, "unknown");
-
-#undef TRITON_CREATE_CUDA_EXCEPTION
-} // namespace hip
-
-} // namespace exception
-} // namespace driver
-} // namespace triton
-
-#endif
diff --git a/include/triton/driver/llvm.h b/include/triton/driver/llvm.h
deleted file mode 100644
index a46eb66b3..000000000
--- a/include/triton/driver/llvm.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#include "triton/external/CUDA/cuda.h"
-#include "triton/external/hip.h"
-#include <string>
-
-namespace llvm {
-class Module;
-}
-
-namespace triton {
-namespace driver {
-
-void init_llvm();
-std::string path_to_ptxas(int &version);
-std::string llir_to_ptx(llvm::Module *module, int cc, int version);
-std::string ptx_to_cubin(const std::string &ptx, const std::string &ptxas_path,
-                         int cc);
-CUmodule ptx_to_cumodule(const std::string &ptx, int cc);
-std::string llir_to_amdgpu(llvm::Module *module, const std::string &proc);
-hipModule_t amdgpu_to_hipmodule(const std::string &path);
-
-} // namespace driver
-} // namespace triton
diff --git a/include/triton/external/CUDA/cuda.h b/include/triton/external/CUDA/cuda.h
deleted file mode 100755
index f7bf9fc12..000000000
--- a/include/triton/external/CUDA/cuda.h
+++ /dev/null
@@ -1,18994 +0,0 @@
-/*
- * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-
-#ifndef __cuda_cuda_h__
-#define __cuda_cuda_h__
-
-#include <stdlib.h>
-#ifdef _MSC_VER
-typedef unsigned __int32 cuuint32_t;
-typedef unsigned __int64 cuuint64_t;
-#else
-#include <stdint.h>
-typedef uint32_t cuuint32_t;
-typedef uint64_t cuuint64_t;
-#endif
-
-#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
-#define __CUDA_DEPRECATED
-#elif defined(_MSC_VER)
-#define __CUDA_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __CUDA_DEPRECATED __attribute__((deprecated))
-#else
-#define __CUDA_DEPRECATED
-#endif
-
-#if defined(CUDA_FORCE_API_VERSION)
-#error "CUDA_FORCE_API_VERSION is no longer supported."
-#endif
-
-#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
-    #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
-    #define __CUDA_API_PTDS(api) api ## _ptds
-    #define __CUDA_API_PTSZ(api) api ## _ptsz
-#else
-    #define __CUDA_API_PTDS(api) api
-    #define __CUDA_API_PTSZ(api) api
-#endif
-
-#define cuDeviceTotalMem                    cuDeviceTotalMem_v2
-#define cuCtxCreate                         cuCtxCreate_v2
-#define cuCtxCreate_v3                      cuCtxCreate_v3
-#define cuModuleGetGlobal                   cuModuleGetGlobal_v2
-#define cuMemGetInfo                        cuMemGetInfo_v2
-#define cuMemAlloc                          cuMemAlloc_v2
-#define cuMemAllocPitch                     cuMemAllocPitch_v2
-#define cuMemFree                           cuMemFree_v2
-#define cuMemGetAddressRange                cuMemGetAddressRange_v2
-#define cuMemAllocHost                      cuMemAllocHost_v2
-#define cuMemHostGetDevicePointer           cuMemHostGetDevicePointer_v2
-#define cuMemcpyHtoD                        __CUDA_API_PTDS(cuMemcpyHtoD_v2)
-#define cuMemcpyDtoH                        __CUDA_API_PTDS(cuMemcpyDtoH_v2)
-#define cuMemcpyDtoD                        __CUDA_API_PTDS(cuMemcpyDtoD_v2)
-#define cuMemcpyDtoA                        __CUDA_API_PTDS(cuMemcpyDtoA_v2)
-#define cuMemcpyAtoD                        __CUDA_API_PTDS(cuMemcpyAtoD_v2)
-#define cuMemcpyHtoA                        __CUDA_API_PTDS(cuMemcpyHtoA_v2)
-#define cuMemcpyAtoH                        __CUDA_API_PTDS(cuMemcpyAtoH_v2)
-#define cuMemcpyAtoA                        __CUDA_API_PTDS(cuMemcpyAtoA_v2)
-#define cuMemcpyHtoAAsync                   __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2)
-#define cuMemcpyAtoHAsync                   __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2)
-#define cuMemcpy2D                          __CUDA_API_PTDS(cuMemcpy2D_v2)
-#define cuMemcpy2DUnaligned                 __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2)
-#define cuMemcpy3D                          __CUDA_API_PTDS(cuMemcpy3D_v2)
-#define cuMemcpyHtoDAsync                   __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2)
-#define cuMemcpyDtoHAsync                   __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2)
-#define cuMemcpyDtoDAsync                   __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
-#define cuMemcpy2DAsync                     __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
-#define cuMemcpy3DAsync                     __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
-#define cuMemsetD8                          __CUDA_API_PTDS(cuMemsetD8_v2)
-#define cuMemsetD16                         __CUDA_API_PTDS(cuMemsetD16_v2)
-#define cuMemsetD32                         __CUDA_API_PTDS(cuMemsetD32_v2)
-#define cuMemsetD2D8                        __CUDA_API_PTDS(cuMemsetD2D8_v2)
-#define cuMemsetD2D16                       __CUDA_API_PTDS(cuMemsetD2D16_v2)
-#define cuMemsetD2D32                       __CUDA_API_PTDS(cuMemsetD2D32_v2)
-#define cuArrayCreate                       cuArrayCreate_v2
-#define cuArrayGetDescriptor                cuArrayGetDescriptor_v2
-#define cuArray3DCreate                     cuArray3DCreate_v2
-#define cuArray3DGetDescriptor              cuArray3DGetDescriptor_v2
-#define cuTexRefSetAddress                  cuTexRefSetAddress_v2
-#define cuTexRefGetAddress                  cuTexRefGetAddress_v2
-#define cuGraphicsResourceGetMappedPointer  cuGraphicsResourceGetMappedPointer_v2
-#define cuCtxDestroy                        cuCtxDestroy_v2
-#define cuCtxPopCurrent                     cuCtxPopCurrent_v2
-#define cuCtxPushCurrent                    cuCtxPushCurrent_v2
-#define cuStreamDestroy                     cuStreamDestroy_v2
-#define cuEventDestroy                      cuEventDestroy_v2
-#define cuTexRefSetAddress2D                cuTexRefSetAddress2D_v3
-#define cuLinkCreate                        cuLinkCreate_v2
-#define cuLinkAddData                       cuLinkAddData_v2
-#define cuLinkAddFile                       cuLinkAddFile_v2
-#define cuMemHostRegister                   cuMemHostRegister_v2
-#define cuGraphicsResourceSetMapFlags       cuGraphicsResourceSetMapFlags_v2
-#define cuStreamBeginCapture                __CUDA_API_PTSZ(cuStreamBeginCapture_v2)
-#define cuDevicePrimaryCtxRelease           cuDevicePrimaryCtxRelease_v2
-#define cuDevicePrimaryCtxReset             cuDevicePrimaryCtxReset_v2
-#define cuDevicePrimaryCtxSetFlags          cuDevicePrimaryCtxSetFlags_v2
-#define cuDeviceGetUuid_v2                  cuDeviceGetUuid_v2
-#define cuIpcOpenMemHandle                  cuIpcOpenMemHandle_v2
-#define cuGraphInstantiate                  cuGraphInstantiate_v2
-
-#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
-    #define cuMemcpy                            __CUDA_API_PTDS(cuMemcpy)
-    #define cuMemcpyAsync                       __CUDA_API_PTSZ(cuMemcpyAsync)
-    #define cuMemcpyPeer                        __CUDA_API_PTDS(cuMemcpyPeer)
-    #define cuMemcpyPeerAsync                   __CUDA_API_PTSZ(cuMemcpyPeerAsync)
-    #define cuMemcpy3DPeer                      __CUDA_API_PTDS(cuMemcpy3DPeer)
-    #define cuMemcpy3DPeerAsync                 __CUDA_API_PTSZ(cuMemcpy3DPeerAsync)
-    #define cuMemPrefetchAsync                  __CUDA_API_PTSZ(cuMemPrefetchAsync)
-
-    #define cuMemsetD8Async                     __CUDA_API_PTSZ(cuMemsetD8Async)
-    #define cuMemsetD16Async                    __CUDA_API_PTSZ(cuMemsetD16Async)
-    #define cuMemsetD32Async                    __CUDA_API_PTSZ(cuMemsetD32Async)
-    #define cuMemsetD2D8Async                   __CUDA_API_PTSZ(cuMemsetD2D8Async)
-    #define cuMemsetD2D16Async                  __CUDA_API_PTSZ(cuMemsetD2D16Async)
-    #define cuMemsetD2D32Async                  __CUDA_API_PTSZ(cuMemsetD2D32Async)
-
-    #define cuStreamGetPriority                 __CUDA_API_PTSZ(cuStreamGetPriority)
-    #define cuStreamGetFlags                    __CUDA_API_PTSZ(cuStreamGetFlags)
-    #define cuStreamGetCtx                      __CUDA_API_PTSZ(cuStreamGetCtx)
-    #define cuStreamWaitEvent                   __CUDA_API_PTSZ(cuStreamWaitEvent)
-    #define cuStreamEndCapture                  __CUDA_API_PTSZ(cuStreamEndCapture)
-    #define cuStreamIsCapturing                 __CUDA_API_PTSZ(cuStreamIsCapturing)
-    #define cuStreamGetCaptureInfo              __CUDA_API_PTSZ(cuStreamGetCaptureInfo)
-    #define cuStreamGetCaptureInfo_v2           __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2)
-    #define cuStreamUpdateCaptureDependencies   __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies)
-    #define cuStreamAddCallback                 __CUDA_API_PTSZ(cuStreamAddCallback)
-    #define cuStreamAttachMemAsync              __CUDA_API_PTSZ(cuStreamAttachMemAsync)
-    #define cuStreamQuery                       __CUDA_API_PTSZ(cuStreamQuery)
-    #define cuStreamSynchronize                 __CUDA_API_PTSZ(cuStreamSynchronize)
-    #define cuEventRecord                       __CUDA_API_PTSZ(cuEventRecord)
-    #define cuEventRecordWithFlags              __CUDA_API_PTSZ(cuEventRecordWithFlags)
-    #define cuLaunchKernel                      __CUDA_API_PTSZ(cuLaunchKernel)
-    #define cuLaunchHostFunc                    __CUDA_API_PTSZ(cuLaunchHostFunc)
-    #define cuGraphicsMapResources              __CUDA_API_PTSZ(cuGraphicsMapResources)
-    #define cuGraphicsUnmapResources            __CUDA_API_PTSZ(cuGraphicsUnmapResources)
-
-    #define cuStreamWriteValue32                __CUDA_API_PTSZ(cuStreamWriteValue32)
-    #define cuStreamWaitValue32                 __CUDA_API_PTSZ(cuStreamWaitValue32)
-    #define cuStreamWriteValue64                __CUDA_API_PTSZ(cuStreamWriteValue64)
-    #define cuStreamWaitValue64                 __CUDA_API_PTSZ(cuStreamWaitValue64)
-    #define cuStreamBatchMemOp                  __CUDA_API_PTSZ(cuStreamBatchMemOp)
-
-    #define cuLaunchCooperativeKernel           __CUDA_API_PTSZ(cuLaunchCooperativeKernel)
-
-    #define cuSignalExternalSemaphoresAsync     __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync)
-    #define cuWaitExternalSemaphoresAsync       __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync)
-
-    #define cuGraphUpload                       __CUDA_API_PTSZ(cuGraphUpload)
-    #define cuGraphLaunch                       __CUDA_API_PTSZ(cuGraphLaunch)
-    #define cuStreamCopyAttributes              __CUDA_API_PTSZ(cuStreamCopyAttributes)
-    #define cuStreamGetAttribute                __CUDA_API_PTSZ(cuStreamGetAttribute)
-    #define cuStreamSetAttribute                __CUDA_API_PTSZ(cuStreamSetAttribute)
-    #define cuMemMapArrayAsync                  __CUDA_API_PTSZ(cuMemMapArrayAsync)
-
-    #define cuMemFreeAsync                      __CUDA_API_PTSZ(cuMemFreeAsync)
-    #define cuMemAllocAsync                     __CUDA_API_PTSZ(cuMemAllocAsync)
-    #define cuMemAllocFromPoolAsync             __CUDA_API_PTSZ(cuMemAllocFromPoolAsync)
-#endif
-
-/**
- * \file cuda.h
- * \brief Header file for the CUDA Toolkit application programming interface.
- *
- * \file cudaGL.h
- * \brief Header file for the OpenGL interoperability functions of the
- * low-level CUDA driver application programming interface.
- *
- * \file cudaD3D9.h
- * \brief Header file for the Direct3D 9 interoperability functions of the
- * low-level CUDA driver application programming interface.
- */
-
-/**
- * \defgroup CUDA_TYPES Data types used by CUDA driver
- * @{
- */
-
-/**
- * CUDA API version number
- */
-#define CUDA_VERSION 11050
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * CUDA device pointer
- * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
- */
-#if defined(_WIN64) || defined(__LP64__)
-typedef unsigned long long CUdeviceptr_v2;
-#else
-typedef unsigned int CUdeviceptr_v2;
-#endif
-typedef CUdeviceptr_v2 CUdeviceptr;                          /**< CUDA device pointer */
-
-typedef int CUdevice_v1;                                     /**< CUDA device */
-typedef CUdevice_v1 CUdevice;                                /**< CUDA device */
-typedef struct CUctx_st *CUcontext;                          /**< CUDA context */
-typedef struct CUmod_st *CUmodule;                           /**< CUDA module */
-typedef struct CUfunc_st *CUfunction;                        /**< CUDA function */
-typedef struct CUarray_st *CUarray;                          /**< CUDA array */
-typedef struct CUmipmappedArray_st *CUmipmappedArray;        /**< CUDA mipmapped array */
-typedef struct CUtexref_st *CUtexref;                        /**< CUDA texture reference */
-typedef struct CUsurfref_st *CUsurfref;                      /**< CUDA surface reference */
-typedef struct CUevent_st *CUevent;                          /**< CUDA event */
-typedef struct CUstream_st *CUstream;                        /**< CUDA stream */
-typedef struct CUgraphicsResource_st *CUgraphicsResource;    /**< CUDA graphics interop resource */
-typedef unsigned long long CUtexObject_v1;                   /**< An opaque value that represents a CUDA texture object */
-typedef CUtexObject_v1 CUtexObject;                          /**< An opaque value that represents a CUDA texture object */
-typedef unsigned long long CUsurfObject_v1;                  /**< An opaque value that represents a CUDA surface object */
-typedef CUsurfObject_v1 CUsurfObject;                        /**< An opaque value that represents a CUDA surface object */ 
-typedef struct CUextMemory_st *CUexternalMemory;             /**< CUDA external memory */
-typedef struct CUextSemaphore_st *CUexternalSemaphore;       /**< CUDA external semaphore */
-typedef struct CUgraph_st *CUgraph;                          /**< CUDA graph */
-typedef struct CUgraphNode_st *CUgraphNode;                  /**< CUDA graph node */
-typedef struct CUgraphExec_st *CUgraphExec;                  /**< CUDA executable graph */
-typedef struct CUmemPoolHandle_st *CUmemoryPool;             /**< CUDA memory pool */
-typedef struct CUuserObject_st *CUuserObject;                /**< CUDA user object for graphs */
-
-#ifndef CU_UUID_HAS_BEEN_DEFINED
-#define CU_UUID_HAS_BEEN_DEFINED
-typedef struct CUuuid_st {                                /**< CUDA definition of UUID */
-    char bytes[16];
-} CUuuid;
-#endif
-
-/**
- * CUDA IPC handle size
- */
-#define CU_IPC_HANDLE_SIZE 64
-
-/**
- * CUDA IPC event handle
- */
-typedef struct CUipcEventHandle_st {
-    char reserved[CU_IPC_HANDLE_SIZE];
-} CUipcEventHandle_v1;
-typedef CUipcEventHandle_v1 CUipcEventHandle;
-
-/**
- * CUDA IPC mem handle
- */
-typedef struct CUipcMemHandle_st {
-    char reserved[CU_IPC_HANDLE_SIZE];
-} CUipcMemHandle_v1;
-typedef CUipcMemHandle_v1 CUipcMemHandle;
-
-/**
- * CUDA Ipc Mem Flags
- */
-typedef enum CUipcMem_flags_enum {
-    CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
-} CUipcMem_flags;
-
-
-/**
- * CUDA Mem Attach Flags
- */
-typedef enum CUmemAttach_flags_enum {
-    CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */
-    CU_MEM_ATTACH_HOST   = 0x2, /**< Memory cannot be accessed by any stream on any device */
-    CU_MEM_ATTACH_SINGLE = 0x4  /**< Memory can only be accessed by a single stream on the associated device */
-} CUmemAttach_flags;
-
-/**
- * Context creation flags
- */
-typedef enum CUctx_flags_enum {
-    CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
-    CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
-    CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
-    CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
-    CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling
-                                         *  \deprecated This flag was deprecated as of CUDA 4.0
-                                         *  and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
-    CU_CTX_SCHED_MASK          = 0x07,
-    CU_CTX_MAP_HOST            = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 
-                                         *  and it no longer has any effect. All contexts 
-                                         *  as of CUDA 3.2 behave as though the flag is enabled. */
-    CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
-    CU_CTX_FLAGS_MASK          = 0x1f
-} CUctx_flags;
-
-/**
- * Stream creation flags
- */
-typedef enum CUstream_flags_enum {
-    CU_STREAM_DEFAULT             = 0x0, /**< Default stream flag */
-    CU_STREAM_NON_BLOCKING        = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
-} CUstream_flags;
-
-/**
- * Legacy stream handle
- *
- * Stream handle that can be passed as a CUstream to use an implicit stream
- * with legacy synchronization behavior.
- *
- * See details of the \link_sync_behavior
- */
-#define CU_STREAM_LEGACY     ((CUstream)0x1)
-
-/**
- * Per-thread stream handle
- *
- * Stream handle that can be passed as a CUstream to use an implicit stream
- * with per-thread synchronization behavior.
- *
- * See details of the \link_sync_behavior
- */
-#define CU_STREAM_PER_THREAD ((CUstream)0x2)
-
-/**
- * Event creation flags
- */
-typedef enum CUevent_flags_enum {
-    CU_EVENT_DEFAULT        = 0x0, /**< Default event flag */
-    CU_EVENT_BLOCKING_SYNC  = 0x1, /**< Event uses blocking synchronization */
-    CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
-    CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
-} CUevent_flags;
-
-/**
- * Event record flags
- */
-typedef enum CUevent_record_flags_enum {
-    CU_EVENT_RECORD_DEFAULT  = 0x0, /**< Default event record flag */
-    CU_EVENT_RECORD_EXTERNAL = 0x1  /**< When using stream capture, create an event record node
-                                      *  instead of the default behavior.  This flag is invalid
-                                      *  when used outside of capture. */
-} CUevent_record_flags;
-
-/**
- * Event wait flags
- */
-typedef enum CUevent_wait_flags_enum {
-    CU_EVENT_WAIT_DEFAULT  = 0x0, /**< Default event wait flag */
-    CU_EVENT_WAIT_EXTERNAL = 0x1  /**< When using stream capture, create an event wait node
-                                    *  instead of the default behavior.  This flag is invalid
-                                    *  when used outside of capture.*/
-} CUevent_wait_flags;
-
-/**
- * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64
- */
-typedef enum CUstreamWaitValue_flags_enum {
-    CU_STREAM_WAIT_VALUE_GEQ   = 0x0,   /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit
-                                             values). Note this is a cyclic comparison which ignores wraparound.
-                                             (Default behavior.) */
-    CU_STREAM_WAIT_VALUE_EQ    = 0x1,   /**< Wait until *addr == value. */
-    CU_STREAM_WAIT_VALUE_AND   = 0x2,   /**< Wait until (*addr & value) != 0. */
-    CU_STREAM_WAIT_VALUE_NOR   = 0x3,   /**< Wait until ~(*addr | value) != 0. Support for this operation can be
-                                             queried with ::cuDeviceGetAttribute() and
-                                             ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/
-    CU_STREAM_WAIT_VALUE_FLUSH = 1<<30  /**< Follow the wait operation with a flush of outstanding remote writes. This
-                                             means that, if a remote write operation is guaranteed to have reached the
-                                             device before the wait can be satisfied, that write is guaranteed to be
-                                             visible to downstream device work. The device is permitted to reorder
-                                             remote writes internally. For example, this flag would be required if
-                                             two remote writes arrive in a defined order, the wait is satisfied by the
-                                             second write, and downstream work needs to observe the first write.
-                                             Support for this operation is restricted to selected platforms and can be
-                                             queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/
-} CUstreamWaitValue_flags;
-
-/**
- * Flags for ::cuStreamWriteValue32
- */
-typedef enum CUstreamWriteValue_flags_enum {
-    CU_STREAM_WRITE_VALUE_DEFAULT           = 0x0, /**< Default behavior */
-    CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1  /**< Permits the write to be reordered with writes which were issued
-                                                        before it, as a performance optimization. Normally,
-                                                        ::cuStreamWriteValue32 will provide a memory fence before the
-                                                        write, which has similar semantics to
-                                                        __threadfence_system() but is scoped to the stream
-                                                        rather than a CUDA thread. */
-} CUstreamWriteValue_flags;
-
-/**
- * Operations for ::cuStreamBatchMemOp
- */
-typedef enum CUstreamBatchMemOpType_enum {
-    CU_STREAM_MEM_OP_WAIT_VALUE_32  = 1,     /**< Represents a ::cuStreamWaitValue32 operation */
-    CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2,     /**< Represents a ::cuStreamWriteValue32 operation */
-    CU_STREAM_MEM_OP_WAIT_VALUE_64  = 4,     /**< Represents a ::cuStreamWaitValue64 operation */
-    CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5,     /**< Represents a ::cuStreamWriteValue64 operation */
-    CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a
-                                                  standalone operation. */
-} CUstreamBatchMemOpType;
-
-/**
- * Per-operation parameters for ::cuStreamBatchMemOp
- */
-typedef union CUstreamBatchMemOpParams_union {
-    CUstreamBatchMemOpType operation;
-    struct CUstreamMemOpWaitValueParams_st {
-        CUstreamBatchMemOpType operation;
-        CUdeviceptr address;
-        union {
-            cuuint32_t value;
-            cuuint64_t value64;
-        };
-        unsigned int flags;
-        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
-    } waitValue;
-    struct CUstreamMemOpWriteValueParams_st {
-        CUstreamBatchMemOpType operation;
-        CUdeviceptr address;
-        union {
-            cuuint32_t value;
-            cuuint64_t value64;
-        };
-        unsigned int flags;
-        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
-    } writeValue;
-    struct CUstreamMemOpFlushRemoteWritesParams_st {
-        CUstreamBatchMemOpType operation;
-        unsigned int flags;
-    } flushRemoteWrites;
-    cuuint64_t pad[6];
-} CUstreamBatchMemOpParams_v1;
-typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams;
-
-/**
- * Occupancy calculator flag
- */
-typedef enum CUoccupancy_flags_enum {
-    CU_OCCUPANCY_DEFAULT                  = 0x0, /**< Default behavior */
-    CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1  /**< Assume global caching is enabled and cannot be automatically turned off */
-} CUoccupancy_flags;
-
-/**
- * Flags for ::cuStreamUpdateCaptureDependencies
- */
-typedef enum CUstreamUpdateCaptureDependencies_flags_enum {
-    CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */
-    CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1  /**< Replace the dependency set with the new nodes */
-} CUstreamUpdateCaptureDependencies_flags;
-
-/**
- * Array formats
- */
-typedef enum CUarray_format_enum {
-    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
-    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
-    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
-    CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
-    CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
-    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
-    CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
-    CU_AD_FORMAT_FLOAT          = 0x20, /**< 32-bit floating point */
-    CU_AD_FORMAT_NV12           = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
-    CU_AD_FORMAT_UNORM_INT8X1   = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT8X2   = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT8X4   = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT16X1  = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT16X2  = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */
-    CU_AD_FORMAT_UNORM_INT16X4  = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT8X1   = 0xc6, /**< 1 channel signed 8-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT8X2   = 0xc7, /**< 2 channel signed 8-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT8X4   = 0xc8, /**< 4 channel signed 8-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT16X1  = 0xc9, /**< 1 channel signed 16-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT16X2  = 0xca, /**< 2 channel signed 16-bit normalized integer */
-    CU_AD_FORMAT_SNORM_INT16X4  = 0xcb, /**< 4 channel signed 16-bit normalized integer */
-    CU_AD_FORMAT_BC1_UNORM      = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
-    CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
-    CU_AD_FORMAT_BC2_UNORM      = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
-    CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/
-    CU_AD_FORMAT_BC3_UNORM      = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
-    CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/
-    CU_AD_FORMAT_BC4_UNORM      = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
-    CU_AD_FORMAT_BC4_SNORM      = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */
-    CU_AD_FORMAT_BC5_UNORM      = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
-    CU_AD_FORMAT_BC5_SNORM      = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */
-    CU_AD_FORMAT_BC6H_UF16      = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
-    CU_AD_FORMAT_BC6H_SF16      = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */
-    CU_AD_FORMAT_BC7_UNORM      = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
-    CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e  /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
-} CUarray_format;
-
-/**
- * Texture reference addressing modes
- */
-typedef enum CUaddress_mode_enum {
-    CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
-    CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
-    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
-    CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
-} CUaddress_mode;
-
-/**
- * Texture reference filtering modes
- */
-typedef enum CUfilter_mode_enum {
-    CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
-    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
-} CUfilter_mode;
-
-/**
- * Device properties
- */
-typedef enum CUdevice_attribute_enum {
-    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,                          /**< Maximum number of threads per block */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                                /**< Maximum block dimension X */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                                /**< Maximum block dimension Y */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                                /**< Maximum block dimension Z */
-    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                                 /**< Maximum grid dimension X */
-    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                                 /**< Maximum grid dimension Y */
-    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                                 /**< Maximum grid dimension Z */
-    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,                    /**< Maximum shared memory available per block in bytes */
-    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,                        /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
-    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,                          /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
-    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                                     /**< Warp size in threads */
-    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                                     /**< Maximum pitch in bytes allowed by memory copies */
-    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,                       /**< Maximum number of 32-bit registers available per block */
-    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,                           /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
-    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                                    /**< Typical clock frequency in kilohertz */
-    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                             /**< Alignment requirement for textures */
-    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                                   /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
-    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,                          /**< Number of multiprocessors on device */
-    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,                           /**< Specifies whether there is a run time limit on kernels */
-    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                                    /**< Device is integrated with host memory */
-    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,                           /**< Device can map host memory into CUDA address space */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                                  /**< Compute mode (See ::CUcomputemode for details) */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,                       /**< Maximum 1D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,                       /**< Maximum 2D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,                      /**< Maximum 2D texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,                       /**< Maximum 3D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,                      /**< Maximum 3D texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,                       /**< Maximum 3D texture depth */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,               /**< Maximum 2D layered texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,              /**< Maximum 2D layered texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,              /**< Maximum layers in a 2D layered texture */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,                 /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,                /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,             /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
-    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                             /**< Alignment requirement for surfaces */
-    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                            /**< Device can possibly execute multiple kernels concurrently */
-    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                                   /**< Device has ECC support enabled */
-    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                                    /**< PCI bus ID of the device */
-    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                                 /**< PCI device ID of the device */
-    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                                    /**< Device is using TCC driver model */
-    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                             /**< Peak memory clock frequency in kilohertz */
-    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,                       /**< Global memory bus width in bits */
-    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                                 /**< Size of L2 cache in bytes */
-    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,                /**< Maximum resident threads per multiprocessor */
-    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                            /**< Number of asynchronous engines */
-    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                            /**< Device shares a unified address space with the host */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,               /**< Maximum 1D layered texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,              /**< Maximum layers in a 1D layered texture */
-    CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,                              /**< Deprecated, do not use. */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,                /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,               /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,             /**< Alternate maximum 3D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,            /**< Alternate maximum 3D texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,             /**< Alternate maximum 3D texture depth */
-    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                                 /**< PCI domain ID of the device */
-    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,                       /**< Pitch alignment requirement for textures */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,                  /**< Maximum cubemap texture width/height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,          /**< Maximum cubemap layered texture width/height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,         /**< Maximum layers in a cubemap layered texture */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,                       /**< Maximum 1D surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,                       /**< Maximum 2D surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,                      /**< Maximum 2D surface height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,                       /**< Maximum 3D surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,                      /**< Maximum 3D surface height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,                       /**< Maximum 3D surface depth */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,               /**< Maximum 1D layered surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,              /**< Maximum layers in a 1D layered surface */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,               /**< Maximum 2D layered surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,              /**< Maximum 2D layered surface height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,              /**< Maximum layers in a 2D layered surface */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,                  /**< Maximum cubemap surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,          /**< Maximum cubemap layered surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,         /**< Maximum layers in a cubemap layered surface */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,                /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,                /**< Maximum 2D linear texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,               /**< Maximum 2D linear texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,                /**< Maximum 2D linear texture pitch in bytes */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,             /**< Maximum mipmapped 2D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,            /**< Maximum mipmapped 2D texture height */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,                      /**< Major compute capability version number */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,                      /**< Minor compute capability version number */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,             /**< Maximum mipmapped 1D texture width */
-    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,                   /**< Device supports stream priorities */
-    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,                     /**< Device supports caching globals in L1 */
-    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,                      /**< Device supports caching locals in L1 */
-    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,          /**< Maximum shared memory available per multiprocessor in bytes */
-    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,              /**< Maximum number of 32-bit registers available per multiprocessor */
-    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                                /**< Device can allocate managed memory on this system */
-    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                               /**< Device is on a multi-GPU board */
-    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,                      /**< Unique id for a group of devices on the same multi-GPU board */
-    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,                  /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
-    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,         /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
-    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,                        /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
-    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,                     /**< Device can coherently access managed memory concurrently with the CPU */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,                  /**< Device supports compute preemption. */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,       /**< Device can access host registered memory at the same virtual address as the CPU */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,                        /**< ::cuStreamBatchMemOp and related APIs are supported. */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,                 /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,                 /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */
-    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                            /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
-    CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,               /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */
-    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,             /**< Maximum optin shared memory per block */
-    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,                       /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
-    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,                       /**< Device supports host memory registration via ::cudaHostRegister. */
-    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
-    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,          /**< The host can directly access managed memory on the device without migration. */
-    CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102,         /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/
-    CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102,         /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
-    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,  /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
-    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,           /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
-    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,       /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106,                /**< Maximum number of blocks per multiprocessor */
-    CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107,                /**< Device supports compression of memory */
-    CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108,                 /**< Maximum L2 persisting lines capacity setting in bytes. */
-    CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109,                /**< Maximum value of CUaccessPolicyWindow::num_bytes. */
-    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110,      /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
-    CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111,             /**< Shared memory reserved by CUDA driver per block in bytes */
-    CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112,                  /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
-    CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113,            /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
-    CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114,         /**< External timeline semaphore interop is supported on the device */
-    CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115,                       /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
-    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116,                    /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
-    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117,         /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */
-    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118,              /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */
-    CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119,               /**< Handle types supported with mempool based IPC */
-    CU_DEVICE_ATTRIBUTE_MAX
-} CUdevice_attribute;
-
-/**
- * Legacy device properties
- */
-typedef struct CUdevprop_st {
-    int maxThreadsPerBlock;     /**< Maximum number of threads per block */
-    int maxThreadsDim[3];       /**< Maximum size of each dimension of a block */
-    int maxGridSize[3];         /**< Maximum size of each dimension of a grid */
-    int sharedMemPerBlock;      /**< Shared memory available per block in bytes */
-    int totalConstantMemory;    /**< Constant memory available on device in bytes */
-    int SIMDWidth;              /**< Warp size in threads */
-    int memPitch;               /**< Maximum pitch in bytes allowed by memory copies */
-    int regsPerBlock;           /**< 32-bit registers available per block */
-    int clockRate;              /**< Clock frequency in kilohertz */
-    int textureAlign;           /**< Alignment requirement for textures */
-} CUdevprop_v1;
-typedef CUdevprop_v1 CUdevprop;
-
-/**
- * Pointer information
- */
-typedef enum CUpointer_attribute_enum {
-    CU_POINTER_ATTRIBUTE_CONTEXT = 1,                     /**< The ::CUcontext on which a pointer was allocated or registered */
-    CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,                 /**< The ::CUmemorytype describing the physical location of a pointer */
-    CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3,              /**< The address at which a pointer's memory may be accessed on the device */
-    CU_POINTER_ATTRIBUTE_HOST_POINTER = 4,                /**< The address at which a pointer's memory may be accessed on the host */
-    CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5,                  /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */
-    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6,                 /**< Synchronize every synchronous memory operation initiated on this region */
-    CU_POINTER_ATTRIBUTE_BUFFER_ID = 7,                   /**< A process-wide unique ID for an allocated memory region*/
-    CU_POINTER_ATTRIBUTE_IS_MANAGED = 8,                  /**< Indicates if the pointer points to managed memory */
-    CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9,              /**< A device ordinal of a device on which a pointer was allocated or registered */
-    CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/
-    CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11,           /**< Starting address for this requested pointer */
-    CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12,                 /**< Size of the address range for this requested pointer */
-    CU_POINTER_ATTRIBUTE_MAPPED = 13,                     /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/
-    CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14,       /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/
-    CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/
-    CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16,               /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */
-    CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17              /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/
-} CUpointer_attribute;
-
-/**
- * Function properties
- */
-typedef enum CUfunction_attribute_enum {
-    /**
-     * The maximum number of threads per block, beyond which a launch of the
-     * function would fail. This number depends on both the function and the
-     * device on which the function is currently loaded.
-     */
-    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
-
-    /**
-     * The size in bytes of statically-allocated shared memory required by
-     * this function. This does not include dynamically-allocated shared
-     * memory requested by the user at runtime.
-     */
-    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
-
-    /**
-     * The size in bytes of user-allocated constant memory required by this
-     * function.
-     */
-    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
-
-    /**
-     * The size in bytes of local memory used by each thread of this function.
-     */
-    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
-
-    /**
-     * The number of registers used by each thread of this function.
-     */
-    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
-
-    /**
-     * The PTX virtual architecture version for which the function was
-     * compiled. This value is the major PTX version * 10 + the minor PTX
-     * version, so a PTX version 1.3 function would return the value 13.
-     * Note that this may return the undefined value of 0 for cubins
-     * compiled prior to CUDA 3.0.
-     */
-    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
-
-    /**
-     * The binary architecture version for which the function was compiled.
-     * This value is the major binary version * 10 + the minor binary version,
-     * so a binary version 1.3 function would return the value 13. Note that
-     * this will return a value of 10 for legacy cubins that do not have a
-     * properly-encoded binary architecture version.
-     */
-    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
-
-    /**
-     * The attribute to indicate whether the function has been compiled with
-     * user specified option "-Xptxas --dlcm=ca" set .
-     */
-    CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
-
-    /**
-     * The maximum size in bytes of dynamically-allocated shared memory that can be used by
-     * this function. If the user-specified dynamic shared memory size is larger than this
-     * value, the launch will fail.
-     * See ::cuFuncSetAttribute
-     */
-    CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
-
-    /**
-     * On devices where the L1 cache and shared memory use the same hardware resources, 
-     * this sets the shared memory carveout preference, in percent of the total shared memory.
-     * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
-     * This is only a hint, and the driver can choose a different ratio if required to execute the function.
-     * See ::cuFuncSetAttribute
-     */
-    CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
-
-    CU_FUNC_ATTRIBUTE_MAX
-} CUfunction_attribute;
-
-/**
- * Function cache configurations
- */
-typedef enum CUfunc_cache_enum {
-    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
-    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
-    CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
-    CU_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
-} CUfunc_cache;
-
-/**
- * Shared memory configurations
- */
-typedef enum CUsharedconfig_enum {
-    CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
-    CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
-    CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
-} CUsharedconfig;
-
-/**
- * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute
- */
-typedef enum CUshared_carveout_enum {
-    CU_SHAREDMEM_CARVEOUT_DEFAULT       = -1,  /**< No preference for shared memory or L1 (default) */
-    CU_SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, /**< Prefer maximum available shared memory, minimum L1 cache */
-    CU_SHAREDMEM_CARVEOUT_MAX_L1        = 0    /**< Prefer maximum available L1 cache, minimum shared memory */
-} CUshared_carveout;
-
-/**
- * Memory types
- */
-typedef enum CUmemorytype_enum {
-    CU_MEMORYTYPE_HOST    = 0x01,    /**< Host memory */
-    CU_MEMORYTYPE_DEVICE  = 0x02,    /**< Device memory */
-    CU_MEMORYTYPE_ARRAY   = 0x03,    /**< Array memory */
-    CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
-} CUmemorytype;
-
-/**
- * Compute Modes
- */
-typedef enum CUcomputemode_enum {
-    CU_COMPUTEMODE_DEFAULT           = 0, /**< Default compute mode (Multiple contexts allowed per device) */
-    CU_COMPUTEMODE_PROHIBITED        = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
-    CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
-} CUcomputemode;
-
-/**
- * Memory advise values
- */
-typedef enum CUmem_advise_enum {
-    CU_MEM_ADVISE_SET_READ_MOSTLY          = 1, /**< Data will mostly be read and only occassionally be written to */
-    CU_MEM_ADVISE_UNSET_READ_MOSTLY        = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */
-    CU_MEM_ADVISE_SET_PREFERRED_LOCATION   = 3, /**< Set the preferred location for the data as the specified device */
-    CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */
-    CU_MEM_ADVISE_SET_ACCESSED_BY          = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */
-    CU_MEM_ADVISE_UNSET_ACCESSED_BY        = 6  /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
-} CUmem_advise;
-
-typedef enum CUmem_range_attribute_enum {
-    CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY            = 1, /**< Whether the range will mostly be read and only occassionally be written to */
-    CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION     = 2, /**< The preferred location of the range */
-    CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY            = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */
-    CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4  /**< The last location to which the range was prefetched */
-} CUmem_range_attribute;
-
-/**
- * Online compiler and linker options
- */
-typedef enum CUjit_option_enum
-{
-    /**
-     * Max number of registers that a thread may use.\n
-     * Option type: unsigned int\n
-     * Applies to: compiler only
-     */
-    CU_JIT_MAX_REGISTERS = 0,
-
-    /**
-     * IN: Specifies minimum number of threads per block to target compilation
-     * for\n
-     * OUT: Returns the number of threads the compiler actually targeted.
-     * This restricts the resource utilization fo the compiler (e.g. max
-     * registers) such that a block with the given number of threads should be
-     * able to launch based on register limitations. Note, this option does not
-     * currently take into account any other resource limitations, such as
-     * shared memory utilization.\n
-     * Cannot be combined with ::CU_JIT_TARGET.\n
-     * Option type: unsigned int\n
-     * Applies to: compiler only
-     */
-    CU_JIT_THREADS_PER_BLOCK,
-
-    /**
-     * Overwrites the option value with the total wall clock time, in
-     * milliseconds, spent in the compiler and linker\n
-     * Option type: float\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_WALL_TIME,
-
-    /**
-     * Pointer to a buffer in which to print any log messages
-     * that are informational in nature (the buffer size is specified via
-     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
-     * Option type: char *\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_INFO_LOG_BUFFER,
-
-    /**
-     * IN: Log buffer size in bytes.  Log messages will be capped at this size
-     * (including null terminator)\n
-     * OUT: Amount of log buffer filled with messages\n
-     * Option type: unsigned int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
-
-    /**
-     * Pointer to a buffer in which to print any log messages that
-     * reflect errors (the buffer size is specified via option
-     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
-     * Option type: char *\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_ERROR_LOG_BUFFER,
-
-    /**
-     * IN: Log buffer size in bytes.  Log messages will be capped at this size
-     * (including null terminator)\n
-     * OUT: Amount of log buffer filled with messages\n
-     * Option type: unsigned int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
-
-    /**
-     * Level of optimizations to apply to generated code (0 - 4), with 4
-     * being the default and highest level of optimizations.\n
-     * Option type: unsigned int\n
-     * Applies to: compiler only
-     */
-    CU_JIT_OPTIMIZATION_LEVEL,
-
-    /**
-     * No option value required. Determines the target based on the current
-     * attached context (default)\n
-     * Option type: No option value needed\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_TARGET_FROM_CUCONTEXT,
-
-    /**
-     * Target is chosen based on supplied ::CUjit_target.  Cannot be
-     * combined with ::CU_JIT_THREADS_PER_BLOCK.\n
-     * Option type: unsigned int for enumerated type ::CUjit_target\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_TARGET,
-
-    /**
-     * Specifies choice of fallback strategy if matching cubin is not found.
-     * Choice is based on supplied ::CUjit_fallback.  This option cannot be
-     * used with cuLink* APIs as the linker requires exact matches.\n
-     * Option type: unsigned int for enumerated type ::CUjit_fallback\n
-     * Applies to: compiler only
-     */
-    CU_JIT_FALLBACK_STRATEGY,
-
-    /**
-     * Specifies whether to create debug information in output (-g)
-     * (0: false, default)\n
-     * Option type: int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_GENERATE_DEBUG_INFO,
-
-    /**
-     * Generate verbose log messages (0: false, default)\n
-     * Option type: int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_LOG_VERBOSE,
-
-    /**
-     * Generate line number information (-lineinfo) (0: false, default)\n
-     * Option type: int\n
-     * Applies to: compiler only
-     */
-    CU_JIT_GENERATE_LINE_INFO,
-
-    /**
-     * Specifies whether to enable caching explicitly (-dlcm) \n
-     * Choice is based on supplied ::CUjit_cacheMode_enum.\n
-     * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
-     * Applies to: compiler only
-     */
-    CU_JIT_CACHE_MODE,
-
-    /**
-     * The below jit options are used for internal purposes only, in this version of CUDA
-     */
-    CU_JIT_NEW_SM3X_OPT,
-    CU_JIT_FAST_COMPILE,
-
-    /**
-     * Array of device symbol names that will be relocated to the corresponing
-     * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
-     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
-     * When loding a device module, driver will relocate all encountered
-     * unresolved symbols to the host addresses.\n
-     * It is only allowed to register symbols that correspond to unresolved
-     * global variables.\n
-     * It is illegal to register the same device symbol at multiple addresses.\n
-     * Option type: const char **\n
-     * Applies to: dynamic linker only
-     */
-    CU_JIT_GLOBAL_SYMBOL_NAMES,
-
-    /**
-     * Array of host addresses that will be used to relocate corresponding
-     * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n
-     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
-     * Option type: void **\n
-     * Applies to: dynamic linker only
-     */
-    CU_JIT_GLOBAL_SYMBOL_ADDRESSES,
-
-    /**
-     * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and
-     * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
-     * Option type: unsigned int\n
-     * Applies to: dynamic linker only
-     */
-    CU_JIT_GLOBAL_SYMBOL_COUNT,
-
-    /**
-     * Enable link-time optimization (-dlto) for device code (0: false, default)\n
-     * Option type: int\n
-     * Applies to: compiler and linker
-     */
-    CU_JIT_LTO,
-
-    /**
-     * Control single-precision denormals (-ftz) support (0: false, default).
-     * 1 : flushes denormal values to zero
-     * 0 : preserves denormal values
-     * Option type: int\n
-     * Applies to: link-time optimization specified with CU_JIT_LTO
-     */
-    CU_JIT_FTZ,
-
-    /**
-     * Control single-precision floating-point division and reciprocals
-     * (-prec-div) support (1: true, default).
-     * 1 : Enables the IEEE round-to-nearest mode
-     * 0 : Enables the fast approximation mode
-     * Option type: int\n
-     * Applies to: link-time optimization specified with CU_JIT_LTO
-     */
-    CU_JIT_PREC_DIV,
-
-    /**
-     * Control single-precision floating-point square root
-     * (-prec-sqrt) support (1: true, default).
-     * 1 : Enables the IEEE round-to-nearest mode
-     * 0 : Enables the fast approximation mode
-     * Option type: int\n
-     * Applies to: link-time optimization specified with CU_JIT_LTO
-     */
-    CU_JIT_PREC_SQRT,
-
-    /**
-     * Enable/Disable the contraction of floating-point multiplies
-     * and adds/subtracts into floating-point multiply-add (-fma)
-     * operations (1: Enable, default; 0: Disable).
-     * Option type: int\n
-     * Applies to: link-time optimization specified with CU_JIT_LTO
-     */
-    CU_JIT_FMA,
-
-    CU_JIT_NUM_OPTIONS
-
-} CUjit_option;
-
-/**
- * Online compilation targets
- */
-typedef enum CUjit_target_enum
-{
-    CU_TARGET_COMPUTE_20 = 20,       /**< Compute device class 2.0 */
-    CU_TARGET_COMPUTE_21 = 21,       /**< Compute device class 2.1 */
-    CU_TARGET_COMPUTE_30 = 30,       /**< Compute device class 3.0 */
-    CU_TARGET_COMPUTE_32 = 32,       /**< Compute device class 3.2 */
-    CU_TARGET_COMPUTE_35 = 35,       /**< Compute device class 3.5 */
-    CU_TARGET_COMPUTE_37 = 37,       /**< Compute device class 3.7 */
-    CU_TARGET_COMPUTE_50 = 50,       /**< Compute device class 5.0 */
-    CU_TARGET_COMPUTE_52 = 52,       /**< Compute device class 5.2 */
-    CU_TARGET_COMPUTE_53 = 53,       /**< Compute device class 5.3 */
-    CU_TARGET_COMPUTE_60 = 60,       /**< Compute device class 6.0.*/
-    CU_TARGET_COMPUTE_61 = 61,       /**< Compute device class 6.1.*/
-    CU_TARGET_COMPUTE_62 = 62,       /**< Compute device class 6.2.*/
-    CU_TARGET_COMPUTE_70 = 70,       /**< Compute device class 7.0.*/
-    CU_TARGET_COMPUTE_72 = 72,       /**< Compute device class 7.2.*/
-    CU_TARGET_COMPUTE_75 = 75,       /**< Compute device class 7.5.*/
-    CU_TARGET_COMPUTE_80 = 80,       /**< Compute device class 8.0.*/
-    CU_TARGET_COMPUTE_86 = 86        /**< Compute device class 8.6.*/
-} CUjit_target;
-
-/**
- * Cubin matching fallback strategies
- */
-typedef enum CUjit_fallback_enum
-{
-    CU_PREFER_PTX = 0,  /**< Prefer to compile ptx if exact binary match not found */
-
-    CU_PREFER_BINARY    /**< Prefer to fall back to compatible binary code if exact match not found */
-
-} CUjit_fallback;
-
-/**
- * Caching modes for dlcm
- */
-typedef enum CUjit_cacheMode_enum
-{
-    CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */
-    CU_JIT_CACHE_OPTION_CG,       /**< Compile with L1 cache disabled */
-    CU_JIT_CACHE_OPTION_CA        /**< Compile with L1 cache enabled */
-} CUjit_cacheMode;
-
-/**
- * Device code formats
- */
-typedef enum CUjitInputType_enum
-{
-    /**
-     * Compiled device-class-specific device code\n
-     * Applicable options: none
-     */
-    CU_JIT_INPUT_CUBIN = 0,
-
-    /**
-     * PTX source code\n
-     * Applicable options: PTX compiler options
-     */
-    CU_JIT_INPUT_PTX,
-
-    /**
-     * Bundle of multiple cubins and/or PTX of some device code\n
-     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
-     */
-    CU_JIT_INPUT_FATBINARY,
-
-    /**
-     * Host object with embedded device code\n
-     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
-     */
-    CU_JIT_INPUT_OBJECT,
-
-    /**
-     * Archive of host objects with embedded device code\n
-     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
-     */
-    CU_JIT_INPUT_LIBRARY,
-
-    /**
-     * High-level intermediate code for link-time optimization\n
-     * Applicable options: NVVM compiler options, PTX compiler options
-     */
-    CU_JIT_INPUT_NVVM,
-
-    CU_JIT_NUM_INPUT_TYPES
-} CUjitInputType;
-
-typedef struct CUlinkState_st *CUlinkState;
-
-/**
- * Flags to register a graphics resource
- */
-typedef enum CUgraphicsRegisterFlags_enum {
-    CU_GRAPHICS_REGISTER_FLAGS_NONE           = 0x00,
-    CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY      = 0x01,
-    CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD  = 0x02,
-    CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST   = 0x04,
-    CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08
-} CUgraphicsRegisterFlags;
-
-/**
- * Flags for mapping and unmapping interop resources
- */
-typedef enum CUgraphicsMapResourceFlags_enum {
-    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
-    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
-    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
-} CUgraphicsMapResourceFlags;
-
-/**
- * Array indices for cube faces
- */
-typedef enum CUarray_cubemap_face_enum {
-    CU_CUBEMAP_FACE_POSITIVE_X  = 0x00, /**< Positive X face of cubemap */
-    CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01, /**< Negative X face of cubemap */
-    CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02, /**< Positive Y face of cubemap */
-    CU_CUBEMAP_FACE_NEGATIVE_Y  = 0x03, /**< Negative Y face of cubemap */
-    CU_CUBEMAP_FACE_POSITIVE_Z  = 0x04, /**< Positive Z face of cubemap */
-    CU_CUBEMAP_FACE_NEGATIVE_Z  = 0x05  /**< Negative Z face of cubemap */
-} CUarray_cubemap_face;
-
-/**
- * Limits
- */
-typedef enum CUlimit_enum {
-    CU_LIMIT_STACK_SIZE                       = 0x00, /**< GPU thread stack size */
-    CU_LIMIT_PRINTF_FIFO_SIZE                 = 0x01, /**< GPU printf FIFO size */
-    CU_LIMIT_MALLOC_HEAP_SIZE                 = 0x02, /**< GPU malloc heap size */
-    CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH           = 0x03, /**< GPU device runtime launch synchronize depth */
-    CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */
-    CU_LIMIT_MAX_L2_FETCH_GRANULARITY         = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
-    CU_LIMIT_PERSISTING_L2_CACHE_SIZE         = 0x06, /**< A size in bytes for L2 persisting lines cache size */
-    CU_LIMIT_MAX
-} CUlimit;
-
-/**
- * Resource types
- */
-typedef enum CUresourcetype_enum {
-    CU_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resoure */
-    CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
-    CU_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
-    CU_RESOURCE_TYPE_PITCH2D         = 0x03  /**< Pitch 2D resource */
-} CUresourcetype;
-
-#ifdef _WIN32
-#define CUDA_CB __stdcall
-#else
-#define CUDA_CB
-#endif
-
-/**
- * CUDA host function
- * \param userData Argument value passed to the function
- */
-typedef void (CUDA_CB *CUhostFn)(void *userData);
-
-/**
- * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members.
- */
-typedef enum CUaccessProperty_enum {
-    CU_ACCESS_PROPERTY_NORMAL           = 0,    /**< Normal cache persistence. */
-    CU_ACCESS_PROPERTY_STREAMING        = 1,    /**< Streaming access is less likely to persit from cache. */
-    CU_ACCESS_PROPERTY_PERSISTING       = 2     /**< Persisting access is more likely to persist in cache.*/
-} CUaccessProperty;
-
-/**
- * Specifies an access policy for a window, a contiguous extent of memory
- * beginning at base_ptr and ending at base_ptr + num_bytes.
- * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE.
- * Partition into many segments and assign segments such that:
- * sum of "hit segments" / window == approx. ratio.
- * sum of "miss segments" / window == approx 1-ratio.
- * Segments and ratio specifications are fitted to the capabilities of
- * the architecture.
- * Accesses in a hit segment apply the hitProp access policy.
- * Accesses in a miss segment apply the missProp access policy.
- */
-typedef struct CUaccessPolicyWindow_st {
-    void *base_ptr;                     /**< Starting address of the access policy window. CUDA driver may align it. */
-    size_t num_bytes;                   /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */
-    float hitRatio;                     /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */
-    CUaccessProperty hitProp;           /**< ::CUaccessProperty set for hit. */
-    CUaccessProperty missProp;          /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */
-} CUaccessPolicyWindow_v1;
-typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow;
-
-/**
- * GPU kernel node parameters
- */
-typedef struct CUDA_KERNEL_NODE_PARAMS_st {
-    CUfunction func;             /**< Kernel to launch */
-    unsigned int gridDimX;       /**< Width of grid in blocks */
-    unsigned int gridDimY;       /**< Height of grid in blocks */
-    unsigned int gridDimZ;       /**< Depth of grid in blocks */
-    unsigned int blockDimX;      /**< X dimension of each thread block */
-    unsigned int blockDimY;      /**< Y dimension of each thread block */
-    unsigned int blockDimZ;      /**< Z dimension of each thread block */
-    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
-    void **kernelParams;         /**< Array of pointers to kernel parameters */
-    void **extra;                /**< Extra options */
-} CUDA_KERNEL_NODE_PARAMS_v1;
-typedef CUDA_KERNEL_NODE_PARAMS_v1 CUDA_KERNEL_NODE_PARAMS;
-
-/**
- * Memset node parameters
- */
-typedef struct CUDA_MEMSET_NODE_PARAMS_st {
-    CUdeviceptr dst;                        /**< Destination device pointer */
-    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
-    unsigned int value;                     /**< Value to be set */
-    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
-    size_t width;                           /**< Width of the row in elements */
-    size_t height;                          /**< Number of rows */
-} CUDA_MEMSET_NODE_PARAMS_v1;
-typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS;
-
-/**
- * Host node parameters
- */
-typedef struct CUDA_HOST_NODE_PARAMS_st {
-    CUhostFn fn;    /**< The function to call when the node executes */
-    void* userData; /**< Argument to pass to the function */
-} CUDA_HOST_NODE_PARAMS_v1;
-typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS;
-
-/**
- * Graph node types
- */
-typedef enum CUgraphNodeType_enum {
-    CU_GRAPH_NODE_TYPE_KERNEL           = 0, /**< GPU kernel node */
-    CU_GRAPH_NODE_TYPE_MEMCPY           = 1, /**< Memcpy node */
-    CU_GRAPH_NODE_TYPE_MEMSET           = 2, /**< Memset node */
-    CU_GRAPH_NODE_TYPE_HOST             = 3, /**< Host (executable) node */
-    CU_GRAPH_NODE_TYPE_GRAPH            = 4, /**< Node which executes an embedded graph */
-    CU_GRAPH_NODE_TYPE_EMPTY            = 5, /**< Empty (no-op) node */
-    CU_GRAPH_NODE_TYPE_WAIT_EVENT       = 6, /**< External event wait node */
-    CU_GRAPH_NODE_TYPE_EVENT_RECORD     = 7, /**< External event record node */
-    CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */
-    CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT   = 9, /**< External semaphore wait node */
-    CU_GRAPH_NODE_TYPE_MEM_ALLOC        = 10,/**< Memory Allocation Node */
-    CU_GRAPH_NODE_TYPE_MEM_FREE         = 11 /**< Memory Free Node */
-} CUgraphNodeType;
-
-typedef enum CUsynchronizationPolicy_enum {
-    CU_SYNC_POLICY_AUTO = 1,
-    CU_SYNC_POLICY_SPIN = 2,
-    CU_SYNC_POLICY_YIELD = 3,
-    CU_SYNC_POLICY_BLOCKING_SYNC = 4
-} CUsynchronizationPolicy;
-
-/**
- * Graph kernel node Attributes
- */
-typedef enum CUkernelNodeAttrID_enum {
-    CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW       = 1,    /**< Identifier for ::CUkernelNodeAttrValue::accessPolicyWindow. */
-    CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE                = 2     /**< Allows a kernel node to be cooperative (see ::cuLaunchCooperativeKernel). */
-} CUkernelNodeAttrID;
-
-/**
- * Graph kernel node attributes union, used with ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute
- */
-typedef union CUkernelNodeAttrValue_union {
-    CUaccessPolicyWindow accessPolicyWindow;    /**< Attribute ::CUaccessPolicyWindow. */
-    int cooperative;                            /**< Nonzero indicates a cooperative kernel (see ::cuLaunchCooperativeKernel). */
-} CUkernelNodeAttrValue_v1;
-typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue;
-
-/**
- * Possible stream capture statuses returned by ::cuStreamIsCapturing
- */
-typedef enum CUstreamCaptureStatus_enum {
-    CU_STREAM_CAPTURE_STATUS_NONE        = 0, /**< Stream is not capturing */
-    CU_STREAM_CAPTURE_STATUS_ACTIVE      = 1, /**< Stream is actively capturing */
-    CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2  /**< Stream is part of a capture sequence that
-                                                   has been invalidated, but not terminated */
-} CUstreamCaptureStatus;
-
-/**
- * Possible modes for stream capture thread interactions. For more details see
- * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode
- */
-typedef enum CUstreamCaptureMode_enum {
-    CU_STREAM_CAPTURE_MODE_GLOBAL       = 0,
-    CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
-    CU_STREAM_CAPTURE_MODE_RELAXED      = 2
-} CUstreamCaptureMode;
-
-/**
- * Stream Attributes 
- */
-typedef enum CUstreamAttrID_enum {
-    CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW    = 1,   /**< Identifier for ::CUstreamAttrValue::accessPolicyWindow. */
-    CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY  = 3    /**< ::CUsynchronizationPolicy for work queued up in this stream */
-} CUstreamAttrID;
-
-/**
- * Stream attributes union, used with ::cuStreamSetAttribute/::cuStreamGetAttribute
- */
-typedef union CUstreamAttrValue_union {
-    CUaccessPolicyWindow accessPolicyWindow;   /**< Attribute ::CUaccessPolicyWindow. */
-    CUsynchronizationPolicy syncPolicy;        /**< Value for ::CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY. */
-} CUstreamAttrValue_v1;
-typedef CUstreamAttrValue_v1 CUstreamAttrValue;
-
-/**
- * Flags to specify search options. For more details see ::cuGetProcAddress
- */
-typedef enum CUdriverProcAddress_flags_enum {
-    CU_GET_PROC_ADDRESS_DEFAULT = 0,                        /**< Default search mode for driver symbols. */
-    CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0,             /**< Search for legacy versions of driver symbols. */
-    CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1  /**< Search for per-thread versions of driver symbols. */ 
-} CUdriverProcAddress_flags;
-
-/**
- * Execution Affinity Types 
- */
-typedef enum CUexecAffinityType_enum {
-    CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0,  /**< Create a context with limited SMs. */
-    CU_EXEC_AFFINITY_TYPE_MAX
-} CUexecAffinityType;
-
-/**
- * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT
- */
-typedef struct CUexecAffinitySmCount_st {
-    unsigned int val;    /**< The number of SMs the context is limited to use. */
-} CUexecAffinitySmCount_v1;
-typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount;
-
-/**
- * Execution Affinity Parameters 
- */
-typedef struct CUexecAffinityParam_st {
-    CUexecAffinityType type;
-    union {
-        CUexecAffinitySmCount smCount;    /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */
-    } param;
-} CUexecAffinityParam_v1;
-typedef CUexecAffinityParam_v1 CUexecAffinityParam;
-
-/**
- * Error codes
- */
-typedef enum cudaError_enum {
-    /**
-     * The API call returned with no errors. In the case of query calls, this
-     * also means that the operation being queried is complete (see
-     * ::cuEventQuery() and ::cuStreamQuery()).
-     */
-    CUDA_SUCCESS                              = 0,
-
-    /**
-     * This indicates that one or more of the parameters passed to the API call
-     * is not within an acceptable range of values.
-     */
-    CUDA_ERROR_INVALID_VALUE                  = 1,
-
-    /**
-     * The API call failed because it was unable to allocate enough memory to
-     * perform the requested operation.
-     */
-    CUDA_ERROR_OUT_OF_MEMORY                  = 2,
-
-    /**
-     * This indicates that the CUDA driver has not been initialized with
-     * ::cuInit() or that initialization has failed.
-     */
-    CUDA_ERROR_NOT_INITIALIZED                = 3,
-
-    /**
-     * This indicates that the CUDA driver is in the process of shutting down.
-     */
-    CUDA_ERROR_DEINITIALIZED                  = 4,
-
-    /**
-     * This indicates profiler is not initialized for this run. This can
-     * happen when the application is running with external profiling tools
-     * like visual profiler.
-     */
-    CUDA_ERROR_PROFILER_DISABLED              = 5,
-
-    /**
-     * \deprecated
-     * This error return is deprecated as of CUDA 5.0. It is no longer an error
-     * to attempt to enable/disable the profiling via ::cuProfilerStart or
-     * ::cuProfilerStop without initialization.
-     */
-    CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
-
-    /**
-     * \deprecated
-     * This error return is deprecated as of CUDA 5.0. It is no longer an error
-     * to call cuProfilerStart() when profiling is already enabled.
-     */
-    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
-
-    /**
-     * \deprecated
-     * This error return is deprecated as of CUDA 5.0. It is no longer an error
-     * to call cuProfilerStop() when profiling is already disabled.
-     */
-    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
-
-    /**
-     * This indicates that the CUDA driver that the application has loaded is a
-     * stub library. Applications that run with the stub rather than a real
-     * driver loaded will result in CUDA API returning this error.
-     */
-    CUDA_ERROR_STUB_LIBRARY                   = 34,
-
-    /**
-     * This indicates that no CUDA-capable devices were detected by the installed
-     * CUDA driver.
-     */
-    CUDA_ERROR_NO_DEVICE                      = 100,
-
-    /**
-     * This indicates that the device ordinal supplied by the user does not
-     * correspond to a valid CUDA device or that the action requested is
-     * invalid for the specified device.
-     */
-    CUDA_ERROR_INVALID_DEVICE                 = 101,
-
-    /**
-     * This error indicates that the Grid license is not applied.
-     */
-    CUDA_ERROR_DEVICE_NOT_LICENSED            = 102,
-
-    /**
-     * This indicates that the device kernel image is invalid. This can also
-     * indicate an invalid CUDA module.
-     */
-    CUDA_ERROR_INVALID_IMAGE                  = 200,
-
-    /**
-     * This most frequently indicates that there is no context bound to the
-     * current thread. This can also be returned if the context passed to an
-     * API call is not a valid handle (such as a context that has had
-     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
-     * mixes different API versions (i.e. 3010 context with 3020 API calls).
-     * See ::cuCtxGetApiVersion() for more details.
-     */
-    CUDA_ERROR_INVALID_CONTEXT                = 201,
-
-    /**
-     * This indicated that the context being supplied as a parameter to the
-     * API call was already the active context.
-     * \deprecated
-     * This error return is deprecated as of CUDA 3.2. It is no longer an
-     * error to attempt to push the active context via ::cuCtxPushCurrent().
-     */
-    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
-
-    /**
-     * This indicates that a map or register operation has failed.
-     */
-    CUDA_ERROR_MAP_FAILED                     = 205,
-
-    /**
-     * This indicates that an unmap or unregister operation has failed.
-     */
-    CUDA_ERROR_UNMAP_FAILED                   = 206,
-
-    /**
-     * This indicates that the specified array is currently mapped and thus
-     * cannot be destroyed.
-     */
-    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
-
-    /**
-     * This indicates that the resource is already mapped.
-     */
-    CUDA_ERROR_ALREADY_MAPPED                 = 208,
-
-    /**
-     * This indicates that there is no kernel image available that is suitable
-     * for the device. This can occur when a user specifies code generation
-     * options for a particular CUDA source file that do not include the
-     * corresponding device configuration.
-     */
-    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
-
-    /**
-     * This indicates that a resource has already been acquired.
-     */
-    CUDA_ERROR_ALREADY_ACQUIRED               = 210,
-
-    /**
-     * This indicates that a resource is not mapped.
-     */
-    CUDA_ERROR_NOT_MAPPED                     = 211,
-
-    /**
-     * This indicates that a mapped resource is not available for access as an
-     * array.
-     */
-    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
-
-    /**
-     * This indicates that a mapped resource is not available for access as a
-     * pointer.
-     */
-    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
-
-    /**
-     * This indicates that an uncorrectable ECC error was detected during
-     * execution.
-     */
-    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
-
-    /**
-     * This indicates that the ::CUlimit passed to the API call is not
-     * supported by the active device.
-     */
-    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
-
-    /**
-     * This indicates that the ::CUcontext passed to the API call can
-     * only be bound to a single CPU thread at a time but is already
-     * bound to a CPU thread.
-     */
-    CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
-
-    /**
-     * This indicates that peer access is not supported across the given
-     * devices.
-     */
-    CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        = 217,
-
-    /**
-     * This indicates that a PTX JIT compilation failed.
-     */
-    CUDA_ERROR_INVALID_PTX                    = 218,
-
-    /**
-     * This indicates an error with OpenGL or DirectX context.
-     */
-    CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       = 219,
-
-    /**
-    * This indicates that an uncorrectable NVLink error was detected during the
-    * execution.
-    */
-    CUDA_ERROR_NVLINK_UNCORRECTABLE           = 220,
-
-    /**
-    * This indicates that the PTX JIT compiler library was not found.
-    */
-    CUDA_ERROR_JIT_COMPILER_NOT_FOUND         = 221,
-
-    /**
-     * This indicates that the provided PTX was compiled with an unsupported toolchain.
-     */
-
-    CUDA_ERROR_UNSUPPORTED_PTX_VERSION        = 222,
-
-    /**
-     * This indicates that the PTX JIT compilation was disabled.
-     */
-    CUDA_ERROR_JIT_COMPILATION_DISABLED       = 223,
-
-    /**
-     * This indicates that the ::CUexecAffinityType passed to the API call is not
-     * supported by the active device.
-     */ 
-    CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY      = 224,
-
-    /**
-     * This indicates that the device kernel source is invalid. This includes
-     * compilation/linker errors encountered in device code or user error.
-     */
-    CUDA_ERROR_INVALID_SOURCE                 = 300,
-
-    /**
-     * This indicates that the file specified was not found.
-     */
-    CUDA_ERROR_FILE_NOT_FOUND                 = 301,
-
-    /**
-     * This indicates that a link to a shared object failed to resolve.
-     */
-    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
-
-    /**
-     * This indicates that initialization of a shared object failed.
-     */
-    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
-
-    /**
-     * This indicates that an OS call failed.
-     */
-    CUDA_ERROR_OPERATING_SYSTEM               = 304,
-
-    /**
-     * This indicates that a resource handle passed to the API call was not
-     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
-     */
-    CUDA_ERROR_INVALID_HANDLE                 = 400,
-
-    /**
-     * This indicates that a resource required by the API call is not in a
-     * valid state to perform the requested operation.
-     */
-    CUDA_ERROR_ILLEGAL_STATE                  = 401,
-
-    /**
-     * This indicates that a named symbol was not found. Examples of symbols
-     * are global/constant variable names, driver function names, texture names,
-     * and surface names.
-     */
-    CUDA_ERROR_NOT_FOUND                      = 500,
-
-    /**
-     * This indicates that asynchronous operations issued previously have not
-     * completed yet. This result is not actually an error, but must be indicated
-     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
-     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
-     */
-    CUDA_ERROR_NOT_READY                      = 600,
-
-    /**
-     * While executing a kernel, the device encountered a
-     * load or store instruction on an invalid memory address.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_ILLEGAL_ADDRESS                = 700,
-
-    /**
-     * This indicates that a launch did not occur because it did not have
-     * appropriate resources. This error usually indicates that the user has
-     * attempted to pass too many arguments to the device kernel, or the
-     * kernel launch specifies too many threads for the kernel's register
-     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
-     * when a 32-bit int is expected) is equivalent to passing too many
-     * arguments and can also result in this error.
-     */
-    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
-
-    /**
-     * This indicates that the device kernel took too long to execute. This can
-     * only occur if timeouts are enabled - see the device attribute
-     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
-
-    /**
-     * This error indicates a kernel launch that uses an incompatible texturing
-     * mode.
-     */
-    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
-
-    /**
-     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
-     * trying to re-enable peer access to a context which has already
-     * had peer access to it enabled.
-     */
-    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    = 704,
-
-    /**
-     * This error indicates that ::cuCtxDisablePeerAccess() is
-     * trying to disable peer access which has not been enabled yet
-     * via ::cuCtxEnablePeerAccess().
-     */
-    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        = 705,
-
-    /**
-     * This error indicates that the primary context for the specified device
-     * has already been initialized.
-     */
-    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
-
-    /**
-     * This error indicates that the context current to the calling thread
-     * has been destroyed using ::cuCtxDestroy, or is a primary context which
-     * has not yet been initialized.
-     */
-    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
-
-    /**
-     * A device-side assert triggered during kernel execution. The context
-     * cannot be used anymore, and must be destroyed. All existing device
-     * memory allocations from this context are invalid and must be
-     * reconstructed if the program is to continue using CUDA.
-     */
-    CUDA_ERROR_ASSERT                         = 710,
-
-    /**
-     * This error indicates that the hardware resources required to enable
-     * peer access have been exhausted for one or more of the devices
-     * passed to ::cuCtxEnablePeerAccess().
-     */
-    CUDA_ERROR_TOO_MANY_PEERS                 = 711,
-
-    /**
-     * This error indicates that the memory range passed to ::cuMemHostRegister()
-     * has already been registered.
-     */
-    CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
-
-    /**
-     * This error indicates that the pointer passed to ::cuMemHostUnregister()
-     * does not correspond to any currently registered memory region.
-     */
-    CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     = 713,
-
-    /**
-     * While executing a kernel, the device encountered a stack error.
-     * This can be due to stack corruption or exceeding the stack size limit.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_HARDWARE_STACK_ERROR           = 714,
-
-    /**
-     * While executing a kernel, the device encountered an illegal instruction.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_ILLEGAL_INSTRUCTION            = 715,
-
-    /**
-     * While executing a kernel, the device encountered a load or store instruction
-     * on a memory address which is not aligned.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_MISALIGNED_ADDRESS             = 716,
-
-    /**
-     * While executing a kernel, the device encountered an instruction
-     * which can only operate on memory locations in certain address spaces
-     * (global, shared, or local), but was supplied a memory address not
-     * belonging to an allowed address space.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_INVALID_ADDRESS_SPACE          = 717,
-
-    /**
-     * While executing a kernel, the device program counter wrapped its address space.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_INVALID_PC                     = 718,
-
-    /**
-     * An exception occurred on the device while executing a kernel. Common
-     * causes include dereferencing an invalid device pointer and accessing
-     * out of bounds shared memory. Less common cases can be system specific - more
-     * information about these cases can be found in the system specific user guide.
-     * This leaves the process in an inconsistent state and any further CUDA work
-     * will return the same error. To continue using CUDA, the process must be terminated
-     * and relaunched.
-     */
-    CUDA_ERROR_LAUNCH_FAILED                  = 719,
-
-    /**
-     * This error indicates that the number of blocks launched per grid for a kernel that was
-     * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
-     * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
-     * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
-     * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
-     */
-    CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE   = 720,
-
-    /**
-     * This error indicates that the attempted operation is not permitted.
-     */
-    CUDA_ERROR_NOT_PERMITTED                  = 800,
-
-    /**
-     * This error indicates that the attempted operation is not supported
-     * on the current system or device.
-     */
-    CUDA_ERROR_NOT_SUPPORTED                  = 801,
-
-    /**
-     * This error indicates that the system is not yet ready to start any CUDA
-     * work.  To continue using CUDA, verify the system configuration is in a
-     * valid state and all required driver daemons are actively running.
-     * More information about this error can be found in the system specific
-     * user guide.
-     */
-    CUDA_ERROR_SYSTEM_NOT_READY               = 802,
-
-    /**
-     * This error indicates that there is a mismatch between the versions of
-     * the display driver and the CUDA driver. Refer to the compatibility documentation
-     * for supported versions.
-     */
-    CUDA_ERROR_SYSTEM_DRIVER_MISMATCH         = 803,
-
-    /**
-     * This error indicates that the system was upgraded to run with forward compatibility
-     * but the visible hardware detected by CUDA does not support this configuration.
-     * Refer to the compatibility documentation for the supported hardware matrix or ensure
-     * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
-     * environment variable.
-     */
-    CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
-
-    /**
-     * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
-     */
-    CUDA_ERROR_MPS_CONNECTION_FAILED          = 805,
-
-    /**
-     * This error indicates that the remote procedural call between the MPS server and the MPS client failed.
-     */
-    CUDA_ERROR_MPS_RPC_FAILURE                = 806,
-
-    /**
-     * This error indicates that the MPS server is not ready to accept new MPS client requests.
-     * This error can be returned when the MPS server is in the process of recovering from a fatal failure.
-     */
-    CUDA_ERROR_MPS_SERVER_NOT_READY           = 807,
-
-    /**
-     * This error indicates that the hardware resources required to create MPS client have been exhausted.
-     */
-    CUDA_ERROR_MPS_MAX_CLIENTS_REACHED        = 808,
-
-    /**
-     * This error indicates the the hardware resources required to support device connections have been exhausted.
-     */
-    CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED    = 809,
-
-    /**
-     * This error indicates that the operation is not permitted when
-     * the stream is capturing.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED     = 900,
-
-    /**
-     * This error indicates that the current capture sequence on the stream
-     * has been invalidated due to a previous error.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_INVALIDATED     = 901,
-
-    /**
-     * This error indicates that the operation would have resulted in a merge
-     * of two independent capture sequences.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_MERGE           = 902,
-
-    /**
-     * This error indicates that the capture was not initiated in this stream.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_UNMATCHED       = 903,
-
-    /**
-     * This error indicates that the capture sequence contains a fork that was
-     * not joined to the primary stream.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_UNJOINED        = 904,
-
-    /**
-     * This error indicates that a dependency would have been created which
-     * crosses the capture sequence boundary. Only implicit in-stream ordering
-     * dependencies are allowed to cross the boundary.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_ISOLATION       = 905,
-
-    /**
-     * This error indicates a disallowed implicit dependency on a current capture
-     * sequence from cudaStreamLegacy.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_IMPLICIT        = 906,
-
-    /**
-     * This error indicates that the operation is not permitted on an event which
-     * was last recorded in a capturing stream.
-     */
-    CUDA_ERROR_CAPTURED_EVENT                 = 907,
-
-    /**
-     * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
-     * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
-     * different thread.
-     */
-    CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD    = 908,
-
-    /**
-     * This error indicates that the timeout specified for the wait operation has lapsed.
-     */
-    CUDA_ERROR_TIMEOUT                        = 909,
-
-    /**
-     * This error indicates that the graph update was not performed because it included 
-     * changes which violated constraints specific to instantiated graph update.
-     */
-    CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE      = 910,
-
-    /**
-     * This indicates that an async error has occurred in a device outside of CUDA.
-     * If CUDA was waiting for an external device's signal before consuming shared data,
-     * the external device signaled an error indicating that the data is not valid for
-     * consumption. This leaves the process in an inconsistent state and any further CUDA
-     * work will return the same error. To continue using CUDA, the process must be
-     * terminated and relaunched.
-     */
-    CUDA_ERROR_EXTERNAL_DEVICE               = 911,
-
-    /**
-     * This indicates that an unknown internal error has occurred.
-     */
-    CUDA_ERROR_UNKNOWN                        = 999
-} CUresult;
-
-/**
- * P2P Attributes
- */
-typedef enum CUdevice_P2PAttribute_enum {
-    CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK                     = 0x01,  /**< A relative value indicating the performance of the link between two devices */
-    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED                     = 0x02,  /**< P2P Access is enable */
-    CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED              = 0x03,  /**< Atomic operation over the link supported */
-    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED              = 0x04,  /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */
-    CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED          = 0x04   /**< Accessing CUDA arrays over the link supported */
-} CUdevice_P2PAttribute;
-
-/**
- * CUDA stream callback
- * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback.  May be NULL.
- * \param status ::CUDA_SUCCESS or any persistent error on the stream.
- * \param userData User parameter provided at registration.
- */
-typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData);
-
-/**
- * Block size to per-block dynamic shared memory mapping for a certain
- * kernel \param blockSize Block size of the kernel.
- *
- * \return The dynamic shared memory needed by a block.
- */
-typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize);
-
-/**
- * If set, host memory is portable between CUDA contexts.
- * Flag for ::cuMemHostAlloc()
- */
-#define CU_MEMHOSTALLOC_PORTABLE        0x01
-
-/**
- * If set, host memory is mapped into CUDA address space and
- * ::cuMemHostGetDevicePointer() may be called on the host pointer.
- * Flag for ::cuMemHostAlloc()
- */
-#define CU_MEMHOSTALLOC_DEVICEMAP       0x02
-
-/**
- * If set, host memory is allocated as write-combined - fast to write,
- * faster to DMA, slow to read except via SSE4 streaming load instruction
- * (MOVNTDQA).
- * Flag for ::cuMemHostAlloc()
- */
-#define CU_MEMHOSTALLOC_WRITECOMBINED   0x04
-
-/**
- * If set, host memory is portable between CUDA contexts.
- * Flag for ::cuMemHostRegister()
- */
-#define CU_MEMHOSTREGISTER_PORTABLE     0x01
-
-/**
- * If set, host memory is mapped into CUDA address space and
- * ::cuMemHostGetDevicePointer() may be called on the host pointer.
- * Flag for ::cuMemHostRegister()
- */
-#define CU_MEMHOSTREGISTER_DEVICEMAP    0x02
-
-/**
- * If set, the passed memory pointer is treated as pointing to some
- * memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
- * On Windows the flag is a no-op.
- * On Linux that memory is marked as non cache-coherent for the GPU and
- * is expected to be physically contiguous. It may return
- * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
- * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
- * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED
- * is returned.
- * Flag for ::cuMemHostRegister()
- */
-#define CU_MEMHOSTREGISTER_IOMEMORY     0x04
-
-/**
-* If set, the passed memory pointer is treated as pointing to memory that is
-* considered read-only by the device.  On platforms without
-* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
-* required in order to register memory mapped to the CPU as read-only.  Support
-* for the use of this flag can be queried from the device attribute
-* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
-* a current context associated with a device that does not have this attribute
-* set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED.
-*/
-#define CU_MEMHOSTREGISTER_READ_ONLY    0x08
-
-/**
- * 2D memory copy parameters
- */
-typedef struct CUDA_MEMCPY2D_st {
-    size_t srcXInBytes;         /**< Source X in bytes */
-    size_t srcY;                /**< Source Y */
-
-    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
-    const void *srcHost;        /**< Source host pointer */
-    CUdeviceptr srcDevice;      /**< Source device pointer */
-    CUarray srcArray;           /**< Source array reference */
-    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
-
-    size_t dstXInBytes;         /**< Destination X in bytes */
-    size_t dstY;                /**< Destination Y */
-
-    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
-    void *dstHost;              /**< Destination host pointer */
-    CUdeviceptr dstDevice;      /**< Destination device pointer */
-    CUarray dstArray;           /**< Destination array reference */
-    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
-
-    size_t WidthInBytes;        /**< Width of 2D memory copy in bytes */
-    size_t Height;              /**< Height of 2D memory copy */
-} CUDA_MEMCPY2D_v2;
-typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D;
-
-/**
- * 3D memory copy parameters
- */
-typedef struct CUDA_MEMCPY3D_st {
-    size_t srcXInBytes;         /**< Source X in bytes */
-    size_t srcY;                /**< Source Y */
-    size_t srcZ;                /**< Source Z */
-    size_t srcLOD;              /**< Source LOD */
-    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
-    const void *srcHost;        /**< Source host pointer */
-    CUdeviceptr srcDevice;      /**< Source device pointer */
-    CUarray srcArray;           /**< Source array reference */
-    void *reserved0;            /**< Must be NULL */
-    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
-    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
-
-    size_t dstXInBytes;         /**< Destination X in bytes */
-    size_t dstY;                /**< Destination Y */
-    size_t dstZ;                /**< Destination Z */
-    size_t dstLOD;              /**< Destination LOD */
-    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
-    void *dstHost;              /**< Destination host pointer */
-    CUdeviceptr dstDevice;      /**< Destination device pointer */
-    CUarray dstArray;           /**< Destination array reference */
-    void *reserved1;            /**< Must be NULL */
-    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
-    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
-
-    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
-    size_t Height;              /**< Height of 3D memory copy */
-    size_t Depth;               /**< Depth of 3D memory copy */
-} CUDA_MEMCPY3D_v2;
-typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D;
-
-/**
- * 3D memory cross-context copy parameters
- */
-typedef struct CUDA_MEMCPY3D_PEER_st {
-    size_t srcXInBytes;         /**< Source X in bytes */
-    size_t srcY;                /**< Source Y */
-    size_t srcZ;                /**< Source Z */
-    size_t srcLOD;              /**< Source LOD */
-    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
-    const void *srcHost;        /**< Source host pointer */
-    CUdeviceptr srcDevice;      /**< Source device pointer */
-    CUarray srcArray;           /**< Source array reference */
-    CUcontext srcContext;       /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
-    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
-    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
-
-    size_t dstXInBytes;         /**< Destination X in bytes */
-    size_t dstY;                /**< Destination Y */
-    size_t dstZ;                /**< Destination Z */
-    size_t dstLOD;              /**< Destination LOD */
-    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
-    void *dstHost;              /**< Destination host pointer */
-    CUdeviceptr dstDevice;      /**< Destination device pointer */
-    CUarray dstArray;           /**< Destination array reference */
-    CUcontext dstContext;       /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
-    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
-    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
-
-    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
-    size_t Height;              /**< Height of 3D memory copy */
-    size_t Depth;               /**< Depth of 3D memory copy */
-} CUDA_MEMCPY3D_PEER_v1;
-typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER;
-
-/**
- * Array descriptor
- */
-typedef struct CUDA_ARRAY_DESCRIPTOR_st
-{
-    size_t Width;             /**< Width of array */
-    size_t Height;            /**< Height of array */
-
-    CUarray_format Format;    /**< Array format */
-    unsigned int NumChannels; /**< Channels per array element */
-} CUDA_ARRAY_DESCRIPTOR_v2;
-typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR;
-
-/**
- * 3D array descriptor
- */
-typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
-{
-    size_t Width;             /**< Width of 3D array */
-    size_t Height;            /**< Height of 3D array */
-    size_t Depth;             /**< Depth of 3D array */
-
-    CUarray_format Format;    /**< Array format */
-    unsigned int NumChannels; /**< Channels per array element */
-    unsigned int Flags;       /**< Flags */
-} CUDA_ARRAY3D_DESCRIPTOR_v2;
-typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR;
-
-/**
- * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers
- */
-#define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1
-
-/**
- * CUDA array sparse properties
- */
-typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st {
-    struct {
-        unsigned int width;     /**< Width of sparse tile in elements */
-        unsigned int height;    /**< Height of sparse tile in elements */
-        unsigned int depth;     /**< Depth of sparse tile in elements */
-    } tileExtent;
-
-    /**
-     * First mip level at which the mip tail begins.
-     */
-    unsigned int miptailFirstLevel;
-    /**
-     * Total size of the mip tail.
-     */
-    unsigned long long miptailSize;
-    /**
-     * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL
-     */
-    unsigned int flags;
-    unsigned int reserved[4];
-} CUDA_ARRAY_SPARSE_PROPERTIES_v1;
-typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES;
-
-/**
- * CUDA Resource descriptor
- */
-typedef struct CUDA_RESOURCE_DESC_st
-{
-    CUresourcetype resType;                   /**< Resource type */
-
-    union {
-        struct {
-            CUarray hArray;                   /**< CUDA array */
-        } array;
-        struct {
-            CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */
-        } mipmap;
-        struct {
-            CUdeviceptr devPtr;               /**< Device pointer */
-            CUarray_format format;            /**< Array format */
-            unsigned int numChannels;         /**< Channels per array element */
-            size_t sizeInBytes;               /**< Size in bytes */
-        } linear;
-        struct {
-            CUdeviceptr devPtr;               /**< Device pointer */
-            CUarray_format format;            /**< Array format */
-            unsigned int numChannels;         /**< Channels per array element */
-            size_t width;                     /**< Width of the array in elements */
-            size_t height;                    /**< Height of the array in elements */
-            size_t pitchInBytes;              /**< Pitch between two rows in bytes */
-        } pitch2D;
-        struct {
-            int reserved[32];
-        } reserved;
-    } res;
-
-    unsigned int flags;                       /**< Flags (must be zero) */
-} CUDA_RESOURCE_DESC_v1;
-typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC;
-
-/**
- * Texture descriptor
- */
-typedef struct CUDA_TEXTURE_DESC_st {
-    CUaddress_mode addressMode[3];  /**< Address modes */
-    CUfilter_mode filterMode;       /**< Filter mode */
-    unsigned int flags;             /**< Flags */
-    unsigned int maxAnisotropy;     /**< Maximum anisotropy ratio */
-    CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
-    float mipmapLevelBias;          /**< Mipmap level bias */
-    float minMipmapLevelClamp;      /**< Mipmap minimum level clamp */
-    float maxMipmapLevelClamp;      /**< Mipmap maximum level clamp */
-    float borderColor[4];           /**< Border Color */
-    int reserved[12];
-} CUDA_TEXTURE_DESC_v1;
-typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC;
-
-/**
- * Resource view format
- */
-typedef enum CUresourceViewFormat_enum
-{
-    CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
-    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
-    CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
-    CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
-    CU_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
-    CU_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
-    CU_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
-    CU_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
-    CU_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
-    CU_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
-} CUresourceViewFormat;
-
-/**
- * Resource view descriptor
- */
-typedef struct CUDA_RESOURCE_VIEW_DESC_st
-{
-    CUresourceViewFormat format;   /**< Resource view format */
-    size_t width;                  /**< Width of the resource view */
-    size_t height;                 /**< Height of the resource view */
-    size_t depth;                  /**< Depth of the resource view */
-    unsigned int firstMipmapLevel; /**< First defined mipmap level */
-    unsigned int lastMipmapLevel;  /**< Last defined mipmap level */
-    unsigned int firstLayer;       /**< First layer index */
-    unsigned int lastLayer;        /**< Last layer index */
-    unsigned int reserved[16];
-} CUDA_RESOURCE_VIEW_DESC_v1;
-typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC;
-
-/**
- * GPU Direct v3 tokens
- */
-typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
-    unsigned long long p2pToken;
-    unsigned int vaSpaceToken;
-} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1;
-typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
-
-/**
-* Access flags that specify the level of access the current context's device has
-* on the memory referenced.
-*/
-typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum {
-    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE      = 0x0,   /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */
-    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ      = 0x1,   /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */
-    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3    /**< Read-write access, the device has full read-write access to the memory */
-} CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS;
-
-/**
- * Kernel launch parameters
- */
-typedef struct CUDA_LAUNCH_PARAMS_st {
-    CUfunction function;         /**< Kernel to launch */
-    unsigned int gridDimX;       /**< Width of grid in blocks */
-    unsigned int gridDimY;       /**< Height of grid in blocks */
-    unsigned int gridDimZ;       /**< Depth of grid in blocks */
-    unsigned int blockDimX;      /**< X dimension of each thread block */
-    unsigned int blockDimY;      /**< Y dimension of each thread block */
-    unsigned int blockDimZ;      /**< Z dimension of each thread block */
-    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
-    CUstream hStream;            /**< Stream identifier */
-    void **kernelParams;         /**< Array of pointers to kernel parameters */
-} CUDA_LAUNCH_PARAMS_v1;
-typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS;
-
-/**
- * External memory handle types
- */
-typedef enum CUexternalMemoryHandleType_enum {
-    /**
-     * Handle is an opaque file descriptor
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
-    /**
-     * Handle is an opaque shared NT handle
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
-    /**
-     * Handle is an opaque, globally shared handle
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
-    /**
-     * Handle is a D3D12 heap object
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
-    /**
-     * Handle is a D3D12 committed resource
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
-    /**
-     * Handle is a shared NT handle to a D3D11 resource
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
-    /**
-     * Handle is a globally shared handle to a D3D11 resource
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
-    /**
-     * Handle is an NvSciBuf object
-     */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8
-} CUexternalMemoryHandleType;
-
-/**
- * Indicates that the external memory object is a dedicated resource
- */
-#define CUDA_EXTERNAL_MEMORY_DEDICATED   0x1
-
-/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
- * contains this flag, it indicates that signaling an external semaphore object
- * should skip performing appropriate memory synchronization operations over all
- * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF,
- * which otherwise are performed by default to ensure data coherency with other
- * importers of the same NvSciBuf memory objects.
- */
-#define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01
-
-/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
- * contains this flag, it indicates that waiting on an external semaphore object
- * should skip performing appropriate memory synchronization operations over all
- * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF,
- * which otherwise are performed by default to ensure data coherency with other
- * importers of the same NvSciBuf memory objects.
- */
-#define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02
-
-/**
- * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this,
- * it indicates that application needs signaler specific NvSciSyncAttr
- * to be filled by ::cuDeviceGetNvSciSyncAttributes.
- */
-#define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1
-
-/**
- * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this,
- * it indicates that application needs waiter specific NvSciSyncAttr
- * to be filled by ::cuDeviceGetNvSciSyncAttributes.
- */
-#define CUDA_NVSCISYNC_ATTR_WAIT 0x2
-/**
- * External memory handle descriptor
- */
-typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
-    /**
-     * Type of the handle
-     */
-    CUexternalMemoryHandleType type;
-    union {
-        /**
-         * File descriptor referencing the memory object. Valid
-         * when type is
-         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD
-         */
-        int fd;
-        /**
-         * Win32 handle referencing the semaphore object. Valid when
-         * type is one of the following:
-         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32
-         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
-         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP
-         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
-         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
-         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
-         * Exactly one of 'handle' and 'name' must be non-NULL. If
-         * type is one of the following:
-         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
-         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
-         * then 'name' must be NULL.
-         */
-        struct {
-            /**
-             * Valid NT handle. Must be NULL if 'name' is non-NULL
-             */
-            void *handle;
-            /**
-             * Name of a valid memory object.
-             * Must be NULL if 'handle' is non-NULL.
-             */
-            const void *name;
-        } win32;
-        /**
-         * A handle representing an NvSciBuf Object. Valid when type
-         * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF
-         */
-        const void *nvSciBufObject;
-    } handle;
-    /**
-     * Size of the memory allocation
-     */
-    unsigned long long size;
-    /**
-     * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED
-     */
-    unsigned int flags;
-    unsigned int reserved[16];
-} CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1;
-typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
-
-/**
- * External memory buffer descriptor
- */
-typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
-    /**
-     * Offset into the memory object where the buffer's base is
-     */
-    unsigned long long offset;
-    /**
-     * Size of the buffer
-     */
-    unsigned long long size;
-    /**
-     * Flags reserved for future use. Must be zero.
-     */
-    unsigned int flags;
-    unsigned int reserved[16];
-} CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1;
-typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
-
-/**
- * External memory mipmap descriptor
- */
-typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
-    /**
-     * Offset into the memory object where the base level of the
-     * mipmap chain is.
-     */
-    unsigned long long offset;
-    /**
-     * Format, dimension and type of base level of the mipmap chain
-     */
-    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
-    /**
-     * Total number of levels in the mipmap chain
-     */
-    unsigned int numLevels;
-    unsigned int reserved[16];
-} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1;
-typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
-
-/**
- * External semaphore handle types
- */
-typedef enum CUexternalSemaphoreHandleType_enum {
-    /**
-     * Handle is an opaque file descriptor
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD             = 1,
-    /**
-     * Handle is an opaque shared NT handle
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32          = 2,
-    /**
-     * Handle is an opaque, globally shared handle
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT      = 3,
-    /**
-     * Handle is a shared NT handle referencing a D3D12 fence object
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE           = 4,
-    /**
-     * Handle is a shared NT handle referencing a D3D11 fence object
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE           = 5,
-    /**
-     * Opaque handle to NvSciSync Object
-	 */
-	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC             = 6,
-    /**
-     * Handle is a shared NT handle referencing a D3D11 keyed mutex object
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX     = 7,
-    /**
-     * Handle is a globally shared handle referencing a D3D11 keyed mutex object
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8,
-    /**
-     * Handle is an opaque file descriptor referencing a timeline semaphore
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9,
-    /**
-     * Handle is an opaque shared NT handle referencing a timeline semaphore
-     */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
-} CUexternalSemaphoreHandleType;
-
-/**
- * External semaphore handle descriptor
- */
-typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
-    /**
-     * Type of the handle
-     */
-    CUexternalSemaphoreHandleType type;
-    union {
-        /**
-         * File descriptor referencing the semaphore object. Valid
-         * when type is one of the following:
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD
-         */
-        int fd;
-        /**
-         * Win32 handle referencing the semaphore object. Valid when
-         * type is one of the following:
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
-         * Exactly one of 'handle' and 'name' must be non-NULL. If
-         * type is one of the following:
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
-         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
-         * then 'name' must be NULL.
-         */
-        struct {
-            /**
-             * Valid NT handle. Must be NULL if 'name' is non-NULL
-             */
-            void *handle;
-            /**
-             * Name of a valid synchronization primitive.
-             * Must be NULL if 'handle' is non-NULL.
-             */
-            const void *name;
-        } win32;
-        /**
-         * Valid NvSciSyncObj. Must be non NULL
-         */
-        const void* nvSciSyncObj;
-    } handle;
-    /**
-     * Flags reserved for the future. Must be zero.
-     */
-    unsigned int flags;
-    unsigned int reserved[16];
-} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1;
-typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
-
-/**
- * External semaphore signal parameters
- */
-typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st {
-    struct {
-        /**
-         * Parameters for fence objects
-         */
-        struct {
-            /**
-             * Value of fence to be signaled
-             */
-            unsigned long long value;
-        } fence;
-        union {
-            /**
-             * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType
-             * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC.
-             */
-            void *fence;
-            unsigned long long reserved;
-        } nvSciSync;
-        /**
-         * Parameters for keyed mutex objects
-         */
-        struct {
-            /**
-             * Value of key to release the mutex with
-             */
-            unsigned long long key;
-        } keyedMutex;
-        unsigned int reserved[12];
-    } params;
-    /**
-     * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to
-     * signal a ::CUexternalSemaphore of type
-     * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
-     * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates
-     * that while signaling the ::CUexternalSemaphore, no memory synchronization
-     * operations should be performed for any external memory object imported
-     * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
-     * For all other types of ::CUexternalSemaphore, flags must be zero.
-     */
-    unsigned int flags;
-    unsigned int reserved[16];
-} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1;
-typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS;
-
-/**
- * External semaphore wait parameters
- */
-typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
-    struct {
-        /**
-         * Parameters for fence objects
-         */
-        struct {
-            /**
-             * Value of fence to be waited on
-             */
-            unsigned long long value;
-        } fence;
-        /**
-         * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType
-         * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC.
-         */
-        union {
-            void *fence;
-            unsigned long long reserved;
-        } nvSciSync;
-        /**
-         * Parameters for keyed mutex objects
-         */
-        struct {
-            /**
-             * Value of key to acquire the mutex with
-             */
-            unsigned long long key;
-            /**
-             * Timeout in milliseconds to wait to acquire the mutex
-             */
-            unsigned int timeoutMs;
-        } keyedMutex;
-        unsigned int reserved[10];
-    } params;
-    /**
-     * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
-     * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
-     * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
-     * which indicates that while waiting for the ::CUexternalSemaphore, no memory
-     * synchronization operations should be performed for any external memory
-     * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
-     * For all other types of ::CUexternalSemaphore, flags must be zero.
-     */
-    unsigned int flags;
-    unsigned int reserved[16];
-} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1;
-typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS;
-
-/**
- * Semaphore signal node parameters
- */
-typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st {
-    CUexternalSemaphore* extSemArray;                         /**< Array of external semaphore handles. */
-    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */
-    unsigned int numExtSems;                                  /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
-} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1;
-typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS;
-
-/**
- * Semaphore wait node parameters
- */
-typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st {
-    CUexternalSemaphore* extSemArray;                       /**< Array of external semaphore handles. */
-    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */
-    unsigned int numExtSems;                                /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
-} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1;
-typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS;
-
-typedef unsigned long long CUmemGenericAllocationHandle_v1;
-typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
-
-/**
- * Flags for specifying particular handle types
- */
-typedef enum CUmemAllocationHandleType_enum {
-    CU_MEM_HANDLE_TYPE_NONE                  = 0x0,  /**< Does not allow any export mechanism. > */
-    CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,  /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */
-    CU_MEM_HANDLE_TYPE_WIN32                 = 0x2,  /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */
-    CU_MEM_HANDLE_TYPE_WIN32_KMT             = 0x4,  /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */
-    CU_MEM_HANDLE_TYPE_MAX                   = 0x7FFFFFFF
-} CUmemAllocationHandleType;
-
-/**
- * Specifies the memory protection flags for mapping.
- */
-typedef enum CUmemAccess_flags_enum {
-    CU_MEM_ACCESS_FLAGS_PROT_NONE        = 0x0,  /**< Default, make the address range not accessible */
-    CU_MEM_ACCESS_FLAGS_PROT_READ        = 0x1,  /**< Make the address range read accessible */
-    CU_MEM_ACCESS_FLAGS_PROT_READWRITE   = 0x3,  /**< Make the address range read-write accessible */
-    CU_MEM_ACCESS_FLAGS_PROT_MAX         = 0x7FFFFFFF
-} CUmemAccess_flags;
-
-/**
- * Specifies the type of location
- */
-typedef enum CUmemLocationType_enum {
-    CU_MEM_LOCATION_TYPE_INVALID = 0x0,
-    CU_MEM_LOCATION_TYPE_DEVICE  = 0x1,  /**< Location is a device location, thus id is a device ordinal */
-    CU_MEM_LOCATION_TYPE_MAX     = 0x7FFFFFFF
-} CUmemLocationType;
-
-/**
-* Defines the allocation types available
-*/
-typedef enum CUmemAllocationType_enum {
-    CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
-
-    /** This allocation type is 'pinned', i.e. cannot migrate from its current
-      * location while the application is actively using it
-      */
-    CU_MEM_ALLOCATION_TYPE_PINNED  = 0x1,
-    CU_MEM_ALLOCATION_TYPE_MAX     = 0x7FFFFFFF
-} CUmemAllocationType;
-
-/**
-* Flag for requesting different optimal and required granularities for an allocation.
-*/
-typedef enum CUmemAllocationGranularity_flags_enum {
-    CU_MEM_ALLOC_GRANULARITY_MINIMUM     = 0x0,     /**< Minimum required granularity for allocation */
-    CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1      /**< Recommended granularity for allocation for best performance */
-} CUmemAllocationGranularity_flags;
-
-/**
- * Sparse subresource types
- */
-typedef enum CUarraySparseSubresourceType_enum {
-    CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
-    CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
-} CUarraySparseSubresourceType;
-
-/**
- * Memory operation types
- */
-typedef enum CUmemOperationType_enum {
-    CU_MEM_OPERATION_TYPE_MAP = 1,
-    CU_MEM_OPERATION_TYPE_UNMAP = 2
-} CUmemOperationType;
-
-/**
- * Memory handle types
- */
-typedef enum CUmemHandleType_enum {
-    CU_MEM_HANDLE_TYPE_GENERIC = 0
-} CUmemHandleType;
-
-/**
- * Specifies the CUDA array or CUDA mipmapped array memory mapping information
- */
-typedef struct CUarrayMapInfo_st {    
-    CUresourcetype resourceType;                    /**< Resource type */
-
-    union {
-        CUmipmappedArray mipmap;
-        CUarray array;
-    } resource;
-
-    CUarraySparseSubresourceType subresourceType;   /**< Sparse subresource type */
-
-    union {
-        struct {
-            unsigned int level;                     /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */            
-            unsigned int layer;                     /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */
-            unsigned int offsetX;                   /**< Starting X offset in elements */
-            unsigned int offsetY;                   /**< Starting Y offset in elements */
-            unsigned int offsetZ;                   /**< Starting Z offset in elements */            
-            unsigned int extentWidth;               /**< Width in elements */
-            unsigned int extentHeight;              /**< Height in elements */
-            unsigned int extentDepth;               /**< Depth in elements */
-        } sparseLevel;
-        struct {
-            unsigned int layer;                     /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */
-            unsigned long long offset;              /**< Offset within mip tail */
-            unsigned long long size;                /**< Extent in bytes */
-        } miptail;
-    } subresource;
-    
-    CUmemOperationType memOperationType;            /**< Memory operation type */
-    CUmemHandleType memHandleType;                  /**< Memory handle type */
-
-    union {
-        CUmemGenericAllocationHandle memHandle;
-    } memHandle;
-    
-    unsigned long long offset;                      /**< Offset within the memory */
-    unsigned int deviceBitMask;                     /**< Device ordinal bit mask */
-    unsigned int flags;                             /**< flags for future use, must be zero now. */
-    unsigned int reserved[2];                       /**< Reserved for future use, must be zero now. */
-} CUarrayMapInfo_v1;
-typedef CUarrayMapInfo_v1 CUarrayMapInfo;
-
-/**
- * Specifies a memory location.
- */
-typedef struct CUmemLocation_st {
-    CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */
-    int id;                 /**< identifier for a given this location's ::CUmemLocationType. */
-} CUmemLocation_v1;
-typedef CUmemLocation_v1 CUmemLocation;
-
-/**
- * Specifies compression attribute for an allocation.
- */
-typedef enum CUmemAllocationCompType_enum {
-    CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */
-    CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating  compressible memory */
-} CUmemAllocationCompType;
-
-/**
- * This flag if set indicates that the memory will be used as a tile pool.
- */
-#define CU_MEM_CREATE_USAGE_TILE_POOL    0x1
-
-/**
-* Specifies the allocation properties for a allocation.
-*/
-typedef struct CUmemAllocationProp_st {
-    /** Allocation type */
-    CUmemAllocationType type;
-    /** requested ::CUmemAllocationHandleType */
-    CUmemAllocationHandleType requestedHandleTypes;
-    /** Location of allocation */
-    CUmemLocation location;
-    /**
-     * Windows-specific POBJECT_ATTRIBUTES required when
-     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This object atributes structure
-     * includes security attributes that define
-     * the scope of which exported allocations may be tranferred to other
-     * processes.  In all other cases, this field is required to be zero.
-     */
-    void *win32HandleMetaData;
-    struct {
-         /**
-         * Allocation hint for requesting compressible memory.
-         * On devices that support Compute Data Compression, compressible
-         * memory can be used to accelerate accesses to data with unstructured
-         * sparsity and other compressible data patterns. Applications are 
-         * expected to query allocation property of the handle obtained with 
-         * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to 
-         * validate if the obtained allocation is compressible or not. Note that 
-         * compressed memory may not be mappable on all devices.
-         */
-         unsigned char compressionType;
-         unsigned char gpuDirectRDMACapable;
-         /** Bitmask indicating intended usage for this allocation */
-         unsigned short usage;
-         unsigned char reserved[4];
-    } allocFlags;
-} CUmemAllocationProp_v1;
-typedef CUmemAllocationProp_v1 CUmemAllocationProp;
-
-/**
- * Memory access descriptor
- */
-typedef struct CUmemAccessDesc_st {
-    CUmemLocation location;        /**< Location on which the request is to change it's accessibility */
-    CUmemAccess_flags flags;       /**< ::CUmemProt accessibility flags to set on the request */
-} CUmemAccessDesc_v1;
-typedef CUmemAccessDesc_v1 CUmemAccessDesc;
-
-typedef enum CUgraphExecUpdateResult_enum {
-    CU_GRAPH_EXEC_UPDATE_SUCCESS                     = 0x0, /**< The update succeeded */
-    CU_GRAPH_EXEC_UPDATE_ERROR                       = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */
-    CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED      = 0x2, /**< The update failed because the topology changed */
-    CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED     = 0x3, /**< The update failed because a node type changed */
-    CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED      = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */
-    CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED    = 0x5, /**< The update failed because the parameters changed in a way that is not supported */
-    CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED         = 0x6, /**< The update failed because something about the node is not supported */
-    CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7 /**< The update failed because the function of a kernel node changed in an unsupported way */
-} CUgraphExecUpdateResult;
-
-/**
- * CUDA memory pool attributes
- */
-typedef enum CUmemPool_attribute_enum {
-    /**
-     * (value type = int)
-     * Allow cuMemAllocAsync to use memory asynchronously freed
-     * in another streams as long as a stream ordering dependency
-     * of the allocating stream on the free action exists.
-     * Cuda events and null stream interactions can create the required
-     * stream ordered dependencies. (default enabled)
-     */
-    CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1,
-
-    /**
-     * (value type = int)
-     * Allow reuse of already completed frees when there is no dependency
-     * between the free and allocation. (default enabled)
-     */
-    CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
-
-    /**
-     * (value type = int)
-     * Allow cuMemAllocAsync to insert new stream dependencies
-     * in order to establish the stream ordering required to reuse
-     * a piece of memory released by cuFreeAsync (default enabled).
-     */
-    CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
-
-    /**
-     * (value type = cuuint64_t)
-     * Amount of reserved memory in bytes to hold onto before trying
-     * to release memory back to the OS. When more than the release
-     * threshold bytes of memory are held by the memory pool, the
-     * allocator will try to release memory back to the OS on the
-     * next call to stream, event or context synchronize. (default 0)
-     */
-    CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-
-    /**
-     * (value type = cuuint64_t)
-     * Amount of backing memory currently allocated for the mempool.
-     */
-    CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT,
-
-    /**
-     * (value type = cuuint64_t)
-     * High watermark of backing memory allocated for the mempool since the
-     * last time it was reset. High watermark can only be reset to zero.
-     */
-    CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,
-
-    /**
-     * (value type = cuuint64_t)
-     * Amount of memory from the pool that is currently in use by the application.
-     */
-    CU_MEMPOOL_ATTR_USED_MEM_CURRENT,
-
-    /**
-     * (value type = cuuint64_t)
-     * High watermark of the amount of memory from the pool that was in use by the application since
-     * the last time it was reset. High watermark can only be reset to zero.
-     */
-    CU_MEMPOOL_ATTR_USED_MEM_HIGH
-} CUmemPool_attribute;
-
-/**
- * Specifies the properties of allocations made from the pool.
- */
-typedef struct CUmemPoolProps_st {
-    CUmemAllocationType allocType;         /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */
-    CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */
-    CUmemLocation location;                /**< Location where allocations should reside. */
-    /**
-     * Windows-specific LPSECURITYATTRIBUTES required when
-     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This security attribute defines
-     * the scope of which exported allocations may be tranferred to other
-     * processes.  In all other cases, this field is required to be zero.
-     */
-    void *win32SecurityAttributes;
-    unsigned char reserved[64]; /**< reserved for future use, must be 0 */
-} CUmemPoolProps_v1;
-typedef CUmemPoolProps_v1 CUmemPoolProps;
-
-/**
- * Opaque data for exporting a pool allocation
- */
-typedef struct CUmemPoolPtrExportData_st {
-    unsigned char reserved[64];
-} CUmemPoolPtrExportData_v1;
-typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData;
-
-/**
- * Memory allocation node parameters
- */
-typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_st {
-    /**
-    * in: location where the allocation should reside (specified in ::location).
-    * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported.
-    */
-    CUmemPoolProps poolProps;
-    const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */
-    size_t accessDescCount; /**< in: number of memory access descriptors.  Must not exceed the number of GPUs. */
-    size_t bytesize; /**< in: size in bytes of the requested allocation */
-    CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */
-} CUDA_MEM_ALLOC_NODE_PARAMS;
-
-typedef enum CUgraphMem_attribute_enum {
-    /**
-     * (value type = cuuint64_t)
-     * Amount of memory, in bytes, currently associated with graphs
-     */
-    CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT,
-
-    /**
-     * (value type = cuuint64_t)
-     * High watermark of memory, in bytes, associated with graphs since the
-     * last time it was reset.  High watermark can only be reset to zero.
-     */
-    CU_GRAPH_MEM_ATTR_USED_MEM_HIGH,
-
-    /**
-     * (value type = cuuint64_t)
-     * Amount of memory, in bytes, currently allocated for use by
-     * the CUDA graphs asynchronous allocator.
-     */
-    CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT,
-
-    /**
-     * (value type = cuuint64_t)
-     * High watermark of memory, in bytes, currently allocated for use by
-     * the CUDA graphs asynchronous allocator.
-     */
-    CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH
-} CUgraphMem_attribute;
-
-/**
- * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only
- * waits for prior work in the stream corresponding to that GPU to complete before the
- * kernel begins execution.
- */
-#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC   0x01
-
-/**
- * If set, any subsequent work pushed in a stream that participated in a call to
- * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on
- * the GPU corresponding to that stream to complete before it begins execution.
- */
-#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC  0x02
-
-/**
- * If set, the CUDA array is a collection of layers, where each layer is either a 1D
- * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
- * of layers, not the depth of a 3D array.
- */
-#define CUDA_ARRAY3D_LAYERED        0x01
-
-/**
- * Deprecated, use CUDA_ARRAY3D_LAYERED
- */
-#define CUDA_ARRAY3D_2DARRAY        0x01
-
-/**
- * This flag must be set in order to bind a surface reference
- * to the CUDA array
- */
-#define CUDA_ARRAY3D_SURFACE_LDST   0x02
-
-/**
- * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The
- * width of such a CUDA array must be equal to its height, and Depth must be six.
- * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps
- * and Depth must be a multiple of six.
- */
-#define CUDA_ARRAY3D_CUBEMAP        0x04
-
-/**
- * This flag must be set in order to perform texture gather operations
- * on a CUDA array.
- */
-#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08
-
-/**
- * This flag if set indicates that the CUDA
- * array is a DEPTH_TEXTURE.
- */
-#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10
-
-/**
- * This flag indicates that the CUDA array may be bound as a color target
- * in an external graphics API
- */
-#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20
-
-/**
- * This flag if set indicates that the CUDA array or CUDA mipmapped array
- * is a sparse CUDA array or CUDA mipmapped array respectively
- */
-#define CUDA_ARRAY3D_SPARSE 0x40
-
-/**
- * Override the texref format with a format inferred from the array.
- * Flag for ::cuTexRefSetArray()
- */
-#define CU_TRSA_OVERRIDE_FORMAT 0x01
-
-/**
- * Read the texture as integers rather than promoting the values to floats
- * in the range [0,1].
- * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
- */
-#define CU_TRSF_READ_AS_INTEGER         0x01
-
-/**
- * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
- * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
- */
-#define CU_TRSF_NORMALIZED_COORDINATES  0x02
-
-/**
- * Perform sRGB->linear conversion during texture read.
- * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
- */
-#define CU_TRSF_SRGB  0x10
-
- /**
-  * Disable any trilinear filtering optimizations.
-  * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
-  */
-#define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION  0x20
-
-/**
- * End of array terminator for the \p extra parameter to
- * ::cuLaunchKernel
- */
-#define CU_LAUNCH_PARAM_END            ((void*)0x00)
-
-/**
- * Indicator that the next value in the \p extra parameter to
- * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
- * parameters used for launching kernel \p f.  This buffer needs to
- * honor all alignment/padding requirements of the individual parameters.
- * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
- * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
- * effect.
- */
-#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
-
-/**
- * Indicator that the next value in the \p extra parameter to
- * ::cuLaunchKernel will be a pointer to a size_t which contains the
- * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
- * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
- * in the \p extra array if the value associated with
- * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
- */
-#define CU_LAUNCH_PARAM_BUFFER_SIZE    ((void*)0x02)
-
-/**
- * For texture references loaded into the module, use default texunit from
- * texture reference.
- */
-#define CU_PARAM_TR_DEFAULT -1
-
-/**
- * Device that represents the CPU
- */
-#define CU_DEVICE_CPU               ((CUdevice)-1)
-
-/**
- * Device that represents an invalid device
- */
-#define CU_DEVICE_INVALID           ((CUdevice)-2)
-
-/**
- * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS
- */
-typedef enum CUflushGPUDirectRDMAWritesOptions_enum {
-    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST   = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */
-    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1  /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */
-} CUflushGPUDirectRDMAWritesOptions;
-
-/**
- * Platform native ordering for GPUDirect RDMA writes
- */
-typedef enum CUGPUDirectRDMAWritesOrdering_enum {
-    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE        = 0,   /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */
-    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER       = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */
-    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200  /**< Any CUDA device in the system can consistently consume remote writes to this device. */
-} CUGPUDirectRDMAWritesOrdering;
-
-/**
- * The scopes for ::cuFlushGPUDirectRDMAWrites
- */
-typedef enum CUflushGPUDirectRDMAWritesScope_enum {
-    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER       = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */
-    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200  /**< Blocks until remote writes are visible to all CUDA device contexts. */
-} CUflushGPUDirectRDMAWritesScope;
- 
-/**
- * The targets for ::cuFlushGPUDirectRDMAWrites
- */
-typedef enum CUflushGPUDirectRDMAWritesTarget_enum {
-    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */
-} CUflushGPUDirectRDMAWritesTarget;
-
-/**
- * The additional write options for ::cuGraphDebugDotPrint
- */
-typedef enum CUgraphDebugDot_flags_enum {
-    CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE                        = 1<<0,  /** Output all debug data as if every debug flag is enabled */
-    CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES                  = 1<<1,  /** Use CUDA Runtime structures for output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS             = 1<<2,  /** Adds CUDA_KERNEL_NODE_PARAMS values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS             = 1<<3,  /** Adds CUDA_MEMCPY3D values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS             = 1<<4,  /** Adds CUDA_MEMSET_NODE_PARAMS values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS               = 1<<5,  /** Adds CUDA_HOST_NODE_PARAMS values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS              = 1<<6,  /** Adds CUevent handle from record and wait nodes to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS   = 1<<7,  /** Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS     = 1<<8,  /** Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES         = 1<<9,  /** Adds CUkernelNodeAttrValue values to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES                        = 1<<10, /** Adds node handles and every kernel function handle to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS          = 1<<11, /** Adds memory alloc node parameters to output */
-    CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS           = 1<<12  /** Adds memory free node parameters to output */
-} CUgraphDebugDot_flags;
-
-/**
- * Flags for user objects for graphs
- */
-typedef enum CUuserObject_flags_enum {
-    CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1  /**< Indicates the destructor execution is not synchronized by any CUDA handle. */
-} CUuserObject_flags;
-
-/**
- * Flags for retaining user object references for graphs
- */
-typedef enum CUuserObjectRetain_flags_enum {
-    CU_GRAPH_USER_OBJECT_MOVE = 1  /**< Transfer references from the caller rather than creating new references. */
-} CUuserObjectRetain_flags;
-
-/**
- * Flags for instantiating a graph
- */
-typedef enum CUgraphInstantiate_flags_enum {
-    CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH  = 1  /**< Automatically free memory allocated in a graph before relaunching. */
-} CUgraphInstantiate_flags;
-
-/** @} */ /* END CUDA_TYPES */
-
-#if defined(__GNUC__)
-  #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT)
-    #pragma GCC visibility push(default)
-  #endif
-#endif
-
-#ifdef _WIN32
-#define CUDAAPI __stdcall
-#else
-#define CUDAAPI
-#endif
-
-/**
- * \defgroup CUDA_ERROR Error Handling
- *
- * ___MANBRIEF___ error handling functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the error handling functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Gets the string description of an error code
- *
- * Sets \p *pStr to the address of a NULL-terminated string description
- * of the error code \p error.
- * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
- * will be returned and \p *pStr will be set to the NULL address.
- *
- * \param error - Error code to convert to string
- * \param pStr - Address of the string pointer.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::CUresult,
- * ::cudaGetErrorString
- */
-CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr);
-
-/**
- * \brief Gets the string representation of an error code enum name
- *
- * Sets \p *pStr to the address of a NULL-terminated string representation
- * of the name of the enum error code \p error.
- * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
- * will be returned and \p *pStr will be set to the NULL address.
- *
- * \param error - Error code to convert to string
- * \param pStr - Address of the string pointer.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::CUresult,
- * ::cudaGetErrorName
- */
-CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr);
-
-/** @} */ /* END CUDA_ERROR */
-
-/**
- * \defgroup CUDA_INITIALIZE Initialization
- *
- * ___MANBRIEF___ initialization functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the initialization functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Initialize the CUDA driver API
- *
- * Initializes the driver API and must be called before any other function from
- * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit()
- * has not been called, any function from the driver API will return
- * ::CUDA_ERROR_NOT_INITIALIZED.
- *
- * \param Flags - Initialization flag for CUDA.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
- * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
- * \notefnerr
- */
-CUresult CUDAAPI cuInit(unsigned int Flags);
-
-/** @} */ /* END CUDA_INITIALIZE */
-
-/**
- * \defgroup CUDA_VERSION Version Management
- *
- * ___MANBRIEF___ version management functions of the low-level CUDA driver
- * API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the version management functions of the low-level
- * CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Returns the latest CUDA version supported by driver
- *
- * Returns in \p *driverVersion the version of CUDA supported by
- * the driver.  The version is returned as
- * (1000 &times; major + 10 &times; minor). For example, CUDA 9.2
- * would be represented by 9020.
- *
- * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if
- * \p driverVersion is NULL.
- *
- * \param driverVersion - Returns the CUDA driver version
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cudaDriverGetVersion,
- * ::cudaRuntimeGetVersion
- */
-CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
-
-/** @} */ /* END CUDA_VERSION */
-
-/**
- * \defgroup CUDA_DEVICE Device Management
- *
- * ___MANBRIEF___ device management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the device management functions of the low-level
- * CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Returns a handle to a compute device
- *
- * Returns in \p *device a device handle given an ordinal in the range <b>[0,
- * ::cuDeviceGetCount()-1]</b>.
- *
- * \param device  - Returned device handle
- * \param ordinal - Device number to get handle for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGetLuid,
- * ::cuDeviceTotalMem,
- * ::cuDeviceGetExecAffinitySupport
- */
-CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
-
-/**
- * \brief Returns the number of compute-capable devices
- *
- * Returns in \p *count the number of devices with compute capability greater
- * than or equal to 2.0 that are available for execution. If there is no such
- * device, ::cuDeviceGetCount() returns 0.
- *
- * \param count - Returned number of compute-capable devices
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGetLuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem,
- * ::cuDeviceGetExecAffinitySupport,
- * ::cudaGetDeviceCount
- */
-CUresult CUDAAPI cuDeviceGetCount(int *count);
-
-/**
- * \brief Returns an identifer string for the device
- *
- * Returns an ASCII string identifying the device \p dev in the NULL-terminated
- * string pointed to by \p name. \p len specifies the maximum length of the
- * string that may be returned.
- *
- * \param name - Returned identifier string for the device
- * \param len  - Maximum length of string to store in \p name
- * \param dev  - Device to get identifier string for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGetLuid,
- * ::cuDeviceGetCount,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem,
- * ::cuDeviceGetExecAffinitySupport,
- * ::cudaGetDeviceProperties
- */
-CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
-
-/**
- * \brief Return an UUID for the device
- *
- * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will
- * supplant this version in 12.0, which is retained for minor version compatibility.
- *
- * Returns 16-octets identifing the device \p dev in the structure
- * pointed by the \p uuid.
- *
- * \param uuid - Returned UUID
- * \param dev  - Device to get identifier string for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetUuid_v2
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetLuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem,
- * ::cuDeviceGetExecAffinitySupport,
- * ::cudaGetDeviceProperties
- */
-CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev);
-
-/**
- * \brief Return an UUID for the device (11.4+)
- *
- * Returns 16-octets identifing the device \p dev in the structure
- * pointed by the \p uuid. If the device is in MIG mode, returns its
- * MIG UUID which uniquely identifies the subscribed MIG compute instance.
- *
- * \param uuid - Returned UUID
- * \param dev  - Device to get identifier string for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetLuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem,
- * ::cudaGetDeviceProperties
- */
-CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev);
-
-/**
- * \brief Return an LUID and device node mask for the device
- *
- * Return identifying information (\p luid and \p deviceNodeMask) to allow
- * matching device with graphics APIs.
- *
- * \param luid - Returned LUID
- * \param deviceNodeMask - Returned device node mask
- * \param dev  - Device to get identifier string for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem,
- * ::cuDeviceGetExecAffinitySupport,
- * ::cudaGetDeviceProperties
- */
-CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev);
-
-/**
- * \brief Returns the total amount of memory on the device
- *
- * Returns in \p *bytes the total amount of memory available on the device
- * \p dev in bytes.
- *
- * \param bytes - Returned memory available on device in bytes
- * \param dev   - Device handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGet,
- * ::cuDeviceGetExecAffinitySupport,
- * ::cudaMemGetInfo
- */
-CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
-
-/**
- * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size.
- *
- * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture
- * for given \p format and \p numChannels.
- *
- * \param maxWidthInElements    - Returned maximum number of texture elements allocatable for given \p format and \p numChannels.
- * \param format                - Texture format.
- * \param numChannels           - Number of channels per texture element.
- * \param dev                   - Device handle.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGet,
- * ::cudaMemGetInfo,
- * ::cuDeviceTotalMem
- */
-CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev);
-
-/**
- * \brief Returns information about the device
- *
- * Returns in \p *pi the integer value of the attribute \p attrib on device
- * \p dev. The supported attributes are:
- * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
- *   block;
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block
- * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid
- * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid
- * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid
- * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
- *   shared memory available to a thread block in bytes
- * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
- *   __constant__ variables in a CUDA C kernel in bytes
- * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads
- * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
- *   memory copy functions that involve memory regions allocated through
- *   ::cuMemAllocPitch()
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
- *  texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
- *  for a 1D texture bound to linear memory
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum
- *  mipmapped 1D texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
- *  texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
- *  texture height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
- *  for a 2D texture bound to linear memory
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
- *  for a 2D texture bound to linear memory
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
- *  in bytes for a 2D texture bound to linear memory
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum
- *  mipmapped 2D texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum
- *  mipmapped 2D texture height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
- *  texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
- *  texture height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
- *  texture depth
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
- *  Alternate maximum 3D texture width, 0 if no alternate
- *  maximum 3D texture size is supported
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
- *  Alternate maximum 3D texture height, 0 if no alternate
- *  maximum 3D texture size is supported
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
- *  Alternate maximum 3D texture depth, 0 if no alternate
- *  maximum 3D texture size is supported
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
- *  Maximum cubemap texture width or height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
- *  Maximum 1D layered texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
- *   Maximum layers in a 1D layered texture
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
- *  Maximum 2D layered texture width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
- *   Maximum 2D layered texture height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
- *   Maximum layers in a 2D layered texture
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
- *   Maximum cubemap layered texture width or height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
- *   Maximum layers in a cubemap layered texture
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
- *   Maximum 1D surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
- *   Maximum 2D surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
- *   Maximum 2D surface height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
- *   Maximum 3D surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
- *   Maximum 3D surface height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
- *   Maximum 3D surface depth
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
- *   Maximum 1D layered surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
- *   Maximum layers in a 1D layered surface
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
- *   Maximum 2D layered surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
- *   Maximum 2D layered surface height
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
- *   Maximum layers in a 2D layered surface
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
- *   Maximum cubemap surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
- *   Maximum cubemap layered surface width
- * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
- *   Maximum layers in a cubemap layered surface
- * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
- *   registers available to a thread block
- * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz
- * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
- *   base addresses aligned to ::textureAlign bytes do not need an offset
- *   applied to texture fetches
- * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
- *   for 2D texture references bound to pitched memory
- * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
- *   memory between host and device while executing a kernel, or 0 if not
- * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
- *   the device
- * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
- *   for kernels executed on the device, or 0 if not
- * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
- *   memory subsystem, or 0 if not
- * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
- *   memory into the CUDA address space, or 0 if not
- * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
- *   in. Available modes are as follows:
- *   - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
- *     can have multiple CUDA contexts present at a single time.
- *   - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
- *     prohibited from creating new CUDA contexts.
- *   - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS:  Compute-exclusive-process mode - Device
- *     can have only one context used by a single process at a time.
- * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports
- *   executing multiple kernels within the same context simultaneously, or 0 if
- *   not. It is not guaranteed that multiple kernels will be resident
- *   on the device concurrently so this feature should not be relied upon for
- *   correctness.
- * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
- *    device, 0 if error correction is disabled or not supported by the device
- * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device
- * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
- *   of the device
- * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device
- * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
- *    is only available on Tesla hardware running Windows Vista or later
- * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz
- * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits
- * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache
- * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor
- * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with
- *   the host, or 0 if not
- * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number
- * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number
- * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals
- *    in L1 cache, 0 if caching globals in L1 cache is not supported by the device
- * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals
- *    in L1 cache, 0 if caching locals in L1 cache is not supported by the device
- * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of
- *   shared memory available to a multiprocessor in bytes; this amount is shared
- *   by all thread blocks simultaneously resident on a multiprocessor
- * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit
- *   registers available to a multiprocessor; this number is shared by all thread
- *   blocks simultaneously resident on a multiprocessor
- * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory
- *   on this system, 0 if allocating managed memory is not supported by the device on this system.
- * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not.
- * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices
- *   associated with the same board. Devices on the same multi-GPU board will share the same identifier.
- * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host
- *   supports native atomic operations.
- * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance
- *   (in floating-point operations per second) to double precision performance.
- * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently accessing
- *   pageable memory without calling cudaHostRegister on it.
- * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory
- *   concurrently with the CPU.
- * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption.
- * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered
- *   memory at the same virtual address as the CPU.
- * -  ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size
- *    suported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call.
- *    For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
- * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's
- *   page tables.
- * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration.
- * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED:  Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs
- * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
- * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED:  Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
- * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor
- * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate
- * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes
- * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes 
- * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate.
- * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes
- * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. 
- * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU
- * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs
- * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information)
- * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum
- * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here.
- * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC
- *
- * \param pi     - Returned device attribute value
- * \param attrib - Device attribute to query
- * \param dev    - Device handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem,
- * ::cuDeviceGetExecAffinitySupport,
- * ::cudaDeviceGetAttribute,
- * ::cudaGetDeviceProperties
- */
-CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
-
-/**
- * \brief Return NvSciSync attributes that this device can support.
- *
- * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that
- * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList
- * can be used to create an NvSciSync object that matches this device's capabilities.
- * 
- * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is
- * already set this API will return ::CUDA_ERROR_INVALID_VALUE.
- * 
- * The applications should set \p nvSciSyncAttrList to a valid 
- * NvSciSyncAttrList failing which this API will return
- * ::CUDA_ERROR_INVALID_HANDLE.
- * 
- * The \p flags controls how applications intends to use
- * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are:
- * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to 
- * signal an NvSciSync on this CUDA device.
- * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to 
- * wait on an NvSciSync on this CUDA device.
- *
- * At least one of these flags must be set, failing which the API
- * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal
- * to one another: a developer may set both these flags that allows to
- * set both wait and signal specific attributes in the same \p nvSciSyncAttrList.
- *
- * \param nvSciSyncAttrList     - Return NvSciSync attributes supported.
- * \param dev                   - Valid Cuda Device to get NvSciSync attributes for.
- * \param flags                 - flags describing NvSciSync usage.
- *
- * \return
- *
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa
- * ::cuImportExternalSemaphore,
- * ::cuDestroyExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags);
-
-/**
- * \brief Sets the current memory pool of a device
- *
- * The memory pool must be local to the specified device.
- * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device.
- * By default, a device's current memory pool is its default memory pool.
- *
- * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different
- * than the one the stream runs on. 
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync
- */
-CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool);
-
-/**
- * \brief Gets the current mempool for a device
- *
- * Returns the last pool provided to ::cuDeviceSetMemPool for this device
- * or the device's default memory pool if ::cuDeviceSetMemPool has never been called.
- * By default the current mempool is the default mempool for a device.
- * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool.
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool
- */
-CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev);
-
-/**
- * \brief Returns the default mempool of a device
- *
- * The default mempool of a device contains device memory from that device.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev);
-
-/**
- * \brief Blocks until remote writes are visible to the specified scope
- *
- * Blocks until GPUDirect RDMA writes to the target context via mappings
- * created through APIs like nvidia_p2p_get_pages (see
- * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
- * visible to the specified scope.
- *
- * If the scope equals or lies within the scope indicated by
- * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call
- * will be a no-op and can be safely omitted for performance. This can be
- * determined by comparing the numerical values between the two enums, with
- * smaller scopes having smaller values.
- *
- * Users may query support for this API via
- * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS.
- *
- * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget
- * \param scope  - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- */
-CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
-
-/** @} */ /* END CUDA_DEVICE */
-
-/**
- * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated device management functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the device management functions of the low-level
- * CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Returns properties for a selected device
- *
- * \deprecated
- *
- * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute().
- *
- * Returns in \p *prop the properties of device \p dev. The ::CUdevprop
- * structure is defined as:
- *
- * \code
-     typedef struct CUdevprop_st {
-     int maxThreadsPerBlock;
-     int maxThreadsDim[3];
-     int maxGridSize[3];
-     int sharedMemPerBlock;
-     int totalConstantMemory;
-     int SIMDWidth;
-     int memPitch;
-     int regsPerBlock;
-     int clockRate;
-     int textureAlign
-  } CUdevprop;
- * \endcode
- * where:
- *
- * - ::maxThreadsPerBlock is the maximum number of threads per block;
- * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block;
- * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid;
- * - ::sharedMemPerBlock is the total amount of shared memory available per
- *   block in bytes;
- * - ::totalConstantMemory is the total amount of constant memory available on
- *   the device in bytes;
- * - ::SIMDWidth is the warp size;
- * - ::memPitch is the maximum pitch allowed by the memory copy functions that
- *   involve memory regions allocated through ::cuMemAllocPitch();
- * - ::regsPerBlock is the total number of registers available per block;
- * - ::clockRate is the clock frequency in kilohertz;
- * - ::textureAlign is the alignment requirement; texture base addresses that
- *   are aligned to ::textureAlign bytes do not need an offset applied to
- *   texture fetches.
- *
- * \param prop - Returned properties of device
- * \param dev  - Device to get properties for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
-
-/**
- * \brief Returns the compute capability of the device
- *
- * \deprecated
- *
- * This function was deprecated as of CUDA 5.0 and its functionality superceded
- * by ::cuDeviceGetAttribute().
- *
- * Returns in \p *major and \p *minor the major and minor revision numbers that
- * define the compute capability of the device \p dev.
- *
- * \param major - Major revision number
- * \param minor - Minor revision number
- * \param dev   - Device handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
-
-/** @} */ /* END CUDA_DEVICE_DEPRECATED */
-
-/**
- * \defgroup CUDA_PRIMARY_CTX Primary Context Management
- *
- * ___MANBRIEF___ primary context management functions of the low-level CUDA driver
- * API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the primary context management functions of the low-level
- * CUDA driver application programming interface.
- *
- * The primary context is unique per device and shared with the CUDA runtime API.
- * These functions allow integration with other libraries using CUDA.
- *
- * @{
- */
-
-/**
- * \brief Retain the primary context on the GPU
- *
- * Retains the primary context on the device.
- * Once the user successfully retains the primary context, the primary context
- * will be active and available to the user until the user releases it
- * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset().
- * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack.
- *
- * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN
- * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function
- * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to
- * determine the compute mode  of the device.
- * The <i>nvidia-smi</i> tool can be used to set the compute mode for
- * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a
- * -h option to it.
- *
- * Please note that the primary context always supports pinned allocations. Other
- * flags can be specified by ::cuDevicePrimaryCtxSetFlags().
- *
- * \param pctx  - Returned context handle of the new context
- * \param dev   - Device for which primary context is requested
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuDevicePrimaryCtxRelease,
- * ::cuDevicePrimaryCtxSetFlags,
- * ::cuCtxCreate,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev);
-
-/**
- * \brief Release the primary context on the GPU
- *
- * Releases the primary context interop on the device.
- * A retained context should always be released once the user is done using
- * it. The context is automatically reset once the last reference to it is
- * released. This behavior is different when the primary context was retained
- * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary
- * context remains always active.
- *
- * Releasing a primary context that has not been previously retained will
- * fail with ::CUDA_ERROR_INVALID_CONTEXT.
- *
- * Please note that unlike ::cuCtxDestroy() this method does not pop the context
- * from stack in any circumstances.
- *
- * \param dev - Device which primary context is released
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuDevicePrimaryCtxRetain,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
-
-/**
- * \brief Set flags for the primary context
- *
- * Sets the flags for the primary context on the device overwriting perviously
- * set ones.
- *
- * The three LSBs of the \p flags parameter can be used to control how the OS
- * thread, which owns the CUDA context at the time of an API call, interacts
- * with the OS scheduler when waiting for results from the GPU. Only one of
- * the scheduling flags can be set when creating a context.
- *
- * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
- * results from the GPU. This can decrease latency when waiting for the GPU,
- * but may lower the performance of CPU threads if they are performing work in
- * parallel with the CUDA thread.
- *
- * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
- * results from the GPU. This can increase latency when waiting for the GPU,
- * but can increase the performance of CPU threads performing work in parallel
- * with the GPU.
- *
- * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- * synchronization primitive when waiting for the GPU to finish work.
- *
- * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- * synchronization primitive when waiting for the GPU to finish work. <br>
- * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
- * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
- *
- * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
- * uses a heuristic based on the number of active CUDA contexts in the
- * process \e C and the number of logical processors in the system \e P. If
- * \e C > \e P, then CUDA will yield to other OS threads when waiting for
- * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
- * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
- * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
- * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
- * for low-powered devices.
- *
- * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
- * after resizing local memory for a kernel. This can prevent thrashing by
- * local memory allocations when launching many kernels with high local
- * memory usage at the cost of potentially increased memory usage. <br>
- * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
- * by this flag is now the default and cannot be disabled.
- *
- * \param dev   - Device for which the primary context flags are set
- * \param flags - New flags for the device
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- * \sa ::cuDevicePrimaryCtxRetain,
- * ::cuDevicePrimaryCtxGetState,
- * ::cuCtxCreate,
- * ::cuCtxGetFlags,
- * ::cudaSetDeviceFlags
- */
-CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
-
-/**
- * \brief Get the state of the primary context
- *
- * Returns in \p *flags the flags for the primary context of \p dev, and in
- * \p *active whether it is active.  See ::cuDevicePrimaryCtxSetFlags for flag
- * values.
- *
- * \param dev    - Device to get primary context flags for
- * \param flags  - Pointer to store flags
- * \param active - Pointer to store context state; 0 = inactive, 1 = active
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- * \sa
- * ::cuDevicePrimaryCtxSetFlags,
- * ::cuCtxGetFlags,
- * ::cudaGetDeviceFlags
- */
-CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active);
-
-/**
- * \brief Destroy all allocations and reset all state on the primary context
- *
- * Explicitly destroys and cleans up all resources associated with the current
- * device in the current process.
- *
- * Note that it is responsibility of the calling function to ensure that no
- * other module in the process is using the device any more. For that reason
- * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases.
- * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease()
- * even after resetting the device.
- * Resetting the primary context does not release it, an application that has
- * retained the primary context should explicitly release its usage.
- *
- * \param dev - Device for which primary context is destroyed
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
- * \notefnerr
- *
- * \sa ::cuDevicePrimaryCtxRetain,
- * ::cuDevicePrimaryCtxRelease,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cudaDeviceReset
- */
-CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
-
-/** @} */ /* END CUDA_PRIMARY_CTX */
-
-/**
- * \brief Returns information about the execution affinity support of the device.
- *
- * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev.
- * The supported types are:
- * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device,
- *   or 0 if not;
- *
- * \param pi   - 1 if the execution affinity type \p type is supported by the device, or 0 if not
- * \param type - Execution affinity type to query
- * \param dev  - Device handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetCount,
- * ::cuDeviceGetName,
- * ::cuDeviceGetUuid,
- * ::cuDeviceGet,
- * ::cuDeviceTotalMem
- */
-CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev);
-
-/**
- * \defgroup CUDA_CTX Context Management
- *
- * ___MANBRIEF___ context management functions of the low-level CUDA driver
- * API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the context management functions of the low-level
- * CUDA driver application programming interface.
- *
- * Please note that some functions are described in
- * \ref CUDA_PRIMARY_CTX "Primary Context Management" section.
- *
- * @{
- */
-
-/**
- * \brief Create a CUDA context
- *
- * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain.
- *
- * Creates a new CUDA context and associates it with the calling thread. The
- * \p flags parameter is described below. The context is created with a usage
- * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or
- * when done using the context. If a context is already current to the thread,
- * it is supplanted by the newly created context and may be restored by a subsequent
- * call to ::cuCtxPopCurrent().
- *
- * The three LSBs of the \p flags parameter can be used to control how the OS
- * thread, which owns the CUDA context at the time of an API call, interacts
- * with the OS scheduler when waiting for results from the GPU. Only one of
- * the scheduling flags can be set when creating a context.
- *
- * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
- * results from the GPU. This can decrease latency when waiting for the GPU,
- * but may lower the performance of CPU threads if they are performing work in
- * parallel with the CUDA thread.
- *
- * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
- * results from the GPU. This can increase latency when waiting for the GPU,
- * but can increase the performance of CPU threads performing work in parallel
- * with the GPU.
- *
- * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- * synchronization primitive when waiting for the GPU to finish work.
- *
- * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- * synchronization primitive when waiting for the GPU to finish work. <br>
- * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
- * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
- *
- * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
- * uses a heuristic based on the number of active CUDA contexts in the
- * process \e C and the number of logical processors in the system \e P. If
- * \e C > \e P, then CUDA will yield to other OS threads when waiting for
- * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
- * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
- * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
- * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
- * for low-powered devices.
- *
- * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
- * This flag must be set in order to allocate pinned host memory that is
- * accessible to the GPU.
- *
- * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
- * after resizing local memory for a kernel. This can prevent thrashing by
- * local memory allocations when launching many kernels with high local
- * memory usage at the cost of potentially increased memory usage. <br>
- * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
- * by this flag is now the default and cannot be disabled.
- * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
- *
- * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
- * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
- * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
- * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
- * the compute mode for * devices.
- * Documentation for <i>nvidia-smi</i> can be obtained by passing a
- * -h option to it.
- *
- * \param pctx  - Returned context handle of the new context
- * \param flags - Context creation flags
- * \param dev   - Device to create context on
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
-
-/**
- * \brief Create a CUDA context with execution affinity
- *
- * Creates a new CUDA context with execution affinity and associates it with
- * the calling thread. The \p paramsArray and \p flags parameter are described below.
- * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must
- * call ::cuCtxDestroy() or when done using the context. If a context is already
- * current to the thread, it is supplanted by the newly created context and may
- * be restored by a subsequent call to ::cuCtxPopCurrent().
- *
- * The type and the amount of execution resource the context can use is limited by \p paramsArray
- * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams
- * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type,
- * the latter execution affinity parameter overrides the former execution affinity parameter.
- * The supported execution affinity types are:
- * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
- *   of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
- *   rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
- *   affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
- *   is only supported under Volta+ MPS.
- *
- * The three LSBs of the \p flags parameter can be used to control how the OS
- * thread, which owns the CUDA context at the time of an API call, interacts
- * with the OS scheduler when waiting for results from the GPU. Only one of
- * the scheduling flags can be set when creating a context.
- *
- * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
- * results from the GPU. This can decrease latency when waiting for the GPU,
- * but may lower the performance of CPU threads if they are performing work in
- * parallel with the CUDA thread.
- *
- * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
- * results from the GPU. This can increase latency when waiting for the GPU,
- * but can increase the performance of CPU threads performing work in parallel
- * with the GPU.
- *
- * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- * synchronization primitive when waiting for the GPU to finish work.
- *
- * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
- * synchronization primitive when waiting for the GPU to finish work. <br>
- * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
- * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
- *
- * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
- * uses a heuristic based on the number of active CUDA contexts in the
- * process \e C and the number of logical processors in the system \e P. If
- * \e C > \e P, then CUDA will yield to other OS threads when waiting for
- * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
- * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
- * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
- * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
- * for low-powered devices.
- *
- * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
- * This flag must be set in order to allocate pinned host memory that is
- * accessible to the GPU.
- *
- * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
- * after resizing local memory for a kernel. This can prevent thrashing by
- * local memory allocations when launching many kernels with high local
- * memory usage at the cost of potentially increased memory usage. <br>
- * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
- * by this flag is now the default and cannot be disabled.
- * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
- *
- * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
- * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
- * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
- * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
- * the compute mode for * devices.
- * Documentation for <i>nvidia-smi</i> can be obtained by passing a
- * -h option to it.
- *
- * \param pctx        - Returned context handle of the new context
- * \param paramsArray - Execution affinity parameters
- * \param numParams   - Number of execution affinity parameters
- * \param flags       - Context creation flags
- * \param dev         - Device to create context on
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::CUexecAffinityParam
- */
-CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev);
-
-/**
- * \brief Destroy a CUDA context
- *
- * Destroys the CUDA context specified by \p ctx.  The context \p ctx will be
- * destroyed regardless of how many threads it is current to.
- * It is the responsibility of the calling function to ensure that no API
- * call issues using \p ctx while ::cuCtxDestroy() is executing.
- *
- * Destroys and cleans up all resources associated with the context.
- * It is the caller's responsibility to ensure that the context or its resources
- * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior.
- * These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
- * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref,
- * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore.
- *
- * If \p ctx is current to the calling thread then \p ctx will also be
- * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
- * were called).  If \p ctx is current to other threads, then \p ctx will
- * remain current to those threads, and attempting to access \p ctx from
- * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
- *
- * \param ctx - Context to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
-
-/**
- * \brief Pushes a context on the current CPU thread
- *
- * Pushes the given context \p ctx onto the CPU thread's stack of current
- * contexts. The specified context becomes the CPU thread's current context, so
- * all CUDA functions that operate on the current context are affected.
- *
- * The previous current context may be made current again by calling
- * ::cuCtxDestroy() or ::cuCtxPopCurrent().
- *
- * \param ctx - Context to push
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
-
-/**
- * \brief Pops the current CUDA context from the current CPU thread.
- *
- * Pops the current CUDA context from the CPU thread and passes back the
- * old context handle in \p *pctx. That context may then be made current
- * to a different CPU thread by calling ::cuCtxPushCurrent().
- *
- * If a context was current to the CPU thread before ::cuCtxCreate() or
- * ::cuCtxPushCurrent() was called, this function makes that context current to
- * the CPU thread again.
- *
- * \param pctx - Returned new context handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
-
-/**
- * \brief Binds the specified CUDA context to the calling CPU thread
- *
- * Binds the specified CUDA context to the calling CPU thread.
- * If \p ctx is NULL then the CUDA context previously bound to the
- * calling CPU thread is unbound and ::CUDA_SUCCESS is returned.
- *
- * If there exists a CUDA context stack on the calling CPU thread, this
- * will replace the top of that stack with \p ctx.
- * If \p ctx is NULL then this will be equivalent to popping the top
- * of the calling CPU thread's CUDA context stack (or a no-op if the
- * calling CPU thread's CUDA context stack is empty).
- *
- * \param ctx - Context to bind to the calling CPU thread
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa
- * ::cuCtxGetCurrent,
- * ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cudaSetDevice
- */
-CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
-
-/**
- * \brief Returns the CUDA context bound to the calling CPU thread.
- *
- * Returns in \p *pctx the CUDA context bound to the calling CPU thread.
- * If no context is bound to the calling CPU thread then \p *pctx is
- * set to NULL and ::CUDA_SUCCESS is returned.
- *
- * \param pctx - Returned context handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * \notefnerr
- *
- * \sa
- * ::cuCtxSetCurrent,
- * ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cudaGetDevice
- */
-CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
-
-/**
- * \brief Returns the device ID for the current context
- *
- * Returns in \p *device the ordinal of the current context's device.
- *
- * \param device - Returned device ID for the current context
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cudaGetDevice
- */
-CUresult CUDAAPI cuCtxGetDevice(CUdevice *device);
-
-/**
- * \brief Returns the flags for the current context
- *
- * Returns in \p *flags the flags of the current context. See ::cuCtxCreate
- * for flag values.
- *
- * \param flags - Pointer to store flags of current context
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetCurrent,
- * ::cuCtxGetDevice,
- * ::cuCtxGetLimit,
- * ::cuCtxGetSharedMemConfig,
- * ::cuCtxGetStreamPriorityRange,
- * ::cudaGetDeviceFlags
- */
-CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags);
-
-/**
- * \brief Block for a context's tasks to complete
- *
- * Blocks until the device has completed all preceding requested tasks.
- * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
- * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
- * CPU thread will block until the GPU context has finished its work.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cudaDeviceSynchronize
- */
-CUresult CUDAAPI cuCtxSynchronize(void);
-
-/**
- * \brief Set resource limits
- *
- * Setting \p limit to \p value is a request by the application to update
- * the current limit maintained by the context. The driver is free to
- * modify the requested value to meet h/w requirements (this could be
- * clamping to minimum or maximum values, rounding up to nearest element
- * size, etc). The application can use ::cuCtxGetLimit() to find out exactly
- * what the limit has been set to.
- *
- * Setting each ::CUlimit has its own specific restrictions, so each is
- * discussed here.
- *
- * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread.
- *   The driver automatically increases the per-thread stack size
- *   for each kernel launch as needed. This size isn't reset back to the
- *   original value after each launch. Setting this value will take effect 
- *   immediately, and if necessary, the device will block until all preceding 
- *   requested tasks are complete.
- *
- * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used
- *   by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE
- *   must be performed before launching any kernel that uses the ::printf()
- *   device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned.
- *
- * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used
- *   by the ::malloc() and ::free() device system calls. Setting
- *   ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel
- *   that uses the ::malloc() or ::free() device system calls, otherwise
- *   ::CUDA_ERROR_INVALID_VALUE will be returned.
- *
- * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of
- *   a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
- *   this limit must be performed before any launch of a kernel that uses the
- *   device runtime and calls ::cudaDeviceSynchronize() above the default sync
- *   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
- *   with error code ::cudaErrorSyncDepthExceeded if the limitation is
- *   violated. This limit can be set smaller than the default or up the maximum
- *   launch depth of 24. When setting this limit, keep in mind that additional
- *   levels of sync depth require the driver to reserve large amounts of device
- *   memory which can no longer be used for user allocations. If these
- *   reservations of device memory fail, ::cuCtxSetLimit() will return
- *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
- *   This limit is only applicable to devices of compute capability 3.5 and
- *   higher. Attempting to set this limit on devices of compute capability less
- *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
- *   returned.
- *
- * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of
- *   outstanding device runtime launches that can be made from the current
- *   context. A grid is outstanding from the point of launch up until the grid
- *   is known to have been completed. Device runtime launches which violate
- *   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
- *   ::cudaGetLastError() is called after launch. If more pending launches than
- *   the default (2048 launches) are needed for a module using the device
- *   runtime, this limit can be increased. Keep in mind that being able to
- *   sustain additional pending launches will require the driver to reserve
- *   larger amounts of device memory upfront which can no longer be used for
- *   allocations. If these reservations fail, ::cuCtxSetLimit() will return
- *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
- *   This limit is only applicable to devices of compute capability 3.5 and
- *   higher. Attempting to set this limit on devices of compute capability less
- *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
- *   returned.
- *
- * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity.
- *   Values can range from 0B to 128B. This is purely a performence hint and
- *   it can be ignored or clamped depending on the platform.
- *
- * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes availabe for
- *   persisting L2 cache. This is purely a performance hint and it can be
- *   ignored or clamped depending on the platform.
- *
- * \param limit - Limit to set
- * \param value - Size of limit
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNSUPPORTED_LIMIT,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSynchronize,
- * ::cudaDeviceSetLimit
- */
-CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
-
-/**
- * \brief Returns resource limits
- *
- * Returns in \p *pvalue the current size of \p limit.  The supported
- * ::CUlimit values are:
- * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread.
- * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the
- *   ::printf() device system call.
- * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the
- *   ::malloc() and ::free() device system calls.
- * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread
- *   can issue the device runtime call ::cudaDeviceSynchronize() to wait on
- *   child grid launches to complete.
- * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding
- *   device runtime launches that can be made from this context.
- * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity.
- * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes
- *
- * \param limit  - Limit to query
- * \param pvalue - Returned size of limit
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNSUPPORTED_LIMIT
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cudaDeviceGetLimit
- */
-CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);
-
-/**
- * \brief Returns the preferred cache configuration for the current context.
- *
- * On devices where the L1 cache and shared memory use the same hardware
- * resources, this function returns through \p pconfig the preferred cache configuration
- * for the current context. This is only a preference. The driver will use
- * the requested configuration if possible, but it is free to choose a different
- * configuration if required to execute functions.
- *
- * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices
- * where the size of the L1 cache and shared memory are fixed.
- *
- * The supported cache configurations are:
- * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
- * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
- * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
- * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
- *
- * \param pconfig - Returned cache configuration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cuFuncSetCacheConfig,
- * ::cudaDeviceGetCacheConfig
- */
-CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig);
-
-/**
- * \brief Sets the preferred cache configuration for the current context.
- *
- * On devices where the L1 cache and shared memory use the same hardware
- * resources, this sets through \p config the preferred cache configuration for
- * the current context. This is only a preference. The driver will use
- * the requested configuration if possible, but it is free to choose a different
- * configuration if required to execute the function. Any function preference
- * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide
- * setting. Setting the context-wide cache configuration to
- * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer
- * to not change the cache configuration unless required to launch the kernel.
- *
- * This setting does nothing on devices where the size of the L1 cache and
- * shared memory are fixed.
- *
- * Launching a kernel with a different preference than the most recent
- * preference setting may insert a device-side synchronization point.
- *
- * The supported cache configurations are:
- * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
- * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
- * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
- * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
- *
- * \param config - Requested cache configuration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cuFuncSetCacheConfig,
- * ::cudaDeviceSetCacheConfig
- */
-CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config);
-
-/**
- * \brief Returns the current shared memory configuration for the current context.
- *
- * This function will return in \p pConfig the current size of shared memory banks
- * in the current context. On devices with configurable shared memory banks,
- * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all
- * subsequent kernel launches will by default use the new bank size. When
- * ::cuCtxGetSharedMemConfig is called on devices without configurable shared
- * memory, it will return the fixed bank size of the hardware.
- *
- * The returned bank configurations can be either:
- * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:  shared memory bank width is
- *   four bytes.
- * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will
- *   eight bytes.
- *
- * \param pConfig - returned shared memory configuration
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cuCtxGetSharedMemConfig,
- * ::cuFuncSetCacheConfig,
- * ::cudaDeviceGetSharedMemConfig
- */
-CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
-
-/**
- * \brief Sets the shared memory configuration for the current context.
- *
- * On devices with configurable shared memory banks, this function will set
- * the context's shared memory bank size which is used for subsequent kernel
- * launches.
- *
- * Changed the shared memory configuration between launches may insert a device
- * side synchronization point between those launches.
- *
- * Changing the shared memory bank size will not increase shared memory usage
- * or affect occupancy of kernels, but may have major effects on performance.
- * Larger bank sizes will allow for greater potential bandwidth to shared memory,
- * but will change what kinds of accesses to shared memory will result in bank
- * conflicts.
- *
- * This function will do nothing on devices with fixed shared memory bank size.
- *
- * The supported bank configurations are:
- * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial
- *   setting (currently, four bytes).
- * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
- *   be natively four bytes.
- * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
- *   be natively eight bytes.
- *
- * \param config - requested shared memory configuration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cuCtxGetSharedMemConfig,
- * ::cuFuncSetCacheConfig,
- * ::cudaDeviceSetSharedMemConfig
- */
-CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config);
-
-/**
- * \brief Gets the context's API version.
- *
- * Returns a version number in \p version corresponding to the capabilities of
- * the context (e.g. 3010 or 3020), which library developers can use to direct
- * callers to a specific API version. If \p ctx is NULL, returns the API version
- * used to create the currently bound context.
- *
- * Note that new API versions are only introduced when context capabilities are
- * changed that break binary compatibility, so the API version and driver version
- * may be different. For example, it is valid for the API version to be 3020 while
- * the driver version is 4020.
- *
- * \param ctx     - Context to check
- * \param version - Pointer to version
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
-
-/**
- * \brief Returns numerical values that correspond to the least and
- * greatest stream priorities.
- *
- * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
- * to the least and greatest stream priorities respectively. Stream priorities
- * follow a convention where lower numbers imply greater priorities. The range of
- * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
- * If the user attempts to create a stream with a priority value that is
- * outside the meaningful range as specified by this API, the priority is
- * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
- * respectively. See ::cuStreamCreateWithPriority for details on creating a
- * priority stream.
- * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
- * is not desired.
- *
- * This function will return '0' in both \p *leastPriority and \p *greatestPriority if
- * the current context's device does not support stream priorities
- * (see ::cuDeviceGetAttribute).
- *
- * \param leastPriority    - Pointer to an int in which the numerical value for least
- *                           stream priority is returned
- * \param greatestPriority - Pointer to an int in which the numerical value for greatest
- *                           stream priority is returned
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \notefnerr
- *
- * \sa ::cuStreamCreateWithPriority,
- * ::cuStreamGetPriority,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize,
- * ::cudaDeviceGetStreamPriorityRange
- */
-CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
-
-/**
- * \brief Resets all persisting lines in cache to normal status.
- *
- * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal
- * status. Takes effect on function return.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuCtxResetPersistingL2Cache(void);
-
-/**
- * \brief Returns the execution affinity setting for the current context.
- *
- * Returns in \p *pExecAffinity the current value of \p type. The supported
- * ::CUexecAffinityType values are:
- * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use.
- *
- * \param type          - Execution affinity type to query
- * \param pExecAffinity - Returned execution affinity
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY
- * \notefnerr
- *
- * \sa
- * ::CUexecAffinityParam
- */
-CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
-
-
-/** @} */ /* END CUDA_CTX */
-
-/**
- * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated context management functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the deprecated context management functions of the low-level
- * CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Increment a context's usage-count
- *
- * \deprecated
- *
- * Note that this function is deprecated and should not be used.
- *
- * Increments the usage count of the context and passes back a context handle
- * in \p *pctx that must be passed to ::cuCtxDetach() when the application is
- * done with the context. ::cuCtxAttach() fails if there is no context current
- * to the thread.
- *
- * Currently, the \p flags parameter must be 0.
- *
- * \param pctx  - Returned context handle of the current context
- * \param flags - Context attach flags (must be 0)
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxDetach,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags);
-
-/**
- * \brief Decrement a context's usage-count
- *
- * \deprecated
- *
- * Note that this function is deprecated and should not be used.
- *
- * Decrements the usage count of the context \p ctx, and destroys the context
- * if the usage count goes to 0. The context must be a handle that was passed
- * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the
- * calling thread.
- *
- * \param ctx - Context to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuCtxCreate,
- * ::cuCtxDestroy,
- * ::cuCtxGetApiVersion,
- * ::cuCtxGetCacheConfig,
- * ::cuCtxGetDevice,
- * ::cuCtxGetFlags,
- * ::cuCtxGetLimit,
- * ::cuCtxPopCurrent,
- * ::cuCtxPushCurrent,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxSetLimit,
- * ::cuCtxSynchronize
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx);
-
-/** @} */ /* END CUDA_CTX_DEPRECATED */
-
-
-/**
- * \defgroup CUDA_MODULE Module Management
- *
- * ___MANBRIEF___ module management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the module management functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Loads a compute module
- *
- * Takes a filename \p fname and loads the corresponding module \p module into
- * the current context. The CUDA driver API does not attempt to lazily
- * allocate the resources needed by a module; if the memory for functions and
- * data (constant and global) needed by the module cannot be allocated,
- * ::cuModuleLoad() fails. The file should be a \e cubin file as output by
- * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or
- * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later.
- *
- * \param module - Returned module
- * \param fname  - Filename of module to load
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_PTX,
- * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
- * ::CUDA_ERROR_NOT_FOUND,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_FILE_NOT_FOUND,
- * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
- * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload
- */
-CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
-
-/**
- * \brief Load a module's data
- *
- * Takes a pointer \p image and loads the corresponding module \p module into
- * the current context. The pointer may be obtained by mapping a \e cubin or
- * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
- * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
- * object into the executable resources and using operating system calls such
- * as Windows \c FindResource() to obtain the pointer.
- *
- * \param module - Returned module
- * \param image  - Module data to load
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_PTX,
- * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
- * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload
- */
-CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image);
-
-/**
- * \brief Load a module's data with options
- *
- * Takes a pointer \p image and loads the corresponding module \p module into
- * the current context. The pointer may be obtained by mapping a \e cubin or
- * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
- * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
- * object into the executable resources and using operating system calls such
- * as Windows \c FindResource() to obtain the pointer. Options are passed as
- * an array via \p options and any corresponding parameters are passed in
- * \p optionValues. The number of total options is supplied via \p numOptions.
- * Any outputs will be returned via \p optionValues.
- *
- * \param module       - Returned module
- * \param image        - Module data to load
- * \param numOptions   - Number of options
- * \param options      - Options for JIT
- * \param optionValues - Option values for JIT
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_PTX,
- * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
- * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload
- */
-CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
-
-/**
- * \brief Load a module's data
- *
- * Takes a pointer \p fatCubin and loads the corresponding module \p module
- * into the current context. The pointer represents a <i>fat binary</i> object,
- * which is a collection of different \e cubin and/or \e PTX files, all
- * representing the same device code, but compiled and optimized for different
- * architectures.
- *
- * Prior to CUDA 4.0, there was no documented API for constructing and using
- * fat binary objects by programmers.  Starting with CUDA 4.0, fat binary
- * objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc.
- * More information can be found in the \b nvcc document.
- *
- * \param module   - Returned module
- * \param fatCubin - Fat binary to load
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_PTX,
- * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
- * ::CUDA_ERROR_NOT_FOUND,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
- * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleUnload
- */
-CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
-
-/**
- * \brief Unloads a module
- *
- * Unloads a module \p hmod from the current context.
- *
- * \param hmod - Module to unload
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_destroy_ub
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary
- */
-CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
-
-/**
- * \brief Returns a function handle
- *
- * Returns in \p *hfunc the handle of the function of name \p name located in
- * module \p hmod. If no function of that name exists, ::cuModuleGetFunction()
- * returns ::CUDA_ERROR_NOT_FOUND.
- *
- * \param hfunc - Returned function handle
- * \param hmod  - Module to retrieve function from
- * \param name  - Name of function to retrieve
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload
- */
-CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
-
-/**
- * \brief Returns a global pointer from a module
- *
- * Returns in \p *dptr and \p *bytes the base pointer and size of the
- * global of name \p name located in module \p hmod. If no variable of that name
- * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both
- * parameters \p dptr and \p bytes are optional. If one of them is
- * NULL, it is ignored.
- *
- * \param dptr  - Returned global device pointer
- * \param bytes - Returned global size in bytes
- * \param hmod  - Module to retrieve global from
- * \param name  - Name of global to retrieve
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload,
- * ::cudaGetSymbolAddress,
- * ::cudaGetSymbolSize
- */
-CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
-
-/**
- * \brief Returns a handle to a texture reference
- *
- * Returns in \p *pTexRef the handle of the texture reference of name \p name
- * in the module \p hmod. If no texture reference of that name exists,
- * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference
- * handle should not be destroyed, since it will be destroyed when the module
- * is unloaded.
- *
- * \param pTexRef  - Returned texture reference
- * \param hmod     - Module to retrieve texture reference from
- * \param name     - Name of texture reference to retrieve
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetSurfRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload,
- * ::cudaGetTextureReference
- */
-CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
-
-/**
- * \brief Returns a handle to a surface reference
- *
- * Returns in \p *pSurfRef the handle of the surface reference of name \p name
- * in the module \p hmod. If no surface reference of that name exists,
- * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND.
- *
- * \param pSurfRef  - Returned surface reference
- * \param hmod     - Module to retrieve surface reference from
- * \param name     - Name of surface reference to retrieve
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuModuleGetFunction,
- * ::cuModuleGetGlobal,
- * ::cuModuleGetTexRef,
- * ::cuModuleLoad,
- * ::cuModuleLoadData,
- * ::cuModuleLoadDataEx,
- * ::cuModuleLoadFatBinary,
- * ::cuModuleUnload,
- * ::cudaGetSurfaceReference
- */
-CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
-
-/**
- * \brief Creates a pending JIT linker invocation.
- *
- * If the call is successful, the caller owns the returned CUlinkState, which
- * should eventually be destroyed with ::cuLinkDestroy.  The
- * device code machine size (32 or 64 bit) will match the calling application.
- *
- * Both linker and compiler options may be specified.  Compiler options will
- * be applied to inputs to this linker action which must be compiled from PTX.
- * The options ::CU_JIT_WALL_TIME,
- * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
- * will accumulate data until the CUlinkState is destroyed.
- *
- * \p optionValues must remain valid for the life of the CUlinkState if output
- * options are used.  No other references to inputs are maintained after this
- * call returns.
- *
- * \param numOptions   Size of options arrays
- * \param options      Array of linker and compiler options
- * \param optionValues Array of option values, each cast to void *
- * \param stateOut     On success, this will contain a CUlinkState to specify
- *                     and complete this action
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
- * \notefnerr
- *
- * \sa ::cuLinkAddData,
- * ::cuLinkAddFile,
- * ::cuLinkComplete,
- * ::cuLinkDestroy
- */
-CUresult CUDAAPI
-cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
-
-/**
- * \brief Add an input to a pending linker invocation
- *
- * Ownership of \p data is retained by the caller.  No reference is retained to any
- * inputs after this call returns.
- *
- * This method accepts only compiler options, which are used if the data must
- * be compiled from PTX, and does not accept any of
- * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
- * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
- *
- * \param state        A pending linker action.
- * \param type         The type of the input data.
- * \param data         The input data.  PTX must be NULL-terminated.
- * \param size         The length of the input data.
- * \param name         An optional name for this input in log messages.
- * \param numOptions   Size of options.
- * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate).
- * \param optionValues Array of option values, each cast to void *.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_IMAGE,
- * ::CUDA_ERROR_INVALID_PTX,
- * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_NO_BINARY_FOR_GPU
- *
- * \sa ::cuLinkCreate,
- * ::cuLinkAddFile,
- * ::cuLinkComplete,
- * ::cuLinkDestroy
- */
-CUresult CUDAAPI
-cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
-    unsigned int numOptions, CUjit_option *options, void **optionValues);
-
-/**
- * \brief Add a file input to a pending linker invocation
- *
- * No reference is retained to any inputs after this call returns.
- *
- * This method accepts only compiler options, which are used if the input
- * must be compiled from PTX, and does not accept any of
- * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
- * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
- *
- * This method is equivalent to invoking ::cuLinkAddData on the contents
- * of the file.
- *
- * \param state        A pending linker action
- * \param type         The type of the input data
- * \param path         Path to the input file
- * \param numOptions   Size of options
- * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate)
- * \param optionValues Array of option values, each cast to void *
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_FILE_NOT_FOUND
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_IMAGE,
- * ::CUDA_ERROR_INVALID_PTX,
- * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_NO_BINARY_FOR_GPU
- *
- * \sa ::cuLinkCreate,
- * ::cuLinkAddData,
- * ::cuLinkComplete,
- * ::cuLinkDestroy
- */
-CUresult CUDAAPI
-cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
-    unsigned int numOptions, CUjit_option *options, void **optionValues);
-
-/**
- * \brief Complete a pending linker invocation
- *
- * Completes the pending linker action and returns the cubin image for the linked
- * device code, which can be used with ::cuModuleLoadData.  The cubin is owned by
- * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy.
- * This call does not destroy \p state.
- *
- * \param state    A pending linker invocation
- * \param cubinOut On success, this will point to the output image
- * \param sizeOut  Optional parameter to receive the size of the generated image
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuLinkCreate,
- * ::cuLinkAddData,
- * ::cuLinkAddFile,
- * ::cuLinkDestroy,
- * ::cuModuleLoadData
- */
-CUresult CUDAAPI
-cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut);
-
-/**
- * \brief Destroys state for a JIT linker invocation.
- *
- * \param state State object for the linker invocation
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_HANDLE
- *
- * \sa ::cuLinkCreate
- */
-CUresult CUDAAPI
-cuLinkDestroy(CUlinkState state);
-
-/** @} */ /* END CUDA_MODULE */
-
-
-/**
- * \defgroup CUDA_MEM Memory Management
- *
- * ___MANBRIEF___ memory management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the memory management functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Gets free and total memory
- *
- * Returns in \p *total the total amount of memory available to the the current context.
- * Returns in \p *free the amount of memory on the device that is free according to the OS.
- * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
- *
- * \param free  - Returned free memory in bytes
- * \param total - Returned total memory in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemGetInfo
- */
-CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total);
-
-/**
- * \brief Allocates device memory
- *
- * Allocates \p bytesize bytes of linear memory on the device and returns in
- * \p *dptr a pointer to the allocated memory. The allocated memory is suitably
- * aligned for any kind of variable. The memory is not cleared. If \p bytesize
- * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE.
- *
- * \param dptr     - Returned device pointer
- * \param bytesize - Requested allocation size in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMalloc
- */
-CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
-
-/**
- * \brief Allocates pitched device memory
- *
- * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on
- * the device and returns in \p *dptr a pointer to the allocated memory. The
- * function may pad the allocation to ensure that corresponding pointers in
- * any given row will continue to meet the alignment requirements for
- * coalescing as the address is updated from row to row. \p ElementSizeBytes
- * specifies the size of the largest reads and writes that will be performed
- * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced
- * memory transactions are not possible on other data sizes). If
- * \p ElementSizeBytes is smaller than the actual read/write size of a kernel,
- * the kernel will run correctly, but possibly at reduced speed. The pitch
- * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the
- * allocation. The intended usage of pitch is as a separate parameter of the
- * allocation, used to compute addresses within the 2D array. Given the row
- * and column of an array element of type \b T, the address is computed as:
- * \code
-   T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
- * \endcode
- *
- * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with
- * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is
- * recommended that programmers consider performing pitch allocations using
- * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is
- * especially true if the application will be performing 2D memory copies
- * between different regions of device memory (whether linear memory or CUDA
- * arrays).
- *
- * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed
- * to match or exceed the alignment requirement for texture binding with
- * ::cuTexRefSetAddress2D().
- *
- * \param dptr             - Returned device pointer
- * \param pPitch           - Returned pitch of allocation in bytes
- * \param WidthInBytes     - Requested allocation width in bytes
- * \param Height           - Requested allocation height in rows
- * \param ElementSizeBytes - Size of largest reads/writes for range
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMallocPitch
- */
-CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
-
-/**
- * \brief Frees device memory
- *
- * Frees the memory space pointed to by \p dptr, which must have been returned
- * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch().
- *
- * \param dptr - Pointer to memory to free
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaFree
- */
-CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
-
-/**
- * \brief Get information on memory allocations
- *
- * Returns the base address in \p *pbase and size in \p *psize of the
- * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input
- * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one
- * of them is NULL, it is ignored.
- *
- * \param pbase - Returned base address
- * \param psize - Returned size of device memory allocation
- * \param dptr  - Device pointer to query
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_NOT_FOUND,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
- */
-CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
-
-/**
- * \brief Allocates page-locked host memory
- *
- * Allocates \p bytesize bytes of host memory that is page-locked and
- * accessible to the device. The driver tracks the virtual memory ranges
- * allocated with this function and automatically accelerates calls to
- * functions such as ::cuMemcpy(). Since the memory can be accessed directly by
- * the device, it can be read or written with much higher bandwidth than
- * pageable memory obtained with functions such as ::malloc(). Allocating
- * excessive amounts of memory with ::cuMemAllocHost() may degrade system
- * performance, since it reduces the amount of memory available to the system
- * for paging. As a result, this function is best used sparingly to allocate
- * staging areas for data exchange between host and device.
- *
- * Note all host memory allocated using ::cuMemHostAlloc() will automatically
- * be immediately accessible to all contexts on all devices which support unified
- * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
- * The device pointer that may be used to access this host memory from those
- * contexts is always equal to the returned host pointer \p *pp.
- * See \ref CUDA_UNIFIED for additional details.
- *
- * \param pp       - Returned host pointer to page-locked memory
- * \param bytesize - Requested allocation size in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMallocHost
- */
-CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize);
-
-/**
- * \brief Frees page-locked host memory
- *
- * Frees the memory space pointed to by \p p, which must have been returned by
- * a previous call to ::cuMemAllocHost().
- *
- * \param p - Pointer to memory to free
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaFreeHost
- */
-CUresult CUDAAPI cuMemFreeHost(void *p);
-
-/**
- * \brief Allocates page-locked host memory
- *
- * Allocates \p bytesize bytes of host memory that is page-locked and accessible
- * to the device. The driver tracks the virtual memory ranges allocated with
- * this function and automatically accelerates calls to functions such as
- * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device,
- * it can be read or written with much higher bandwidth than pageable memory
- * obtained with functions such as ::malloc(). Allocating excessive amounts of
- * pinned memory may degrade system performance, since it reduces the amount
- * of memory available to the system for paging. As a result, this function is
- * best used sparingly to allocate staging areas for data exchange between
- * host and device.
- *
- * The \p Flags parameter enables different options to be specified that
- * affect the allocation, as follows.
- *
- * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be
- *   considered as pinned memory by all CUDA contexts, not just the one that
- *   performed the allocation.
- *
- * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address
- *   space. The device pointer to the memory may be obtained by calling
- *   ::cuMemHostGetDevicePointer().
- *
- * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined
- *   (WC). WC memory can be transferred across the PCI Express bus more
- *   quickly on some system configurations, but cannot be read efficiently by
- *   most CPUs. WC memory is a good option for buffers that will be written by
- *   the CPU and read by the GPU via mapped pinned memory or host->device
- *   transfers.
- *
- * All of these flags are orthogonal to one another: a developer may allocate
- * memory that is portable, mapped and/or write-combined with no restrictions.
- *
- * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for
- * devices that do not support mapped pinned memory. The failure is deferred
- * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
- * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag.
- *
- * The memory allocated by this function must be freed with ::cuMemFreeHost().
- *
- * Note all host memory allocated using ::cuMemHostAlloc() will automatically
- * be immediately accessible to all contexts on all devices which support unified
- * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
- * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer
- * that may be used to access this host memory from those contexts is always equal
- * to the returned host pointer \p *pp.  If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED
- * is specified, then the function ::cuMemHostGetDevicePointer() must be used
- * to query the device pointer, even if the context supports unified addressing.
- * See \ref CUDA_UNIFIED for additional details.
- *
- * \param pp       - Returned host pointer to page-locked memory
- * \param bytesize - Requested allocation size in bytes
- * \param Flags    - Flags for allocation request
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaHostAlloc
- */
-CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
-
-/**
- * \brief Passes back device pointer of mapped pinned memory
- *
- * Passes back the device pointer \p pdptr corresponding to the mapped, pinned
- * host buffer \p p allocated by ::cuMemHostAlloc.
- *
- * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP
- * flag was not specified at the time the memory was allocated, or if the
- * function is called on a GPU that does not support mapped pinned memory.
- *
- * For devices that have a non-zero value for the device attribute
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
- * can also be accessed from the device using the host pointer \p p.
- * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
- * match the original host pointer \p p and depends on the devices visible to the
- * application. If all devices visible to the application have a non-zero value for the
- * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
- * will match the original pointer \p p. If any device visible to the application
- * has a zero value for the device attribute, the device pointer returned by
- * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p,
- * but it will be suitable for use on all devices provided Unified Virtual Addressing
- * is enabled. In such systems, it is valid to access the memory using either pointer
- * on devices that have a non-zero value for the device attribute. Note however that
- * such devices should access the memory using only of the two pointers and not both.
- *
- * \p Flags provides for future releases. For now, it must be set to 0.
- *
- * \param pdptr - Returned device pointer
- * \param p     - Host pointer
- * \param Flags - Options (must be 0)
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaHostGetDevicePointer
- */
-CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
-
-/**
- * \brief Passes back flags that were used for a pinned allocation
- *
- * Passes back the flags \p pFlags that were specified when allocating
- * the pinned host buffer \p p allocated by ::cuMemHostAlloc.
- *
- * ::cuMemHostGetFlags() will fail if the pointer does not reside in
- * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc().
- *
- * \param pFlags - Returned flags word
- * \param p     - Host pointer
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cuMemAllocHost,
- * ::cuMemHostAlloc,
- * ::cudaHostGetFlags
- */
-CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p);
-
-/**
- * \brief Allocates memory that will be automatically managed by the Unified Memory system
- *
- * Allocates \p bytesize bytes of managed memory on the device and returns in
- * \p *dptr a pointer to the allocated memory. If the device doesn't support
- * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support
- * for managed memory can be queried using the device attribute
- * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably
- * aligned for any kind of variable. The memory is not cleared. If \p bytesize
- * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer
- * is valid on the CPU and on all GPUs in the system that support managed memory.
- * All accesses to this pointer must obey the Unified Memory programming model.
- *
- * \p flags specifies the default stream association for this allocation.
- * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If
- * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from
- * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the
- * allocation should not be accessed from devices that have a zero value for the
- * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to
- * ::cuStreamAttachMemAsync will be required to enable access on such devices.
- *
- * If the association is later changed via ::cuStreamAttachMemAsync to
- * a single stream, the default association as specifed during ::cuMemAllocManaged
- * is restored when that stream is destroyed. For __managed__ variables, the
- * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a
- * stream is an asynchronous operation, and as a result, the change to default
- * association won't happen until all work in the stream has completed.
- *
- * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree.
- *
- * Device memory oversubscription is possible for GPUs that have a non-zero value for the
- * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on
- * such GPUs may be evicted from device memory to host memory at any time by the Unified
- * Memory driver in order to make room for other allocations.
- *
- * In a multi-GPU system where all GPUs have a non-zero value for the device attribute
- * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this
- * API returns and instead may be populated on access. In such systems, managed memory can
- * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
- * maintain data locality and prevent excessive page faults to the extent possible. The application
- * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application
- * can also explicitly migrate memory to a desired processor's memory via
- * ::cuMemPrefetchAsync.
- *
- * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
- * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support
- * with each other, the physical storage for managed memory is created on the GPU which is active
- * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced
- * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
- * memory among such GPUs.
- *
- * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
- * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
- * is zero for at least one of those GPUs, the location chosen for physical storage of managed
- * memory is system-dependent.
- * - On Linux, the location chosen will be device memory as long as the current set of active
- * contexts are on devices that either have peer-to-peer support with each other or have a
- * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
- * If there is an active context on a GPU that does not have a non-zero value for that device
- * attribute and it does not have peer-to-peer support with the other devices that have active
- * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
- * Note that this means that managed memory that is located in device memory is migrated to
- * host memory if a new context is created on a GPU that doesn't have a non-zero value for
- * the device attribute and does not support peer-to-peer with at least one of the other devices
- * that has an active context. This in turn implies that context creation may fail if there is
- * insufficient host memory to migrate all managed allocations.
- * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
- * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
- * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
- * restrict CUDA to only use those GPUs that have peer-to-peer support.
- * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a
- * non-zero value to force the driver to always use device memory for physical storage.
- * When this environment variable is set to a non-zero value, all contexts created in
- * that process on devices that support managed memory have to be peer-to-peer compatible
- * with each other. Context creation will fail if a context is created on a device that
- * supports managed memory and is not peer-to-peer compatible with any of the other
- * managed memory supporting devices on which contexts were previously created, even if
- * those contexts have been destroyed. These environment variables are described
- * in the CUDA programming guide under the "CUDA environment variables" section.
- * - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
- *
- * \param dptr     - Returned device pointer
- * \param bytesize - Requested allocation size in bytes
- * \param flags    - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync,
- * ::cudaMallocManaged
- */
-CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags);
-
-/**
- * \brief Returns a handle to a compute device
- *
- * Returns in \p *device a device handle given a PCI bus ID string.
- *
- * \param dev      - Returned device handle
- *
- * \param pciBusId - String in one of the following forms:
- * [domain]:[bus]:[device].[function]
- * [domain]:[bus]:[device]
- * [bus]:[device].[function]
- * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGet,
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetPCIBusId,
- * ::cudaDeviceGetByPCIBusId
- */
-CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId);
-
-/**
- * \brief Returns a PCI Bus Id string for the device
- *
- * Returns an ASCII string identifying the device \p dev in the NULL-terminated
- * string pointed to by \p pciBusId. \p len specifies the maximum length of the
- * string that may be returned.
- *
- * \param pciBusId - Returned identifier string for the device in the following format
- * [domain]:[bus]:[device].[function]
- * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
- * pciBusId should be large enough to store 13 characters including the NULL-terminator.
- *
- * \param len      - Maximum length of string to store in \p name
- *
- * \param dev      - Device to get identifier string for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceGet,
- * ::cuDeviceGetAttribute,
- * ::cuDeviceGetByPCIBusId,
- * ::cudaDeviceGetPCIBusId
- */
-CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
-
-/**
- * \brief Gets an interprocess handle for a previously allocated event
- *
- * Takes as input a previously allocated event. This event must have been
- * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING
- * flags set. This opaque handle may be copied into other processes and
- * opened with ::cuIpcOpenEventHandle to allow efficient hardware
- * synchronization between GPU work in different processes.
- *
- * After the event has been opened in the importing process,
- * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and
- * ::cuEventQuery may be used in either process. Performing operations
- * on the imported event after the exported event has been freed
- * with ::cuEventDestroy will result in undefined behavior.
- *
- * IPC functionality is restricted to devices with support for unified
- * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
- *
- * \param pHandle - Pointer to a user allocated CUipcEventHandle
- *                    in which to return the opaque event handle
- * \param event   - Event allocated with ::CU_EVENT_INTERPROCESS and
- *                    ::CU_EVENT_DISABLE_TIMING flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_MAP_FAILED,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuEventCreate,
- * ::cuEventDestroy,
- * ::cuEventSynchronize,
- * ::cuEventQuery,
- * ::cuStreamWaitEvent,
- * ::cuIpcOpenEventHandle,
- * ::cuIpcGetMemHandle,
- * ::cuIpcOpenMemHandle,
- * ::cuIpcCloseMemHandle,
- * ::cudaIpcGetEventHandle
- */
-CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
-
-/**
- * \brief Opens an interprocess event handle for use in the current process
- *
- * Opens an interprocess event handle exported from another process with
- * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like
- * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified.
- * This event must be freed with ::cuEventDestroy.
- *
- * Performing operations on the imported event after the exported event has
- * been freed with ::cuEventDestroy will result in undefined behavior.
- *
- * IPC functionality is restricted to devices with support for unified
- * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
- *
- * \param phEvent - Returns the imported event
- * \param handle  - Interprocess handle to open
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_MAP_FAILED,
- * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuEventCreate,
- * ::cuEventDestroy,
- * ::cuEventSynchronize,
- * ::cuEventQuery,
- * ::cuStreamWaitEvent,
- * ::cuIpcGetEventHandle,
- * ::cuIpcGetMemHandle,
- * ::cuIpcOpenMemHandle,
- * ::cuIpcCloseMemHandle,
- * ::cudaIpcOpenEventHandle
- */
-CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle);
-
-/**
- * \brief Gets an interprocess memory handle for an existing device memory
- * allocation
- *
- * Takes a pointer to the base of an existing device memory allocation created
- * with ::cuMemAlloc and exports it for use in another process. This is a
- * lightweight operation and may be called multiple times on an allocation
- * without adverse effects.
- *
- * If a region of memory is freed with ::cuMemFree and a subsequent call
- * to ::cuMemAlloc returns memory with the same device address,
- * ::cuIpcGetMemHandle will return a unique handle for the
- * new memory.
- *
- * IPC functionality is restricted to devices with support for unified
- * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
- *
- * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return
- *                    the handle in.
- * \param dptr    - Base pointer to previously allocated device memory
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_MAP_FAILED,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuMemAlloc,
- * ::cuMemFree,
- * ::cuIpcGetEventHandle,
- * ::cuIpcOpenEventHandle,
- * ::cuIpcOpenMemHandle,
- * ::cuIpcCloseMemHandle,
- * ::cudaIpcGetMemHandle
- */
-CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
-
-/**
- * \brief Opens an interprocess memory handle exported from another process
- * and returns a device pointer usable in the local process.
- *
- * Maps memory exported from another process with ::cuIpcGetMemHandle into
- * the current device address space. For contexts on different devices
- * ::cuIpcOpenMemHandle can attempt to enable peer access between the
- * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is
- * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag.
- * ::cuDeviceCanAccessPeer can determine if a mapping is possible.
- *
- * Contexts that may open ::CUipcMemHandles are restricted in the following way.
- * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened
- * by one ::CUcontext per ::CUdevice per other process.
- *
- * If the memory handle has already been opened by the current context, the
- * reference count on the handle is incremented by 1 and the existing device pointer
- * is returned.
- *
- * Memory returned from ::cuIpcOpenMemHandle must be freed with
- * ::cuIpcCloseMemHandle.
- *
- * Calling ::cuMemFree on an exported memory region before calling
- * ::cuIpcCloseMemHandle in the importing context will result in undefined
- * behavior.
- *
- * IPC functionality is restricted to devices with support for unified
- * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
- *
- * \param pdptr  - Returned device pointer
- * \param handle - ::CUipcMemHandle to open
- * \param Flags  - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_MAP_FAILED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_TOO_MANY_PEERS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \note No guarantees are made about the address returned in \p *pdptr.
- * In particular, multiple processes may not receive the same address for the same \p handle.
- *
- * \sa
- * ::cuMemAlloc,
- * ::cuMemFree,
- * ::cuIpcGetEventHandle,
- * ::cuIpcOpenEventHandle,
- * ::cuIpcGetMemHandle,
- * ::cuIpcCloseMemHandle,
- * ::cuCtxEnablePeerAccess,
- * ::cuDeviceCanAccessPeer,
- * ::cudaIpcOpenMemHandle
- */
-CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
-
-/**
- * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle
- *
- * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1.
- * When the reference count reaches 0, this API unmaps the memory. The original allocation
- * in the exporting process as well as imported mappings in other processes
- * will be unaffected.
- *
- * Any resources used to enable peer access will be freed if this is the
- * last mapping using them.
- *
- * IPC functionality is restricted to devices with support for unified
- * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode
- *
- * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_MAP_FAILED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \sa
- * ::cuMemAlloc,
- * ::cuMemFree,
- * ::cuIpcGetEventHandle,
- * ::cuIpcOpenEventHandle,
- * ::cuIpcGetMemHandle,
- * ::cuIpcOpenMemHandle,
- * ::cudaIpcCloseMemHandle
- */
-CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
-
-/**
- * \brief Registers an existing host memory range for use by CUDA
- *
- * Page-locks the memory range specified by \p p and \p bytesize and maps it
- * for the device(s) as specified by \p Flags. This memory range also is added
- * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate
- * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed
- * directly by the device, it can be read or written with much higher bandwidth
- * than pageable memory that has not been registered.  Page-locking excessive
- * amounts of memory may degrade system performance, since it reduces the amount
- * of memory available to the system for paging. As a result, this function is
- * best used sparingly to register staging areas for data exchange between
- * host and device.
- *
- * This function has limited support on Mac OS X. OS 10.7 or higher is required.
- *
- * The \p Flags parameter enables different options to be specified that
- * affect the allocation, as follows.
- *
- * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be
- *   considered as pinned memory by all CUDA contexts, not just the one that
- *   performed the allocation.
- *
- * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address
- *   space. The device pointer to the memory may be obtained by calling
- *   ::cuMemHostGetDevicePointer().
- *
- * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some
- *   I/O memory space, e.g. the PCI Express resource of a 3rd party device.
- *
- * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory
- *   that is considered read-only by the device.  On platforms without
- *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
- *   required in order to register memory mapped to the CPU as read-only.  Support
- *   for the use of this flag can be queried from the device attribute
- *   ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
- *   a current context associated with a device that does not have this attribute
- *   set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED.
- *
- * All of these flags are orthogonal to one another: a developer may page-lock
- * memory that is portable or mapped with no restrictions.
- *
- * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for
- * devices that do not support mapped pinned memory. The failure is deferred
- * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
- * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag.
- *
- * For devices that have a non-zero value for the device attribute
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
- * can also be accessed from the device using the host pointer \p p.
- * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
- * match the original host pointer \p ptr and depends on the devices visible to the
- * application. If all devices visible to the application have a non-zero value for the
- * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
- * will match the original pointer \p ptr. If any device visible to the application
- * has a zero value for the device attribute, the device pointer returned by
- * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr,
- * but it will be suitable for use on all devices provided Unified Virtual Addressing
- * is enabled. In such systems, it is valid to access the memory using either pointer
- * on devices that have a non-zero value for the device attribute. Note however that
- * such devices should access the memory using only of the two pointers and not both.
- *
- * The memory page-locked by this function must be unregistered with
- * ::cuMemHostUnregister().
- *
- * \param p        - Host pointer to memory to page-lock
- * \param bytesize - Size in bytes of the address range to page-lock
- * \param Flags    - Flags for allocation request
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
- * ::CUDA_ERROR_NOT_PERMITTED,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa
- * ::cuMemHostUnregister,
- * ::cuMemHostGetFlags,
- * ::cuMemHostGetDevicePointer,
- * ::cudaHostRegister
- */
-CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
-
-/**
- * \brief Unregisters a memory range that was registered with cuMemHostRegister.
- *
- * Unmaps the memory range whose base address is specified by \p p, and makes
- * it pageable again.
- *
- * The base address must be the same one specified to ::cuMemHostRegister().
- *
- * \param p - Host pointer to memory to unregister
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
- * \notefnerr
- *
- * \sa
- * ::cuMemHostRegister,
- * ::cudaHostUnregister
- */
-CUresult CUDAAPI cuMemHostUnregister(void *p);
-
-/**
- * \brief Copies memory
- *
- * Copies data between two pointers.
- * \p dst and \p src are base pointers of the destination and source, respectively.
- * \p ByteCount specifies the number of bytes to copy.
- * Note that this function infers the type of the transfer (host to host, host to
- *   device, device to device, or device to host) from the pointer values.  This
- *   function is only allowed in contexts which support unified addressing.
- *
- * \param dst - Destination unified virtual address space pointer
- * \param src - Source unified virtual address space pointer
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy,
- * ::cudaMemcpyToSymbol,
- * ::cudaMemcpyFromSymbol
- */
-CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
-
-/**
- * \brief Copies device memory between two contexts
- *
- * Copies from device memory in one context to device memory in another
- * context. \p dstDevice is the base device pointer of the destination memory
- * and \p dstContext is the destination context.  \p srcDevice is the base
- * device pointer of the source memory and \p srcContext is the source pointer.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstDevice  - Destination device pointer
- * \param dstContext - Destination context
- * \param srcDevice  - Source device pointer
- * \param srcContext - Source context
- * \param ByteCount  - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
- * ::cuMemcpy3DPeerAsync,
- * ::cudaMemcpyPeer
- */
-CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
-
-/**
- * \brief Copies memory from Host to Device
- *
- * Copies from host memory to device memory. \p dstDevice and \p srcHost are
- * the base addresses of the destination and source, respectively. \p ByteCount
- * specifies the number of bytes to copy.
- *
- * \param dstDevice - Destination device pointer
- * \param srcHost   - Source host pointer
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy,
- * ::cudaMemcpyToSymbol
- */
-CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
-
-/**
- * \brief Copies memory from Device to Host
- *
- * Copies from device to host memory. \p dstHost and \p srcDevice specify the
- * base pointers of the destination and source, respectively. \p ByteCount
- * specifies the number of bytes to copy.
- *
- * \param dstHost   - Destination host pointer
- * \param srcDevice - Source device pointer
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy,
- * ::cudaMemcpyFromSymbol
- */
-CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
-
-/**
- * \brief Copies memory from Device to Device
- *
- * Copies from device memory to device memory. \p dstDevice and \p srcDevice
- * are the base pointers of the destination and source, respectively.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstDevice - Destination device pointer
- * \param srcDevice - Source device pointer
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy,
- * ::cudaMemcpyToSymbol,
- * ::cudaMemcpyFromSymbol
- */
-CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
-
-/**
- * \brief Copies memory from Device to Array
- *
- * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset
- * specify the CUDA array handle and starting index of the destination data.
- * \p srcDevice specifies the base pointer of the source. \p ByteCount
- * specifies the number of bytes to copy.
- *
- * \param dstArray  - Destination array
- * \param dstOffset - Offset in bytes of destination array
- * \param srcDevice - Source device pointer
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpyToArray
- */
-CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
-
-/**
- * \brief Copies memory from Array to Device
- *
- * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the
- * base pointer of the destination and must be naturally aligned with the CUDA
- * array elements. \p srcArray and \p srcOffset specify the CUDA array handle
- * and the offset in bytes into the array where the copy is to begin.
- * \p ByteCount specifies the number of bytes to copy and must be evenly
- * divisible by the array element size.
- *
- * \param dstDevice - Destination device pointer
- * \param srcArray  - Source array
- * \param srcOffset - Offset in bytes of source array
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpyFromArray
- */
-CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-
-/**
- * \brief Copies memory from Host to Array
- *
- * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
- * specify the CUDA array handle and starting offset in bytes of the destination
- * data.  \p pSrc specifies the base address of the source. \p ByteCount specifies
- * the number of bytes to copy.
- *
- * \param dstArray  - Destination array
- * \param dstOffset - Offset in bytes of destination array
- * \param srcHost   - Source host pointer
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpyToArray
- */
-CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
-
-/**
- * \brief Copies memory from Array to Host
- *
- * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
- * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
- * array handle and starting offset in bytes of the source data.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstHost   - Destination device pointer
- * \param srcArray  - Source array
- * \param srcOffset - Offset in bytes of source array
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpyFromArray
- */
-CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-
-/**
- * \brief Copies memory from Array to Array
- *
- * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray
- * specify the handles of the destination and source CUDA arrays for the copy,
- * respectively. \p dstOffset and \p srcOffset specify the destination and
- * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of
- * bytes to be copied. The size of the elements in the CUDA arrays need not be
- * the same format, but the elements must be the same size; and count must be
- * evenly divisible by that size.
- *
- * \param dstArray  - Destination array
- * \param dstOffset - Offset in bytes of destination array
- * \param srcArray  - Source array
- * \param srcOffset - Offset in bytes of source array
- * \param ByteCount - Size of memory copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpyArrayToArray
- */
-CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-
-/**
- * \brief Copies memory for 2D arrays
- *
- * Perform a 2D memory copy according to the parameters specified in \p pCopy.
- * The ::CUDA_MEMCPY2D structure is defined as:
- *
- * \code
-   typedef struct CUDA_MEMCPY2D_st {
-      unsigned int srcXInBytes, srcY;
-      CUmemorytype srcMemoryType;
-          const void *srcHost;
-          CUdeviceptr srcDevice;
-          CUarray srcArray;
-          unsigned int srcPitch;
-
-      unsigned int dstXInBytes, dstY;
-      CUmemorytype dstMemoryType;
-          void *dstHost;
-          CUdeviceptr dstDevice;
-          CUarray dstArray;
-          unsigned int dstPitch;
-
-      unsigned int WidthInBytes;
-      unsigned int Height;
-   } CUDA_MEMCPY2D;
- * \endcode
- * where:
- * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- *   source and destination, respectively; ::CUmemorytype_enum is defined as:
- *
- * \code
-   typedef enum CUmemorytype_enum {
-      CU_MEMORYTYPE_HOST = 0x01,
-      CU_MEMORYTYPE_DEVICE = 0x02,
-      CU_MEMORYTYPE_ARRAY = 0x03,
-      CU_MEMORYTYPE_UNIFIED = 0x04
-   } CUmemorytype;
- * \endcode
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::srcArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
- * specify the (host) base address of the source data and the bytes per row to
- * apply. ::srcArray is ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
- * specify the (device) base address of the source data and the bytes per row
- * to apply. ::srcArray is ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
- * ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- * specify the (host) base address of the destination data and the bytes per
- * row to apply. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::dstArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- * specify the (device) base address of the destination data and the bytes per
- * row to apply. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
- * ignored.
- *
- * - ::srcXInBytes and ::srcY specify the base address of the source data for
- *   the copy.
- *
- * \par
- * For host pointers, the starting address is
- * \code
-  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::dstXInBytes and ::dstY specify the base address of the destination data
- *   for the copy.
- *
- * \par
- * For host pointers, the base address is
- * \code
-  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
- *   the 2D copy being performed.
- * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
- *   ::WidthInBytes + dstXInBytes.
- *
- * \par
- * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
- * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
- * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
- * (device to device, CUDA array to device, CUDA array to CUDA array),
- * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
- * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
- * significantly slower in the cases where ::cuMemcpy2D() would have returned
- * an error code.
- *
- * \param pCopy - Parameters for the memory copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy2D,
- * ::cudaMemcpy2DToArray,
- * ::cudaMemcpy2DFromArray
- */
-CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
-
-/**
- * \brief Copies memory for 2D arrays
- *
- * Perform a 2D memory copy according to the parameters specified in \p pCopy.
- * The ::CUDA_MEMCPY2D structure is defined as:
- *
- * \code
-   typedef struct CUDA_MEMCPY2D_st {
-      unsigned int srcXInBytes, srcY;
-      CUmemorytype srcMemoryType;
-      const void *srcHost;
-      CUdeviceptr srcDevice;
-      CUarray srcArray;
-      unsigned int srcPitch;
-      unsigned int dstXInBytes, dstY;
-      CUmemorytype dstMemoryType;
-      void *dstHost;
-      CUdeviceptr dstDevice;
-      CUarray dstArray;
-      unsigned int dstPitch;
-      unsigned int WidthInBytes;
-      unsigned int Height;
-   } CUDA_MEMCPY2D;
- * \endcode
- * where:
- * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- *   source and destination, respectively; ::CUmemorytype_enum is defined as:
- *
- * \code
-   typedef enum CUmemorytype_enum {
-      CU_MEMORYTYPE_HOST = 0x01,
-      CU_MEMORYTYPE_DEVICE = 0x02,
-      CU_MEMORYTYPE_ARRAY = 0x03,
-      CU_MEMORYTYPE_UNIFIED = 0x04
-   } CUmemorytype;
- * \endcode
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::srcArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
- * specify the (host) base address of the source data and the bytes per row to
- * apply. ::srcArray is ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
- * specify the (device) base address of the source data and the bytes per row
- * to apply. ::srcArray is ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
- * ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::dstArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- * specify the (host) base address of the destination data and the bytes per
- * row to apply. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- * specify the (device) base address of the destination data and the bytes per
- * row to apply. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
- * ignored.
- *
- * - ::srcXInBytes and ::srcY specify the base address of the source data for
- *   the copy.
- *
- * \par
- * For host pointers, the starting address is
- * \code
-  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::dstXInBytes and ::dstY specify the base address of the destination data
- *   for the copy.
- *
- * \par
- * For host pointers, the base address is
- * \code
-  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
- *   the 2D copy being performed.
- * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
- *   ::WidthInBytes + dstXInBytes.
- *
- * \par
- * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
- * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
- * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
- * (device to device, CUDA array to device, CUDA array to CUDA array),
- * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
- * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
- * significantly slower in the cases where ::cuMemcpy2D() would have returned
- * an error code.
- *
- * \param pCopy - Parameters for the memory copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy2D,
- * ::cudaMemcpy2DToArray,
- * ::cudaMemcpy2DFromArray
- */
-CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
-
-/**
- * \brief Copies memory for 3D arrays
- *
- * Perform a 3D memory copy according to the parameters specified in
- * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
- *
- * \code
-        typedef struct CUDA_MEMCPY3D_st {
-
-            unsigned int srcXInBytes, srcY, srcZ;
-            unsigned int srcLOD;
-            CUmemorytype srcMemoryType;
-                const void *srcHost;
-                CUdeviceptr srcDevice;
-                CUarray srcArray;
-                unsigned int srcPitch;  // ignored when src is array
-                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
-
-            unsigned int dstXInBytes, dstY, dstZ;
-            unsigned int dstLOD;
-            CUmemorytype dstMemoryType;
-                void *dstHost;
-                CUdeviceptr dstDevice;
-                CUarray dstArray;
-                unsigned int dstPitch;  // ignored when dst is array
-                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
-
-            unsigned int WidthInBytes;
-            unsigned int Height;
-            unsigned int Depth;
-        } CUDA_MEMCPY3D;
- * \endcode
- * where:
- * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- *   source and destination, respectively; ::CUmemorytype_enum is defined as:
- *
- * \code
-   typedef enum CUmemorytype_enum {
-      CU_MEMORYTYPE_HOST = 0x01,
-      CU_MEMORYTYPE_DEVICE = 0x02,
-      CU_MEMORYTYPE_ARRAY = 0x03,
-      CU_MEMORYTYPE_UNIFIED = 0x04
-   } CUmemorytype;
- * \endcode
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::srcArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
- * ::srcHeight specify the (host) base address of the source data, the bytes
- * per row, and the height of each 2D slice of the 3D array. ::srcArray is
- * ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
- * ::srcHeight specify the (device) base address of the source data, the bytes
- * per row, and the height of each 2D slice of the 3D array. ::srcArray is
- * ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
- * ::srcHeight are ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::dstArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- * specify the (host) base address of the destination data, the bytes per row,
- * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- * specify the (device) base address of the destination data, the bytes per
- * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
- * ::dstHeight are ignored.
- *
- * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
- *   data for the copy.
- *
- * \par
- * For host pointers, the starting address is
- * \code
-  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- * element size.
- *
- * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
- *   destination data for the copy.
- *
- * \par
- * For host pointers, the base address is
- * \code
-  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
- *   and depth of the 3D copy being performed.
- * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
- *   ::WidthInBytes + dstXInBytes.
- * - If specified, ::srcHeight must be greater than or equal to ::Height +
- *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
- *
- * \par
- * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
- * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
- *
- * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
- * set to 0.
- *
- * \param pCopy - Parameters for the memory copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMemcpy3D
- */
-CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
-
-/**
- * \brief Copies memory between contexts
- *
- * Perform a 3D memory copy according to the parameters specified in
- * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
- * for documentation of its parameters.
- *
- * \param pCopy - Parameters for the memory copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_sync
- *
- * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
- * ::cuMemcpy3DPeerAsync,
- * ::cudaMemcpy3DPeer
- */
-CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
-
-/**
- * \brief Copies memory asynchronously
- *
- * Copies data between two pointers.
- * \p dst and \p src are base pointers of the destination and source, respectively.
- * \p ByteCount specifies the number of bytes to copy.
- * Note that this function infers the type of the transfer (host to host, host to
- *   device, device to device, or device to host) from the pointer values.  This
- *   function is only allowed in contexts which support unified addressing.
- *
- * \param dst       - Destination unified virtual address space pointer
- * \param src       - Source unified virtual address space pointer
- * \param ByteCount - Size of memory copy in bytes
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpyAsync,
- * ::cudaMemcpyToSymbolAsync,
- * ::cudaMemcpyFromSymbolAsync
- */
-CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies device memory between two contexts asynchronously.
- *
- * Copies from device memory in one context to device memory in another
- * context. \p dstDevice is the base device pointer of the destination memory
- * and \p dstContext is the destination context.  \p srcDevice is the base
- * device pointer of the source memory and \p srcContext is the source pointer.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstDevice  - Destination device pointer
- * \param dstContext - Destination context
- * \param srcDevice  - Source device pointer
- * \param srcContext - Source context
- * \param ByteCount  - Size of memory copy in bytes
- * \param hStream    - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync,
- * ::cuMemcpy3DPeerAsync,
- * ::cudaMemcpyPeerAsync
- */
-CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies memory from Host to Device
- *
- * Copies from host memory to device memory. \p dstDevice and \p srcHost are
- * the base addresses of the destination and source, respectively. \p ByteCount
- * specifies the number of bytes to copy.
- *
- * \param dstDevice - Destination device pointer
- * \param srcHost   - Source host pointer
- * \param ByteCount - Size of memory copy in bytes
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpyAsync,
- * ::cudaMemcpyToSymbolAsync
- */
-CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies memory from Device to Host
- *
- * Copies from device to host memory. \p dstHost and \p srcDevice specify the
- * base pointers of the destination and source, respectively. \p ByteCount
- * specifies the number of bytes to copy.
- *
- * \param dstHost   - Destination host pointer
- * \param srcDevice - Source device pointer
- * \param ByteCount - Size of memory copy in bytes
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpyAsync,
- * ::cudaMemcpyFromSymbolAsync
- */
-CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies memory from Device to Device
- *
- * Copies from device memory to device memory. \p dstDevice and \p srcDevice
- * are the base pointers of the destination and source, respectively.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstDevice - Destination device pointer
- * \param srcDevice - Source device pointer
- * \param ByteCount - Size of memory copy in bytes
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpyAsync,
- * ::cudaMemcpyToSymbolAsync,
- * ::cudaMemcpyFromSymbolAsync
- */
-CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies memory from Host to Array
- *
- * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
- * specify the CUDA array handle and starting offset in bytes of the
- * destination data. \p srcHost specifies the base address of the source.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstArray  - Destination array
- * \param dstOffset - Offset in bytes of destination array
- * \param srcHost   - Source host pointer
- * \param ByteCount - Size of memory copy in bytes
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpyToArrayAsync
- */
-CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies memory from Array to Host
- *
- * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
- * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
- * array handle and starting offset in bytes of the source data.
- * \p ByteCount specifies the number of bytes to copy.
- *
- * \param dstHost   - Destination pointer
- * \param srcArray  - Source array
- * \param srcOffset - Offset in bytes of source array
- * \param ByteCount - Size of memory copy in bytes
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- * \note_memcpy
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpyFromArrayAsync
- */
-CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
-
-/**
- * \brief Copies memory for 2D arrays
- *
- * Perform a 2D memory copy according to the parameters specified in \p pCopy.
- * The ::CUDA_MEMCPY2D structure is defined as:
- *
- * \code
-   typedef struct CUDA_MEMCPY2D_st {
-      unsigned int srcXInBytes, srcY;
-      CUmemorytype srcMemoryType;
-      const void *srcHost;
-      CUdeviceptr srcDevice;
-      CUarray srcArray;
-      unsigned int srcPitch;
-      unsigned int dstXInBytes, dstY;
-      CUmemorytype dstMemoryType;
-      void *dstHost;
-      CUdeviceptr dstDevice;
-      CUarray dstArray;
-      unsigned int dstPitch;
-      unsigned int WidthInBytes;
-      unsigned int Height;
-   } CUDA_MEMCPY2D;
- * \endcode
- * where:
- * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- *   source and destination, respectively; ::CUmemorytype_enum is defined as:
- *
- * \code
-   typedef enum CUmemorytype_enum {
-      CU_MEMORYTYPE_HOST = 0x01,
-      CU_MEMORYTYPE_DEVICE = 0x02,
-      CU_MEMORYTYPE_ARRAY = 0x03,
-      CU_MEMORYTYPE_UNIFIED = 0x04
-   } CUmemorytype;
- * \endcode
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
- * specify the (host) base address of the source data and the bytes per row to
- * apply. ::srcArray is ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::srcArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
- * specify the (device) base address of the source data and the bytes per row
- * to apply. ::srcArray is ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
- * ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::dstArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- * specify the (host) base address of the destination data and the bytes per
- * row to apply. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- * specify the (device) base address of the destination data and the bytes per
- * row to apply. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
- * ignored.
- *
- * - ::srcXInBytes and ::srcY specify the base address of the source data for
- *   the copy.
- *
- * \par
- * For host pointers, the starting address is
- * \code
-  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::dstXInBytes and ::dstY specify the base address of the destination data
- *   for the copy.
- *
- * \par
- * For host pointers, the base address is
- * \code
-  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
- *   the 2D copy being performed.
- * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
- *   ::WidthInBytes + dstXInBytes.
- * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
- *   ::WidthInBytes + dstXInBytes.
- * - If specified, ::srcHeight must be greater than or equal to ::Height +
- *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
- *
- * \par
- * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum
- * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
- * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
- * (device to device, CUDA array to device, CUDA array to CUDA array),
- * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch().
- *
- * \param pCopy   - Parameters for the memory copy
- * \param hStream - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpy2DAsync,
- * ::cudaMemcpy2DToArrayAsync,
- * ::cudaMemcpy2DFromArrayAsync
- */
-CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
-
-/**
- * \brief Copies memory for 3D arrays
- *
- * Perform a 3D memory copy according to the parameters specified in
- * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
- *
- * \code
-        typedef struct CUDA_MEMCPY3D_st {
-
-            unsigned int srcXInBytes, srcY, srcZ;
-            unsigned int srcLOD;
-            CUmemorytype srcMemoryType;
-                const void *srcHost;
-                CUdeviceptr srcDevice;
-                CUarray srcArray;
-                unsigned int srcPitch;  // ignored when src is array
-                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
-
-            unsigned int dstXInBytes, dstY, dstZ;
-            unsigned int dstLOD;
-            CUmemorytype dstMemoryType;
-                void *dstHost;
-                CUdeviceptr dstDevice;
-                CUarray dstArray;
-                unsigned int dstPitch;  // ignored when dst is array
-                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
-
-            unsigned int WidthInBytes;
-            unsigned int Height;
-            unsigned int Depth;
-        } CUDA_MEMCPY3D;
- * \endcode
- * where:
- * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
- *   source and destination, respectively; ::CUmemorytype_enum is defined as:
- *
- * \code
-   typedef enum CUmemorytype_enum {
-      CU_MEMORYTYPE_HOST = 0x01,
-      CU_MEMORYTYPE_DEVICE = 0x02,
-      CU_MEMORYTYPE_ARRAY = 0x03,
-      CU_MEMORYTYPE_UNIFIED = 0x04
-   } CUmemorytype;
- * \endcode
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::srcArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
- * ::srcHeight specify the (host) base address of the source data, the bytes
- * per row, and the height of each 2D slice of the 3D array. ::srcArray is
- * ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
- * ::srcHeight specify the (device) base address of the source data, the bytes
- * per row, and the height of each 2D slice of the 3D array. ::srcArray is
- * ignored.
- *
- * \par
- * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
- * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
- * ::srcHeight are ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
- *   specify the (unified virtual address space) base address of the source data
- *   and the bytes per row to apply.  ::dstArray is ignored.
- * This value may be used only if unified addressing is supported in the calling
- *   context.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
- * specify the (host) base address of the destination data, the bytes per row,
- * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
- * specify the (device) base address of the destination data, the bytes per
- * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
- *
- * \par
- * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
- * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
- * ::dstHeight are ignored.
- *
- * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
- *   data for the copy.
- *
- * \par
- * For host pointers, the starting address is
- * \code
-  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
- * element size.
- *
- * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
- *   destination data for the copy.
- *
- * \par
- * For host pointers, the base address is
- * \code
-  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
- * \endcode
- *
- * \par
- * For device pointers, the starting address is
- * \code
-  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
- * \endcode
- *
- * \par
- * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
- * element size.
- *
- * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
- *   and depth of the 3D copy being performed.
- * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
- *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
- *   ::WidthInBytes + dstXInBytes.
- * - If specified, ::srcHeight must be greater than or equal to ::Height +
- *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
- *
- * \par
- * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum
- * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
- *
- * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
- * set to 0.
- *
- * \param pCopy - Parameters for the memory copy
- * \param hStream - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemcpy3DAsync
- */
-CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
-
-/**
- * \brief Copies memory between contexts asynchronously.
- *
- * Perform a 3D memory copy according to the parameters specified in
- * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
- * for documentation of its parameters.
- *
- * \param pCopy - Parameters for the memory copy
- * \param hStream - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
- * ::cuMemcpy3DPeerAsync,
- * ::cudaMemcpy3DPeerAsync
- */
-CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
-
-/**
- * \brief Initializes device memory
- *
- * Sets the memory range of \p N 8-bit values to the specified value
- * \p uc.
- *
- * \param dstDevice - Destination device pointer
- * \param uc        - Value to set
- * \param N         - Number of elements
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset
- */
-CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
-
-/**
- * \brief Initializes device memory
- *
- * Sets the memory range of \p N 16-bit values to the specified value
- * \p us. The \p dstDevice pointer must be two byte aligned.
- *
- * \param dstDevice - Destination device pointer
- * \param us        - Value to set
- * \param N         - Number of elements
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset
- */
-CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
-
-/**
- * \brief Initializes device memory
- *
- * Sets the memory range of \p N 32-bit values to the specified value
- * \p ui. The \p dstDevice pointer must be four byte aligned.
- *
- * \param dstDevice - Destination device pointer
- * \param ui        - Value to set
- * \param N         - Number of elements
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32Async,
- * ::cudaMemset
- */
-CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
-
-/**
- * \brief Initializes device memory
- *
- * Sets the 2D memory range of \p Width 8-bit values to the specified value
- * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
- * specifies the number of bytes between each row. This function performs
- * fastest when the pitch is one that has been passed back by
- * ::cuMemAllocPitch().
- *
- * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
- * \param uc        - Value to set
- * \param Width     - Width of row
- * \param Height    - Number of rows
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset2D
- */
-CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
-
-/**
- * \brief Initializes device memory
- *
- * Sets the 2D memory range of \p Width 16-bit values to the specified value
- * \p us. \p Height specifies the number of rows to set, and \p dstPitch
- * specifies the number of bytes between each row. The \p dstDevice pointer
- * and \p dstPitch offset must be two byte aligned. This function performs
- * fastest when the pitch is one that has been passed back by
- * ::cuMemAllocPitch().
- *
- * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
- * \param us        - Value to set
- * \param Width     - Width of row
- * \param Height    - Number of rows
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset2D
- */
-CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
-
-/**
- * \brief Initializes device memory
- *
- * Sets the 2D memory range of \p Width 32-bit values to the specified value
- * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
- * specifies the number of bytes between each row. The \p dstDevice pointer
- * and \p dstPitch offset must be four byte aligned. This function performs
- * fastest when the pitch is one that has been passed back by
- * ::cuMemAllocPitch().
- *
- * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
- * \param ui        - Value to set
- * \param Width     - Width of row
- * \param Height    - Number of rows
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset2D
- */
-CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
-
-/**
- * \brief Sets device memory
- *
- * Sets the memory range of \p N 8-bit values to the specified value
- * \p uc.
- *
- * \param dstDevice - Destination device pointer
- * \param uc        - Value to set
- * \param N         - Number of elements
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemsetAsync
- */
-CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
-
-/**
- * \brief Sets device memory
- *
- * Sets the memory range of \p N 16-bit values to the specified value
- * \p us. The \p dstDevice pointer must be two byte aligned.
- *
- * \param dstDevice - Destination device pointer
- * \param us        - Value to set
- * \param N         - Number of elements
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemsetAsync
- */
-CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
-
-/**
- * \brief Sets device memory
- *
- * Sets the memory range of \p N 32-bit values to the specified value
- * \p ui. The \p dstDevice pointer must be four byte aligned.
- *
- * \param dstDevice - Destination device pointer
- * \param ui        - Value to set
- * \param N         - Number of elements
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32,
- * ::cudaMemsetAsync
- */
-CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
-
-/**
- * \brief Sets device memory
- *
- * Sets the 2D memory range of \p Width 8-bit values to the specified value
- * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
- * specifies the number of bytes between each row. This function performs
- * fastest when the pitch is one that has been passed back by
- * ::cuMemAllocPitch().
- *
- * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
- * \param uc        - Value to set
- * \param Width     - Width of row
- * \param Height    - Number of rows
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset2DAsync
- */
-CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
-
-/**
- * \brief Sets device memory
- *
- * Sets the 2D memory range of \p Width 16-bit values to the specified value
- * \p us. \p Height specifies the number of rows to set, and \p dstPitch
- * specifies the number of bytes between each row. The \p dstDevice pointer
- * and \p dstPitch offset must be two byte aligned. This function performs
- * fastest when the pitch is one that has been passed back by
- * ::cuMemAllocPitch().
- *
- * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
- * \param us        - Value to set
- * \param Width     - Width of row
- * \param Height    - Number of rows
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset2DAsync
- */
-CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
-
-/**
- * \brief Sets device memory
- *
- * Sets the 2D memory range of \p Width 32-bit values to the specified value
- * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
- * specifies the number of bytes between each row. The \p dstDevice pointer
- * and \p dstPitch offset must be four byte aligned. This function performs
- * fastest when the pitch is one that has been passed back by
- * ::cuMemAllocPitch().
- *
- * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
- * \param ui        - Value to set
- * \param Width     - Width of row
- * \param Height    - Number of rows
- * \param hStream   - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- * \note_memset
- * \note_null_stream
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
- * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32,
- * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
- * ::cuMemsetD32, ::cuMemsetD32Async,
- * ::cudaMemset2DAsync
- */
-CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
-
-/**
- * \brief Creates a 1D or 2D CUDA array
- *
- * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure
- * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
- * The ::CUDA_ARRAY_DESCRIPTOR is defined as:
- *
- * \code
-    typedef struct {
-        unsigned int Width;
-        unsigned int Height;
-        CUarray_format Format;
-        unsigned int NumChannels;
-    } CUDA_ARRAY_DESCRIPTOR;
- * \endcode
- * where:
- *
- * - \p Width, and \p Height are the width, and height of the CUDA array (in
- * elements); the CUDA array is one-dimensional if height is 0, two-dimensional
- * otherwise;
- * - ::Format specifies the format of the elements; ::CUarray_format is
- * defined as:
- * \code
-    typedef enum CUarray_format_enum {
-        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
-        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
-        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
-        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
-        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
-        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
-        CU_AD_FORMAT_HALF = 0x10,
-        CU_AD_FORMAT_FLOAT = 0x20
-    } CUarray_format;
- *  \endcode
- * - \p NumChannels specifies the number of packed components per CUDA array
- * element; it may be 1, 2, or 4;
- *
- * Here are examples of CUDA array descriptions:
- *
- * Description for a CUDA array of 2048 floats:
- * \code
-    CUDA_ARRAY_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_FLOAT;
-    desc.NumChannels = 1;
-    desc.Width = 2048;
-    desc.Height = 1;
- * \endcode
- *
- * Description for a 64 x 64 CUDA array of floats:
- * \code
-    CUDA_ARRAY_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_FLOAT;
-    desc.NumChannels = 1;
-    desc.Width = 64;
-    desc.Height = 64;
- * \endcode
- *
- * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit
- * float16's:
- * \code
-    CUDA_ARRAY_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_HALF;
-    desc.NumChannels = 4;
-    desc.Width = width;
-    desc.Height = height;
- * \endcode
- *
- * Description for a \p width x \p height CUDA array of 16-bit elements, each
- * of which is two 8-bit unsigned chars:
- * \code
-    CUDA_ARRAY_DESCRIPTOR arrayDesc;
-    desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
-    desc.NumChannels = 2;
-    desc.Width = width;
-    desc.Height = height;
- * \endcode
- *
- * \param pHandle        - Returned array
- * \param pAllocateArray - Array descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMallocArray
- */
-CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
-
-/**
- * \brief Get a 1D or 2D CUDA array descriptor
- *
- * Returns in \p *pArrayDescriptor a descriptor containing information on the
- * format and dimensions of the CUDA array \p hArray. It is useful for
- * subroutines that have been passed a CUDA array, but need to know the CUDA
- * array parameters for validation or other purposes.
- *
- * \param pArrayDescriptor - Returned array descriptor
- * \param hArray           - Array to get descriptor of
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaArrayGetInfo
- */
-CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
-
-/**
- * \brief Returns the layout properties of a sparse CUDA array
- *
- * Returns the layout properties of a sparse CUDA array in \p sparseProperties
- * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE 
- * ::CUDA_ERROR_INVALID_VALUE will be returned.
- *
- * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
- * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero.
- * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero.
- * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained
- * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties 
- * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to.
- *
- * \return
- * ::CUDA_SUCCESS
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
- * \param[in] array - CUDA array to get the sparse properties of
- * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync
- */
-CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array);
-
-/**
- * \brief Returns the layout properties of a sparse CUDA mipmapped array
- *
- * Returns the sparse array layout properties in \p sparseProperties
- * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE 
- * ::CUDA_ERROR_INVALID_VALUE will be returned.
- *
- * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the
- * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth
- * is less than that of the tile.
- * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
- * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. 
- * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer.
- * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero.
- *
- * \return
- * ::CUDA_SUCCESS
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
- * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of
- * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync
- */
-CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap);
-
-/**
- * \brief Gets a CUDA array plane from a CUDA array
- *
- * Returns in \p pPlaneArray a CUDA array that represents a single format plane
- * of the CUDA array \p hArray.
- *
- * If \p planeIdx is greater than the maximum number of planes in this array or if the array does
- * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns
- * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
- * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width
- * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
- *
- * \param pPlaneArray   - Returned CUDA array referenced by the \p planeIdx
- * \param hArray        - Multiplanar CUDA array
- * \param planeIdx      - Plane index
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa
- * ::cuArrayCreate,
- * ::cudaGetArrayPlane
- */
-CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
-
-/**
- * \brief Destroys a CUDA array
- *
- * Destroys the CUDA array \p hArray.
- *
- * \param hArray - Array to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ARRAY_IS_MAPPED,
- * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaFreeArray
- */
-CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
-
-/**
- * \brief Creates a 3D CUDA array
- *
- * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
- * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
- * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
- *
- * \code
-    typedef struct {
-        unsigned int Width;
-        unsigned int Height;
-        unsigned int Depth;
-        CUarray_format Format;
-        unsigned int NumChannels;
-        unsigned int Flags;
-    } CUDA_ARRAY3D_DESCRIPTOR;
- * \endcode
- * where:
- *
- * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
- * CUDA array (in elements); the following types of CUDA arrays can be allocated:
- *     - A 1D array is allocated if \p Height and \p Depth extents are both zero.
- *     - A 2D array is allocated if only \p Depth extent is zero.
- *     - A 3D array is allocated if all three extents are non-zero.
- *     - A 1D layered CUDA array is allocated if only \p Height is zero and the
- *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
- *       of layers is determined by the depth extent.
- *     - A 2D layered CUDA array is allocated if all three extents are non-zero and
- *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
- *       of layers is determined by the depth extent.
- *     - A cubemap CUDA array is allocated if all three extents are non-zero and the
- *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
- *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
- *       where the six layers represent the six faces of a cube. The order of the six
- *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
- *     - A cubemap layered CUDA array is allocated if all three extents are non-zero,
- *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
- *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
- *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
- *       consists of a collection of cubemaps. The first six layers represent the first
- *       cubemap, the next six layers form the second cubemap, and so on.
- *
- * - ::Format specifies the format of the elements; ::CUarray_format is
- * defined as:
- * \code
-    typedef enum CUarray_format_enum {
-        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
-        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
-        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
-        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
-        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
-        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
-        CU_AD_FORMAT_HALF = 0x10,
-        CU_AD_FORMAT_FLOAT = 0x20
-    } CUarray_format;
- *  \endcode
- *
- * - \p NumChannels specifies the number of packed components per CUDA array
- * element; it may be 1, 2, or 4;
- *
- * - ::Flags may be set to
- *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set,
- *     \p Depth specifies the number of layers, not the depth of a 3D array.
- *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array.
- *     If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array
- *     to a surface reference.
- *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be
- *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
- *     then \p Depth must be a multiple of six.
- *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather.
- *     Texture gather can only be performed on 2D CUDA arrays.
- *
- * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
- * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
- * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH.
- *
- * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag
- * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
- * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case.
- *
- * <table>
- * <tr><td><b>CUDA array type</b></td>
- * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
- * (depth range)}</b></td>
- * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
- * {(width range in elements), (height range), (depth range)}</b></td></tr>
- * <tr><td>1D</td>
- * <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td>
- * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
- * <tr><td>2D</td>
- * <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td>
- * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
- * <tr><td>3D</td>
- * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
- * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
- * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
- * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
- * (1,SURFACE3D_DEPTH) }</small></td></tr>
- * <tr><td>1D Layered</td>
- * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
- * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
- * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
- * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
- * <tr><td>2D Layered</td>
- * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
- * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
- * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
- * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
- * <tr><td>Cubemap</td>
- * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
- * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
- * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
- * <tr><td>Cubemap Layered</td>
- * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
- * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
- * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
- * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
- * </table>
- *
- * Here are examples of CUDA array descriptions:
- *
- * Description for a CUDA array of 2048 floats:
- * \code
-    CUDA_ARRAY3D_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_FLOAT;
-    desc.NumChannels = 1;
-    desc.Width = 2048;
-    desc.Height = 0;
-    desc.Depth = 0;
- * \endcode
- *
- * Description for a 64 x 64 CUDA array of floats:
- * \code
-    CUDA_ARRAY3D_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_FLOAT;
-    desc.NumChannels = 1;
-    desc.Width = 64;
-    desc.Height = 64;
-    desc.Depth = 0;
- * \endcode
- *
- * Description for a \p width x \p height x \p depth CUDA array of 64-bit,
- * 4x16-bit float16's:
- * \code
-    CUDA_ARRAY3D_DESCRIPTOR desc;
-    desc.Format = CU_AD_FORMAT_HALF;
-    desc.NumChannels = 4;
-    desc.Width = width;
-    desc.Height = height;
-    desc.Depth = depth;
- * \endcode
- *
- * \param pHandle        - Returned array
- * \param pAllocateArray - 3D array descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaMalloc3DArray
- */
-CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
-
-/**
- * \brief Get a 3D CUDA array descriptor
- *
- * Returns in \p *pArrayDescriptor a descriptor containing information on the
- * format and dimensions of the CUDA array \p hArray. It is useful for
- * subroutines that have been passed a CUDA array, but need to know the CUDA
- * array parameters for validation or other purposes.
- *
- * This function may be called on 1D and 2D arrays, in which case the \p Height
- * and/or \p Depth members of the descriptor struct will be set to 0.
- *
- * \param pArrayDescriptor - Returned 3D array descriptor
- * \param hArray           - 3D array to get descriptor of
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
- * \notefnerr
- *
- * \sa ::cuArray3DCreate, ::cuArrayCreate,
- * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
- * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
- * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
- * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
- * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
- * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
- * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
- * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
- * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
- * ::cudaArrayGetInfo
- */
-CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
-
-/**
- * \brief Creates a CUDA mipmapped array
- *
- * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
- * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle.
- * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is
- * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
- *
- * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
- *
- * \code
-    typedef struct {
-        unsigned int Width;
-        unsigned int Height;
-        unsigned int Depth;
-        CUarray_format Format;
-        unsigned int NumChannels;
-        unsigned int Flags;
-    } CUDA_ARRAY3D_DESCRIPTOR;
- * \endcode
- * where:
- *
- * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
- * CUDA array (in elements); the following types of CUDA arrays can be allocated:
- *     - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero.
- *     - A 2D mipmapped array is allocated if only \p Depth extent is zero.
- *     - A 3D mipmapped array is allocated if all three extents are non-zero.
- *     - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the
- *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
- *       of layers is determined by the depth extent.
- *     - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and
- *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
- *       of layers is determined by the depth extent.
- *     - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
- *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
- *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
- *       where the six layers represent the six faces of a cube. The order of the six
- *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
- *     - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero,
- *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
- *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
- *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
- *       consists of a collection of cubemaps. The first six layers represent the first
- *       cubemap, the next six layers form the second cubemap, and so on.
- *
- * - ::Format specifies the format of the elements; ::CUarray_format is
- * defined as:
- * \code
-    typedef enum CUarray_format_enum {
-        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
-        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
-        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
-        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
-        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
-        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
-        CU_AD_FORMAT_HALF = 0x10,
-        CU_AD_FORMAT_FLOAT = 0x20
-    } CUarray_format;
- *  \endcode
- *
- * - \p NumChannels specifies the number of packed components per CUDA array
- * element; it may be 1, 2, or 4;
- *
- * - ::Flags may be set to
- *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set,
- *     \p Depth specifies the number of layers, not the depth of a 3D array.
- *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of
- *     the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to
- *     bind a mipmap level of the CUDA mipmapped array to a surface reference.
-  *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be
- *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
- *     then \p Depth must be a multiple of six.
- *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather.
- *     Texture gather can only be performed on 2D CUDA mipmapped arrays.
- *
- * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
- * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
- * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH.
- *
- * <table>
- * <tr><td><b>CUDA array type</b></td>
- * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
- * (depth range)}</b></td>
- * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
- * {(width range in elements), (height range), (depth range)}</b></td></tr>
- * <tr><td>1D</td>
- * <td><small>{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }</small></td>
- * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
- * <tr><td>2D</td>
- * <td><small>{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }</small></td>
- * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
- * <tr><td>3D</td>
- * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
- * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
- * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
- * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
- * (1,SURFACE3D_DEPTH) }</small></td></tr>
- * <tr><td>1D Layered</td>
- * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
- * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
- * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
- * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
- * <tr><td>2D Layered</td>
- * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
- * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
- * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
- * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
- * <tr><td>Cubemap</td>
- * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
- * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
- * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
- * <tr><td>Cubemap Layered</td>
- * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
- * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
- * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
- * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
- * </table>
- *
- *
- * \param pHandle             - Returned mipmapped array
- * \param pMipmappedArrayDesc - mipmapped array descriptor
- * \param numMipmapLevels     - Number of mipmap levels
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuMipmappedArrayDestroy,
- * ::cuMipmappedArrayGetLevel,
- * ::cuArrayCreate,
- * ::cudaMallocMipmappedArray
- */
-CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels);
-
-/**
- * \brief Gets a mipmap level of a CUDA mipmapped array
- *
- * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level
- * of the CUDA mipmapped array \p hMipmappedArray.
- *
- * If \p level is greater than the maximum number of levels in this mipmapped array,
- * ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * \param pLevelArray     - Returned mipmap level CUDA array
- * \param hMipmappedArray - CUDA mipmapped array
- * \param level           - Mipmap level
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa
- * ::cuMipmappedArrayCreate,
- * ::cuMipmappedArrayDestroy,
- * ::cuArrayCreate,
- * ::cudaGetMipmappedArrayLevel
- */
-CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
-
-/**
- * \brief Destroys a CUDA mipmapped array
- *
- * Destroys the CUDA mipmapped array \p hMipmappedArray.
- *
- * \param hMipmappedArray - Mipmapped array to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ARRAY_IS_MAPPED,
- * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
- * \notefnerr
- *
- * \sa
- * ::cuMipmappedArrayCreate,
- * ::cuMipmappedArrayGetLevel,
- * ::cuArrayCreate,
- * ::cudaFreeMipmappedArray
- */
-CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
-
-/** @} */ /* END CUDA_MEM */
-
-/**
- * \defgroup CUDA_VA Virtual Memory Management
- *
- * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the virtual memory management functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
-* \brief Allocate an address range reservation. 
-* 
-* Reserves a virtual address range based on the given parameters, giving
-* the starting address of the range in \p ptr.  This API requires a system that
-* supports UVA.  The size and address parameters must be a multiple of the
-* host page size and the alignment must be a power of two or zero for default
-* alignment.
-*
-* \param[out] ptr       - Resulting pointer to start of virtual address range allocated
-* \param[in]  size      - Size of the reserved virtual address range requested
-* \param[in]  alignment - Alignment of the reserved virtual address range requested
-* \param[in]  addr      - Fixed starting address range requested
-* \param[in]  flags     - Currently unused, must be zero
-* \return
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_OUT_OF_MEMORY,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemAddressFree
-*/
-CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags);
-
-/**
-* \brief Free an address range reservation.
-* 
-* Frees a virtual address range reserved by cuMemAddressReserve.  The size
-* must match what was given to memAddressReserve and the ptr given must
-* match what was returned from memAddressReserve.
-*
-* \param[in] ptr  - Starting address of the virtual address range to free
-* \param[in] size - Size of the virtual address region to free
-* \return
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemAddressReserve
-*/
-CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size);
-
-/**
-* \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties
-*
-* This creates a memory allocation on the target device specified through the
-* \p prop strcuture. The created allocation will not have any device or host
-* mappings. The generic memory \p handle for the allocation can be
-* mapped to the address space of calling process via ::cuMemMap. This handle
-* cannot be transmitted directly to other processes (see
-* ::cuMemExportToShareableHandle).  On Windows, the caller must also pass
-* an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which
-* limits or allows access to this handle for a recepient process (see
-* ::CUmemAllocationProp::win32HandleMetaData for more).  The \p size of this
-* allocation must be a multiple of the the value given via
-* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM
-* flag.
-* If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then
-* the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays
-* and sparse CUDA mipmapped arrays.
-* (see ::cuMemMapArrayAsync).
-*
-* \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle.
-* \param[in]  size   - Size of the allocation requested
-* \param[in]  prop   - Properties of the allocation to create.
-* \param[in]  flags  - flags for future use, must be zero now.
-* \return
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_OUT_OF_MEMORY,
-* ::CUDA_ERROR_INVALID_DEVICE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-* \notefnerr
-*
-* \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle
-*/
-CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags);
-
-/**
-* \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate.
-* 
-* Frees the memory that was allocated on a device through cuMemCreate.
-*
-* The memory allocation will be freed when all outstanding mappings to the memory
-* are unmapped and when all outstanding references to the handle (including it's
-* shareable counterparts) are also released. The generic memory handle can be
-* freed when there are still outstanding mappings made with this handle. Each
-* time a recepient process imports a shareable handle, it needs to pair it with
-* ::cuMemRelease for the handle to be freed.  If \p handle is not a valid handle
-* the behavior is undefined. 
-*
-* \param[in] handle Value of handle which was returned previously by cuMemCreate.
-* \return
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-* \notefnerr
-*
-* \sa ::cuMemCreate
-*/
-CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle);
-
-/**
-* \brief Maps an allocation handle to a reserved virtual address range.
-*
-* Maps bytes of memory represented by \p handle starting from byte \p offset to
-* \p size to address range [\p addr, \p addr + \p size]. This range must be an
-* address reservation previously reserved with ::cuMemAddressReserve, and
-* \p offset + \p size must be less than the size of the memory allocation.
-* Both \p ptr, \p size, and \p offset must be a multiple of the value given via
-* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag.
-* 
-* Please note calling ::cuMemMap does not make the address accessible,
-* the caller needs to update accessibility of a contiguous mapped VA
-* range by calling ::cuMemSetAccess.
-* 
-* Once a recipient process obtains a shareable memory handle
-* from ::cuMemImportFromShareableHandle, the process must
-* use ::cuMemMap to map the memory into its address ranges before
-* setting accessibility with ::cuMemSetAccess.
-*  
-* ::cuMemMap can only create mappings on VA range reservations 
-* that are not currently mapped.
-* 
-* \param[in] ptr    - Address where memory will be mapped. 
-* \param[in] size   - Size of the memory mapping. 
-* \param[in] offset - Offset into the memory represented by 
-*                   - \p handle from which to start mapping
-*                   - Note: currently must be zero.
-* \param[in] handle - Handle to a shareable memory 
-* \param[in] flags  - flags for future use, must be zero now. 
-* \return
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_INVALID_DEVICE,
-* ::CUDA_ERROR_OUT_OF_MEMORY,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-* \notefnerr
-*
-* \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle
-*/
-CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags);
-
-/**
- * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays
- *
- * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays.
- * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count.
- * The structure ::CUarrayMapInfo is defined as follow:
- \code
-     typedef struct CUarrayMapInfo_st {
-        CUresourcetype resourceType;                   
-        union {
-            CUmipmappedArray mipmap;
-            CUarray array;
-        } resource;
-
-        CUarraySparseSubresourceType subresourceType;   
-        union {
-            struct {
-                unsigned int level;                     
-                unsigned int layer;                     
-                unsigned int offsetX;                   
-                unsigned int offsetY;                   
-                unsigned int offsetZ;                   
-                unsigned int extentWidth;               
-                unsigned int extentHeight;              
-                unsigned int extentDepth;               
-            } sparseLevel;
-            struct {
-                unsigned int layer;
-                unsigned long long offset;              
-                unsigned long long size;                
-            } miptail;
-        } subresource;
-
-        CUmemOperationType memOperationType;
-        
-        CUmemHandleType memHandleType;                  
-        union {
-            CUmemGenericAllocationHandle memHandle;
-        } memHandle;
-
-        unsigned long long offset;                      
-        unsigned int deviceBitMask;                     
-        unsigned int flags;                             
-        unsigned int reserved[2];                       
-    } CUarrayMapInfo;
- \endcode
- *
- * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on.
- * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then 
- * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle.
- * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using
- * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE. 
- * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned.
- * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY 
- * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle.
- * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been
- * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE.
- *
- * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. 
- * ::CUarraySparseSubresourceType_enum is defined as:
- \code
-    typedef enum CUarraySparseSubresourceType_enum {
-        CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
-        CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
-    } CUarraySparseSubresourceType;
- \endcode
- *
- * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a
- * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which
- * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by 
- * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type.
- *
- * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL
- * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents.
- * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY
- * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively.
- * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight
- * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively.
- * These offsets and extents must be aligned to the corresponding tile dimension.
- * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise,
- * must be zero.
- * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise,
- * must be zero.
- * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth
- * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays.
- * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties
- *
- * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL
- * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in 
- * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size.
- * Both, mip tail offset and mip tail size must be aligned to the tile size. 
- * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags
- * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index.
- * Otherwise, must be zero.
- * 
- * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as:
- \code
-    typedef enum CUmemOperationType_enum {
-        CU_MEM_OPERATION_TYPE_MAP = 1,
-        CU_MEM_OPERATION_TYPE_UNMAP = 2
-    } CUmemOperationType;
- \endcode
- * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource 
- * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. 
- * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, 
- * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC.
- * 
- * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation
- * is performed. ::CUarrayMapInfo::memHandle must be NULL.
- *
- * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. 
- * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. 
- * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match 
- * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle.
- *
- * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- *
- * \param[in] mapInfoList - List of ::CUarrayMapInfo
- * \param[in] count       - Count of ::CUarrayMapInfo  in \p mapInfoList
- * \param[in] hStream     - Stream identifier for the stream to use for map or unmap operations
- *
- * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties
- */
-CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo  *mapInfoList, unsigned int count, CUstream hStream);
-
-/**
-* \brief Unmap the backing memory of a given address range.
-*
-* The range must be the entire contiguous address range that was mapped to.  In
-* other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped
-* by ::cuMemCreate / ::cuMemMap.  Any backing memory allocations will be freed
-* if there are no existing mappings and there are no unreleased memory handles.
-*
-* When ::cuMemUnmap returns successfully the address range is converted to an
-* address reservation and can be used for a future calls to ::cuMemMap.  Any new
-* mapping to this virtual address will need to have access granted through
-* ::cuMemSetAccess, as all mappings start with no accessibility setup.
-*
-* \param[in] ptr  - Starting address for the virtual address range to unmap
-* \param[in] size - Size of the virtual address range to unmap
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-* \notefnerr
-* \note_sync
-*
-* \sa ::cuMemCreate, ::cuMemAddressReserve
-*/
-CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size);
-
-/**
-* \brief Set the access flags for each location specified in \p desc for the given virtual address range
-* 
-* Given the virtual address range via \p ptr and \p size, and the locations
-* in the array given by \p desc and \p count, set the access flags for the
-* target locations.  The range must be a fully mapped address range
-* containing all allocations created by ::cuMemMap / ::cuMemCreate.
-*
-* \param[in] ptr   - Starting address for the virtual address range
-* \param[in] size  - Length of the virtual address range
-* \param[in] desc  - Array of ::CUmemAccessDesc that describe how to change the
-*                  - mapping for each location specified
-* \param[in] count - Number of ::CUmemAccessDesc in \p desc
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_INVALID_DEVICE,
-* ::CUDA_ERROR_NOT_SUPPORTED
-* \notefnerr
-* \note_sync
-*
-* \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap
-*/
-CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count);
-
-/**
-* \brief Get the access \p flags set for the given \p location and \p ptr
-*
-* \param[out] flags   - Flags set for this location
-* \param[in] location - Location in which to check the flags for
-* \param[in] ptr      - Address in which to check the access flags for
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_INVALID_DEVICE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemSetAccess
-*/
-CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr);
-
-/**
-* \brief Exports an allocation to a requested shareable handle type
-*
-* Given a CUDA memory handle, create a shareable memory
-* allocation handle that can be used to share the memory with other
-* processes. The recipient process can convert the shareable handle back into a
-* CUDA memory handle using ::cuMemImportFromShareableHandle and map
-* it with ::cuMemMap. The implementation of what this handle is and how it
-* can be transferred is defined by the requested handle type in \p handleType
-*
-* Once all shareable handles are closed and the allocation is released, the allocated
-* memory referenced will be released back to the OS and uses of the CUDA handle afterward
-* will lead to undefined behavior.
-*
-* This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL)
-* that support importing memory from the shareable type
-*
-* \param[out] shareableHandle - Pointer to the location in which to store the requested handle type
-* \param[in] handle           - CUDA handle for the memory allocation
-* \param[in] handleType       - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter)
-* \param[in] flags            - Reserved, must be zero
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemImportFromShareableHandle
-*/
-CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags);
-
-/**
-* \brief Imports an allocation from a requested shareable handle type.
-*
-* If the current process cannot support the memory described by this shareable
-* handle, this API will error as CUDA_ERROR_NOT_SUPPORTED.
-*
-* \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc)
-* created on devices under an SLI group may not be supported, and thus this API will
-* return CUDA_ERROR_NOT_SUPPORTED.
-* There is no guarantee that the contents of \p handle will be the same CUDA memory handle
-* for the same given OS shareable handle, or the same underlying allocation.
-*
-* \param[out] handle       - CUDA Memory handle for the memory allocation.
-* \param[in]  osHandle     - Shareable Handle representing the memory allocation that is to be imported. 
-* \param[in]  shHandleType - handle type of the exported handle ::CUmemAllocationHandleType.
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease
-*/
-CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
-
-/**
-* \brief Calculates either the minimal or recommended granularity 
-*
-* Calculates either the minimal or recommended granularity
-* for a given allocation specification and returns it in granularity.  This
-* granularity can be used as a multiple for alignment, size, or address mapping.
-*
-* \param[out] granularity Returned granularity.
-* \param[in]  prop Property for which to determine the granularity for
-* \param[in]  option Determines which granularity to return
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemCreate, ::cuMemMap
-*/
-CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option);
-
-/**
-* \brief Retrieve the contents of the property structure defining properties for this handle
-*
-* \param[out] prop  - Pointer to a properties structure which will hold the information about this handle
-* \param[in] handle - Handle which to perform the query on
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemCreate, ::cuMemImportFromShareableHandle
-*/
-CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle);
-
-/**
-* \brief Given an address \p addr, returns the allocation handle of the backing memory allocation.
-*
-* The handle is guaranteed to be the same handle value used to map the memory. If the address
-* requested is not mapped, the function will fail. The returned handle must be released with
-* corresponding number of calls to ::cuMemRelease.
-*
-* \note The address \p addr, can be any address in a range previously mapped
-* by ::cuMemMap, and not necessarily the start address.
-*
-* \param[out] handle CUDA Memory handle for the backing memory allocation.
-* \param[in] addr Memory address to query, that has been mapped previously.
-* \returns
-* ::CUDA_SUCCESS,
-* ::CUDA_ERROR_INVALID_VALUE,
-* ::CUDA_ERROR_NOT_INITIALIZED,
-* ::CUDA_ERROR_DEINITIALIZED,
-* ::CUDA_ERROR_NOT_PERMITTED,
-* ::CUDA_ERROR_NOT_SUPPORTED
-*
-* \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap
-*/
-CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr);
-
-/** @} */ /* END CUDA_VA */
-
-/**
- * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator
- *
- * ___MANBRIEF___ Functions for performing allocation and free operations in stream order.
- *                Functions for controlling the behavior of the underlying allocator.
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the stream ordered memory allocator exposed by the
- * low-level CUDA driver application programming interface.
- *
- * @{
- *
- * \section CUDA_MALLOC_ASYNC_overview overview
- *
- * The asynchronous allocator allows the user to allocate and free in stream order.
- * All asynchronous accesses of the allocation must happen between
- * the stream executions of the allocation and the free. If the memory is accessed
- * outside of the promised stream order, a use before allocation / use after free error
- * will cause undefined behavior.
- *
- * The allocator is free to reallocate the memory as long as it can guarantee
- * that compliant memory accesses will not overlap temporally.
- * The allocator may refer to internal stream ordering as well as inter-stream dependencies
- * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee.
- * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. 
- *
- * \section CUDA_MALLOC_ASYNC_support Supported Platforms
- *
- * Whether or not a device supports the integrated stream ordered memory allocator
- * may be queried by calling ::cuDeviceGetAttribute() with the device attribute
- * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED
- */
-
-/**
- * \brief Frees memory with stream ordered semantics
- *
- * Inserts a free operation into \p hStream.
- * The allocation must not be accessed after stream execution reaches the free.
- * After this API returns, accessing the memory from any subsequent work launched on the GPU
- * or querying its pointer attributes results in undefined behavior.
- *
- * \note During stream capture, this function results in the creation of a free node and
- *       must therefore be passed the address of a graph allocation.
- * 
- * \param dptr - memory to free
- * \param hStream - The stream establishing the stream ordering contract. 
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
- * ::CUDA_ERROR_NOT_SUPPORTED
- */
-CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);
-
-/**
- * \brief Allocates memory with stream ordered semantics
- *
- * Inserts an allocation operation into \p hStream.
- * A pointer to the allocated memory is returned immediately in *dptr.
- * The allocation must not be accessed until the the allocation operation completes.
- * The allocation comes from the memory pool current to the stream's device.
- *
- * \note The default memory pool of a device contains device memory from that device.
- * \note Basic stream ordering allows future work submitted into the same stream to use the allocation.
- *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
- *       operation completes before work submitted in a separate stream runs. 
- * \note During stream capture, this function results in the creation of an allocation node.  In this case,
- *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
- *       are used to set the node's creation parameters.
- *
- * \param[out] dptr    - Returned device pointer
- * \param[in] bytesize - Number of bytes to allocate
- * \param[in] hStream  - The stream establishing the stream ordering contract and the memory pool to allocate from
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool,
- *     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
- *     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute
- */
-CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream);
-
-/**
- * \brief Tries to release memory back to the OS
- *
- * Releases memory back to the OS until the pool contains fewer than minBytesToKeep
- * reserved bytes, or there is no more memory that the allocator can safely release.
- * The allocator cannot release OS allocations that back outstanding asynchronous allocations.
- * The OS allocations may happen at different granularity from the user allocations.
- *
- * \note: Allocations that have not been freed count as outstanding. 
- * \note: Allocations that have been asynchronously freed but whose completion has
- *        not been observed on the host (eg. by a synchronize) can count as outstanding.
- *
- * \param[in] pool           - The memory pool to trim
- * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved,
- * the TrimTo operation is a no-op.  Otherwise the pool will be guaranteed to have
- * at least minBytesToKeep bytes reserved after the operation.
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
- *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep);
-
-/**
- * \brief Sets attributes of a memory pool
- *
- * Supported attributes are:
- * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
- *                    Amount of reserved memory in bytes to hold onto before trying
- *                    to release memory back to the OS. When more than the release
- *                    threshold bytes of memory are held by the memory pool, the
- *                    allocator will try to release memory back to the OS on the
- *                    next call to stream, event or context synchronize. (default 0)
- * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
- *                    Allow ::cuMemAllocAsync to use memory asynchronously freed
- *                    in another stream as long as a stream ordering dependency
- *                    of the allocating stream on the free action exists.
- *                    Cuda events and null stream interactions can create the required
- *                    stream ordered dependencies. (default enabled)
- * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
- *                    Allow reuse of already completed frees when there is no dependency
- *                    between the free and allocation. (default enabled)
- * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
- *                    Allow ::cuMemAllocAsync to insert new stream dependencies
- *                    in order to establish the stream ordering required to reuse
- *                    a piece of memory released by ::cuMemFreeAsync (default enabled).
- * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
- *                    Reset the high watermark that tracks the amount of backing memory that was
- *                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
- * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
- *                    Reset the high watermark that tracks the amount of used memory that was
- *                    allocated for the memory pool.
- *
- * \param[in] pool  - The memory pool to modify
- * \param[in] attr  - The attribute to modify
- * \param[in] value - Pointer to the value to assign
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
- *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
-
-/**
- * \brief Gets attributes of a memory pool
- *
- * Supported attributes are:
- * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
- *                    Amount of reserved memory in bytes to hold onto before trying
- *                    to release memory back to the OS. When more than the release
- *                    threshold bytes of memory are held by the memory pool, the
- *                    allocator will try to release memory back to the OS on the
- *                    next call to stream, event or context synchronize. (default 0)
- * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
- *                    Allow ::cuMemAllocAsync to use memory asynchronously freed
- *                    in another stream as long as a stream ordering dependency
- *                    of the allocating stream on the free action exists.
- *                    Cuda events and null stream interactions can create the required
- *                    stream ordered dependencies. (default enabled)
- * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
- *                    Allow reuse of already completed frees when there is no dependency
- *                    between the free and allocation. (default enabled)
- * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
- *                    Allow ::cuMemAllocAsync to insert new stream dependencies
- *                    in order to establish the stream ordering required to reuse
- *                    a piece of memory released by ::cuMemFreeAsync (default enabled).
- * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t)
- *                    Amount of backing memory currently allocated for the mempool
- * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
- *                    High watermark of backing memory allocated for the mempool since the
- *                    last time it was reset.
- * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t)
- *                    Amount of memory from the pool that is currently in use by the application.
- * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
- *                    High watermark of the amount of memory from the pool that was in use by the application.
- *
- * \param[in] pool   - The memory pool to get attributes of
- * \param[in] attr   - The attribute to get 
- * \param[out] value - Retrieved value
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
- *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
-
-/**
- * \brief Controls visibility of pools between devices
- *
- * \param[in] pool  - The pool being modified
- * \param[in] map   - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu.
- * \param[in] count - Number of descriptors in the map array.
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
- *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count);
-
-/**
- * \brief Returns the accessibility of a pool from a device
- *
- * Returns the accessibility of the pool's memory from the specified location. 
- *
- * \param[out] flags   - the accessibility of the pool from the specified location
- * \param[in] memPool  - the pool being queried
- * \param[in] location - the location accessing the pool
- *
- * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
- *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location);
-
-/**
- * \brief Creates a memory pool
- *
- * Creates a CUDA memory pool and returns the handle in \p pool.  The \p poolProps determines
- * the properties of the pool such as the backing device and IPC capabilities. 
- *
- * By default, the pool's memory will be accessible from the device it is allocated on.
- *
- * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY,
- * ::CUDA_ERROR_NOT_SUPPORTED
- *
- * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool,
- *     ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle
- */
-CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps);
-
-/**
- * \brief Destroys the specified memory pool
- *
- * If any pointers obtained from this pool haven't been freed or
- * the pool has free operations that haven't completed
- * when ::cuMemPoolDestroy is invoked, the function will return immediately and the
- * resources associated with the pool will be released automatically
- * once there are no more outstanding allocations. 
- *
- * Destroying the current mempool of a device sets the default mempool of
- * that device as the current mempool for that device.
- *
- * \note A device's default memory pool cannot be destroyed.
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool,
- *     ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate
- */
-CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool);
-
-/**
- * \brief Allocates memory from a specified pool with stream ordered semantics.
- *
- * Inserts an allocation operation into \p hStream.
- * A pointer to the allocated memory is returned immediately in *dptr.
- * The allocation must not be accessed until the the allocation operation completes.
- * The allocation comes from the specified memory pool.
- *
- * \note
- *    -  The specified memory pool may be from a device different than that of the specified \p hStream. 
- * 
- *    -  Basic stream ordering allows future work submitted into the same stream to use the allocation.
- *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
- *       operation completes before work submitted in a separate stream runs. 
- *
- * \note During stream capture, this function results in the creation of an allocation node.  In this case,
- *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
- *       are used to set the node's creation parameters.
- *
- * \param[out] dptr    - Returned device pointer
- * \param[in] bytesize - Number of bytes to allocate
- * \param[in] pool     - The pool to allocate from 
- * \param[in] hStream  - The stream establishing the stream ordering semantic
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
- *     ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess,
- *     ::cuMemPoolSetAttribute
- */
-CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
-
-/**
- * \brief Exports a memory pool to the requested handle type.
- *
- * Given an IPC capable mempool, create an OS handle to share the pool with another process.
- * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle.
- * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs.
- * The implementation of what the shareable handle is and how it can be transferred is defined by the requested
- * handle type.
- *
- * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE.
- *
- * \param[out] handle_out  - Returned OS handle 
- * \param[in] pool         - pool to export 
- * \param[in] handleType   - the type of handle to create 
- * \param[in] flags        - must be 0 
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer,
- *     ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync,
- *     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
- *     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute
- */
-CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
-
-/**
- * \brief imports a memory pool from a shared handle.
- *
- * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer.
- *
- * \note Imported memory pools do not support creating new allocations.
- *       As such imported memory pools may not be used in cuDeviceSetMemPool
- *       or ::cuMemAllocFromPoolAsync calls.
- *
- * \param[out] pool_out    - Returned memory pool
- * \param[in] handle       - OS handle of the pool to open 
- * \param[in] handleType   - The type of handle being imported 
- * \param[in] flags        - must be 0 
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer
- */
-CUresult CUDAAPI cuMemPoolImportFromShareableHandle(
-        CUmemoryPool *pool_out,
-        void *handle,
-        CUmemAllocationHandleType handleType,
-        unsigned long long flags);
-
-/**
- * \brief Export data to share a memory pool allocation between processes.
- *
- * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool.
- * The recipient process can import the allocation with the ::cuMemPoolImportPointer api.
- * The data is not a handle and may be shared through any IPC mechanism.
- *
- * \param[out] shareData_out - Returned export data  
- * \param[in] ptr            - pointer to memory being exported
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer
- */
-CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr);
-
-/**
- * \brief Import a memory pool allocation from another process.
- *
- * Returns in \p ptr_out a pointer to the imported memory.
- * The imported memory must not be accessed before the allocation operation completes
- * in the exporting process. The imported memory must be freed from all importing processes before
- * being freed in the exporting process. The pointer may be freed with cuMemFree
- * or cuMemFreeAsync.  If cuMemFreeAsync is used, the free must be completed
- * on the importing process before the free operation on the exporting process.
- *
- * \note The cuMemFreeAsync api may be used in the exporting process before
- *       the cuMemFreeAsync operation completes in its stream as long as the
- *       cuMemFreeAsync in the exporting process specifies a stream with
- *       a stream dependency on the importing process's cuMemFreeAsync.
- *
- * \param[out] ptr_out  - pointer to imported memory
- * \param[in] pool      - pool from which to import
- * \param[in] shareData - data specifying the memory to import
- *
- * \returns
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- *
- * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer
- */
-CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData);
-
-/** @} */ /* END CUDA_MALLOC_ASYNC */
-
-/**
- * \defgroup CUDA_UNIFIED Unified Addressing
- *
- * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver
- * API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the unified addressing functions of the
- * low-level CUDA driver application programming interface.
- *
- * @{
- *
- * \section CUDA_UNIFIED_overview Overview
- *
- * CUDA devices can share a unified address space with the host.
- * For these devices there is no distinction between a device
- * pointer and a host pointer -- the same pointer value may be
- * used to access memory from the host program and from a kernel
- * running on the device (with exceptions enumerated below).
- *
- * \section CUDA_UNIFIED_support Supported Platforms
- *
- * Whether or not a device supports unified addressing may be
- * queried by calling ::cuDeviceGetAttribute() with the device
- * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
- *
- * Unified addressing is automatically enabled in 64-bit processes
- *
- * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values
- *
- * It is possible to look up information about the memory which backs a
- * pointer value.  For instance, one may want to know if a pointer points
- * to host or device memory.  As another example, in the case of device
- * memory, one may want to know on which CUDA device the memory
- * resides.  These properties may be queried using the function
- * ::cuPointerGetAttribute()
- *
- * Since pointers are unique, it is not necessary to specify information
- * about the pointers specified to the various copy functions in the
- * CUDA API.  The function ::cuMemcpy() may be used to perform a copy
- * between two pointers, ignoring whether they point to host or device
- * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH()
- * unnecessary for devices supporting unified addressing).  For
- * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be
- * used to specify that the CUDA driver should infer the location of the
- * pointer from its value.
- *
- * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
- *
- * All host memory allocated in all contexts using ::cuMemAllocHost() and
- * ::cuMemHostAlloc() is always directly accessible from all contexts on
- * all devices that support unified addressing.  This is the case regardless
- * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and
- * ::CU_MEMHOSTALLOC_DEVICEMAP are specified.
- *
- * The pointer value through which allocated host memory may be accessed
- * in kernels on all devices that support unified addressing is the same
- * as the pointer value through which that memory is accessed on the host,
- * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device
- * pointer for these allocations.
- *
- * Note that this is not the case for memory allocated using the flag
- * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below.
- *
- * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory
- *
- * Upon enabling direct access from a context that supports unified addressing
- * to another peer context that supports unified addressing using
- * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using
- * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible
- * by the current context.  The device pointer value through
- * which any peer memory may be accessed in the current context
- * is the same pointer value through which that memory may be
- * accessed in the peer context.
- *
- * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing
- *
- * Not all memory may be accessed on devices through the same pointer
- * value through which they are accessed on the host.  These exceptions
- * are host memory registered using ::cuMemHostRegister() and host memory
- * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED.  For these
- * exceptions, there exists a distinct host and device address for the
- * memory.  The device address is guaranteed to not overlap any valid host
- * pointer range and is guaranteed to have the same value across all
- * contexts that support unified addressing.
- *
- * This device address may be queried using ::cuMemHostGetDevicePointer()
- * when a context using unified addressing is current.  Either the host
- * or the unified device pointer value may be used to refer to this memory
- * through ::cuMemcpy() and similar functions using the
- * ::CU_MEMORYTYPE_UNIFIED memory type.
- *
- */
-
-/**
- * \brief Returns information about a pointer
- *
- * The supported attributes are:
- *
- * - ::CU_POINTER_ATTRIBUTE_CONTEXT:
- *
- *      Returns in \p *data the ::CUcontext in which \p ptr was allocated or
- *      registered.
- *      The type of \p data must be ::CUcontext *.
- *
- *      If \p ptr was not allocated by, mapped by, or registered with
- *      a ::CUcontext which uses unified virtual addressing then
- *      ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE:
- *
- *      Returns in \p *data the physical memory type of the memory that
- *      \p ptr addresses as a ::CUmemorytype enumerated value.
- *      The type of \p data must be unsigned int.
- *
- *      If \p ptr addresses device memory then \p *data is set to
- *      ::CU_MEMORYTYPE_DEVICE.  The particular ::CUdevice on which the
- *      memory resides is the ::CUdevice of the ::CUcontext returned by the
- *      ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr.
- *
- *      If \p ptr addresses host memory then \p *data is set to
- *      ::CU_MEMORYTYPE_HOST.
- *
- *      If \p ptr was not allocated by, mapped by, or registered with
- *      a ::CUcontext which uses unified virtual addressing then
- *      ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- *      If the current ::CUcontext does not support unified virtual
- *      addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned.
- *
- * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER:
- *
- *      Returns in \p *data the device pointer value through which
- *      \p ptr may be accessed by kernels running in the current
- *      ::CUcontext.
- *      The type of \p data must be CUdeviceptr *.
- *
- *      If there exists no device pointer value through which
- *      kernels running in the current ::CUcontext may access
- *      \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- *      If there is no current ::CUcontext then
- *      ::CUDA_ERROR_INVALID_CONTEXT is returned.
- *
- *      Except in the exceptional disjoint addressing cases discussed
- *      below, the value returned in \p *data will equal the input
- *      value \p ptr.
- *
- * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER:
- *
- *      Returns in \p *data the host pointer value through which
- *      \p ptr may be accessed by by the host program.
- *      The type of \p data must be void **.
- *      If there exists no host pointer value through which
- *      the host program may directly access \p ptr then
- *      ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- *      Except in the exceptional disjoint addressing cases discussed
- *      below, the value returned in \p *data will equal the input
- *      value \p ptr.
- *
- * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS:
- *
- *      Returns in \p *data two tokens for use with the nv-p2p.h Linux
- *      kernel interface. \p data must be a struct of type
- *      CUDA_POINTER_ATTRIBUTE_P2P_TOKENS.
- *
- *      \p ptr must be a pointer to memory obtained from :cuMemAlloc().
- *      Note that p2pToken and vaSpaceToken are only valid for the
- *      lifetime of the source allocation. A subsequent allocation at
- *      the same address may return completely different tokens.
- *      Querying this attribute has a side effect of setting the attribute
- *      ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that
- *      \p ptr points to.
- *
- * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
- *
- *      A boolean attribute which when set, ensures that synchronous memory operations
- *      initiated on the region of memory that \p ptr points to will always synchronize.
- *      See further documentation in the section titled "API synchronization behavior"
- *      to learn more about cases when synchronous memory operations can
- *      exhibit asynchronous behavior.
- *
- * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID:
- *
- *      Returns in \p *data a buffer ID which is guaranteed to be unique within the process.
- *      \p data must point to an unsigned long long.
- *
- *      \p ptr must be a pointer to memory obtained from a CUDA memory allocation API.
- *      Every memory allocation from any of the CUDA memory allocation APIs will
- *      have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs
- *      from previous freed allocations. IDs are only unique within a single process.
- *
- *
- * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED:
- *
- *      Returns in \p *data a boolean that indicates whether the pointer points to
- *      managed memory or not.
- *
- *      If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL:
- *
- *      Returns in \p *data an integer representing a device ordinal of a device against
- *      which the memory was allocated or registered.
- *
- * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE:
- *
- *      Returns in \p *data a boolean that indicates if this pointer maps to
- *      an allocation that is suitable for ::cudaIpcGetMemHandle.
- *
- * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR:
- *
- *      Returns in \p *data the starting address for the allocation referenced
- *      by the device pointer \p ptr.  Note that this is not necessarily the
- *      address of the mapped region, but the address of the mappable address
- *      range \p ptr references (e.g. from ::cuMemAddressReserve).
- *
- * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE:
- *
- *      Returns in \p *data the size for the allocation referenced by the device
- *      pointer \p ptr.  Note that this is not necessarily the size of the mapped
- *      region, but the size of the mappable address range \p ptr references
- *      (e.g. from ::cuMemAddressReserve).  To retrieve the size of the mapped
- *      region, see ::cuMemGetAddressRange
- *
- * - ::CU_POINTER_ATTRIBUTE_MAPPED:
- *
- *      Returns in \p *data a boolean that indicates if this pointer is in a
- *      valid address range that is mapped to a backing allocation.
- *
- * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES:
- *
- *      Returns a bitmask of the allowed handle types for an allocation that may
- *      be passed to ::cuMemExportToShareableHandle.
- * 
- * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE:
- * 
- *      Returns in \p *data the handle to the mempool that the allocation was obtained from.
- *
- * \par
- *
- * Note that for most allocations in the unified virtual address space
- * the host and device pointer for accessing the allocation will be the
- * same.  The exceptions to this are
- *  - user memory registered using ::cuMemHostRegister
- *  - host memory allocated using ::cuMemHostAlloc with the
- *    ::CU_MEMHOSTALLOC_WRITECOMBINED flag
- * For these types of allocation there will exist separate, disjoint host
- * and device addresses for accessing the allocation.  In particular
- *  - The host address will correspond to an invalid unmapped device address
- *    (which will result in an exception if accessed from the device)
- *  - The device address will correspond to an invalid unmapped host address
- *    (which will result in an exception if accessed from the host).
- * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER
- * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host
- * and device addresses from either address.
- *
- * \param data      - Returned pointer attribute value
- * \param attribute - Pointer attribute to query
- * \param ptr       - Pointer
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuPointerSetAttribute,
- * ::cuMemAlloc,
- * ::cuMemFree,
- * ::cuMemAllocHost,
- * ::cuMemFreeHost,
- * ::cuMemHostAlloc,
- * ::cuMemHostRegister,
- * ::cuMemHostUnregister,
- * ::cudaPointerGetAttributes
- */
-CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
-
-/**
- * \brief Prefetches memory to the specified destination device
- *
- * Prefetches memory to the specified destination device.  \p devPtr is the
- * base device pointer of the memory to be prefetched and \p dstDevice is the
- * destination device. \p count specifies the number of bytes to copy. \p hStream
- * is the stream in which the operation is enqueued. The memory range must refer
- * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
- *
- * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
- * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
- * must be non-zero. Additionally, \p hStream must be associated with a device that has a
- * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
- *
- * The start address and end address of the memory range will be rounded down and rounded up
- * respectively to be aligned to CPU page size before the prefetch operation is enqueued
- * in the stream.
- *
- * If no physical memory has been allocated for this region, then this memory region
- * will be populated and mapped on the destination device. If there's insufficient
- * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
- * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory
- * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted.
- *
- * By default, any mappings to the previous location of the migrated pages are removed and
- * mappings for the new location are only setup on \p dstDevice. The exact behavior however
- * also depends on the settings applied to this memory range via ::cuMemAdvise as described
- * below:
- *
- * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range,
- * then that subset will create a read-only copy of the pages on \p dstDevice.
- *
- * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory
- * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
- * preferred location of any pages in the memory range.
- *
- * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range,
- * then mappings to those pages from all the appropriate processors are updated to
- * refer to the new location if establishing such a mapping is possible. Otherwise,
- * those mappings are cleared.
- *
- * Note that this API is not required for functionality and only serves to improve performance
- * by allowing the application to migrate data to a suitable location before it is accessed.
- * Memory accesses to this range are always coherent and are allowed even when the data is
- * actively being migrated.
- *
- * Note that this function is asynchronous with respect to the host and all work
- * on other devices.
- *
- * \param devPtr    - Pointer to be prefetched
- * \param count     - Size in bytes
- * \param dstDevice - Destination device to prefetch to
- * \param hStream    - Stream to enqueue prefetch operation
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
- * ::cuMemcpy3DPeerAsync, ::cuMemAdvise,
- * ::cudaMemPrefetchAsync
- */
-CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
-
-/**
- * \brief Advise about the usage of a given memory range
- *
- * Advise the Unified Memory subsystem about the usage pattern for the memory range
- * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
- * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
- * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged
- * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
- * memory provided it represents a valid, host-accessible region of memory and all additional constraints
- * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
- * memory range results in an error being returned.
- *
- * The \p advice parameter can take the following values:
- * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
- * from and only occasionally written to. Any read accesses from any processor to this region will create a
- * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync
- * is called on this region, it will create a read-only copy of the data on the destination processor.
- * If any processor writes to this region, all copies of the corresponding page will be invalidated
- * except for the one where the write occurred. The \p device argument is ignored for this advice.
- * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
- * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
- * Also, if a context is created on a device that does not have the device attribute
- * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until
- * all such contexts are destroyed.
- * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
- * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only
- * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
- * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice
- * will not create a read-only copy when that device accesses this memory region.
- *
- * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY:  Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the
- * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
- * copies of the data will be collapsed into a single copy. The location for the collapsed
- * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
- * copies was resident at that location. Otherwise, the location chosen is arbitrary.
- *
- * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
- * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the
- * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
- * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location
- * does not cause data to migrate to that location immediately. Instead, it guides the migration policy
- * when a fault occurs on that memory region. If the data is already in its preferred location and the
- * faulting processor can establish a mapping without requiring the data to be migrated, then
- * data migration will be avoided. On the other hand, if the data is not in its preferred location
- * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
- * it. It is important to note that setting the preferred location does not prevent data prefetching
- * done using ::cuMemPrefetchAsync.
- * Having a preferred location can override the page thrash detection and resolution logic in the Unified
- * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
- * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
- * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
- * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
- * policies associated with that advice will override the policies of this advice, unless read accesses from
- * \p device will not result in a read-only copy being created on that device as outlined in description for
- * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY.
- * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
- * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
- * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
- * then this call has no effect. Note however that this behavior may change in the future.
- *
- * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
- * and changes the preferred location to none.
- *
- * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device.
- * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then
- * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero.
- * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
- * it causes the data to always be mapped in the specified processor's page tables, as long as the
- * location of the data permits a mapping to be established. If the data gets migrated for any reason,
- * the mappings are updated accordingly.
- * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
- * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
- * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
- * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
- * migration may be too high. But preventing faults can still help improve performance, and so having
- * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
- * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
- * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
- * page in host memory.
- * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
- * policies associated with that advice will override the policies of this advice. Additionally, if the
- * preferred location of this memory region or any subset of it is also \p device, then the policies
- * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
- * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
- * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
- * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
- * then this call has no effect.
- *
- * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to
- * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
- * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
- * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
- * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
- * then this call has no effect.
- *
- * \param devPtr - Pointer to memory to set the advice for
- * \param count  - Size in bytes of the memory range
- * \param advice - Advice to be applied for the specified memory range
- * \param device - Device to apply the advice for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
- * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync,
- * ::cudaMemAdvise
- */
-CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device);
-
-/**
- * \brief Query an attribute of a given memory range
- *
- * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
- * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
- * __managed__ variables.
- *
- * The \p attribute parameter can take the following values:
- * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted
- * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
- * memory range have read-duplication enabled, or 0 otherwise.
- * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be
- * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
- * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU
- * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID
- * if either all the pages don't have the same preferred location or some of the pages don't have a
- * preferred location at all. Note that the actual location of the pages in the memory range at the time of
- * the query may be different from the preferred location.
- * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted
- * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
- * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range.
- * If any device does not have that advice set for the entire memory range, that device will not be included.
- * If \p data is larger than the number of devices that have that advice set for that memory range,
- * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12
- * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
- * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have
- * that advice set, then only as many devices will be returned as can fit in the array. There is no
- * guarantee on which specific devices will be returned, however.
- * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be
- * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
- * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be
- * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU
- * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
- * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the
- * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to
- * whether the prefetch operation to that location has completed or even begun.
- *
- * \param data      - A pointers to a memory location where the result
- *                    of each attribute query will be written to.
- * \param dataSize  - Array containing the size of data
- * \param attribute - The attribute to query
- * \param devPtr    - Start of the range to query
- * \param count     - Size of the range to query
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- * \note_async
- * \note_null_stream
- *
- * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync,
- * ::cuMemAdvise,
- * ::cudaMemRangeGetAttribute
- */
-CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count);
-
-/**
- * \brief Query attributes of a given memory range.
- *
- * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
- * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
- * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
- * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
- * The results of the query will be stored in \p data.
- *
- * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for
- * attribute descriptions and restrictions.
- *
- * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY
- * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION
- * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY
- * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION
- *
- * \param data          - A two-dimensional array containing pointers to memory
- *                        locations where the result of each attribute query will be written to.
- * \param dataSizes     - Array containing the sizes of each result
- * \param attributes    - An array of attributes to query
- *                        (numAttributes and the number of attributes in this array should match)
- * \param numAttributes - Number of attributes to query
- * \param devPtr        - Start of the range to query
- * \param count         - Size of the range to query
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise,
- * ::cuMemPrefetchAsync,
- * ::cudaMemRangeGetAttributes
- */
-CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count);
-
-/**
- * \brief Set attributes on a previously allocated memory region
- *
- * The supported attributes are:
- *
- * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
- *
- *      A boolean attribute that can either be set (1) or unset (0). When set,
- *      the region of memory that \p ptr points to is guaranteed to always synchronize
- *      memory operations that are synchronous. If there are some previously initiated
- *      synchronous memory operations that are pending when this attribute is set, the
- *      function does not return until those memory operations are complete.
- *      See further documentation in the section titled "API synchronization behavior"
- *      to learn more about cases when synchronous memory operations can
- *      exhibit asynchronous behavior.
- *      \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set.
- *
- * \param value     - Pointer to memory containing the value to be set
- * \param attribute - Pointer attribute to set
- * \param ptr       - Pointer to a memory region allocated using CUDA memory allocation APIs
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa ::cuPointerGetAttribute,
- * ::cuPointerGetAttributes,
- * ::cuMemAlloc,
- * ::cuMemFree,
- * ::cuMemAllocHost,
- * ::cuMemFreeHost,
- * ::cuMemHostAlloc,
- * ::cuMemHostRegister,
- * ::cuMemHostUnregister
- */
-CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr);
-
-/**
- * \brief Returns information about a pointer.
- *
- * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions):
- *
- * - ::CU_POINTER_ATTRIBUTE_CONTEXT
- * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE
- * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER
- * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER
- * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
- * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID
- * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED
- * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
- * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR
- * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE
- * - ::CU_POINTER_ATTRIBUTE_MAPPED
- * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
- * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
- * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
- *
- * \param numAttributes - Number of attributes to query
- * \param attributes    - An array of attributes to query
- *                      (numAttributes and the number of attributes in this array should match)
- * \param data          - A two-dimensional array containing pointers to memory
- *                      locations where the result of each attribute query will be written to.
- * \param ptr           - Pointer to query
- *
- * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr
- * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values
- * and CUDA_SUCCESS is returned.
- *
- * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA
- * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuPointerGetAttribute,
- * ::cuPointerSetAttribute,
- * ::cudaPointerGetAttributes
- */
-CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr);
-
-/** @} */ /* END CUDA_UNIFIED */
-
-/**
- * \defgroup CUDA_STREAM Stream Management
- *
- * ___MANBRIEF___ stream management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the stream management functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Create a stream
- *
- * Creates a stream and returns a handle in \p phStream.  The \p Flags argument
- * determines behaviors of the stream.
- *
- * Valid values for \p Flags are:
- * - ::CU_STREAM_DEFAULT: Default stream creation flag.
- * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created
- *   stream may run concurrently with work in stream 0 (the NULL stream), and that
- *   the created stream should perform no implicit synchronization with stream 0.
- *
- * \param phStream - Returned newly created stream
- * \param Flags    - Parameters for stream creation
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuStreamDestroy,
- * ::cuStreamCreateWithPriority,
- * ::cuStreamGetPriority,
- * ::cuStreamGetFlags,
- * ::cuStreamWaitEvent,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamAddCallback,
- * ::cudaStreamCreate,
- * ::cudaStreamCreateWithFlags
- */
-CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags);
-
-/**
- * \brief Create a stream with the given priority
- *
- * Creates a stream with the specified priority and returns a handle in \p phStream.
- * This API alters the scheduler priority of work in the stream. Work in a higher
- * priority stream may preempt work already executing in a low priority stream.
- *
- * \p priority follows a convention where lower numbers represent higher priorities.
- * '0' represents default priority. The range of meaningful numerical priorities can
- * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
- * outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
- * it will automatically be clamped to the lowest or the highest number in the range.
- *
- * \param phStream    - Returned newly created stream
- * \param flags       - Flags for stream creation. See ::cuStreamCreate for a list of
- *                      valid flags
- * \param priority    - Stream priority. Lower numbers represent higher priorities.
- *                      See ::cuCtxGetStreamPriorityRange for more information about
- *                      meaningful stream priorities that can be passed.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \note Stream priorities are supported only on GPUs
- * with compute capability 3.5 or higher.
- *
- * \note In the current implementation, only compute kernels launched in
- * priority streams are affected by the stream's priority. Stream priorities have
- * no effect on host-to-device and device-to-host memory operations.
- *
- * \sa ::cuStreamDestroy,
- * ::cuStreamCreate,
- * ::cuStreamGetPriority,
- * ::cuCtxGetStreamPriorityRange,
- * ::cuStreamGetFlags,
- * ::cuStreamWaitEvent,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamAddCallback,
- * ::cudaStreamCreateWithPriority
- */
-CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority);
-
-
-/**
- * \brief Query the priority of a given stream
- *
- * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
- * and return the priority in \p priority. Note that if the stream was created with a
- * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
- * this function returns the clamped priority.
- * See ::cuStreamCreateWithPriority for details about priority clamping.
- *
- * \param hStream    - Handle to the stream to be queried
- * \param priority   - Pointer to a signed integer in which the stream's priority is returned
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuStreamDestroy,
- * ::cuStreamCreate,
- * ::cuStreamCreateWithPriority,
- * ::cuCtxGetStreamPriorityRange,
- * ::cuStreamGetFlags,
- * ::cudaStreamGetPriority
- */
-CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
-
-/**
- * \brief Query the flags of a given stream
- *
- * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
- * and return the flags in \p flags.
- *
- * \param hStream    - Handle to the stream to be queried
- * \param flags      - Pointer to an unsigned integer in which the stream's flags are returned
- *                     The value returned in \p flags is a logical 'OR' of all flags that
- *                     were used while creating this stream. See ::cuStreamCreate for the list
- *                     of valid flags
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa ::cuStreamDestroy,
- * ::cuStreamCreate,
- * ::cuStreamGetPriority,
- * ::cudaStreamGetFlags
- */
-CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
-
-/**
- * \brief Query the context associated with a stream
- *
- * Returns the CUDA context that the stream is associated with.
- *
- * The stream handle \p hStream can refer to any of the following:
- * <ul>
- *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
- *   and ::cuStreamCreateWithPriority, or their runtime API equivalents such as
- *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
- *   The returned context is the context that was active in the calling thread when the
- *   stream was created. Passing an invalid handle will result in undefined behavior.</li>
- *   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
- *   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
- *   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
- *   Specifying any of the special handles will return the context current to the
- *   calling thread. If no context is current to the calling thread,
- *   ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
- * </ul>
- *
- * \param hStream - Handle to the stream to be queried
- * \param pctx    - Returned context associated with the stream
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * \notefnerr
- *
- * \sa ::cuStreamDestroy,
- * ::cuStreamCreateWithPriority,
- * ::cuStreamGetPriority,
- * ::cuStreamGetFlags,
- * ::cuStreamWaitEvent,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamAddCallback,
- * ::cudaStreamCreate,
- * ::cudaStreamCreateWithFlags
- */
-CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
-
-/**
- * \brief Make a compute stream wait on an event
- *
- * Makes all future work submitted to \p hStream wait for all work captured in
- * \p hEvent.  See ::cuEventRecord() for details on what is captured by an event.
- * The synchronization will be performed efficiently on the device when applicable.
- * \p hEvent may be from a different context or device than \p hStream.
- *
- * flags include:
- * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag.
- * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external
- *   event node when performing stream capture. This flag is invalid outside
- *   of stream capture.
- *
- * \param hStream - Stream to wait
- * \param hEvent  - Event to wait on (may not be NULL)
- * \param Flags   - See ::CUevent_capture_flags
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuEventRecord,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamAddCallback,
- * ::cuStreamDestroy,
- * ::cudaStreamWaitEvent
- */
-CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
-
-/**
- * \brief Add a callback to a compute stream
- *
- * \note This function is slated for eventual deprecation and removal. If
- * you do not require the callback to execute in case of a device error,
- * consider using ::cuLaunchHostFunc. Additionally, this function is not
- * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike
- * ::cuLaunchHostFunc.
- *
- * Adds a callback to be called on the host after all currently enqueued
- * items in the stream have completed.  For each
- * cuStreamAddCallback call, the callback will be executed exactly once.
- * The callback will block later work in the stream until it is finished.
- *
- * The callback may be passed ::CUDA_SUCCESS or an error code.  In the event
- * of a device error, all subsequently executed callbacks will receive an
- * appropriate ::CUresult.
- *
- * Callbacks must not make any CUDA API calls.  Attempting to use a CUDA API
- * will result in ::CUDA_ERROR_NOT_PERMITTED.  Callbacks must not perform any
- * synchronization that may depend on outstanding device work or other callbacks
- * that are not mandated to run earlier.  Callbacks without a mandated order
- * (in independent streams) execute in undefined order and may be serialized.
- *
- * For the purposes of Unified Memory, callback execution makes a number of
- * guarantees:
- * <ul>
- *   <li>The callback stream is considered idle for the duration of the
- *   callback.  Thus, for example, a callback may always use memory attached
- *   to the callback stream.</li>
- *   <li>The start of execution of a callback has the same effect as
- *   synchronizing an event recorded in the same stream immediately prior to
- *   the callback.  It thus synchronizes streams which have been "joined"
- *   prior to the callback.</li>
- *   <li>Adding device work to any stream does not have the effect of making
- *   the stream active until all preceding host functions and stream callbacks
- *   have executed.  Thus, for
- *   example, a callback might use global attached memory even if work has
- *   been added to another stream, if the work has been ordered behind the
- *   callback with an event.</li>
- *   <li>Completion of a callback does not cause a stream to become
- *   active except as described above.  The callback stream will remain idle
- *   if no device work follows the callback, and will remain idle across
- *   consecutive callbacks without device work in between.  Thus, for example,
- *   stream synchronization can be done by signaling from a callback at the
- *   end of the stream.</li>
- * </ul>
- *
- * \param hStream  - Stream to add callback to
- * \param callback - The function to call once preceding stream operations are complete
- * \param userData - User specified data to be passed to the callback function
- * \param flags    - Reserved for future use, must be 0
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamWaitEvent,
- * ::cuStreamDestroy,
- * ::cuMemAllocManaged,
- * ::cuStreamAttachMemAsync,
- * ::cuStreamLaunchHostFunc,
- * ::cudaStreamAddCallback
- */
-CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
-
-/**
- * \brief Begins graph capture on a stream
- *
- * Begin graph capture on \p hStream. When a stream is in capture mode, all operations
- * pushed into the stream will not be executed, but will instead be captured into
- * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated
- * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which
- * it was initiated, and it may only be initiated if the stream is not already in capture
- * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id
- * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo.
- *
- * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be
- * called on this stream from the same thread.
- *
- * \param hStream - Stream in which to initiate capture
- * \param mode    - Controls the interaction of this capture sequence with other API
- *                  calls that are potentially unsafe. For more details see
- *                  ::cuThreadExchangeStreamCaptureMode.
- *
- * \note Kernels captured using this API must not use texture and surface references.
- *       Reading or writing through any texture or surface reference is undefined
- *       behavior. This restriction does not apply to texture and surface objects.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cuStreamCreate,
- * ::cuStreamIsCapturing,
- * ::cuStreamEndCapture,
- * ::cuThreadExchangeStreamCaptureMode
- */
-CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode);
-
-/**
- * \brief Swaps the stream capture interaction mode for a thread
- *
- * Sets the calling thread's stream capture interaction mode to the value contained
- * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To
- * facilitate deterministic behavior across function or module boundaries, callers
- * are encouraged to use this API in a push-pop fashion: \code
-     CUstreamCaptureMode mode = desiredMode;
-     cuThreadExchangeStreamCaptureMode(&mode);
-     ...
-     cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode
- * \endcode
- *
- * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call
- * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is
- * not enqueued asynchronously to a stream, and is not observed by stream capture.
- * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture
- * depended on the allocation being replayed whenever the graph is launched, the
- * captured graph would be invalid.
- *
- * Therefore, stream capture places restrictions on API calls that can be made within
- * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This
- * behavior can be controlled via this API and flags to ::cuStreamBeginCapture.
- *
- * A thread's mode is one of the following:
- * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has
- *   an ongoing capture sequence that was not initiated with
- *   \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread
- *   has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL,
- *   this thread is prohibited from potentially unsafe API calls.
- * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture
- *   sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited
- *   from potentially unsafe API calls. Concurrent capture sequences in other threads
- *   are ignored.
- * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially
- *   unsafe API calls. Note that the thread is still prohibited from API calls which
- *   necessarily conflict with stream capture, for example, attempting ::cuEventQuery
- *   on an event that was last recorded inside a capture sequence.
- *
- * \param mode - Pointer to mode value to swap with the current mode
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cuStreamBeginCapture
- */
-CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode);
-
-/**
- * \brief Ends capture on a stream, returning the captured graph
- *
- * End capture on \p hStream, returning the captured graph via \p phGraph.
- * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture.
- * If capture was invalidated, due to a violation of the rules of stream capture, then
- * a NULL graph will be returned.
- *
- * If the \p mode argument to ::cuStreamBeginCapture was not
- * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as
- * ::cuStreamBeginCapture.
- *
- * \param hStream - Stream to query
- * \param phGraph - The captured graph
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD
- * \notefnerr
- *
- * \sa
- * ::cuStreamCreate,
- * ::cuStreamBeginCapture,
- * ::cuStreamIsCapturing
- */
-CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
-
-/**
- * \brief Returns a stream's capture status
- *
- * Return the capture status of \p hStream via \p captureStatus. After a successful
- * call, \p *captureStatus will contain one of the following:
- * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing.
- * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing.
- * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error
- *   has invalidated the capture sequence. The capture sequence must be terminated
- *   with ::cuStreamEndCapture on the stream where it was initiated in order to
- *   continue using \p hStream.
- *
- * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while
- * a blocking stream in the same context is capturing, it will return
- * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified
- * after the call. The blocking stream capture is not invalidated.
- *
- * When a blocking stream is capturing, the legacy stream is in an
- * unusable state until the blocking stream capture is terminated. The legacy
- * stream is not supported for stream capture, but attempted use would have an
- * implicit dependency on the capturing stream(s).
- *
- * \param hStream       - Stream to query
- * \param captureStatus - Returns the stream's capture status
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
- * \notefnerr
- *
- * \sa
- * ::cuStreamCreate,
- * ::cuStreamBeginCapture,
- * ::cuStreamEndCapture
- */
-CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
-
-/**
- * \brief Query capture status of a stream
- *
- * Note there is a later version of this API, ::cuStreamGetCaptureInfo_v2. It will
- * supplant this version in 12.0, which is retained for minor version compatibility.
- *
- * Query the capture status of a stream and and get an id for 
- * the capture sequence, which is unique over the lifetime of the process.
- *
- * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created 
- * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
- *
- * A valid id is returned only if both of the following are true:
- * - the call returns CUDA_SUCCESS
- * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
- * \notefnerr
- *
- * \sa
- * ::cuStreamGetCaptureInfo_v2,
- * ::cuStreamBeginCapture,
- * ::cuStreamIsCapturing
- */
-CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
-
-/**
- * \brief Query a stream's capture state (11.3+)
- *
- * Query stream state related to stream capture.
- *
- * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created 
- * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
- *
- * Valid data (other than capture status) is returned only if both of the following are true:
- * - the call returns CUDA_SUCCESS
- * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE
- *
- * This version of cuStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the
- * previous version in 12.0. Developers requiring compatibility across minor versions to
- * CUDA 11.0 (driver version 445) should use ::cuStreamGetCaptureInfo or include a fallback
- * path.
- *
- * \param hStream - The stream to query
- * \param captureStatus_out - Location to return the capture status of the stream; required
- * \param id_out - Optional location to return an id for the capture sequence, which is
- *           unique over the lifetime of the process
- * \param graph_out - Optional location to return the graph being captured into. All
- *           operations other than destroy and node removal are permitted on the graph
- *           while the capture sequence is in progress. This API does not transfer
- *           ownership of the graph, which is transferred or destroyed at
- *           ::cuStreamEndCapture. Note that the graph handle may be invalidated before
- *           end of capture for certain errors. Nodes that are or become
- *           unreachable from the original stream at ::cuStreamEndCapture due to direct
- *           actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED.
- * \param dependencies_out - Optional location to store a pointer to an array of nodes.
- *           The next node to be captured in the stream will depend on this set of nodes,
- *           absent operations such as event wait which modify this set. The array pointer
- *           is valid until the next API call which operates on the stream or until end of
- *           capture. The node handles may be copied out and are valid until they or the
- *           graph is destroyed. The driver-owned array may also be passed directly to
- *           APIs that operate on the graph (not the stream) without copying.
- * \param numDependencies_out - Optional location to store the size of the array
- *           returned in dependencies_out.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuStreamGetCaptureInfo,
- * ::cuStreamBeginCapture,
- * ::cuStreamIsCapturing,
- * ::cuStreamUpdateCaptureDependencies
- */
-CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
-        cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
-
-/**
- * \brief Update the set of dependencies in a capturing stream (11.3+)
- *
- * Modifies the dependency set of a capturing stream. The dependency set is the set
- * of nodes that the next captured node in the stream will depend on.
- *
- * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and
- * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to
- * the API is added to the existing set or replaces it. A flags value of 0 defaults
- * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES.
- *
- * Nodes that are removed from the dependency set via this API do not result in
- * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at
- * ::cuStreamEndCapture.
- *
- * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing.
- *
- * This API is new in CUDA 11.3. Developers requiring compatibility across minor
- * versions to CUDA 11.0 should not use this API or provide a fallback.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_ILLEGAL_STATE
- *
- * \sa
- * ::cuStreamBeginCapture,
- * ::cuStreamGetCaptureInfo,
- * ::cuStreamGetCaptureInfo_v2
- */
-CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
-
-/**
- * \brief Attach memory to a stream asynchronously
- *
- * Enqueues an operation in \p hStream to specify stream association of
- * \p length bytes of memory starting from \p dptr. This function is a
- * stream-ordered operation, meaning that it is dependent on, and will
- * only take effect when, previous work in stream has completed. Any
- * previous association is automatically replaced.
- *
- * \p dptr must point to one of the following types of memories:
- * - managed memory declared using the __managed__ keyword or allocated with
- *   ::cuMemAllocManaged.
- * - a valid host-accessible region of system-allocated pageable memory. This
- *   type of memory may only be specified if the device associated with the
- *   stream reports a non-zero value for the device attribute
- *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
- *
- * For managed allocations, \p length must be either zero or the entire
- * allocation's size. Both indicate that the entire allocation's stream
- * association is being changed. Currently, it is not possible to change stream
- * association for a portion of a managed allocation.
- *
- * For pageable host allocations, \p length must be non-zero.
- *
- * The stream association is specified using \p flags which must be
- * one of ::CUmemAttach_flags.
- * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed
- * by any stream on any device.
- * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee
- * that it won't access the memory on the device from any stream on a device that
- * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
- * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with
- * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
- * the program makes a guarantee that it will only access the memory on the device
- * from \p hStream. It is illegal to attach singly to the NULL stream, because the
- * NULL stream is a virtual global stream and not a specific stream. An error will
- * be returned in this case.
- *
- * When memory is associated with a single stream, the Unified Memory system will
- * allow CPU access to this memory region so long as all operations in \p hStream
- * have completed, regardless of whether other streams are active. In effect,
- * this constrains exclusive ownership of the managed memory region by
- * an active GPU to per-stream activity instead of whole-GPU activity.
- *
- * Accessing memory on the device from streams that are not associated with
- * it will produce undefined results. No error checking is performed by the
- * Unified Memory system to ensure that kernels launched into other streams
- * do not access this region.
- *
- * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync
- * via events, synchronization or other means to ensure legal access to memory
- * at all times. Data visibility and coherency will be changed appropriately
- * for all kernels which follow a stream-association change.
- *
- * If \p hStream is destroyed while data is associated with it, the association is
- * removed and the association reverts to the default visibility of the allocation
- * as specified at ::cuMemAllocManaged. For __managed__ variables, the default
- * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an
- * asynchronous operation, and as a result, the change to default association won't
- * happen until all work in the stream has completed.
- *
- * \param hStream - Stream in which to enqueue the attach operation
- * \param dptr    - Pointer to memory (must be a pointer to managed memory or
- *                  to a valid host-accessible region of system-allocated
- *                  pageable memory)
- * \param length  - Length of memory
- * \param flags   - Must be one of ::CUmemAttach_flags
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamWaitEvent,
- * ::cuStreamDestroy,
- * ::cuMemAllocManaged,
- * ::cudaStreamAttachMemAsync
- */
-CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
-
-/**
- * \brief Determine status of a compute stream
- *
- * Returns ::CUDA_SUCCESS if all operations in the stream specified by
- * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not.
- *
- * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
- * is equivalent to having called ::cuStreamSynchronize().
- *
- * \param hStream - Stream to query status of
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_READY
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuStreamWaitEvent,
- * ::cuStreamDestroy,
- * ::cuStreamSynchronize,
- * ::cuStreamAddCallback,
- * ::cudaStreamQuery
- */
-CUresult CUDAAPI cuStreamQuery(CUstream hStream);
-
-/**
- * \brief Wait until a stream's tasks are completed
- *
- * Waits until the device has completed all operations in the stream specified
- * by \p hStream. If the context was created with the
- * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the
- * stream is finished with all of its tasks.
- *
- * \param hStream - Stream to wait for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE
-
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuStreamDestroy,
- * ::cuStreamWaitEvent,
- * ::cuStreamQuery,
- * ::cuStreamAddCallback,
- * ::cudaStreamSynchronize
- */
-CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
-
-/**
- * \brief Destroys a stream
- *
- * Destroys the stream specified by \p hStream.
- *
- * In case the device is still doing work in the stream \p hStream
- * when ::cuStreamDestroy() is called, the function will return immediately
- * and the resources associated with \p hStream will be released automatically
- * once the device has completed all work in \p hStream.
- *
- * \param hStream - Stream to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuStreamWaitEvent,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamAddCallback,
- * ::cudaStreamDestroy
- */
-CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
-
-/**
- * \brief Copies attributes from source stream to destination stream.
- *
- * Copies attributes from source stream \p src to destination stream \p dst.
- * Both streams must have the same context.
- *
- * \param[out] dst Destination stream
- * \param[in] src Source stream
- * For list of attributes see ::CUstreamAttrID
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src);
-
-/**
- * \brief Queries stream attribute.
- *
- * Queries attribute \p attr from \p hStream and stores it in corresponding
- * member of \p value_out.
- *
- * \param[in] hStream
- * \param[in] attr
- * \param[out] value_out
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
-                                      CUstreamAttrValue *value_out);
-
-/**
- * \brief Sets stream attribute.
- *
- * Sets attribute \p attr on \p hStream from corresponding attribute of
- * \p value. The updated attribute will be applied to subsequent work
- * submitted to the stream. It will not affect previously submitted work.
- *
- * \param[out] hStream
- * \param[in] attr
- * \param[in] value
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
-                                      const CUstreamAttrValue *value);
-
-/** @} */ /* END CUDA_STREAM */
-
-
-/**
- * \defgroup CUDA_EVENT Event Management
- *
- * ___MANBRIEF___ event management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the event management functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Creates an event
- *
- * Creates an event *phEvent for the current context with the flags specified via
- * \p Flags. Valid flags include:
- * - ::CU_EVENT_DEFAULT: Default event creation flag.
- * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
- *   synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
- *   an event created with this flag will block until the event has actually
- *   been recorded.
- * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need
- *   to record timing data.  Events created with this flag specified and
- *   the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best
- *   performance when used with ::cuStreamWaitEvent() and ::cuEventQuery().
- * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an
- *   interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must
- *   be specified along with ::CU_EVENT_DISABLE_TIMING.
- *
- * \param phEvent - Returns newly created event
- * \param Flags   - Event creation flags
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \notefnerr
- *
- * \sa
- * ::cuEventRecord,
- * ::cuEventQuery,
- * ::cuEventSynchronize,
- * ::cuEventDestroy,
- * ::cuEventElapsedTime,
- * ::cudaEventCreate,
- * ::cudaEventCreateWithFlags
- */
-CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
-
-/**
- * \brief Records an event
- *
- * Captures in \p hEvent the contents of \p hStream at the time of this call.
- * \p hEvent and \p hStream must be from the same context.
- * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
- * examine or wait for completion of the work that was captured. Uses of
- * \p hStream after this call do not modify \p hEvent. See note on default
- * stream behavior for what is captured in the default case.
- *
- * ::cuEventRecord() can be called multiple times on the same event and
- * will overwrite the previously captured state. Other APIs such as
- * ::cuStreamWaitEvent() use the most recently captured state at the time
- * of the API call, and are not affected by later calls to
- * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an
- * event represents an empty set of work, so for example ::cuEventQuery()
- * would return ::CUDA_SUCCESS.
- *
- * \param hEvent  - Event to record
- * \param hStream - Stream to record event for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuEventCreate,
- * ::cuEventQuery,
- * ::cuEventSynchronize,
- * ::cuStreamWaitEvent,
- * ::cuEventDestroy,
- * ::cuEventElapsedTime,
- * ::cudaEventRecord,
- * ::cuEventRecordWithFlags
- */
-CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
-
-/**
- * \brief Records an event
- *
- * Captures in \p hEvent the contents of \p hStream at the time of this call.
- * \p hEvent and \p hStream must be from the same context.
- * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
- * examine or wait for completion of the work that was captured. Uses of
- * \p hStream after this call do not modify \p hEvent. See note on default
- * stream behavior for what is captured in the default case.
- *
- * ::cuEventRecordWithFlags() can be called multiple times on the same event and
- * will overwrite the previously captured state. Other APIs such as
- * ::cuStreamWaitEvent() use the most recently captured state at the time
- * of the API call, and are not affected by later calls to
- * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an
- * event represents an empty set of work, so for example ::cuEventQuery()
- * would return ::CUDA_SUCCESS.
- *
- * flags include:
- * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag.
- * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external
- *   event node when performing stream capture. This flag is invalid outside
- *   of stream capture.
- *
- * \param hEvent  - Event to record
- * \param hStream - Stream to record event for
- * \param flags   - See ::CUevent_capture_flags
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuEventCreate,
- * ::cuEventQuery,
- * ::cuEventSynchronize,
- * ::cuStreamWaitEvent,
- * ::cuEventDestroy,
- * ::cuEventElapsedTime,
- * ::cuEventRecord,
- * ::cudaEventRecord
- */
-CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags);
-
-/**
- * \brief Queries an event's status
- *
- * Queries the status of all work currently captured by \p hEvent. See
- * ::cuEventRecord() for details on what is captured by an event.
- *
- * Returns ::CUDA_SUCCESS if all captured work has been completed, or
- * ::CUDA_ERROR_NOT_READY if any captured work is incomplete.
- *
- * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
- * is equivalent to having called ::cuEventSynchronize().
- *
- * \param hEvent - Event to query
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_READY
- * \notefnerr
- *
- * \sa ::cuEventCreate,
- * ::cuEventRecord,
- * ::cuEventSynchronize,
- * ::cuEventDestroy,
- * ::cuEventElapsedTime,
- * ::cudaEventQuery
- */
-CUresult CUDAAPI cuEventQuery(CUevent hEvent);
-
-/**
- * \brief Waits for an event to complete
- *
- * Waits until the completion of all work currently captured in \p hEvent.
- * See ::cuEventRecord() for details on what is captured by an event.
- *
- * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC
- * flag will cause the calling CPU thread to block until the event has
- * been completed by the device.  If the ::CU_EVENT_BLOCKING_SYNC flag has
- * not been set, then the CPU thread will busy-wait until the event has
- * been completed by the device.
- *
- * \param hEvent - Event to wait for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuEventCreate,
- * ::cuEventRecord,
- * ::cuEventQuery,
- * ::cuEventDestroy,
- * ::cuEventElapsedTime,
- * ::cudaEventSynchronize
- */
-CUresult CUDAAPI cuEventSynchronize(CUevent hEvent);
-
-/**
- * \brief Destroys an event
- *
- * Destroys the event specified by \p hEvent.
- *
- * An event may be destroyed before it is complete (i.e., while
- * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the
- * call does not block on completion of the event, and any associated
- * resources will automatically be released asynchronously at completion.
- *
- * \param hEvent - Event to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuEventCreate,
- * ::cuEventRecord,
- * ::cuEventQuery,
- * ::cuEventSynchronize,
- * ::cuEventElapsedTime,
- * ::cudaEventDestroy
- */
-CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
-
-/**
- * \brief Computes the elapsed time between two events
- *
- * Computes the elapsed time between two events (in milliseconds with a
- * resolution of around 0.5 microseconds).
- *
- * If either event was last recorded in a non-NULL stream, the resulting time
- * may be greater than expected (even if both used the same stream handle). This
- * happens because the ::cuEventRecord() operation takes place asynchronously
- * and there is no guarantee that the measured latency is actually just between
- * the two events. Any number of other different stream operations could execute
- * in between the two measured events, thus altering the timing in a significant
- * way.
- *
- * If ::cuEventRecord() has not been called on either event then
- * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
- * on both events but one or both of them has not yet been completed (that is,
- * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
- * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
- * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
- * ::CUDA_ERROR_INVALID_HANDLE.
- *
- * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
- * \param hStart        - Starting event
- * \param hEnd          - Ending event
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_READY
- * \notefnerr
- *
- * \sa ::cuEventCreate,
- * ::cuEventRecord,
- * ::cuEventQuery,
- * ::cuEventSynchronize,
- * ::cuEventDestroy,
- * ::cudaEventElapsedTime
- */
-CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
-
-/** @} */ /* END CUDA_EVENT */
-
-/**
- * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability
- *
- * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the external resource interoperability functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
- /**
- * \brief Imports an external memory object
- *
- * Imports an externally allocated memory object and returns
- * a handle to that in \p extMem_out.
- *
- * The properties of the handle being imported must be described in
- * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure
- * is defined as follows:
- *
- * \code
-        typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
-            CUexternalMemoryHandleType type;
-            union {
-                int fd;
-                struct {
-                    void *handle;
-                    const void *name;
-                } win32;
-                const void *nvSciBufObject;
-            } handle;
-            unsigned long long size;
-            unsigned int flags;
-        } CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
- * \endcode
- *
- * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type
- * of handle being imported. ::CUexternalMemoryHandleType is
- * defined as:
- *
- * \code
-        typedef enum CUexternalMemoryHandleType_enum {
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF           = 8
-        } CUexternalMemoryHandleType;
- * \endcode
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid
- * file descriptor referencing a memory object. Ownership of
- * the file descriptor is transferred to the CUDA driver when the
- * handle is imported successfully. Performing any operations on the
- * file descriptor after it is imported results in undefined behavior.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
- * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
- * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
- * is not NULL, then it must represent a valid shared NT handle that
- * references a memory object. Ownership of this handle is
- * not transferred to CUDA after the import operation, so the
- * application must release the handle using the appropriate system
- * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must point to a NULL-terminated array of
- * UTF-16 characters that refers to a memory object.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
- * be non-NULL and
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
- * must be NULL. The handle specified must be a globally shared KMT
- * handle. This handle does not hold a reference to the underlying
- * object, and thus will be invalid when all references to the
- * memory object are destroyed.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one
- * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
- * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
- * is not NULL, then it must represent a valid shared NT handle that
- * is returned by ID3D12Device::CreateSharedHandle when referring to a
- * ID3D12Heap object. This handle holds a reference to the underlying
- * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must point to a NULL-terminated array of
- * UTF-16 characters that refers to a ID3D12Heap object.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one
- * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
- * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
- * is not NULL, then it must represent a valid shared NT handle that
- * is returned by ID3D12Device::CreateSharedHandle when referring to a
- * ID3D12Resource object. This handle holds a reference to the
- * underlying object. If
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must point to a NULL-terminated array of
- * UTF-16 characters that refers to a ID3D12Resource object.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
- * represent a valid shared NT handle that is returned by
- * IDXGIResource1::CreateSharedHandle when referring to a
- * ID3D11Resource object. If
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must point to a NULL-terminated array of
- * UTF-16 characters that refers to a ID3D11Resource object.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
- * represent a valid shared KMT handle that is returned by
- * IDXGIResource::GetSharedHandle when referring to a
- * ID3D11Resource object and
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
- * must be NULL.
- *
- * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL
- * and reference a valid NvSciBuf object.
- * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the
- * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync
- * as appropriate barriers to maintain coherence between CUDA and the other drivers.
- * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
- * for memory synchronization.
- *
- *
- * The size of the memory object must be specified in
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size.
- *
- * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in
- * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the
- * resource is a dedicated resource. The definition of what a
- * dedicated resource is outside the scope of this extension.
- * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type
- * is one of the following:
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
- *
- * \param extMem_out    - Returned handle to an external memory object
- * \param memHandleDesc - Memory import handle descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the
- * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges
- * as well as appropriate Vulkan pipeline barriers to maintain coherence between
- * CPU and GPU. For more information on these APIs, please refer to "Synchronization
- * and Cache Control" chapter from Vulkan specification.
- *
- * \sa ::cuDestroyExternalMemory,
- * ::cuExternalMemoryGetMappedBuffer,
- * ::cuExternalMemoryGetMappedMipmappedArray
- */
-CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc);
-
-/**
- * \brief Maps a buffer onto an imported memory object
- *
- * Maps a buffer onto an imported memory object and returns a device
- * pointer in \p devPtr.
- *
- * The properties of the buffer being mapped must be described in
- * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is
- * defined as follows:
- *
- * \code
-        typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
-            unsigned long long offset;
-            unsigned long long size;
-            unsigned int flags;
-        } CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
- * \endcode
- *
- * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in
- * the memory object where the buffer's base address is.
- * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer.
- * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero.
- *
- * The offset and size have to be suitably aligned to match the
- * requirements of the external API. Mapping two buffers whose ranges
- * overlap may or may not result in the same virtual address being
- * returned for the overlapped portion. In such cases, the application
- * must ensure that all accesses to that region from the GPU are
- * volatile. Otherwise writes made via one address are not guaranteed
- * to be visible via the other address, even if they're issued by the
- * same thread. It is recommended that applications map the combined
- * range instead of mapping separate buffers and then apply the
- * appropriate offsets to the returned pointer to derive the
- * individual buffers.
- *
- * The returned pointer \p devPtr must be freed using ::cuMemFree.
- *
- * \param devPtr     - Returned device pointer to buffer
- * \param extMem     - Handle to external memory object
- * \param bufferDesc - Buffer descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuImportExternalMemory,
- * ::cuDestroyExternalMemory,
- * ::cuExternalMemoryGetMappedMipmappedArray
- */
-CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc);
-
-/**
- * \brief Maps a CUDA mipmapped array onto an external memory object
- *
- * Maps a CUDA mipmapped array onto an external object and returns a
- * handle to it in \p mipmap.
- *
- * The properties of the CUDA mipmapped array being mapped must be
- * described in \p mipmapDesc. The structure
- * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows:
- *
- * \code
-        typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
-            unsigned long long offset;
-            CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
-            unsigned int numLevels;
-        } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
- * \endcode
- *
- * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the
- * offset in the memory object where the base level of the mipmap
- * chain is.
- * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes
- * the format, dimensions and type of the base level of the mipmap
- * chain. For further details on these parameters, please refer to the
- * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped
- * array is bound as a color target in the graphics API, then the flag
- * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in
- * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags.
- * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies
- * the total number of levels in the mipmap chain.
- *
- * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
- * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1.
- *
- * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy.
- *
- * \param mipmap     - Returned CUDA mipmapped array
- * \param extMem     - Handle to external memory object
- * \param mipmapDesc - CUDA array descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuImportExternalMemory,
- * ::cuDestroyExternalMemory,
- * ::cuExternalMemoryGetMappedBuffer
- */
-CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc);
-
-/**
- * \brief Destroys an external memory object.
- *
- * Destroys the specified external memory object. Any existing buffers
- * and CUDA mipmapped arrays mapped onto this object must no longer be
- * used and must be explicitly freed using ::cuMemFree and
- * ::cuMipmappedArrayDestroy respectively.
- *
- * \param extMem - External memory object to be destroyed
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuImportExternalMemory,
- * ::cuExternalMemoryGetMappedBuffer,
- * ::cuExternalMemoryGetMappedMipmappedArray
- */
-CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem);
-
-/**
- * \brief Imports an external semaphore
- *
- * Imports an externally allocated synchronization object and returns
- * a handle to that in \p extSem_out.
- *
- * The properties of the handle being imported must be described in
- * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is
- * defined as follows:
- *
- * \code
-        typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
-            CUexternalSemaphoreHandleType type;
-            union {
-                int fd;
-                struct {
-                    void *handle;
-                    const void *name;
-                } win32;
-                const void* NvSciSyncObj;
-            } handle;
-            unsigned int flags;
-        } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
- * \endcode
- *
- * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of
- * handle being imported. ::CUexternalSemaphoreHandleType is defined
- * as:
- *
- * \code
-        typedef enum CUexternalSemaphoreHandleType_enum {
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD                = 1,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32             = 2,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT         = 3,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE              = 4,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE              = 5,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC                = 6,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX        = 7,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT    = 8,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD    = 9,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
-        } CUexternalSemaphoreHandleType;
- * \endcode
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
- * file descriptor referencing a synchronization object. Ownership of
- * the file descriptor is transferred to the CUDA driver when the
- * handle is imported successfully. Performing any operations on the
- * file descriptor after it is imported results in undefined behavior.
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
- * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
- * NULL. If
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
- * is not NULL, then it must represent a valid shared NT handle that
- * references a synchronization object. Ownership of this handle is
- * not transferred to CUDA after the import operation, so the
- * application must release the handle using the appropriate system
- * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must name a valid synchronization object.
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must
- * be non-NULL and
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
- * must be NULL. The handle specified must be a globally shared KMT
- * handle. This handle does not hold a reference to the underlying
- * object, and thus will be invalid when all references to the
- * synchronization object are destroyed.
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one
- * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
- * NULL. If
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
- * is not NULL, then it must represent a valid shared NT handle that
- * is returned by ID3D12Device::CreateSharedHandle when referring to a
- * ID3D12Fence object. This handle holds a reference to the underlying
- * object. If
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must name a valid synchronization object that
- * refers to a valid ID3D12Fence object.
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
- * represents a valid shared NT handle that is returned by
- * ID3D11Fence::CreateSharedHandle. If
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must name a valid synchronization object that
- * refers to a valid ID3D11Fence object.
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj
- * represents a valid NvSciSyncObj.
- *
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
- * represents a valid shared NT handle that
- * is returned by IDXGIResource1::CreateSharedHandle when referring to
- * a IDXGIKeyedMutex object. If
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must name a valid synchronization object that
- * refers to a valid IDXGIKeyedMutex object.
- *
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
- * represents a valid shared KMT handle that
- * is returned by IDXGIResource::GetSharedHandle when referring to
- * a IDXGIKeyedMutex object and
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL.
- * 
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
- * file descriptor referencing a synchronization object. Ownership of
- * the file descriptor is transferred to the CUDA driver when the
- * handle is imported successfully. Performing any operations on the
- * file descriptor after it is imported results in undefined behavior.
- * 
- * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one
- * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
- * NULL. If
- * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
- * is not NULL, then it must represent a valid shared NT handle that
- * references a synchronization object. Ownership of this handle is
- * not transferred to CUDA after the import operation, so the
- * application must release the handle using the appropriate system
- * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
- * is not NULL, then it must name a valid synchronization object.
- *
- * \param extSem_out    - Returned handle to an external semaphore
- * \param semHandleDesc - Semaphore import handle descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuDestroyExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc);
-
-/**
- * \brief Signals a set of external semaphore objects
- *
- * Enqueues a signal operation on a set of externally allocated
- * semaphore object in the specified stream. The operations will be
- * executed when all prior operations in the stream complete.
- *
- * The exact semantics of signaling a semaphore depends on the type of
- * the object.
- *
- * If the semaphore object is any one of the following types:
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
- * then signaling the semaphore will set it to the signaled state.
- *
- * If the semaphore object is any one of the following types:
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
- * then the semaphore will be set to the value specified in
- * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value.
- *
- * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
- * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence
- * to a value that can be used by subsequent waiters of the same NvSciSync object
- * to order operations with those currently submitted in \p stream. Such an update
- * will overwrite previous contents of
- * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default,
- * signaling such an external semaphore object causes appropriate memory synchronization
- * operations to be performed over all external memory objects that are imported as
- * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses
- * made by other importers of the same set of NvSciBuf memory object(s) are coherent.
- * These operations can be skipped by specifying the flag
- * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
- * performance optimization when data coherency is not required. But specifying this
- * flag in scenarios where data coherency is required results in undefined behavior.
- * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
- * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
- * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return
- * CUDA_ERROR_NOT_SUPPORTED.
- *
- * If the semaphore object is any one of the following types:
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
- * then the keyed mutex will be released with the key specified in
- * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key.
- *
- * \param extSemArray - Set of external semaphores to be signaled
- * \param paramsArray - Array of semaphore parameters
- * \param numExtSems  - Number of semaphores to signal
- * \param stream      - Stream to enqueue the signal operations in
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuImportExternalSemaphore,
- * ::cuDestroyExternalSemaphore,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
-
-/**
- * \brief Waits on a set of external semaphore objects
- *
- * Enqueues a wait operation on a set of externally allocated
- * semaphore object in the specified stream. The operations will be
- * executed when all prior operations in the stream complete.
- *
- * The exact semantics of waiting on a semaphore depends on the type
- * of the object.
- *
- * If the semaphore object is any one of the following types:
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
- * then waiting on the semaphore will wait until the semaphore reaches
- * the signaled state. The semaphore will then be reset to the
- * unsignaled state. Therefore for every signal operation, there can
- * only be one wait operation.
- *
- * If the semaphore object is any one of the following types:
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
- * then waiting on the semaphore will wait until the value of the
- * semaphore is greater than or equal to
- * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value.
- *
- * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
- * then, waiting on the semaphore will wait until the
- * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the
- * signaler of the NvSciSyncObj that was associated with this semaphore object.
- * By default, waiting on such an external semaphore object causes appropriate
- * memory synchronization operations to be performed over all external memory objects
- * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that
- * any subsequent accesses made by other importers of the same set of NvSciBuf memory
- * object(s) are coherent. These operations can be skipped by specifying the flag
- * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
- * performance optimization when data coherency is not required. But specifying this
- * flag in scenarios where data coherency is required results in undefined behavior.
- * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
- * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
- * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return
- * CUDA_ERROR_NOT_SUPPORTED.
- *
- * If the semaphore object is any one of the following types:
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
- * then the keyed mutex will be acquired when it is released with the key 
- * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key 
- * or until the timeout specified by
- * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs
- * has lapsed. The timeout interval can either be a finite value
- * specified in milliseconds or an infinite value. In case an infinite
- * value is specified the timeout never elapses. The windows INFINITE
- * macro must be used to specify infinite timeout.
- *
- * \param extSemArray - External semaphores to be waited on
- * \param paramsArray - Array of semaphore parameters
- * \param numExtSems  - Number of semaphores to wait on
- * \param stream      - Stream to enqueue the wait operations in
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_TIMEOUT
- * \notefnerr
- *
- * \sa ::cuImportExternalSemaphore,
- * ::cuDestroyExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
-
-/**
- * \brief Destroys an external semaphore
- *
- * Destroys an external semaphore object and releases any references
- * to the underlying resource. Any outstanding signals or waits must
- * have completed before the semaphore is destroyed.
- *
- * \param extSem - External semaphore to be destroyed
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa ::cuImportExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
-
-/** @} */ /* END CUDA_EXTRES_INTEROP */
-
-/**
- * \defgroup CUDA_MEMOP Stream memory operations
- *
- * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the stream memory operations of the low-level CUDA
- * driver application programming interface.
- *
- * The whole set of operations is disabled by default. Users are required
- * to explicitly enable them, e.g. on Linux by passing the kernel module
- * parameter shown below:
- *     modprobe nvidia NVreg_EnableStreamMemOPs=1
- * There is currently no way to enable these operations on other operating
- * systems.
- *
- * Users can programmatically query whether the device supports these
- * operations with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
- *
- * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.
- *
- * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64()
- * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and
- * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
- *
- * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and
- * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform
- * hardware features and can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.
- *
- * Note that all memory pointers passed as parameters to these operations
- * are device pointers. Where necessary a device pointer should be
- * obtained, for example with ::cuMemHostGetDevicePointer().
- *
- * None of the operations accepts pointers to managed memory buffers
- * (::cuMemAllocManaged).
- *
- * @{
- */
-
-/**
- * \brief Wait on a memory location
- *
- * Enqueues a synchronization of the stream on the given memory location. Work
- * ordered after the operation will block until the given condition on the
- * memory is satisfied. By default, the condition is to wait for
- * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal.
- * Other condition types can be specified via \p flags.
- *
- * If the memory was registered via ::cuMemHostRegister(), the device pointer
- * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
- * be used with managed memory (::cuMemAllocManaged).
- *
- * Support for this can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
- *
- * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.
- *
- * \param stream The stream to synchronize on the memory location.
- * \param addr The memory location to wait on.
- * \param value The value to compare with the memory location.
- * \param flags See ::CUstreamWaitValue_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWaitValue64,
- * ::cuStreamWriteValue32,
- * ::cuStreamWriteValue64,
- * ::cuStreamBatchMemOp,
- * ::cuMemHostRegister,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
-
-/**
- * \brief Wait on a memory location
- *
- * Enqueues a synchronization of the stream on the given memory location. Work
- * ordered after the operation will block until the given condition on the
- * memory is satisfied. By default, the condition is to wait for
- * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal.
- * Other condition types can be specified via \p flags.
- *
- * If the memory was registered via ::cuMemHostRegister(), the device pointer
- * should be obtained with ::cuMemHostGetDevicePointer().
- *
- * Support for this can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
- *
- * \param stream The stream to synchronize on the memory location.
- * \param addr The memory location to wait on.
- * \param value The value to compare with the memory location.
- * \param flags See ::CUstreamWaitValue_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWaitValue32,
- * ::cuStreamWriteValue32,
- * ::cuStreamWriteValue64,
- * ::cuStreamBatchMemOp,
- * ::cuMemHostRegister,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
-
-/**
- * \brief Write a value to memory
- *
- * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
- * flag is passed, the write is preceded by a system-wide memory fence,
- * equivalent to a __threadfence_system() but scoped to the stream
- * rather than a CUDA thread.
- *
- * If the memory was registered via ::cuMemHostRegister(), the device pointer
- * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
- * be used with managed memory (::cuMemAllocManaged).
- *
- * Support for this can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
- *
- * \param stream The stream to do the write in.
- * \param addr The device address to write to.
- * \param value The value to write.
- * \param flags See ::CUstreamWriteValue_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWriteValue64,
- * ::cuStreamWaitValue32,
- * ::cuStreamWaitValue64,
- * ::cuStreamBatchMemOp,
- * ::cuMemHostRegister,
- * ::cuEventRecord
- */
-CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
-
-/**
- * \brief Write a value to memory
- *
- * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
- * flag is passed, the write is preceded by a system-wide memory fence,
- * equivalent to a __threadfence_system() but scoped to the stream
- * rather than a CUDA thread.
- *
- * If the memory was registered via ::cuMemHostRegister(), the device pointer
- * should be obtained with ::cuMemHostGetDevicePointer().
- *
- * Support for this can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
- *
- * \param stream The stream to do the write in.
- * \param addr The device address to write to.
- * \param value The value to write.
- * \param flags See ::CUstreamWriteValue_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWriteValue32,
- * ::cuStreamWaitValue32,
- * ::cuStreamWaitValue64,
- * ::cuStreamBatchMemOp,
- * ::cuMemHostRegister,
- * ::cuEventRecord
- */
-CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
-
-/**
- * \brief Batch operations to synchronize the stream via memory operations
- *
- * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32().
- * Batching operations may avoid some performance overhead in both the API call
- * and the device execution versus adding them to the stream in separate API
- * calls. The operations are enqueued in the order they appear in the array.
- *
- * See ::CUstreamBatchMemOpType for the full set of supported operations, and
- * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(),
- * and ::cuStreamWriteValue64() for details of specific operations.
- *
- * Basic support for this can be queried with ::cuDeviceGetAttribute() and
- * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details
- * on querying support for specific operations.
- *
- * \param stream The stream to enqueue the operations in.
- * \param count The number of operations in the array. Must be less than 256.
- * \param paramArray The types and parameters of the individual operations.
- * \param flags Reserved for future expansion; must be 0.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \notefnerr
- *
- * \sa ::cuStreamWaitValue32,
- * ::cuStreamWaitValue64,
- * ::cuStreamWriteValue32,
- * ::cuStreamWriteValue64,
- * ::cuMemHostRegister
- */
-CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
-
-/** @} */ /* END CUDA_MEMOP */
-
-/**
- * \defgroup CUDA_EXEC Execution Control
- *
- * ___MANBRIEF___ execution control functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the execution control functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Returns information about a function
- *
- * Returns in \p *pi the integer value of the attribute \p attrib on the kernel
- * given by \p hfunc. The supported attributes are:
- * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
- *   per block, beyond which a launch of the function would fail. This number
- *   depends on both the function and the device on which the function is
- *   currently loaded.
- * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
- *   statically-allocated shared memory per block required by this function.
- *   This does not include dynamically-allocated shared memory requested by
- *   the user at runtime.
- * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
- *   constant memory required by this function.
- * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
- *   used by each thread of this function.
- * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
- *   of this function.
- * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
- *   which the function was compiled. This value is the major PTX version * 10
- *   + the minor PTX version, so a PTX version 1.3 function would return the
- *   value 13. Note that this may return the undefined value of 0 for cubins
- *   compiled prior to CUDA 3.0.
- * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
- *   which the function was compiled. This value is the major binary
- *   version * 10 + the minor binary version, so a binary version 1.3 function
- *   would return the value 13. Note that this will return a value of 10 for
- *   legacy cubins that do not have a properly-encoded binary architecture
- *   version.
- * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has
- *   been compiled with user specified option "-Xptxas --dlcm=ca" set .
- * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of
- *   dynamically-allocated shared memory.
- * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1
- *   cache split ratio in percent of total shared memory.
- *
- * \param pi     - Returned attribute value
- * \param attrib - Attribute requested
- * \param hfunc  - Function to query attribute of
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuFuncSetCacheConfig,
- * ::cuLaunchKernel,
- * ::cudaFuncGetAttributes,
- * ::cudaFuncSetAttribute
- */
-CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
-
-/**
- * \brief Sets information about a function
- *
- * This call sets the value of a specified attribute \p attrib on the kernel given
- * by \p hfunc to an integer value specified by \p val
- * This function returns CUDA_SUCCESS if the new value of the attribute could be
- * successfully set. If the set fails, this call will return an error.
- * Not all attributes can have values set. Attempting to set a value on a read-only
- * attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
- *
- * Supported attributes for the cuFuncSetAttribute call are:
- * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of
- *   dynamically-allocated shared memory. The value should contain the requested
- *   maximum size of dynamically-allocated shared memory. The sum of this value and
- *   the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the
- *   device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.
- *   The maximal size of requestable dynamic shared memory may differ by GPU
- *   architecture.
- * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1
- *   cache and shared memory use the same hardware resources, this sets the shared memory
- *   carveout preference, in percent of the total shared memory. 
- *   See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR
- *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
- *
- * \param hfunc  - Function to query attribute of
- * \param attrib - Attribute requested
- * \param value   - The value to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuFuncSetCacheConfig,
- * ::cuLaunchKernel,
- * ::cudaFuncGetAttributes,
- * ::cudaFuncSetAttribute
- */
-CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value);
-
-/**
- * \brief Sets the preferred cache configuration for a device function
- *
- * On devices where the L1 cache and shared memory use the same hardware
- * resources, this sets through \p config the preferred cache configuration for
- * the device function \p hfunc. This is only a preference. The driver will use
- * the requested configuration if possible, but it is free to choose a different
- * configuration if required to execute \p hfunc.  Any context-wide preference
- * set via ::cuCtxSetCacheConfig() will be overridden by this per-function
- * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In
- * that case, the current context-wide setting will be used.
- *
- * This setting does nothing on devices where the size of the L1 cache and
- * shared memory are fixed.
- *
- * Launching a kernel with a different preference than the most recent
- * preference setting may insert a device-side synchronization point.
- *
- *
- * The supported cache configurations are:
- * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
- * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
- * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
- * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
- *
- * \param hfunc  - Kernel to configure cache for
- * \param config - Requested cache configuration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuFuncGetAttribute,
- * ::cuLaunchKernel,
- * ::cudaFuncSetCacheConfig
- */
-CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
-
-/**
- * \brief Sets the shared memory configuration for a device function.
- *
- * On devices with configurable shared memory banks, this function will
- * force all subsequent launches of the specified device function to have
- * the given shared memory bank size configuration. On any given launch of the
- * function, the shared memory configuration of the device will be temporarily
- * changed if needed to suit the function's preferred configuration. Changes in
- * shared memory configuration between subsequent launches of functions,
- * may introduce a device side synchronization point.
- *
- * Any per-function setting of shared memory bank size set via
- * ::cuFuncSetSharedMemConfig will override the context wide setting set with
- * ::cuCtxSetSharedMemConfig.
- *
- * Changing the shared memory bank size will not increase shared memory usage
- * or affect occupancy of kernels, but may have major effects on performance.
- * Larger bank sizes will allow for greater potential bandwidth to shared memory,
- * but will change what kinds of accesses to shared memory will result in bank
- * conflicts.
- *
- * This function will do nothing on devices with fixed shared memory bank size.
- *
- * The supported bank configurations are:
- * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory
- *   configuration when launching this function.
- * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
- *   be natively four bytes when launching this function.
- * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
- *   be natively eight bytes when launching this function.
- *
- * \param hfunc  - kernel to be given a shared memory config
- * \param config - requested shared memory configuration
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuCtxGetSharedMemConfig,
- * ::cuCtxSetSharedMemConfig,
- * ::cuFuncGetAttribute,
- * ::cuLaunchKernel,
- * ::cudaFuncSetSharedMemConfig
- */
-CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
-
-/**
- * \brief Returns a module handle
- *
- * Returns in \p *hmod the handle of the module that function \p hfunc
- * is located in. The lifetime of the module corresponds to the lifetime of
- * the context it was loaded in or until the module is explicitly unloaded.
- *
- * The CUDA runtime manages its own modules loaded into the primary context.
- * If the handle returned by this API refers to a module loaded by the CUDA runtime,
- * calling ::cuModuleUnload() on that module will result in undefined behavior.
- *
- * \param hmod - Returned module handle
- * \param hfunc   - Function to retrieve module for
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_FOUND
- * \notefnerr
- *
- */
-CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc);
-
-/**
- * \brief Launches a CUDA function
- *
- * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
- * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
- * \p blockDimZ threads.
- *
- * \p sharedMemBytes sets the amount of dynamic shared memory that will be
- * available to each thread block.
- *
- * Kernel parameters to \p f can be specified in one of two ways:
- *
- * 1) Kernel parameters can be specified via \p kernelParams.  If \p f
- * has N parameters, then \p kernelParams needs to be an array of N
- * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
- * must point to a region of memory from which the actual kernel
- * parameter will be copied.  The number of kernel parameters and their
- * offsets and sizes do not need to be specified as that information is
- * retrieved directly from the kernel's image.
- *
- * 2) Kernel parameters can also be packaged by the application into
- * a single buffer that is passed in via the \p extra parameter.
- * This places the burden on the application of knowing each kernel
- * parameter's size and alignment/padding within the buffer.  Here is
- * an example of using the \p extra parameter in this manner:
- * \code
-    size_t argBufferSize;
-    char argBuffer[256];
-
-    // populate argBuffer and argBufferSize
-
-    void *config[] = {
-        CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
-        CU_LAUNCH_PARAM_BUFFER_SIZE,    &argBufferSize,
-        CU_LAUNCH_PARAM_END
-    };
-    status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
- * \endcode
- *
- * The \p extra parameter exists to allow ::cuLaunchKernel to take
- * additional less commonly used arguments.  \p extra specifies a list of
- * names of extra settings and their corresponding values.  Each extra
- * setting name is immediately followed by the corresponding value.  The
- * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END.
- *
- * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
- *   array;
- * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
- *   value in \p extra will be a pointer to a buffer containing all
- *   the kernel parameters for launching kernel \p f;
- * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
- *   value in \p extra will be a pointer to a size_t containing the
- *   size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER;
- *
- * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel
- * parameters are specified with both \p kernelParams and \p extra
- * (i.e. both \p kernelParams and \p extra are non-NULL).
- *
- * Calling ::cuLaunchKernel() invalidates the persistent function state
- * set through the following deprecated APIs:
- *  ::cuFuncSetBlockShape(),
- *  ::cuFuncSetSharedSize(),
- *  ::cuParamSetSize(),
- *  ::cuParamSeti(),
- *  ::cuParamSetf(),
- *  ::cuParamSetv().
- *
- * Note that to use ::cuLaunchKernel(), the kernel \p f must either have
- * been compiled with toolchain version 3.2 or later so that it will
- * contain kernel parameter information, or have no kernel parameters.
- * If either of these conditions is not met, then ::cuLaunchKernel() will
- * return ::CUDA_ERROR_INVALID_IMAGE.
- *
- * \param f              - Kernel to launch
- * \param gridDimX       - Width of grid in blocks
- * \param gridDimY       - Height of grid in blocks
- * \param gridDimZ       - Depth of grid in blocks
- * \param blockDimX      - X dimension of each thread block
- * \param blockDimY      - Y dimension of each thread block
- * \param blockDimZ      - Z dimension of each thread block
- * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
- * \param hStream        - Stream identifier
- * \param kernelParams   - Array of pointers to kernel parameters
- * \param extra          - Extra options
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_IMAGE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_LAUNCH_FAILED,
- * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuFuncSetCacheConfig,
- * ::cuFuncGetAttribute,
- * ::cudaLaunchKernel
- */
-CUresult CUDAAPI cuLaunchKernel(CUfunction f,
-                                unsigned int gridDimX,
-                                unsigned int gridDimY,
-                                unsigned int gridDimZ,
-                                unsigned int blockDimX,
-                                unsigned int blockDimY,
-                                unsigned int blockDimZ,
-                                unsigned int sharedMemBytes,
-                                CUstream hStream,
-                                void **kernelParams,
-                                void **extra);
-
-/**
- * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute
- *
- * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
- * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
- * \p blockDimZ threads.
- *
- * \p sharedMemBytes sets the amount of dynamic shared memory that will be
- * available to each thread block.
- *
- * The device on which this kernel is invoked must have a non-zero value for
- * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH.
- *
- * The total number of blocks launched cannot exceed the maximum number of blocks per
- * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
- * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
- * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
- *
- * The kernel cannot make use of CUDA dynamic parallelism.
- *
- * Kernel parameters must be specified via \p kernelParams.  If \p f
- * has N parameters, then \p kernelParams needs to be an array of N
- * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
- * must point to a region of memory from which the actual kernel
- * parameter will be copied.  The number of kernel parameters and their
- * offsets and sizes do not need to be specified as that information is
- * retrieved directly from the kernel's image.
- *
- * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is
- * the same as function state set through ::cuLaunchKernel API
- *
- * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous
- * block shape, shared size and parameter info associated with \p f
- * is overwritten.
- *
- * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have
- * been compiled with toolchain version 3.2 or later so that it will
- * contain kernel parameter information, or have no kernel parameters.
- * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will
- * return ::CUDA_ERROR_INVALID_IMAGE.
- *
- * \param f              - Kernel to launch
- * \param gridDimX       - Width of grid in blocks
- * \param gridDimY       - Height of grid in blocks
- * \param gridDimZ       - Depth of grid in blocks
- * \param blockDimX      - X dimension of each thread block
- * \param blockDimY      - Y dimension of each thread block
- * \param blockDimZ      - Z dimension of each thread block
- * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
- * \param hStream        - Stream identifier
- * \param kernelParams   - Array of pointers to kernel parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_IMAGE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_LAUNCH_FAILED,
- * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuFuncSetCacheConfig,
- * ::cuFuncGetAttribute,
- * ::cuLaunchCooperativeKernelMultiDevice,
- * ::cudaLaunchCooperativeKernel
- */
-CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
-                                unsigned int gridDimX,
-                                unsigned int gridDimY,
-                                unsigned int gridDimZ,
-                                unsigned int blockDimX,
-                                unsigned int blockDimY,
-                                unsigned int blockDimZ,
-                                unsigned int sharedMemBytes,
-                                CUstream hStream,
-                                void **kernelParams);
-
-/**
- * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute
- *
- * \deprecated This function is deprecated as of CUDA 11.3.
- *
- * Invokes kernels as specified in the \p launchParamsList array where each element
- * of the array specifies all the parameters required to perform a single kernel launch.
- * These kernels can cooperate and synchronize as they execute. The size of the array is
- * specified by \p numDevices.
- *
- * No two kernels can be launched on the same device. All the devices targeted by this
- * multi-device launch must be identical. All devices must have a non-zero value for the
- * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH.
- *
- * All kernels launched must be identical with respect to the compiled code. Note that
- * any __device__, __constant__ or __managed__ variables present in the module that owns
- * the kernel launched on each device, are independently instantiated on every device.
- * It is the application's responsiblity to ensure these variables are initialized and
- * used appropriately.
- *
- * The size of the grids as specified in blocks, the size of the blocks themselves
- * and the amount of shared memory used by each thread block must also match across
- * all launched kernels.
- *
- * The streams used to launch these kernels must have been created via either ::cuStreamCreate
- * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD
- * cannot be used.
- *
- * The total number of blocks launched per kernel cannot exceed the maximum number of blocks
- * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
- * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
- * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the
- * total number of blocks launched per device has to match across all devices, the maximum
- * number of blocks that can be launched per device will be limited by the device with the
- * least number of multiprocessors.
- *
- * The kernels cannot make use of CUDA dynamic parallelism.
- *
- * The ::CUDA_LAUNCH_PARAMS structure is defined as:
- * \code
-        typedef struct CUDA_LAUNCH_PARAMS_st
-        {
-            CUfunction function;
-            unsigned int gridDimX;
-            unsigned int gridDimY;
-            unsigned int gridDimZ;
-            unsigned int blockDimX;
-            unsigned int blockDimY;
-            unsigned int blockDimZ;
-            unsigned int sharedMemBytes;
-            CUstream hStream;
-            void **kernelParams;
-        } CUDA_LAUNCH_PARAMS;
- * \endcode
- * where:
- * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must
- *   be identical with respect to the compiled code.
- * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across
- *   all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across
- *   all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across
- *   all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across
- *   all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across
- *   all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across
- *   all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes.
- *   This must match across all kernels launched.
- * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot
- *   be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated
- *   with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function.
- * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If
- *   ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams
- *   needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through
- *   ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual
- *   kernel parameter will be copied. The number of kernel parameters and their offsets and sizes
- *   do not need to be specified as that information is retrieved directly from the kernel's image.
- *
- * By default, the kernel won't begin execution on any GPU until all prior work in all the specified
- * streams has completed. This behavior can be overridden by specifying the flag
- * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel
- * will only wait for prior work in the stream corresponding to that GPU to complete before it begins
- * execution.
- *
- * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin
- * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying
- * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified,
- * any subsequent work pushed in any of the specified streams will only wait for the kernel launched
- * on the GPU corresponding to that stream to complete before it begins execution.
- *
- * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is
- * the same as function state set through ::cuLaunchKernel API when called individually for each
- * element in \p launchParamsList.
- *
- * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous
- * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function
- * in \p launchParamsList is overwritten.
- *
- * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have
- * been compiled with toolchain version 3.2 or later so that it will
- * contain kernel parameter information, or have no kernel parameters.
- * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will
- * return ::CUDA_ERROR_INVALID_IMAGE.
- *
- * \param launchParamsList - List of launch parameters, one per device
- * \param numDevices       - Size of the \p launchParamsList array
- * \param flags            - Flags to control launch behavior
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_IMAGE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_LAUNCH_FAILED,
- * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuCtxGetCacheConfig,
- * ::cuCtxSetCacheConfig,
- * ::cuFuncSetCacheConfig,
- * ::cuFuncGetAttribute,
- * ::cuLaunchCooperativeKernel,
- * ::cudaLaunchCooperativeKernelMultiDevice
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags);
-
-/**
- * \brief Enqueues a host function call in a stream
- *
- * Enqueues a host function to run in a stream.  The function will be called
- * after currently enqueued work and will block work added after it.
- *
- * The host function must not make any CUDA API calls.  Attempting to use a
- * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required.
- * The host function must not perform any synchronization that may depend on
- * outstanding CUDA work not mandated to run earlier.  Host functions without a
- * mandated order (such as in independent streams) execute in undefined order
- * and may be serialized.
- *
- * For the purposes of Unified Memory, execution makes a number of guarantees:
- * <ul>
- *   <li>The stream is considered idle for the duration of the function's
- *   execution.  Thus, for example, the function may always use memory attached
- *   to the stream it was enqueued in.</li>
- *   <li>The start of execution of the function has the same effect as
- *   synchronizing an event recorded in the same stream immediately prior to
- *   the function.  It thus synchronizes streams which have been "joined"
- *   prior to the function.</li>
- *   <li>Adding device work to any stream does not have the effect of making
- *   the stream active until all preceding host functions and stream callbacks
- *   have executed.  Thus, for
- *   example, a function might use global attached memory even if work has
- *   been added to another stream, if the work has been ordered behind the
- *   function call with an event.</li>
- *   <li>Completion of the function does not cause a stream to become
- *   active except as described above.  The stream will remain idle
- *   if no device work follows the function, and will remain idle across
- *   consecutive host functions or stream callbacks without device work in
- *   between.  Thus, for example,
- *   stream synchronization can be done by signaling from a host function at the
- *   end of the stream.</li>
- * </ul>
- *
- * Note that, in contrast to ::cuStreamAddCallback, the function will not be
- * called in the event of an error in the CUDA context.
- *
- * \param hStream  - Stream to enqueue function call in
- * \param fn       - The function to call once preceding stream operations are complete
- * \param userData - User-specified data to be passed to the function
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_SUPPORTED
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuStreamCreate,
- * ::cuStreamQuery,
- * ::cuStreamSynchronize,
- * ::cuStreamWaitEvent,
- * ::cuStreamDestroy,
- * ::cuMemAllocManaged,
- * ::cuStreamAttachMemAsync,
- * ::cuStreamAddCallback
- */
-CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
-
-/** @} */ /* END CUDA_EXEC */
-
-/**
- * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the deprecated execution control functions of the
- * low-level CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Sets the block-dimensions for the function
- *
- * \deprecated
- *
- * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are
- * created when the kernel given by \p hfunc is launched.
- *
- * \param hfunc - Kernel to specify dimensions of
- * \param x     - X dimension
- * \param y     - Y dimension
- * \param z     - Z dimension
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuFuncSetSharedSize,
- * ::cuFuncSetCacheConfig,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSeti,
- * ::cuParamSetf,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
-
-/**
- * \brief Sets the dynamic shared-memory size for the function
- *
- * \deprecated
- *
- * Sets through \p bytes the amount of dynamic shared memory that will be
- * available to each thread block when the kernel given by \p hfunc is launched.
- *
- * \param hfunc - Kernel to specify dynamic shared-memory size for
- * \param bytes - Dynamic shared-memory size per thread in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetCacheConfig,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSeti,
- * ::cuParamSetf,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
-
-/**
- * \brief Sets the parameter size for the function
- *
- * \deprecated
- *
- * Sets through \p numbytes the total size in bytes needed by the function
- * parameters of the kernel corresponding to \p hfunc.
- *
- * \param hfunc    - Kernel to set parameter size for
- * \param numbytes - Size of parameter list in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetf,
- * ::cuParamSeti,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes);
-
-/**
- * \brief Adds an integer parameter to the function's argument list
- *
- * \deprecated
- *
- * Sets an integer parameter that will be specified the next time the
- * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
- *
- * \param hfunc  - Kernel to add parameter to
- * \param offset - Offset to add parameter to argument list
- * \param value  - Value of parameter
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSetf,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value);
-
-/**
- * \brief Adds a floating-point parameter to the function's argument list
- *
- * \deprecated
- *
- * Sets a floating-point parameter that will be specified the next time the
- * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
- *
- * \param hfunc  - Kernel to add parameter to
- * \param offset - Offset to add parameter to argument list
- * \param value  - Value of parameter
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSeti,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value);
-
-/**
- * \brief Adds arbitrary data to the function's argument list
- *
- * \deprecated
- *
- * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr
- * into the parameter space of the kernel corresponding to \p hfunc. \p offset
- * is a byte offset.
- *
- * \param hfunc    - Kernel to add data to
- * \param offset   - Offset to add data to argument list
- * \param ptr      - Pointer to arbitrary data
- * \param numbytes - Size of data to copy in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSetf,
- * ::cuParamSeti,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
-
-/**
- * \brief Launches a CUDA function
- *
- * \deprecated
- *
- * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block
- * contains the number of threads specified by a previous call to
- * ::cuFuncSetBlockShape().
- *
- * The block shape, dynamic shared memory size, and parameter information
- * must be set using
- *  ::cuFuncSetBlockShape(),
- *  ::cuFuncSetSharedSize(),
- *  ::cuParamSetSize(),
- *  ::cuParamSeti(),
- *  ::cuParamSetf(), and
- *  ::cuParamSetv()
- * prior to calling this function.
- *
- * Launching a function via ::cuLaunchKernel() invalidates the function's
- * block shape, dynamic shared memory size, and parameter information. After
- * launching via cuLaunchKernel, this state must be re-initialized prior to
- * calling this function. Failure to do so results in undefined behavior.
- *
- * \param f - Kernel to launch
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_LAUNCH_FAILED,
- * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSetf,
- * ::cuParamSeti,
- * ::cuParamSetv,
- * ::cuLaunchGrid,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f);
-
-/**
- * \brief Launches a CUDA function
- *
- * \deprecated
- *
- * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
- * blocks. Each block contains the number of threads specified by a previous
- * call to ::cuFuncSetBlockShape().
- *
- * The block shape, dynamic shared memory size, and parameter information
- * must be set using
- *  ::cuFuncSetBlockShape(),
- *  ::cuFuncSetSharedSize(),
- *  ::cuParamSetSize(),
- *  ::cuParamSeti(),
- *  ::cuParamSetf(), and
- *  ::cuParamSetv()
- * prior to calling this function.
- *
- * Launching a function via ::cuLaunchKernel() invalidates the function's
- * block shape, dynamic shared memory size, and parameter information. After
- * launching via cuLaunchKernel, this state must be re-initialized prior to
- * calling this function. Failure to do so results in undefined behavior.
- *
- * \param f           - Kernel to launch
- * \param grid_width  - Width of grid in blocks
- * \param grid_height - Height of grid in blocks
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_LAUNCH_FAILED,
- * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSetf,
- * ::cuParamSeti,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGridAsync,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
-
-/**
- * \brief Launches a CUDA function
- *
- * \deprecated
- *
- * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
- * blocks. Each block contains the number of threads specified by a previous
- * call to ::cuFuncSetBlockShape().
- *
- * The block shape, dynamic shared memory size, and parameter information
- * must be set using
- *  ::cuFuncSetBlockShape(),
- *  ::cuFuncSetSharedSize(),
- *  ::cuParamSetSize(),
- *  ::cuParamSeti(),
- *  ::cuParamSetf(), and
- *  ::cuParamSetv()
- * prior to calling this function.
- *
- * Launching a function via ::cuLaunchKernel() invalidates the function's
- * block shape, dynamic shared memory size, and parameter information. After
- * launching via cuLaunchKernel, this state must be re-initialized prior to
- * calling this function. Failure to do so results in undefined behavior.
- *
- * \param f           - Kernel to launch
- * \param grid_width  - Width of grid in blocks
- * \param grid_height - Height of grid in blocks
- * \param hStream     - Stream identifier
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_LAUNCH_FAILED,
- * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
- * ::CUDA_ERROR_LAUNCH_TIMEOUT,
- * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
- * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
- *
- * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no),
- *       this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by
- *       growing the per-thread stack as needed per launch and not shrinking it afterwards.
- *
- * \note_null_stream
- * \notefnerr
- *
- * \sa ::cuFuncSetBlockShape,
- * ::cuFuncSetSharedSize,
- * ::cuFuncGetAttribute,
- * ::cuParamSetSize,
- * ::cuParamSetf,
- * ::cuParamSeti,
- * ::cuParamSetv,
- * ::cuLaunch,
- * ::cuLaunchGrid,
- * ::cuLaunchKernel
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
-
-
-/**
- * \brief Adds a texture-reference to the function's argument list
- *
- * \deprecated
- *
- * Makes the CUDA array or linear memory bound to the texture reference
- * \p hTexRef available to a device program as a texture. In this version of
- * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and
- * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT.
- *
- * \param hfunc   - Kernel to add texture-reference to
- * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT)
- * \param hTexRef - Texture-reference to add to argument list
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
-/** @} */ /* END CUDA_EXEC_DEPRECATED */
-
-/**
- * \defgroup CUDA_GRAPH Graph Management
- *
- * ___MANBRIEF___ graph management functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the graph management functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Creates a graph
- *
- * Creates an empty graph, which is returned via \p phGraph.
- *
- * \param phGraph - Returns newly created graph
- * \param flags   - Graph creation flags, must be 0
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- * ::cuGraphInstantiate,
- * ::cuGraphDestroy,
- * ::cuGraphGetNodes,
- * ::cuGraphGetRootNodes,
- * ::cuGraphGetEdges,
- * ::cuGraphClone
- */
-CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags);
-
-/**
- * \brief Creates a kernel execution node and adds it to a graph
- *
- * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * The CUDA_KERNEL_NODE_PARAMS structure is defined as:
- *
- * \code
- *  typedef struct CUDA_KERNEL_NODE_PARAMS_st {
- *      CUfunction func;
- *      unsigned int gridDimX;
- *      unsigned int gridDimY;
- *      unsigned int gridDimZ;
- *      unsigned int blockDimX;
- *      unsigned int blockDimY;
- *      unsigned int blockDimZ;
- *      unsigned int sharedMemBytes;
- *      void **kernelParams;
- *      void **extra;
- *  } CUDA_KERNEL_NODE_PARAMS;
- * \endcode
- *
- * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x
- * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains
- * (\p blockDimX x \p blockDimY x \p blockDimZ) threads.
- *
- * \p sharedMemBytes sets the amount of dynamic shared memory that will be
- * available to each thread block.
- *
- * Kernel parameters to \p func can be specified in one of two ways:
- *
- * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N
- * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer,
- * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual
- * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need
- * to be specified as that information is retrieved directly from the kernel's image.
- *
- * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single
- * buffer that is passed in via \p extra. This places the burden on the application of knowing each
- * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists
- * to allow this function to take additional less commonly used arguments. \p extra specifies
- * a list of names of extra settings and their corresponding values. Each extra setting name is
- * immediately followed by the corresponding value. The list must be terminated with either NULL or
- * CU_LAUNCH_PARAM_END.
- *
- * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
- *   array;
- * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
- *   value in \p extra will be a pointer to a buffer
- *   containing all the kernel parameters for launching kernel
- *   \p func;
- * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
- *   value in \p extra will be a pointer to a size_t
- *   containing the size of the buffer specified with
- *   ::CU_LAUNCH_PARAM_BUFFER_POINTER;
- *
- * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both
- * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL).
- * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel.
- *
- * The \p kernelParams or \p extra array, as well as the argument values it points to,
- * are copied during this call.
- *
- * \note Kernels launched using graphs must not use texture and surface references. Reading or
- *       writing through any texture or surface reference is undefined behavior.
- *       This restriction does not apply to texture and surface objects.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param nodeParams      - Parameters for the GPU execution node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchKernel,
- * ::cuLaunchCooperativeKernel,
- * ::cuGraphKernelNodeGetParams,
- * ::cuGraphKernelNodeSetParams,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Returns a kernel node's parameters
- *
- * Returns the parameters of kernel node \p hNode in \p nodeParams.
- * The \p kernelParams or \p extra array returned in \p nodeParams,
- * as well as the argument values it points to, are owned by the node.
- * This memory remains valid until the node is destroyed or its
- * parameters are modified, and should not be modified
- * directly. Use ::cuGraphKernelNodeSetParams to update the
- * parameters of this node.
- *
- * The params will contain either \p kernelParams or \p extra,
- * according to which of these was most recently set on the node.
- *
- * \param hNode      - Node to get the parameters for
- * \param nodeParams - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchKernel,
- * ::cuGraphAddKernelNode,
- * ::cuGraphKernelNodeSetParams
- */
-CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Sets a kernel node's parameters
- *
- * Sets the parameters of kernel node \p hNode to \p nodeParams.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchKernel,
- * ::cuGraphAddKernelNode,
- * ::cuGraphKernelNodeGetParams
- */
-CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Creates a memcpy node and adds it to a graph
- *
- * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * When the graph is launched, the node will perform the memcpy described by \p copyParams.
- * See ::cuMemcpy3D() for a description of the structure and its restrictions.
- *
- * Memcpy nodes have some additional restrictions with regards to managed memory, if the
- * system contains at least one device which has a zero value for the device attribute
- * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer
- * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed
- * for those operand(s). The managed memory will be treated as residing on either the
- * host or the device, depending on which memory type is specified.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param copyParams      - Parameters for the memory copy
- * \param ctx             - Context on which to run the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuMemcpy3D,
- * ::cuGraphMemcpyNodeGetParams,
- * ::cuGraphMemcpyNodeSetParams,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
-
-/**
- * \brief Returns a memcpy node's parameters
- *
- * Returns the parameters of memcpy node \p hNode in \p nodeParams.
- *
- * \param hNode      - Node to get the parameters for
- * \param nodeParams - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuMemcpy3D,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphMemcpyNodeSetParams
- */
-CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams);
-
-/**
- * \brief Sets a memcpy node's parameters
- *
- * Sets the parameters of memcpy node \p hNode to \p nodeParams.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuMemcpy3D,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphMemcpyNodeGetParams
- */
-CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams);
-
-/**
- * \brief Creates a memset node and adds it to a graph
- *
- * Creates a new memset node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * The element size must be 1, 2, or 4 bytes.
- * When the graph is launched, the node will perform the memset described by \p memsetParams.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param memsetParams    - Parameters for the memory set
- * \param ctx             - Context on which to run the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_CONTEXT
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuMemsetD2D32,
- * ::cuGraphMemsetNodeGetParams,
- * ::cuGraphMemsetNodeSetParams,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemcpyNode
- */
-CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
-
-/**
- * \brief Returns a memset node's parameters
- *
- * Returns the parameters of memset node \p hNode in \p nodeParams.
- *
- * \param hNode      - Node to get the parameters for
- * \param nodeParams - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuMemsetD2D32,
- * ::cuGraphAddMemsetNode,
- * ::cuGraphMemsetNodeSetParams
- */
-CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Sets a memset node's parameters
- *
- * Sets the parameters of memset node \p hNode to \p nodeParams.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuMemsetD2D32,
- * ::cuGraphAddMemsetNode,
- * ::cuGraphMemsetNodeGetParams
- */
-CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Creates a host execution node and adds it to a graph
- *
- * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * When the graph is launched, the node will invoke the specified CPU function.
- * Host nodes are not supported under MPS with pre-Volta GPUs.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param nodeParams      - Parameters for the host node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchHostFunc,
- * ::cuGraphHostNodeGetParams,
- * ::cuGraphHostNodeSetParams,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Returns a host node's parameters
- *
- * Returns the parameters of host node \p hNode in \p nodeParams.
- *
- * \param hNode      - Node to get the parameters for
- * \param nodeParams - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchHostFunc,
- * ::cuGraphAddHostNode,
- * ::cuGraphHostNodeSetParams
- */
-CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Sets a host node's parameters
- *
- * Sets the parameters of host node \p hNode to \p nodeParams.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchHostFunc,
- * ::cuGraphAddHostNode,
- * ::cuGraphHostNodeGetParams
- */
-CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Creates a child graph node and adds it to a graph
- *
- * Creates a new node which executes an embedded graph, and adds it to \p hGraph with
- * \p numDependencies dependencies specified via \p dependencies.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * If \p hGraph contains allocation or free nodes, this call will return an error.
- *
- * The node executes an embedded child graph. The child graph is cloned in this call.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param childGraph      - The graph to clone into this node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphChildGraphNodeGetGraph,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- * ::cuGraphClone
- */
-CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
-
-/**
- * \brief Gets a handle to the embedded graph of a child graph node
- *
- * Gets a handle to the embedded graph in a child graph node. This call
- * does not clone the graph. Changes to the graph will be reflected in
- * the node, and the node retains ownership of the graph.
- *
- * Allocation and free nodes cannot be added to the returned graph.
- * Attempting to do so will return an error.
- *
- * \param hNode   - Node to get the embedded graph for
- * \param phGraph - Location to store a handle to the graph
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphNodeFindInClone
- */
-CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph);
-
-/**
- * \brief Creates an empty node and adds it to a graph
- *
- * Creates a new node which performs no operation, and adds it to \p hGraph with
- * \p numDependencies dependencies specified via \p dependencies.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * An empty node performs no operation during execution, but can be used for
- * transitive ordering. For example, a phased execution graph with 2 groups of n
- * nodes with a barrier between them can be represented using an empty node and
- * 2*n dependency edges, rather than no empty node and n^2 dependency edges.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
-
-/**
- * \brief Creates an event record node and adds it to a graph
- *
- * Creates a new event record node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and event specified in \p event.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * Each launch of the graph will record \p event to capture execution of the
- * node's dependencies.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param event           - Event for the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventWaitNode,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- */
-CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
- 
-/**
- * \brief Returns the event associated with an event record node
- *
- * Returns the event of event record node \p hNode in \p event_out.
- *
- * \param hNode     - Node to get the event for
- * \param event_out - Pointer to return the event
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphEventRecordNodeSetEvent,
- * ::cuGraphEventWaitNodeGetEvent,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out);
-
-/**
- * \brief Sets an event record node's event
- *
- * Sets the event of event record node \p hNode to \p event.
- *
- * \param hNode - Node to set the event for
- * \param event - Event to use
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphEventRecordNodeGetEvent,
- * ::cuGraphEventWaitNodeSetEvent,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event);
-
-/**
- * \brief Creates an event wait node and adds it to a graph
- *
- * Creates a new event wait node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and event specified in \p event.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries.
- * A handle to the new node will be returned in \p phGraphNode.
- *
- * The graph node will wait for all work captured in \p event.  See ::cuEventRecord()
- * for details on what is captured by an event. \p event may be from a different context
- * or device than the launch stream.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param event           - Event for the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventRecordNode,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- */
-CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
-
-/**
- * \brief Returns the event associated with an event wait node
- *
- * Returns the event of event wait node \p hNode in \p event_out.
- *
- * \param hNode     - Node to get the event for
- * \param event_out - Pointer to return the event
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphEventWaitNodeSetEvent,
- * ::cuGraphEventRecordNodeGetEvent,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out);
-
-/**
- * \brief Sets an event wait node's event
- *
- * Sets the event of event wait node \p hNode to \p event.
- *
- * \param hNode - Node to set the event for
- * \param event - Event to use
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphEventWaitNodeGetEvent,
- * ::cuGraphEventRecordNodeSetEvent,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent
- */
-CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event);
-
-/**
- * \brief Creates an external semaphore signal node and adds it to a graph
- *
- * Creates a new external semaphore signal node and adds it to \p hGraph with \p
- * numDependencies dependencies specified via \p dependencies and arguments specified
- * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the
- * node will be placed at the root of the graph. \p dependencies may not have any
- * duplicate entries. A handle to the new node will be returned in \p phGraphNode.
- *
- * Performs a signal operation on a set of externally allocated semaphore objects
- * when the node is launched.  The operation(s) will occur after all of the node's
- * dependencies have completed.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param nodeParams      - Parameters for the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphExternalSemaphoresSignalNodeGetParams,
- * ::cuGraphExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuImportExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- */
-CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Returns an external semaphore signal node's parameters
- *
- * Returns the parameters of an external semaphore signal node \p hNode in \p params_out.
- * The \p extSemArray and \p paramsArray returned in \p params_out,
- * are owned by the node.  This memory remains valid until the node is destroyed or its
- * parameters are modified, and should not be modified
- * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
- * parameters of this node.
- *
- * \param hNode      - Node to get the parameters for
- * \param params_out - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchKernel,
- * ::cuGraphAddExternalSemaphoresSignalNode,
- * ::cuGraphExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out);
-
-/**
- * \brief Sets an external semaphore signal node's parameters
- *
- * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddExternalSemaphoresSignalNode,
- * ::cuGraphExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Creates an external semaphore wait node and adds it to a graph
- *
- * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
- * to the new node will be returned in \p phGraphNode.
- *
- * Performs a wait operation on a set of externally allocated semaphore objects
- * when the node is launched.  The node's dependencies will not be launched until
- * the wait operation has completed.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param nodeParams      - Parameters for the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphExternalSemaphoresWaitNodeGetParams,
- * ::cuGraphExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphAddExternalSemaphoresSignalNode,
- * ::cuImportExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode,
- */
-CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Returns an external semaphore wait node's parameters
- *
- * Returns the parameters of an external semaphore wait node \p hNode in \p params_out.
- * The \p extSemArray and \p paramsArray returned in \p params_out,
- * are owned by the node.  This memory remains valid until the node is destroyed or its
- * parameters are modified, and should not be modified
- * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
- * parameters of this node.
- *
- * \param hNode      - Node to get the parameters for
- * \param params_out - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuLaunchKernel,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuGraphExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out);
-
-/**
- * \brief Sets an external semaphore wait node's parameters
- *
- * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams.
- *
- * \param hNode      - Node to set the parameters for
- * \param nodeParams - Parameters to copy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuGraphExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync
- */
-CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Creates an allocation node and adds it to a graph
- *
- * Creates a new allocation node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
- * to the new node will be returned in \p phGraphNode.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param nodeParams      - Parameters for the node
- *
- * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in
- * \p nodeParams.dptr.  The allocation's address remains fixed across instantiations and launches.
- *
- * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode,
- * the allocation can be accessed by nodes ordered after the allocation node but before the free node.
- * These allocations cannot be freed outside the owning graph, and they can only be freed once in the
- * owning graph.
- *
- * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the
- * graph which are ordered after the allocation node, but also by stream operations ordered after the
- * graph's execution but before the allocation is freed.
- *
- * Allocations which are not freed in the same graph can be freed by:
- * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree;
- * - launching a graph with a free node for that allocation; or
- * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes
- * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation.
- * 
- * It is not possible to free an allocation in both the owning graph and another graph.  If the allocation
- * is freed in the same graph, a free node cannot be added to another graph.  If the allocation is freed
- * in another graph, a free node can no longer be added to the owning graph.
- *
- * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
- * - Nodes and edges of the graph cannot be deleted.
- * - The graph cannot be used in a child node.
- * - Only one instantiation of the graph may exist at any point in time.
- * - The graph cannot be cloned.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddMemFreeNode,
- * ::cuGraphMemAllocNodeGetParams,
- * ::cuDeviceGraphMemTrim,
- * ::cuDeviceGetGraphMemAttribute,
- * ::cuDeviceSetGraphMemAttribute,
- * ::cuMemAllocAsync,
- * ::cuMemFreeAsync,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphAddExternalSemaphoresSignalNode,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Returns a memory alloc node's parameters
- *
- * Returns the parameters of a memory alloc node \p hNode in \p params_out.
- * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the
- * node.  This memory remains valid until the node is destroyed.  The returned
- * parameters must not be modified.
- *
- * \param hNode      - Node to get the parameters for
- * \param params_out - Pointer to return the parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddMemAllocNode,
- * ::cuGraphMemFreeNodeGetParams
- */
-CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
-
-/**
- * \brief Creates a memory free node and adds it to a graph
- *
- * Creates a new memory free node and adds it to \p hGraph with \p numDependencies
- * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
- * It is possible for \p numDependencies to be 0, in which case the node will be placed
- * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
- * to the new node will be returned in \p phGraphNode.
- *
- * \param phGraphNode     - Returns newly created node
- * \param hGraph          - Graph to which to add the node
- * \param dependencies    - Dependencies of the node
- * \param numDependencies - Number of dependencies
- * \param dptr            - Address of memory to free
- *
- * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free:
- * - an allocation twice in the same graph.
- * - an address that was not returned by an allocation node.
- * - an invalid address.
- *
- * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
- * - Nodes and edges of the graph cannot be deleted.
- * - The graph cannot be used in a child node.
- * - Only one instantiation of the graph may exist at any point in time.
- * - The graph cannot be cloned.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddMemAllocNode,
- * ::cuGraphMemFreeNodeGetParams,
- * ::cuDeviceGraphMemTrim,
- * ::cuDeviceGetGraphMemAttribute,
- * ::cuDeviceSetGraphMemAttribute,
- * ::cuMemAllocAsync,
- * ::cuMemFreeAsync,
- * ::cuGraphCreate,
- * ::cuGraphDestroyNode,
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphAddExternalSemaphoresSignalNode,
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
-
-/**
- * \brief Returns a memory free node's parameters
- *
- * Returns the address of a memory free node \p hNode in \p dptr_out.
- *
- * \param hNode    - Node to get the parameters for
- * \param dptr_out - Pointer to return the device address
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddMemFreeNode,
- * ::cuGraphMemAllocNodeGetParams
- */
-CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out);
-
-/**
- * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS.
- *
- * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are
- * freed back to the operating system.
- *
- * \param device - The device for which cached memory should be freed.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_DEVICE
- *
- * \sa
- * ::cuGraphAddMemAllocNode,
- * ::cuGraphAddMemFreeNode,
- * ::cuDeviceSetGraphMemAttribute,
- * ::cuDeviceGetGraphMemAttribute
- */
-CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device);
-
-/**
- * \brief Query asynchronous allocation attributes related to graphs
- *
- * Valid attributes are:
- *
- * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs
- * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
- *   last time it was reset.  High watermark can only be reset to zero.
- * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by
- *   the CUDA graphs asynchronous allocator.
- * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
- *   the CUDA graphs asynchronous allocator.
- *
- * \param device - Specifies the scope of the query
- * \param attr - attribute to get
- * \param value - retrieved value
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_DEVICE
- *
- * \sa
- * ::cuDeviceSetGraphMemAttribute,
- * ::cuGraphAddMemAllocNode,
- * ::cuGraphAddMemFreeNode
- */
-CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value);
-
-/**
- * \brief Set asynchronous allocation attributes related to graphs
- *
- * Valid attributes are:
- *
- * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
- *   last time it was reset.  High watermark can only be reset to zero.
- * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
- *   the CUDA graphs asynchronous allocator.
- *
- * \param device - Specifies the scope of the query
- * \param attr - attribute to get
- * \param value - pointer to value to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_DEVICE
- *
- * \sa
- * ::cuDeviceGetGraphMemAttribute,
- * ::cuGraphAddMemAllocNode,
- * ::cuGraphAddMemFreeNode
- */
-CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value);
-
-/**
- * \brief Clones a graph
- *
- * This function creates a copy of \p originalGraph and returns it in \p phGraphClone.
- * All parameters are copied into the cloned graph. The original graph may be modified
- * after this call without affecting the clone.
- *
- * Child graph nodes in the original graph are recursively copied into the clone.
- *
- * \param phGraphClone  - Returns newly created cloned graph
- * \param originalGraph - Graph to clone
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OUT_OF_MEMORY
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphCreate,
- * ::cuGraphNodeFindInClone
- */
-CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph);
-
-/**
- * \brief Finds a cloned version of a node
- *
- * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode
- * in the original graph.
- *
- * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone.
- * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to
- * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have
- * been removed. The cloned node is then returned via \p phClonedNode.
- *
- * \param phNode  - Returns handle to the cloned node
- * \param hOriginalNode - Handle to the original node
- * \param hClonedGraph - Cloned graph to query
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphClone
- */
-CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
-
-/**
- * \brief Returns a node's type
- *
- * Returns the node type of \p hNode in \p type.
- *
- * \param hNode - Node to query
- * \param type  - Pointer to return the node type
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphGetNodes,
- * ::cuGraphGetRootNodes,
- * ::cuGraphChildGraphNodeGetGraph,
- * ::cuGraphKernelNodeGetParams,
- * ::cuGraphKernelNodeSetParams,
- * ::cuGraphHostNodeGetParams,
- * ::cuGraphHostNodeSetParams,
- * ::cuGraphMemcpyNodeGetParams,
- * ::cuGraphMemcpyNodeSetParams,
- * ::cuGraphMemsetNodeGetParams,
- * ::cuGraphMemsetNodeSetParams
- */
-CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type);
-
-/**
- * \brief Returns a graph's nodes
- *
- * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this
- * function will return the number of nodes in \p numNodes. Otherwise,
- * \p numNodes entries will be filled in. If \p numNodes is higher than the actual
- * number of nodes, the remaining entries in \p nodes will be set to NULL, and the
- * number of nodes actually obtained will be returned in \p numNodes.
- *
- * \param hGraph   - Graph to query
- * \param nodes    - Pointer to return the nodes
- * \param numNodes - See description
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphCreate,
- * ::cuGraphGetRootNodes,
- * ::cuGraphGetEdges,
- * ::cuGraphNodeGetType,
- * ::cuGraphNodeGetDependencies,
- * ::cuGraphNodeGetDependentNodes
- */
-CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
-
-/**
- * \brief Returns a graph's root nodes
- *
- * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this
- * function will return the number of root nodes in \p numRootNodes. Otherwise,
- * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual
- * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the
- * number of nodes actually obtained will be returned in \p numRootNodes.
- *
- * \param hGraph       - Graph to query
- * \param rootNodes    - Pointer to return the root nodes
- * \param numRootNodes - See description
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphCreate,
- * ::cuGraphGetNodes,
- * ::cuGraphGetEdges,
- * ::cuGraphNodeGetType,
- * ::cuGraphNodeGetDependencies,
- * ::cuGraphNodeGetDependentNodes
- */
-CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
-
-/**
- * \brief Returns a graph's dependency edges
- *
- * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding
- * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the
- * node in \p from[i]. \p from and \p to may both be NULL, in which
- * case this function only returns the number of edges in \p numEdges. Otherwise,
- * \p numEdges entries will be filled in. If \p numEdges is higher than the actual
- * number of edges, the remaining entries in \p from and \p to will be set to NULL, and
- * the number of edges actually returned will be written to \p numEdges.
- *
- * \param hGraph   - Graph to get the edges from
- * \param from     - Location to return edge endpoints
- * \param to       - Location to return edge endpoints
- * \param numEdges - See description
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphGetNodes,
- * ::cuGraphGetRootNodes,
- * ::cuGraphAddDependencies,
- * ::cuGraphRemoveDependencies,
- * ::cuGraphNodeGetDependencies,
- * ::cuGraphNodeGetDependentNodes
- */
-CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
-
-/**
- * \brief Returns a node's dependencies
- *
- * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this
- * function will return the number of dependencies in \p numDependencies. Otherwise,
- * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual
- * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the
- * number of nodes actually obtained will be returned in \p numDependencies.
- *
- * \param hNode           - Node to query
- * \param dependencies    - Pointer to return the dependencies
- * \param numDependencies - See description
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphNodeGetDependentNodes,
- * ::cuGraphGetNodes,
- * ::cuGraphGetRootNodes,
- * ::cuGraphGetEdges,
- * ::cuGraphAddDependencies,
- * ::cuGraphRemoveDependencies
- */
-CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
-
-/**
- * \brief Returns a node's dependent nodes
- *
- * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which
- * case this function will return the number of dependent nodes in \p numDependentNodes.
- * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is
- * higher than the actual number of dependent nodes, the remaining entries in
- * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will
- * be returned in \p numDependentNodes.
- *
- * \param hNode             - Node to query
- * \param dependentNodes    - Pointer to return the dependent nodes
- * \param numDependentNodes - See description
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphNodeGetDependencies,
- * ::cuGraphGetNodes,
- * ::cuGraphGetRootNodes,
- * ::cuGraphGetEdges,
- * ::cuGraphAddDependencies,
- * ::cuGraphRemoveDependencies
- */
-CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
-
-/**
- * \brief Adds dependency edges to a graph
- *
- * The number of dependencies to be added is defined by \p numDependencies
- * Elements in \p from and \p to at corresponding indices define a dependency.
- * Each node in \p from and \p to must belong to \p hGraph.
- *
- * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
- * Specifying an existing dependency will return an error.
- *
- * \param hGraph - Graph to which dependencies are added
- * \param from - Array of nodes that provide the dependencies
- * \param to - Array of dependent nodes
- * \param numDependencies - Number of dependencies to be added
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphRemoveDependencies,
- * ::cuGraphGetEdges,
- * ::cuGraphNodeGetDependencies,
- * ::cuGraphNodeGetDependentNodes
- */
-CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
-
-/**
- * \brief Removes dependency edges from a graph
- *
- * The number of \p dependencies to be removed is defined by \p numDependencies.
- * Elements in \p from and \p to at corresponding indices define a dependency.
- * Each node in \p from and \p to must belong to \p hGraph.
- *
- * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
- * Specifying a non-existing dependency will return an error.
- *
- * Dependencies cannot be removed from graphs which contain allocation or free nodes.
- * Any attempt to do so will return an error.
- *
- * \param hGraph - Graph from which to remove dependencies
- * \param from - Array of nodes that provide the dependencies
- * \param to - Array of dependent nodes
- * \param numDependencies - Number of dependencies to be removed
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddDependencies,
- * ::cuGraphGetEdges,
- * ::cuGraphNodeGetDependencies,
- * ::cuGraphNodeGetDependentNodes
- */
-CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
-
-/**
- * \brief Remove a node from the graph
- *
- * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes
- * on \p hNode and vice versa.
- *
- * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed.
- * Any attempt to do so will return an error.
- *
- * \param hNode  - Node to remove
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphAddEmptyNode,
- * ::cuGraphAddKernelNode,
- * ::cuGraphAddHostNode,
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphAddMemsetNode
- */
-CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode);
-
-/**
- * \brief Creates an executable graph from a graph
- *
- * Instantiates \p hGraph as an executable graph. The graph is validated for any
- * structural constraints or intra-node constraints which were not previously
- * validated. If instantiation is successful, a handle to the instantiated graph
- * is returned in \p phGraphExec.
- *
- * If there are any errors, diagnostic information may be returned in \p errorNode and
- * \p logBuffer. This is the primary way to inspect instantiation errors. The output
- * will be null terminated unless the diagnostics overflow
- * the buffer. In this case, they will be truncated, and the last byte can be
- * inspected to determine if truncation occurred.
- *
- * \param phGraphExec - Returns instantiated graph
- * \param hGraph      - Graph to instantiate
- * \param phErrorNode - In case of an instantiation error, this may be modified to
- *                      indicate a node contributing to the error
- * \param logBuffer   - A character buffer to store diagnostic messages
- * \param bufferSize  - Size of the log buffer in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphInstantiateWithFlags,
- * ::cuGraphCreate,
- * ::cuGraphUpload,
- * ::cuGraphLaunch,
- * ::cuGraphExecDestroy
- */
-CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
-
-/**
- * \brief Creates an executable graph from a graph
- *
- * Instantiates \p hGraph as an executable graph. The graph is validated for any
- * structural constraints or intra-node constraints which were not previously
- * validated. If instantiation is successful, a handle to the instantiated graph
- * is returned in \p phGraphExec.
- *
- * The \p flags parameter controls the behavior of instantiation and subsequent
- * graph launches.  Valid flags are:
- *
- * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a
- * graph containing memory allocation nodes to automatically free any
- * unfreed memory allocations before the graph is relaunched.
- *
- * If \p hGraph contains any allocation or free nodes, there can be at most one
- * executable graph in existence for that graph at a time.
- *
- * An attempt to instantiate a second executable graph before destroying the first
- * with ::cuGraphExecDestroy will result in an error.
- *
- * \param phGraphExec - Returns instantiated graph
- * \param hGraph      - Graph to instantiate
- * \param flags       - Flags to control instantiation.  See ::CUgraphInstantiate_flags.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphInstantiate,
- * ::cuGraphCreate,
- * ::cuGraphUpload,
- * ::cuGraphLaunch,
- * ::cuGraphExecDestroy
- */
-CUresult CUDAAPI cuGraphInstantiateWithFlags(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
-
-/**
- * \brief Sets the parameters for a kernel node in the given graphExec
- *
- * Sets the parameters of a kernel node in an executable graph \p hGraphExec. 
- * The node is identified by the corresponding node \p hNode in the 
- * non-executable graph, from which the executable graph was instantiated. 
- *
- * \p hNode must not have been removed from the original graph. The \p func field 
- * of \p nodeParams cannot be modified and must match the original value.
- * All other values can be modified. 
- *
- * The modifications only affect future launches of \p hGraphExec. Already 
- * enqueued or running launches of \p hGraphExec are not affected by this call. 
- * \p hNode is also not modified by this call.
- * 
- * \param hGraphExec  - The executable graph in which to set the specified node
- * \param hNode       - kernel node from the graph from which graphExec was instantiated
- * \param nodeParams  - Updated Parameters to set
- * 
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddKernelNode,
- * ::cuGraphKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Sets the parameters for a memcpy node in the given graphExec.
- *
- * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
- * contained \p copyParams at instantiation.  hNode must remain in the graph which was 
- * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
- *
- * The source and destination memory in \p copyParams must be allocated from the same 
- * contexts as the original source and destination memory.  Both the instantiation-time 
- * memory operands and the memory operands in \p copyParams must be 1-dimensional.
- * Zero-length operations are not supported.
- *
- * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
- * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
- * not modified by this call.
- *
- * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or
- * either the original or new memory operands are multidimensional.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - Memcpy node from the graph which was used to instantiate graphExec
- * \param copyParams - The updated parameters to set
- * \param ctx        - Context on which to run the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddMemcpyNode,
- * ::cuGraphMemcpyNodeSetParams,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
-
-/**
- * \brief Sets the parameters for a memset node in the given graphExec.
- *
- * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
- * contained \p memsetParams at instantiation.  hNode must remain in the graph which was 
- * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
- *
- * The destination memory in \p memsetParams must be allocated from the same 
- * contexts as the original destination memory.  Both the instantiation-time 
- * memory operand and the memory operand in \p memsetParams must be 1-dimensional.
- * Zero-length operations are not supported.
- *
- * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
- * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
- * not modified by this call.
- *
- * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or
- * either the original or new memory operand are multidimensional.
- *
- * \param hGraphExec   - The executable graph in which to set the specified node
- * \param hNode        - Memset node from the graph which was used to instantiate graphExec
- * \param memsetParams - The updated parameters to set
- * \param ctx          - Context on which to run the node
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddMemsetNode,
- * ::cuGraphMemsetNodeSetParams,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
-
-/**
- * \brief Sets the parameters for a host node in the given graphExec.
- *
- * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
- * contained \p nodeParams at instantiation.  hNode must remain in the graph which was 
- * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
- *
- * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
- * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
- * not modified by this call.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - Host node from the graph which was used to instantiate graphExec
- * \param nodeParams - The updated parameters to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddHostNode,
- * ::cuGraphHostNodeSetParams,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Updates node parameters in the child graph node in the given graphExec.
- *
- * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained
- * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation.
- * \p hNode must remain in the graph which was used to instantiate \p hGraphExec.
- * Changed edges to and from \p hNode are ignored.
- *
- * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
- * or running launches of \p hGraphExec are not affected by this call.  \p hNode is also 
- * not modified by this call.
- *
- * The topology of \p childGraph, as well as the node insertion order,  must match that
- * of the graph contained in \p hNode.  See ::cuGraphExecUpdate() for a list of restrictions
- * on what can be updated in an instantiated graph.  The update is recursive, so child graph
- * nodes contained within the top level child graph will also be updated.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - Host node from the graph which was used to instantiate graphExec
- * \param childGraph - The graph supplying the updated parameters
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddChildGraphNode,
- * ::cuGraphChildGraphNodeGetGraph,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
-
-/**
- * \brief Sets the event for an event record node in the given graphExec
- *
- * Sets the event of an event record node in an executable graph \p hGraphExec.
- * The node is identified by the corresponding node \p hNode in the
- * non-executable graph, from which the executable graph was instantiated.
- *
- * The modifications only affect future launches of \p hGraphExec. Already
- * enqueued or running launches of \p hGraphExec are not affected by this call.
- * \p hNode is also not modified by this call.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - event record node from the graph from which graphExec was instantiated
- * \param event      - Updated event to use
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventRecordNode,
- * ::cuGraphEventRecordNodeGetEvent,
- * ::cuGraphEventWaitNodeSetEvent,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
-
-/**
- * \brief Sets the event for an event wait node in the given graphExec
- *
- * Sets the event of an event wait node in an executable graph \p hGraphExec.
- * The node is identified by the corresponding node \p hNode in the
- * non-executable graph, from which the executable graph was instantiated.
- *
- * The modifications only affect future launches of \p hGraphExec. Already
- * enqueued or running launches of \p hGraphExec are not affected by this call.
- * \p hNode is also not modified by this call.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - event wait node from the graph from which graphExec was instantiated
- * \param event      - Updated event to use
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddEventWaitNode,
- * ::cuGraphEventWaitNodeGetEvent,
- * ::cuGraphEventRecordNodeSetEvent,
- * ::cuEventRecordWithFlags,
- * ::cuStreamWaitEvent,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
-
-/**
- * \brief Sets the parameters for an external semaphore signal node in the given graphExec
- *
- * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec.
- * The node is identified by the corresponding node \p hNode in the
- * non-executable graph, from which the executable graph was instantiated.
- *
- * \p hNode must not have been removed from the original graph.
- *
- * The modifications only affect future launches of \p hGraphExec. Already
- * enqueued or running launches of \p hGraphExec are not affected by this call.
- * \p hNode is also not modified by this call.
- *
- * Changing \p nodeParams->numExtSems is not supported.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - semaphore signal node from the graph from which graphExec was instantiated
- * \param nodeParams - Updated Parameters to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddExternalSemaphoresSignalNode,
- * ::cuImportExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Sets the parameters for an external semaphore wait node in the given graphExec
- *
- * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec.
- * The node is identified by the corresponding node \p hNode in the
- * non-executable graph, from which the executable graph was instantiated.
- *
- * \p hNode must not have been removed from the original graph.
- *
- * The modifications only affect future launches of \p hGraphExec. Already
- * enqueued or running launches of \p hGraphExec are not affected by this call.
- * \p hNode is also not modified by this call.
- *
- * Changing \p nodeParams->numExtSems is not supported.
- *
- * \param hGraphExec - The executable graph in which to set the specified node
- * \param hNode      - semaphore wait node from the graph from which graphExec was instantiated
- * \param nodeParams - Updated Parameters to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphAddExternalSemaphoresWaitNode,
- * ::cuImportExternalSemaphore,
- * ::cuSignalExternalSemaphoresAsync,
- * ::cuWaitExternalSemaphoresAsync,
- * ::cuGraphExecKernelNodeSetParams,
- * ::cuGraphExecMemcpyNodeSetParams,
- * ::cuGraphExecMemsetNodeSetParams,
- * ::cuGraphExecHostNodeSetParams,
- * ::cuGraphExecChildGraphNodeSetParams,
- * ::cuGraphExecEventRecordNodeSetEvent,
- * ::cuGraphExecEventWaitNodeSetEvent,
- * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
- * ::cuGraphExecUpdate,
- * ::cuGraphInstantiate
- */
-CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
-
-/**
- * \brief Uploads an executable graph in a stream
- *
- * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of
- * the same \p hGraphExec will be serialized. Each upload is ordered behind both any
- * previous work in \p hStream and any previous launches of \p hGraphExec.
- * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec.
- *
- * \param hGraphExec - Executable graph to upload
- * \param hStream    - Stream in which to upload the graph
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphInstantiate,
- * ::cuGraphLaunch,
- * ::cuGraphExecDestroy
- */
-CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream);
-
-/**
- * \brief Launches an executable graph in a stream
- *
- * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing
- * at a time. Each launch is ordered behind both any previous work in \p hStream
- * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be
- * instantiated multiple times into multiple executable graphs.
- *
- * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and
- * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH,
- * the launch will fail with ::CUDA_ERROR_INVALID_VALUE.
- *
- * \param hGraphExec - Executable graph to launch
- * \param hStream    - Stream in which to launch the graph
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphInstantiate,
- * ::cuGraphUpload,
- * ::cuGraphExecDestroy
- */
-CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream);
-
-/**
- * \brief Destroys an executable graph
- *
- * Destroys the executable graph specified by \p hGraphExec, as well
- * as all of its executable nodes. If the executable graph is
- * in-flight, it will not be terminated, but rather freed
- * asynchronously on completion.
- *
- * \param hGraphExec - Executable graph to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphInstantiate,
- * ::cuGraphUpload,
- * ::cuGraphLaunch
- */
-CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec);
-
-/**
- * \brief Destroys a graph
- *
- * Destroys the graph specified by \p hGraph, as well as all of its nodes.
- *
- * \param hGraph - Graph to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphCreate
- */
-CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
-
-/**
- * \brief Check whether an executable graph can be updated with a graph and perform the update if possible
- *
- * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the
- * node parameters in a topologically identical graph specified by \p hGraph.
- *
- * Limitations:
- *
- * - Kernel nodes:
- *   - The owning context of the function cannot change.
- *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
- *     to a function which uses CDP
- * - Memset and memcpy nodes:
- *   - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
- *   - The source/destination memory must be allocated from the same contexts as the original
- *     source/destination memory.
- *   - Only 1D memsets can be changed.
- * - Additional memcpy node restrictions:
- *   - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
- *     CU_MEMORYTYPE_ARRAY, etc.) is not supported.
- * - External semaphore wait nodes and record nodes:
- *   - Changing the number of semaphores is not supported.
- *
- * Note:  The API may add further restrictions in future releases.  The return code should always be checked.
- *
- * cuGraphExecUpdate sets \p updateResult_out to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under
- * the following conditions:
- *
- * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out
- *   is NULL.
- * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out
- *   is NULL.
- * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is
- *   the pairless node from \p hGraph.
- * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph.
- *
- * cuGraphExecUpdate sets \p updateResult_out to:
- * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value.
- * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed
- * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case
- *   \p hErrorNode_out is set to the node from \p hGraph.
- * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported
- *   way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph
- * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way 
- *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph.
- * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like 
- *   the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph
- *
- * If \p updateResult_out isn't set in one of the situations described above, the update check passes
- * and cuGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph.  If an error happens
- * during the update, \p updateResult_out will be set to CU_GRAPH_EXEC_UPDATE_ERROR; otherwise,
- * \p updateResult_out is set to CU_GRAPH_EXEC_UPDATE_SUCCESS.
- *
- * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully.  It returns
- * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included 
- * changes which violated constraints specific to instantiated graph update.
- *
- * \param hGraphExec The instantiated graph to be updated
- * \param hGraph The graph containing the updated parameters
- * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any
- * \param updateResult_out Whether the graph update was permitted.  If was forbidden, the reason why
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
- * \note_graph_thread_safety
- * \notefnerr
- *
- * \sa
- * ::cuGraphInstantiate,
- */
-CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
-
-/**
- * \brief Copies attributes from source node to destination node.
- *
- * Copies attributes from source node \p src to destination node \p dst.
- * Both node must have the same context.
- *
- * \param[out] dst Destination node
- * \param[in] src Source node
- * For list of attributes see ::CUkernelNodeAttrID
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src);
-
-/**
- * \brief Queries node attribute.
- * 
- * Queries attribute \p attr from node \p hNode and stores it in corresponding
- * member of \p value_out.
- *
- * \param[in] hNode
- * \param[in] attr
- * \param[out] value_out 
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *  
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
-                                      CUkernelNodeAttrValue *value_out);
- 
-/**
- * \brief Sets node attribute.
- * 
- * Sets attribute \p attr on node \p hNode from corresponding attribute of
- * \p value.
- *
- * \param[out] hNode
- * \param[in] attr
- * \param[out] value
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE
- * \notefnerr
- *
- * \sa
- * ::CUaccessPolicyWindow
- */
-CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
-                                      const CUkernelNodeAttrValue *value);
-
-/**
- * \brief Write a DOT file describing graph structure
- *
- * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph.
- * By default this includes the graph topology, node types, node id, kernel names and memcpy direction.
- * \p flags can be specified to write more detailed information about each node type such as
- * parameter values, kernel attributes, node and function handles.
- *
- * \param hGraph - The graph to create a DOT file from
- * \param path   - The path to write the DOT file to
- * \param flags  - Flags from CUgraphDebugDot_flags for specifying which additional node information to write
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_OPERATING_SYSTEM
- */
-CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags);
-
-/**
- * \brief Create a user object
- *
- * Create a user object with the specified destructor callback and initial reference count. The
- * initial references are owned by the caller.
- *
- * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they
- * are executed by a shared internal thread. Another thread may be signaled to perform such
- * actions, if it does not block forward progress of tasks scheduled through CUDA.
- *
- * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
- *
- * \param object_out      - Location to return the user object handle
- * \param ptr             - The pointer to pass to the destroy function
- * \param destroy         - Callback to free the user object when it is no longer in use
- * \param initialRefcount - The initial refcount to create the object with, typically 1. The
- *                          initial references are owned by the calling thread.
- * \param flags           - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC,
- *                          which is the only defined flag. This indicates that the destroy
- *                          callback cannot be waited on by any CUDA API. Users requiring
- *                          synchronization of the callback should signal its completion
- *                          manually.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuUserObjectRetain,
- * ::cuUserObjectRelease,
- * ::cuGraphRetainUserObject,
- * ::cuGraphReleaseUserObject,
- * ::cuGraphCreate
- */
-CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy,
-                                    unsigned int initialRefcount, unsigned int flags);
-
-/**
- * \brief Retain a reference to a user object
- *
- * Retains new references to a user object. The new references are owned by the caller.
- *
- * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
- *
- * \param object - The object to retain
- * \param count  - The number of references to retain, typically 1. Must be nonzero
- *                 and not larger than INT_MAX.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuUserObjectCreate,
- * ::cuUserObjectRelease,
- * ::cuGraphRetainUserObject,
- * ::cuGraphReleaseUserObject,
- * ::cuGraphCreate
- */
-CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count);
-
-/**
- * \brief Release a reference to a user object
- *
- * Releases user object references owned by the caller. The object's destructor is invoked if
- * the reference count reaches zero.
- *
- * It is undefined behavior to release references not owned by the caller, or to use a user
- * object handle after all references are released.
- *
- * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
- *
- * \param object - The object to release
- * \param count  - The number of references to release, typically 1. Must be nonzero
- *                 and not larger than INT_MAX.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuUserObjectCreate,
- * ::cuUserObjectRetain,
- * ::cuGraphRetainUserObject,
- * ::cuGraphReleaseUserObject,
- * ::cuGraphCreate
- */
-CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count);
-
-/**
- * \brief Retain a reference to a user object from a graph
- *
- * Creates or moves user object references that will be owned by a CUDA graph.
- *
- * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
- *
- * \param graph  - The graph to associate the reference with
- * \param object - The user object to retain a reference for
- * \param count  - The number of references to add to the graph, typically 1. Must be
- *                 nonzero and not larger than INT_MAX.
- * \param flags  - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references
- *                 from the calling thread, rather than create new references. Pass 0
- *                 to create new references.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuUserObjectCreate,
- * ::cuUserObjectRetain,
- * ::cuUserObjectRelease,
- * ::cuGraphReleaseUserObject,
- * ::cuGraphCreate
- */
-CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
-
-/**
- * \brief Release a user object reference from a graph
- *
- * Releases user object references owned by a graph.
- *
- * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
- *
- * \param graph  - The graph that will release the reference
- * \param object - The user object to release a reference for
- * \param count  - The number of references to release, typically 1. Must be nonzero
- *                 and not larger than INT_MAX.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuUserObjectCreate,
- * ::cuUserObjectRetain,
- * ::cuUserObjectRelease,
- * ::cuGraphRetainUserObject,
- * ::cuGraphCreate
- */
-CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count);
-
-/** @} */ /* END CUDA_GRAPH */
-
-/**
- * \defgroup CUDA_OCCUPANCY Occupancy
- *
- * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver
- * API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the occupancy calculation functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Returns occupancy of a function
- *
- * Returns in \p *numBlocks the number of the maximum active blocks per
- * streaming multiprocessor.
- *
- * \param numBlocks       - Returned occupancy
- * \param func            - Kernel for which occupancy is calculated
- * \param blockSize       - Block size the kernel is intended to be launched with
- * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
- */
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
-
-/**
- * \brief Returns occupancy of a function
- *
- * Returns in \p *numBlocks the number of the maximum active blocks per
- * streaming multiprocessor.
- *
- * The \p Flags parameter controls how special cases are handled. The
- * valid flags are:
- *
- * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
- *   ::cuOccupancyMaxActiveBlocksPerMultiprocessor;
- *
- * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
- *   default behavior on platform where global caching affects
- *   occupancy. On such platforms, if caching is enabled, but
- *   per-block SM resource usage would result in zero occupancy, the
- *   occupancy calculator will calculate the occupancy as if caching
- *   is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes
- *   the occupancy calculator to return 0 in such cases. More information
- *   can be found about this feature in the "Unified L1/Texture Cache"
- *   section of the Maxwell tuning guide.
- *
- * \param numBlocks       - Returned occupancy
- * \param func            - Kernel for which occupancy is calculated
- * \param blockSize       - Block size the kernel is intended to be launched with
- * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
- * \param flags           - Requested behavior for the occupancy calculator
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
- */
-CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
-
-/**
- * \brief Suggest a launch configuration with reasonable occupancy
- *
- * Returns in \p *blockSize a reasonable block size that can achieve
- * the maximum occupancy (or, the maximum number of active warps with
- * the fewest blocks per multiprocessor), and in \p *minGridSize the
- * minimum grid size to achieve the maximum occupancy.
- *
- * If \p blockSizeLimit is 0, the configurator will use the maximum
- * block size permitted by the device / function instead.
- *
- * If per-block dynamic shared memory allocation is not needed, the
- * user should leave both \p blockSizeToDynamicSMemSize and \p
- * dynamicSMemSize as 0.
- *
- * If per-block dynamic shared memory allocation is needed, then if
- * the dynamic shared memory size is constant regardless of block
- * size, the size should be passed through \p dynamicSMemSize, and \p
- * blockSizeToDynamicSMemSize should be NULL.
- *
- * Otherwise, if the per-block dynamic shared memory size varies with
- * different block sizes, the user needs to provide a unary function
- * through \p blockSizeToDynamicSMemSize that computes the dynamic
- * shared memory needed by \p func for any given block size. \p
- * dynamicSMemSize is ignored. An example signature is:
- *
- * \code
- *    // Take block size, returns dynamic shared memory needed
- *    size_t blockToSmem(int blockSize);
- * \endcode
- *
- * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
- * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
- * \param func        - Kernel for which launch configuration is calculated
- * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
- * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
- * \param blockSizeLimit  - The maximum block size \p func is designed to handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cudaOccupancyMaxPotentialBlockSize
- */
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
-
-/**
- * \brief Suggest a launch configuration with reasonable occupancy
- *
- * An extended version of ::cuOccupancyMaxPotentialBlockSize. In
- * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize,
- * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags
- * parameter.
- *
- * The \p Flags parameter controls how special cases are handled. The
- * valid flags are:
- *
- * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
- *   ::cuOccupancyMaxPotentialBlockSize;
- *
- * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
- *   default behavior on platform where global caching affects
- *   occupancy. On such platforms, the launch configurations that
- *   produces maximal occupancy might not support global
- *   caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE
- *   guarantees that the the produced launch configuration is global
- *   caching compatible at a potential cost of occupancy. More information
- *   can be found about this feature in the "Unified L1/Texture Cache"
- *   section of the Maxwell tuning guide.
- *
- * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
- * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
- * \param func        - Kernel for which launch configuration is calculated
- * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
- * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
- * \param blockSizeLimit  - The maximum block size \p func is designed to handle
- * \param flags       - Options
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cudaOccupancyMaxPotentialBlockSizeWithFlags
- */
-CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
-
-/**
- * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM 
- *
- * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. 
- *
- * \param dynamicSmemSize - Returned maximum dynamic shared memory 
- * \param func            - Kernel function for which occupancy is calculated
- * \param numBlocks       - Number of blocks to fit on SM 
- * \param blockSize       - Size of the blocks
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- */
-CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
-
-/** @} */ /* END CUDA_OCCUPANCY */
-
-/**
- * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated texture reference management functions of the
- * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the deprecated texture reference management
- * functions of the low-level CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Binds an array as a texture reference
- *
- * \deprecated
- *
- * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any
- * previous address or CUDA array state associated with the texture reference
- * is superseded by this function. \p Flags must be set to
- * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is
- * unbound.
- *
- * \param hTexRef - Texture reference to bind
- * \param hArray  - Array to bind
- * \param Flags   - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
-
-/**
- * \brief Binds a mipmapped array to a texture reference
- *
- * \deprecated
- *
- * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef.
- * Any previous address or CUDA array state associated with the texture reference
- * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT.
- * Any CUDA array previously bound to \p hTexRef is unbound.
- *
- * \param hTexRef         - Texture reference to bind
- * \param hMipmappedArray - Mipmapped array to bind
- * \param Flags           - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
-
-/**
- * \brief Binds an address as a texture reference
- *
- * \deprecated
- *
- * Binds a linear address range to the texture reference \p hTexRef. Any
- * previous address or CUDA array state associated with the texture reference
- * is superseded by this function. Any memory previously bound to \p hTexRef
- * is unbound.
- *
- * Since the hardware enforces an alignment requirement on texture base
- * addresses, ::cuTexRefSetAddress() passes back a byte offset in
- * \p *ByteOffset that must be applied to texture fetches in order to read from
- * the desired memory. This offset must be divided by the texel size and
- * passed to kernels that read from the texture so they can be applied to the
- * ::tex1Dfetch() function.
- *
- * If the device memory pointer was returned from ::cuMemAlloc(), the offset
- * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter.
- *
- * The total number of elements (or texels) in the linear address range
- * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH.
- * The number of elements is computed as (\p bytes / bytesPerElement),
- * where bytesPerElement is determined from the data format and number of
- * components set using ::cuTexRefSetFormat().
- *
- * \param ByteOffset - Returned byte offset
- * \param hTexRef    - Texture reference to bind
- * \param dptr       - Device pointer to bind
- * \param bytes      - Size of memory to bind in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTexture
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
-
-/**
- * \brief Binds an address as a 2D texture reference
- *
- * \deprecated
- *
- * Binds a linear address range to the texture reference \p hTexRef. Any
- * previous address or CUDA array state associated with the texture reference
- * is superseded by this function. Any memory previously bound to \p hTexRef
- * is unbound.
- *
- * Using a ::tex2D() function inside a kernel requires a call to either
- * ::cuTexRefSetArray() to bind the corresponding texture reference to an
- * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear
- * memory.
- *
- * Function calls to ::cuTexRefSetFormat() cannot follow calls to
- * ::cuTexRefSetAddress2D() for the same texture reference.
- *
- * It is required that \p dptr be aligned to the appropriate hardware-specific
- * texture alignment. You can query this value using the device attribute
- * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is
- * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * \p Pitch has to be aligned to the hardware-specific texture pitch alignment.
- * This value can be queried using the device attribute
- * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is
- * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * Width and Height, which are specified in elements (or texels), cannot exceed
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
- * \p Pitch, which is specified in bytes, cannot exceed
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
- *
- * \param hTexRef - Texture reference to bind
- * \param desc    - Descriptor of CUDA array
- * \param dptr    - Device pointer to bind
- * \param Pitch   - Line pitch in bytes
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTexture2D
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
-
-/**
- * \brief Sets the format for a texture reference
- *
- * \deprecated
- *
- * Specifies the format of the data to be read by the texture reference
- * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the
- * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure:
- * They specify the format of each component and the number of components per
- * array element.
- *
- * \param hTexRef             - Texture reference
- * \param fmt                 - Format to set
- * \param NumPackedComponents - Number of components per array element
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaCreateChannelDesc,
- * ::cudaBindTexture,
- * ::cudaBindTexture2D,
- * ::cudaBindTextureToArray,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
-
-/**
- * \brief Sets the addressing mode for a texture reference
- *
- * \deprecated
- *
- * Specifies the addressing mode \p am for the given dimension \p dim of the
- * texture reference \p hTexRef. If \p dim is zero, the addressing mode is
- * applied to the first parameter of the functions used to fetch from the
- * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined
- * as:
- * \code
-   typedef enum CUaddress_mode_enum {
-      CU_TR_ADDRESS_MODE_WRAP = 0,
-      CU_TR_ADDRESS_MODE_CLAMP = 1,
-      CU_TR_ADDRESS_MODE_MIRROR = 2,
-      CU_TR_ADDRESS_MODE_BORDER = 3
-   } CUaddress_mode;
- * \endcode
- *
- * Note that this call has no effect if \p hTexRef is bound to linear memory.
- * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only
- * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
- *
- * \param hTexRef - Texture reference
- * \param dim     - Dimension
- * \param am      - Addressing mode to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTexture,
- * ::cudaBindTexture2D,
- * ::cudaBindTextureToArray,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
-
-/**
- * \brief Sets the filtering mode for a texture reference
- *
- * \deprecated
- *
- * Specifies the filtering mode \p fm to be used when reading memory through
- * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
- *
- * \code
-   typedef enum CUfilter_mode_enum {
-      CU_TR_FILTER_MODE_POINT = 0,
-      CU_TR_FILTER_MODE_LINEAR = 1
-   } CUfilter_mode;
- * \endcode
- *
- * Note that this call has no effect if \p hTexRef is bound to linear memory.
- *
- * \param hTexRef - Texture reference
- * \param fm      - Filtering mode to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
-
-/**
- * \brief Sets the mipmap filtering mode for a texture reference
- *
- * \deprecated
- *
- * Specifies the mipmap filtering mode \p fm to be used when reading memory through
- * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
- *
- * \code
-   typedef enum CUfilter_mode_enum {
-      CU_TR_FILTER_MODE_POINT = 0,
-      CU_TR_FILTER_MODE_LINEAR = 1
-   } CUfilter_mode;
- * \endcode
- *
- * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
- *
- * \param hTexRef - Texture reference
- * \param fm      - Filtering mode to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
-
-/**
- * \brief Sets the mipmap level bias for a texture reference
- *
- * \deprecated
- *
- * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when
- * reading memory through the texture reference \p hTexRef.
- *
- * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
- *
- * \param hTexRef - Texture reference
- * \param bias    - Mipmap level bias
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
-
-/**
- * \brief Sets the mipmap min/max mipmap level clamps for a texture reference
- *
- * \deprecated
- *
- * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp
- * respectively, to be used when reading memory through the texture reference
- * \p hTexRef.
- *
- * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
- *
- * \param hTexRef        - Texture reference
- * \param minMipmapLevelClamp - Mipmap min level clamp
- * \param maxMipmapLevelClamp - Mipmap max level clamp
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
-
-/**
- * \brief Sets the maximum anisotropy for a texture reference
- *
- * \deprecated
- *
- * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through
- * the texture reference \p hTexRef.
- *
- * Note that this call has no effect if \p hTexRef is bound to linear memory.
- *
- * \param hTexRef  - Texture reference
- * \param maxAniso - Maximum anisotropy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTextureToArray,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
-
-/**
- * \brief Sets the border color for a texture reference
- *
- * \deprecated
- *
- * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference
- * \p hTexRef. The color value supports only float type and holds color components in
- * the following sequence:
- * pBorderColor[0] holds 'R' component
- * pBorderColor[1] holds 'G' component
- * pBorderColor[2] holds 'B' component
- * pBorderColor[3] holds 'A' component
- *
- * Note that the color values can be set only when the Address mode is set to
- * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode.
- * Applications using integer border color values have to "reinterpret_cast" their values to float.
- *
- * \param hTexRef       - Texture reference
- * \param pBorderColor  - RGBA color
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddressMode,
- * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor,
- * ::cudaBindTexture,
- * ::cudaBindTexture2D,
- * ::cudaBindTextureToArray,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor);
-
-/**
- * \brief Sets the flags for a texture reference
- *
- * \deprecated
- *
- * Specifies optional flags via \p Flags to specify the behavior of data
- * returned through the texture reference \p hTexRef. The valid flags are:
- *
- * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
- *   having the texture promote integer data to floating point data in the
- *   range [0, 1]. Note that texture with 32-bit integer format
- *   would not be promoted, regardless of whether or not this
- *   flag is specified;
- * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the
- *   default behavior of having the texture coordinates range
- *   from [0, Dim) where Dim is the width or height of the CUDA
- *   array. Instead, the texture coordinates [0, 1.0) reference
- *   the entire breadth of the array dimension;
- * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
- *   filtering optimizations. Trilinear optimizations improve texture filtering
- *   performance by allowing bilinear filtering on textures in scenarios where
- *   it can closely approximate the expected results.
- *
- * \param hTexRef - Texture reference
- * \param Flags   - Optional flags to set
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
- * ::cudaBindTexture,
- * ::cudaBindTexture2D,
- * ::cudaBindTextureToArray,
- * ::cudaBindTextureToMipmappedArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
-
-/**
- * \brief Gets the address associated with a texture reference
- *
- * \deprecated
- *
- * Returns in \p *pdptr the base address bound to the texture reference
- * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
- * is not bound to any device memory range.
- *
- * \param pdptr   - Returned device address
- * \param hTexRef - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
-
-/**
- * \brief Gets the array bound to a texture reference
- *
- * \deprecated
- *
- * Returns in \p *phArray the CUDA array bound to the texture reference
- * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
- * is not bound to any CUDA array.
- *
- * \param phArray - Returned array
- * \param hTexRef - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
-
-/**
- * \brief Gets the mipmapped array bound to a texture reference
- *
- * \deprecated
- *
- * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture
- * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
- * is not bound to any CUDA mipmapped array.
- *
- * \param phMipmappedArray - Returned mipmapped array
- * \param hTexRef          - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
-
-/**
- * \brief Gets the addressing mode used by a texture reference
- *
- * \deprecated
- *
- * Returns in \p *pam the addressing mode corresponding to the
- * dimension \p dim of the texture reference \p hTexRef. Currently, the only
- * valid value for \p dim are 0 and 1.
- *
- * \param pam     - Returned addressing mode
- * \param hTexRef - Texture reference
- * \param dim     - Dimension
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
-
-/**
- * \brief Gets the filter-mode used by a texture reference
- *
- * \deprecated
- *
- * Returns in \p *pfm the filtering mode of the texture reference
- * \p hTexRef.
- *
- * \param pfm     - Returned filtering mode
- * \param hTexRef - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
-
-/**
- * \brief Gets the format used by a texture reference
- *
- * \deprecated
- *
- * Returns in \p *pFormat and \p *pNumChannels the format and number
- * of components of the CUDA array bound to the texture reference \p hTexRef.
- * If \p pFormat or \p pNumChannels is NULL, it will be ignored.
- *
- * \param pFormat      - Returned format
- * \param pNumChannels - Returned number of components
- * \param hTexRef      - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
-
-/**
- * \brief Gets the mipmap filtering mode for a texture reference
- *
- * \deprecated
- *
- * Returns the mipmap filtering mode in \p pfm that's used when reading memory through
- * the texture reference \p hTexRef.
- *
- * \param pfm     - Returned mipmap filtering mode
- * \param hTexRef - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
-
-/**
- * \brief Gets the mipmap level bias for a texture reference
- *
- * \deprecated
- *
- * Returns the mipmap level bias in \p pBias that's added to the specified mipmap
- * level when reading memory through the texture reference \p hTexRef.
- *
- * \param pbias   - Returned mipmap level bias
- * \param hTexRef - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef);
-
-/**
- * \brief Gets the min/max mipmap level clamps for a texture reference
- *
- * \deprecated
- *
- * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp
- * that's used when reading memory through the texture reference \p hTexRef.
- *
- * \param pminMipmapLevelClamp - Returned mipmap min level clamp
- * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp
- * \param hTexRef              - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
-
-/**
- * \brief Gets the maximum anisotropy for a texture reference
- *
- * \deprecated
- *
- * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through
- * the texture reference \p hTexRef.
- *
- * \param pmaxAniso - Returned maximum anisotropy
- * \param hTexRef   - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef);
-
-/**
- * \brief Gets the border color used by a texture reference
- *
- * \deprecated
- *
- * Returns in \p pBorderColor, values of the RGBA color used by
- * the texture reference \p hTexRef.
- * The color value is of type float and holds color components in
- * the following sequence:
- * pBorderColor[0] holds 'R' component
- * pBorderColor[1] holds 'G' component
- * pBorderColor[2] holds 'B' component
- * pBorderColor[3] holds 'A' component
- *
- * \param hTexRef  - Texture reference
- * \param pBorderColor   - Returned Type and Value of RGBA color
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddressMode,
- * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef);
-
-/**
- * \brief Gets the flags used by a texture reference
- *
- * \deprecated
- *
- * Returns in \p *pFlags the flags of the texture reference \p hTexRef.
- *
- * \param pFlags  - Returned flags
- * \param hTexRef - Texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefSetAddress,
- * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
- * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
- * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
- * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
-
-/**
- * \brief Creates a texture reference
- *
- * \deprecated
- *
- * Creates a texture reference and returns its handle in \p *pTexRef. Once
- * created, the application must call ::cuTexRefSetArray() or
- * ::cuTexRefSetAddress() to associate the reference with allocated memory.
- * Other texture reference functions are used to specify the format and
- * interpretation (addressing, filtering, etc.) to be used when the memory is
- * read through this texture reference.
- *
- * \param pTexRef - Returned texture reference
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefDestroy
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef);
-
-/**
- * \brief Destroys a texture reference
- *
- * \deprecated
- *
- * Destroys the texture reference specified by \p hTexRef.
- *
- * \param hTexRef - Texture reference to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuTexRefCreate
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef);
-
-/** @} */ /* END CUDA_TEXREF_DEPRECATED */
-
-
-/**
- * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED]
- *
- * ___MANBRIEF___ surface reference management functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the surface reference management functions of the
- * low-level CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Sets the CUDA array for a surface reference.
- *
- * \deprecated
- *
- * Sets the CUDA array \p hArray to be read and written by the surface reference
- * \p hSurfRef.  Any previous CUDA array state associated with the surface
- * reference is superseded by this function.  \p Flags must be set to 0.
- * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array.
- * Any CUDA array previously bound to \p hSurfRef is unbound.
-
- * \param hSurfRef - Surface reference handle
- * \param hArray - CUDA array handle
- * \param Flags - set to 0
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuModuleGetSurfRef,
- * ::cuSurfRefGetArray,
- * ::cudaBindSurfaceToArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
-
-/**
- * \brief Passes back the CUDA array bound to a surface reference.
- *
- * \deprecated
- *
- * Returns in \p *phArray the CUDA array bound to the surface reference
- * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference
- * is not bound to any CUDA array.
-
- * \param phArray - Surface reference handle
- * \param hSurfRef - Surface reference handle
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray
- */
-__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
-
-/** @} */ /* END CUDA_SURFREF_DEPRECATED */
-
-/**
- * \defgroup CUDA_TEXOBJECT Texture Object Management
- *
- * ___MANBRIEF___ texture object management functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the texture object management functions of the
- * low-level CUDA driver application programming interface. The texture
- * object API is only supported on devices of compute capability 3.0 or higher.
- *
- * @{
- */
-
-/**
- * \brief Creates a texture object
- *
- * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
- * the data to texture from. \p pTexDesc describes how the data should be sampled.
- * \p pResViewDesc is an optional argument that specifies an alternate format for
- * the data described by \p pResDesc, and also describes the subresource region
- * to restrict access to when texturing. \p pResViewDesc can only be specified if
- * the type of resource is a CUDA array or a CUDA mipmapped array.
- *
- * Texture objects are only supported on devices of compute capability 3.0 or higher.
- * Additionally, a texture object is an opaque value, and, as such, should only be
- * accessed through CUDA API calls.
- *
- * The ::CUDA_RESOURCE_DESC structure is defined as:
- * \code
-        typedef struct CUDA_RESOURCE_DESC_st
-        {
-            CUresourcetype resType;
-
-            union {
-                struct {
-                    CUarray hArray;
-                } array;
-                struct {
-                    CUmipmappedArray hMipmappedArray;
-                } mipmap;
-                struct {
-                    CUdeviceptr devPtr;
-                    CUarray_format format;
-                    unsigned int numChannels;
-                    size_t sizeInBytes;
-                } linear;
-                struct {
-                    CUdeviceptr devPtr;
-                    CUarray_format format;
-                    unsigned int numChannels;
-                    size_t width;
-                    size_t height;
-                    size_t pitchInBytes;
-                } pitch2D;
-            } res;
-
-            unsigned int flags;
-        } CUDA_RESOURCE_DESC;
-
- * \endcode
- * where:
- * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from.
- * CUresourceType is defined as:
- * \code
-        typedef enum CUresourcetype_enum {
-            CU_RESOURCE_TYPE_ARRAY           = 0x00,
-            CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01,
-            CU_RESOURCE_TYPE_LINEAR          = 0x02,
-            CU_RESOURCE_TYPE_PITCH2D         = 0x03
-        } CUresourcetype;
- * \endcode
- *
- * \par
- * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray
- * must be set to a valid CUDA array handle.
- *
- * \par
- * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray
- * must be set to a valid CUDA mipmapped array handle.
- *
- * \par
- * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr
- * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
- * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels
- * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes
- * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)).
- *
- * \par
- * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr
- * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
- * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels
- * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width
- * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
- * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
- * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to
- * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
- *
- * - ::flags must be set to zero.
- *
- *
- * The ::CUDA_TEXTURE_DESC struct is defined as
- * \code
-        typedef struct CUDA_TEXTURE_DESC_st {
-            CUaddress_mode addressMode[3];
-            CUfilter_mode filterMode;
-            unsigned int flags;
-            unsigned int maxAnisotropy;
-            CUfilter_mode mipmapFilterMode;
-            float mipmapLevelBias;
-            float minMipmapLevelClamp;
-            float maxMipmapLevelClamp;
-        } CUDA_TEXTURE_DESC;
- * \endcode
- * where
- * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as:
- *   \code
-        typedef enum CUaddress_mode_enum {
-            CU_TR_ADDRESS_MODE_WRAP = 0,
-            CU_TR_ADDRESS_MODE_CLAMP = 1,
-            CU_TR_ADDRESS_MODE_MIRROR = 2,
-            CU_TR_ADDRESS_MODE_BORDER = 3
-        } CUaddress_mode;
- *   \endcode
- *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES
- *   is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
- *
- * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as:
- *   \code
-        typedef enum CUfilter_mode_enum {
-            CU_TR_FILTER_MODE_POINT = 0,
-            CU_TR_FILTER_MODE_LINEAR = 1
-        } CUfilter_mode;
- *   \endcode
- *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR.
- *
- * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following:
- *   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
- *   having the texture promote integer data to floating point data in the
- *   range [0, 1]. Note that texture with 32-bit integer format would not be 
- *   promoted, regardless of whether or not this flag is specified.
- *   - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior
- *   of having the texture coordinates range from [0, Dim) where Dim is the 
- *   width or height of the CUDA array. Instead, the texture coordinates 
- *   [0, 1.0) reference the entire breadth of the array dimension; Note that
- *   for CUDA mipmapped arrays, this flag has to be set.
- *   - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
- *   filtering optimizations. Trilinear optimizations improve texture filtering
- *   performance by allowing bilinear filtering on textures in scenarios where
- *   it can closely approximate the expected results.
- *
- * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be
- *   clamped to the range [1,16].
- *
- * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
- *
- * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
- *
- * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
- *
- * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
- *
- *
- * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as
- * \code
-        typedef struct CUDA_RESOURCE_VIEW_DESC_st
-        {
-            CUresourceViewFormat format;
-            size_t width;
-            size_t height;
-            size_t depth;
-            unsigned int firstMipmapLevel;
-            unsigned int lastMipmapLevel;
-            unsigned int firstLayer;
-            unsigned int lastLayer;
-        } CUDA_RESOURCE_VIEW_DESC;
- * \endcode
- * where:
- * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
- *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
- *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32.
- *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
- *   a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base
- *   format but with 4 channels.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block
- *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
- *   this value has to be equal to that of the original resource.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block
- *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
- *   this value has to be equal to that of the original resource.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the
- *   original resource.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
- *   For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp
- *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
- *   then the actual minimum mipmap level clamp will be 3.2.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
- *   has to be zero.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
- *   For non-layered resources, this value has to be zero.
- *
- * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources,
- *   this value has to be zero.
- *
- *
- * \param pTexObject   - Texture object to create
- * \param pResDesc     - Resource descriptor
- * \param pTexDesc     - Texture descriptor
- * \param pResViewDesc - Resource view descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuTexObjectDestroy,
- * ::cudaCreateTextureObject
- */
-CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc);
-
-/**
- * \brief Destroys a texture object
- *
- * Destroys the texture object specified by \p texObject.
- *
- * \param texObject - Texture object to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuTexObjectCreate,
- * ::cudaDestroyTextureObject
- */
-CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject);
-
-/**
- * \brief Returns a texture object's resource descriptor
- *
- * Returns the resource descriptor for the texture object specified by \p texObject.
- *
- * \param pResDesc  - Resource descriptor
- * \param texObject - Texture object
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuTexObjectCreate,
- * ::cudaGetTextureObjectResourceDesc,
- */
-CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject);
-
-/**
- * \brief Returns a texture object's texture descriptor
- *
- * Returns the texture descriptor for the texture object specified by \p texObject.
- *
- * \param pTexDesc  - Texture descriptor
- * \param texObject - Texture object
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuTexObjectCreate,
- * ::cudaGetTextureObjectTextureDesc
- */
-CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject);
-
-/**
- * \brief Returns a texture object's resource view descriptor
- *
- * Returns the resource view descriptor for the texture object specified by \p texObject.
- * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * \param pResViewDesc - Resource view descriptor
- * \param texObject    - Texture object
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuTexObjectCreate,
- * ::cudaGetTextureObjectResourceViewDesc
- */
-CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject);
-
-/** @} */ /* END CUDA_TEXOBJECT */
-
-/**
- * \defgroup CUDA_SURFOBJECT Surface Object Management
- *
- * ___MANBRIEF___ surface object management functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the surface object management functions of the
- * low-level CUDA driver application programming interface. The surface
- * object API is only supported on devices of compute capability 3.0 or higher.
- *
- * @{
- */
-
-/**
- * \brief Creates a surface object
- *
- * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
- * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be
- * ::CU_RESOURCE_TYPE_ARRAY and  ::CUDA_RESOURCE_DESC::res::array::hArray
- * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero.
- *
- * Surface objects are only supported on devices of compute capability 3.0 or higher.
- * Additionally, a surface object is an opaque value, and, as such, should only be
- * accessed through CUDA API calls.
- *
- * \param pSurfObject - Surface object to create
- * \param pResDesc    - Resource descriptor
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuSurfObjectDestroy,
- * ::cudaCreateSurfaceObject
- */
-CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc);
-
-/**
- * \brief Destroys a surface object
- *
- * Destroys the surface object specified by \p surfObject.
- *
- * \param surfObject - Surface object to destroy
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuSurfObjectCreate,
- * ::cudaDestroySurfaceObject
- */
-CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject);
-
-/**
- * \brief Returns a surface object's resource descriptor
- *
- * Returns the resource descriptor for the surface object specified by \p surfObject.
- *
- * \param pResDesc   - Resource descriptor
- * \param surfObject - Surface object
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE
- *
- * \sa
- * ::cuSurfObjectCreate,
- * ::cudaGetSurfaceObjectResourceDesc
- */
-CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject);
-
-/** @} */ /* END CUDA_SURFOBJECT */
-
-/**
- * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access
- *
- * ___MANBRIEF___ direct peer context memory access functions of the low-level
- * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the direct peer context memory access functions
- * of the low-level CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Queries if a device may directly access a peer device's memory.
- *
- * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of
- * directly accessing memory from contexts on \p peerDev and 0 otherwise.
- * If direct access of \p peerDev from \p dev is possible, then access may be
- * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess().
- *
- * \param canAccessPeer - Returned access capability
- * \param dev           - Device from which allocations on \p peerDev are to
- *                        be directly accessed.
- * \param peerDev       - Device on which the allocations to be directly accessed
- *                        by \p dev reside.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE
- * \notefnerr
- *
- * \sa
- * ::cuCtxEnablePeerAccess,
- * ::cuCtxDisablePeerAccess,
- * ::cudaDeviceCanAccessPeer
- */
-CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
-
-/**
- * \brief Enables direct access to memory allocations in a peer context.
- *
- * If both the current context and \p peerContext are on devices which support unified
- * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same
- * major compute capability, then on success all allocations from \p peerContext will
- * immediately be accessible by the current context.  See \ref CUDA_UNIFIED for additional
- * details.
- *
- * Note that access granted by this call is unidirectional and that in order to access
- * memory from the current context in \p peerContext, a separate symmetric call
- * to ::cuCtxEnablePeerAccess() is required.
- *
- * Note that there are both device-wide and system-wide limitations per system
- * configuration, as noted in the CUDA Programming Guide under the section
- * "Peer-to-Peer Memory Access".
- *
- * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates
- * that the ::CUdevice of the current context cannot directly access memory
- * from the ::CUdevice of \p peerContext.
- *
- * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of
- * \p peerContext from the current context has already been enabled.
- *
- * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible
- * because hardware resources required for peer access have been exhausted.
- *
- * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext
- * is not a valid context, or if the current context is \p peerContext.
- *
- * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0.
- *
- * \param peerContext - Peer context to enable direct access to from the current context
- * \param Flags       - Reserved for future use and must be set to 0
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
- * ::CUDA_ERROR_TOO_MANY_PEERS,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cuDeviceCanAccessPeer,
- * ::cuCtxDisablePeerAccess,
- * ::cudaDeviceEnablePeerAccess
- */
-CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
-
-/**
- * \brief Disables direct access to memory allocations in a peer context and
- * unregisters any registered allocations.
- *
-  Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
- * not yet been enabled from \p peerContext to the current context.
- *
- * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if
- * \p peerContext is not a valid context.
- *
- * \param peerContext - Peer context to disable direct access to
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * \notefnerr
- *
- * \sa
- * ::cuDeviceCanAccessPeer,
- * ::cuCtxEnablePeerAccess,
- * ::cudaDeviceDisablePeerAccess
- */
-CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext);
-
-/**
- * \brief Queries attributes of the link between two devices.
- *
- * Returns in \p *value the value of the requested attribute \p attrib of the
- * link between \p srcDevice and \p dstDevice. The supported attributes are:
- * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the
- *   performance of the link between two devices.
- * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable.
- * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over
- *   the link are supported.
- * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can
- *   be accessed over the link.
- *
- * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid
- * or if they represent the same device.
- *
- * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is
- * a null pointer.
- *
- * \param value         - Returned value of the requested attribute
- * \param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
- * \param srcDevice     - The source device of the target link.
- * \param dstDevice     - The destination device of the target link.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE,
- * ::CUDA_ERROR_INVALID_VALUE
- * \notefnerr
- *
- * \sa
- * ::cuCtxEnablePeerAccess,
- * ::cuCtxDisablePeerAccess,
- * ::cuDeviceCanAccessPeer,
- * ::cudaDeviceGetP2PAttribute
- */
-CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
-
-/** @} */ /* END CUDA_PEER_ACCESS */
-
-/**
- * \defgroup CUDA_GRAPHICS Graphics Interoperability
- *
- * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the graphics interoperability functions of the
- * low-level CUDA driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Unregisters a graphics resource for access by CUDA
- *
- * Unregisters the graphics resource \p resource so it is not accessible by
- * CUDA unless registered again.
- *
- * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is
- * returned.
- *
- * \param resource - Resource to unregister
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_UNKNOWN
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsD3D9RegisterResource,
- * ::cuGraphicsD3D10RegisterResource,
- * ::cuGraphicsD3D11RegisterResource,
- * ::cuGraphicsGLRegisterBuffer,
- * ::cuGraphicsGLRegisterImage,
- * ::cudaGraphicsUnregisterResource
- */
-CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource);
-
-/**
- * \brief Get an array through which to access a subresource of a mapped graphics resource.
- *
- * Returns in \p *pArray an array through which the subresource of the mapped
- * graphics resource \p resource which corresponds to array index \p arrayIndex
- * and mipmap level \p mipLevel may be accessed.  The value set in \p *pArray may
- * change every time that \p resource is mapped.
- *
- * If \p resource is not a texture then it cannot be accessed via an array and
- * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
- * If \p arrayIndex is not a valid array index for \p resource then
- * ::CUDA_ERROR_INVALID_VALUE is returned.
- * If \p mipLevel is not a valid mipmap level for \p resource then
- * ::CUDA_ERROR_INVALID_VALUE is returned.
- * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
- *
- * \param pArray      - Returned array through which a subresource of \p resource may be accessed
- * \param resource    - Mapped resource to access
- * \param arrayIndex  - Array index for array textures or cubemap face
- *                      index as defined by ::CUarray_cubemap_face for
- *                      cubemap textures for the subresource to access
- * \param mipLevel    - Mipmap level for the subresource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED,
- * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsResourceGetMappedPointer,
- * ::cudaGraphicsSubResourceGetMappedArray
- */
-CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
-
-/**
- * \brief Get a mipmapped array through which to access a mapped graphics resource.
- *
- * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics
- * resource \p resource. The value set in \p *pMipmappedArray may change every time
- * that \p resource is mapped.
- *
- * If \p resource is not a texture then it cannot be accessed via a mipmapped array and
- * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
- * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
- *
- * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed
- * \param resource        - Mapped resource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED,
- * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsResourceGetMappedPointer,
- * ::cudaGraphicsResourceGetMappedMipmappedArray
- */
-CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
-
-/**
- * \brief Get a device pointer through which to access a mapped graphics resource.
- *
- * Returns in \p *pDevPtr a pointer through which the mapped graphics resource
- * \p resource may be accessed.
- * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer.
- * The value set in \p pPointer may change every time that \p resource is mapped.
- *
- * If \p resource is not a buffer then it cannot be accessed via a pointer and
- * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned.
- * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
- * *
- * \param pDevPtr    - Returned pointer through which \p resource may be accessed
- * \param pSize      - Returned size of the buffer accessible starting at \p *pPointer
- * \param resource   - Mapped resource to access
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED,
- * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsMapResources,
- * ::cuGraphicsSubResourceGetMappedArray,
- * ::cudaGraphicsResourceGetMappedPointer
- */
-CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
-
-/**
- * \brief Set usage flags for mapping a graphics resource
- *
- * Set \p flags for mapping the graphics resource \p resource.
- *
- * Changes to \p flags will take effect the next time \p resource is mapped.
- * The \p flags argument may be any of the following:
-
- * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
- *   resource will be used. It is therefore assumed that this resource will be
- *   read from and written to by CUDA kernels.  This is the default value.
- * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
- *   access this resource will not write to this resource.
- * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
- *   which access this resource will not read from this resource and will
- *   write over the entire contents of the resource, so none of the data
- *   previously stored in the resource will be preserved.
- *
- * If \p resource is presently mapped for access by CUDA then
- * ::CUDA_ERROR_ALREADY_MAPPED is returned.
- * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned.
- *
- * \param resource - Registered resource to set flags for
- * \param flags    - Parameters for resource mapping
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ALREADY_MAPPED
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsMapResources,
- * ::cudaGraphicsResourceSetMapFlags
- */
-CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
-
-/**
- * \brief Map graphics resources for access by CUDA
- *
- * Maps the \p count graphics resources in \p resources for access by CUDA.
- *
- * The resources in \p resources may be accessed by CUDA until they
- * are unmapped. The graphics API from which \p resources were registered
- * should not access any resources while they are mapped by CUDA. If an
- * application does so, the results are undefined.
- *
- * This function provides the synchronization guarantee that any graphics calls
- * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA
- * work issued in \p stream begins.
- *
- * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
- * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
- *
- * \param count      - Number of resources to map
- * \param resources  - Resources to map for CUDA usage
- * \param hStream    - Stream with which to synchronize
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_ALREADY_MAPPED,
- * ::CUDA_ERROR_UNKNOWN
- * \note_null_stream
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsResourceGetMappedPointer,
- * ::cuGraphicsSubResourceGetMappedArray,
- * ::cuGraphicsUnmapResources,
- * ::cudaGraphicsMapResources
- */
-CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-
-/**
- * \brief Unmap graphics resources.
- *
- * Unmaps the \p count graphics resources in \p resources.
- *
- * Once unmapped, the resources in \p resources may not be accessed by CUDA
- * until they are mapped again.
- *
- * This function provides the synchronization guarantee that any CUDA work issued
- * in \p stream before ::cuGraphicsUnmapResources() will complete before any
- * subsequently issued graphics work begins.
- *
- *
- * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
- * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
- *
- * \param count      - Number of resources to unmap
- * \param resources  - Resources to unmap
- * \param hStream    - Stream with which to synchronize
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_DEINITIALIZED,
- * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_CONTEXT,
- * ::CUDA_ERROR_INVALID_HANDLE,
- * ::CUDA_ERROR_NOT_MAPPED,
- * ::CUDA_ERROR_UNKNOWN
- * \note_null_stream
- * \notefnerr
- *
- * \sa
- * ::cuGraphicsMapResources,
- * ::cudaGraphicsUnmapResources
- */
-CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-
-/** @} */ /* END CUDA_GRAPHICS */
-
-/**
- * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access 
- *
- * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API
- * (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the driver entry point access functions of the low-level CUDA
- * driver application programming interface.
- *
- * @{
- */
-
-/**
- * \brief Returns the requested driver API function pointer
- *
- * Returns in \p **pfn the address of the CUDA driver function for the requested
- * CUDA version and flags.
- *
- * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2
- * should be specified as 11020. For a requested driver symbol, if the specified
- * CUDA version is greater than or equal to the CUDA version in which the driver symbol
- * was introduced, this API will return the function pointer to the corresponding
- * versioned function.
- *
- * The pointer returned by the API should be cast to a function pointer matching the
- * requested driver function's definition in the API header file. The function pointer
- * typedef can be picked up from the corresponding typedefs header file. For example,
- * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h.
- *
- * The API will return ::CUDA_ERROR_NOT_FOUND if the requested driver function is not
- * supported on the platform, no ABI compatible driver function exists for the specified
- * \p cudaVersion or if the driver symbol is invalid.
- *
- * The requested flags can be:
- * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to
- *   ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with
- *   --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM
- *   is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise.
- * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols
- *   that match the requested driver symbol name except the corresponding per-thread versions.
- * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all
- *   driver symbols that match the requested driver symbol name including the per-thread
- *   versions. If a per-thread version is not found, the API will return the legacy version
- *   of the driver function.
- *
- * \param symbol - The base name of the driver API function to look for. As an example,
- *                 for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and
- *                 \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. 
- * \param pfn - Location to return the function pointer to the requested driver function
- * \param cudaVersion - The CUDA version to look for the requested driver symbol 
- * \param flags -  Flags to specify search options.
- *
- * \return
- * ::CUDA_SUCCESS,
- * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_NOT_SUPPORTED,
- * ::CUDA_ERROR_NOT_FOUND
- * \note_version_mixing
- *
- * \sa
- * ::cudaGetDriverEntryPoint
- */
-CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags);
-
-/** @} */ /* END CUDA_DRIVER_ENTRY_POINT */
-
-CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
-
-/**
- * CUDA API versioning support
- */
-#if defined(__CUDA_API_VERSION_INTERNAL)
-    #undef cuMemHostRegister
-    #undef cuGraphicsResourceSetMapFlags
-    #undef cuLinkCreate
-    #undef cuLinkAddData
-    #undef cuLinkAddFile
-    #undef cuDeviceTotalMem
-    #undef cuCtxCreate
-    #undef cuModuleGetGlobal
-    #undef cuMemGetInfo
-    #undef cuMemAlloc
-    #undef cuMemAllocPitch
-    #undef cuMemFree
-    #undef cuMemGetAddressRange
-    #undef cuMemAllocHost
-    #undef cuMemHostGetDevicePointer
-    #undef cuMemcpyHtoD
-    #undef cuMemcpyDtoH
-    #undef cuMemcpyDtoD
-    #undef cuMemcpyDtoA
-    #undef cuMemcpyAtoD
-    #undef cuMemcpyHtoA
-    #undef cuMemcpyAtoH
-    #undef cuMemcpyAtoA
-    #undef cuMemcpyHtoAAsync
-    #undef cuMemcpyAtoHAsync
-    #undef cuMemcpy2D
-    #undef cuMemcpy2DUnaligned
-    #undef cuMemcpy3D
-    #undef cuMemcpyHtoDAsync
-    #undef cuMemcpyDtoHAsync
-    #undef cuMemcpyDtoDAsync
-    #undef cuMemcpy2DAsync
-    #undef cuMemcpy3DAsync
-    #undef cuMemsetD8
-    #undef cuMemsetD16
-    #undef cuMemsetD32
-    #undef cuMemsetD2D8
-    #undef cuMemsetD2D16
-    #undef cuMemsetD2D32
-    #undef cuArrayCreate
-    #undef cuArrayGetDescriptor
-    #undef cuArray3DCreate
-    #undef cuArray3DGetDescriptor
-    #undef cuTexRefSetAddress
-    #undef cuTexRefSetAddress2D
-    #undef cuTexRefGetAddress
-    #undef cuGraphicsResourceGetMappedPointer
-    #undef cuCtxDestroy
-    #undef cuCtxPopCurrent
-    #undef cuCtxPushCurrent
-    #undef cuStreamDestroy
-    #undef cuEventDestroy
-    #undef cuMemcpy
-    #undef cuMemcpyAsync
-    #undef cuMemcpyPeer
-    #undef cuMemcpyPeerAsync
-    #undef cuMemcpy3DPeer
-    #undef cuMemcpy3DPeerAsync
-    #undef cuMemsetD8Async
-    #undef cuMemsetD16Async
-    #undef cuMemsetD32Async
-    #undef cuMemsetD2D8Async
-    #undef cuMemsetD2D16Async
-    #undef cuMemsetD2D32Async
-    #undef cuStreamGetPriority
-    #undef cuStreamGetFlags
-    #undef cuStreamGetCtx
-    #undef cuStreamWaitEvent
-    #undef cuStreamAddCallback
-    #undef cuStreamAttachMemAsync
-    #undef cuStreamQuery
-    #undef cuStreamSynchronize
-    #undef cuEventRecord
-    #undef cuEventRecordWithFlags
-    #undef cuLaunchKernel
-    #undef cuLaunchHostFunc
-    #undef cuGraphicsMapResources
-    #undef cuGraphicsUnmapResources
-    #undef cuStreamWriteValue32
-    #undef cuStreamWaitValue32
-    #undef cuStreamWriteValue64
-    #undef cuStreamWaitValue64
-    #undef cuStreamBatchMemOp
-    #undef cuMemPrefetchAsync
-    #undef cuLaunchCooperativeKernel
-    #undef cuSignalExternalSemaphoresAsync
-    #undef cuWaitExternalSemaphoresAsync
-    #undef cuStreamBeginCapture
-    #undef cuStreamEndCapture
-    #undef cuStreamIsCapturing
-    #undef cuStreamGetCaptureInfo
-    #undef cuStreamGetCaptureInfo_v2
-    #undef cuGraphUpload
-    #undef cuGraphLaunch
-    #undef cuDevicePrimaryCtxRelease
-    #undef cuDevicePrimaryCtxReset
-    #undef cuDevicePrimaryCtxSetFlags
-    #undef cuIpcOpenMemHandle
-    #undef cuStreamCopyAttributes
-    #undef cuStreamSetAttribute
-    #undef cuStreamGetAttribute
-    #undef cuGraphInstantiate
-    #undef cuMemMapArrayAsync
-    #undef cuMemFreeAsync 
-    #undef cuMemAllocAsync 
-    #undef cuMemAllocFromPoolAsync 
-    #undef cuStreamUpdateCaptureDependencies
-
-    CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
-    CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
-    CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
-    CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
-        unsigned int numOptions, CUjit_option *options, void **optionValues);
-    CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
-        unsigned int numOptions, CUjit_option *options, void **optionValues);
-    CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
-
-    typedef unsigned int CUdeviceptr_v1;
-
-    typedef struct CUDA_MEMCPY2D_v1_st
-    {
-        unsigned int srcXInBytes;   /**< Source X in bytes */
-        unsigned int srcY;          /**< Source Y */
-        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
-        const void *srcHost;        /**< Source host pointer */
-        CUdeviceptr_v1 srcDevice;   /**< Source device pointer */
-        CUarray srcArray;           /**< Source array reference */
-        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
-
-        unsigned int dstXInBytes;   /**< Destination X in bytes */
-        unsigned int dstY;          /**< Destination Y */
-        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
-        void *dstHost;              /**< Destination host pointer */
-        CUdeviceptr_v1 dstDevice;   /**< Destination device pointer */
-        CUarray dstArray;           /**< Destination array reference */
-        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
-
-        unsigned int WidthInBytes;  /**< Width of 2D memory copy in bytes */
-        unsigned int Height;        /**< Height of 2D memory copy */
-    } CUDA_MEMCPY2D_v1;
-
-    typedef struct CUDA_MEMCPY3D_v1_st
-    {
-        unsigned int srcXInBytes;   /**< Source X in bytes */
-        unsigned int srcY;          /**< Source Y */
-        unsigned int srcZ;          /**< Source Z */
-        unsigned int srcLOD;        /**< Source LOD */
-        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
-        const void *srcHost;        /**< Source host pointer */
-        CUdeviceptr_v1 srcDevice;   /**< Source device pointer */
-        CUarray srcArray;           /**< Source array reference */
-        void *reserved0;            /**< Must be NULL */
-        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
-        unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */
-
-        unsigned int dstXInBytes;   /**< Destination X in bytes */
-        unsigned int dstY;          /**< Destination Y */
-        unsigned int dstZ;          /**< Destination Z */
-        unsigned int dstLOD;        /**< Destination LOD */
-        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
-        void *dstHost;              /**< Destination host pointer */
-        CUdeviceptr_v1 dstDevice;   /**< Destination device pointer */
-        CUarray dstArray;           /**< Destination array reference */
-        void *reserved1;            /**< Must be NULL */
-        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
-        unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
-
-        unsigned int WidthInBytes;  /**< Width of 3D memory copy in bytes */
-        unsigned int Height;        /**< Height of 3D memory copy */
-        unsigned int Depth;         /**< Depth of 3D memory copy */
-    } CUDA_MEMCPY3D_v1;
-
-    typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st
-    {
-        unsigned int Width;         /**< Width of array */
-        unsigned int Height;        /**< Height of array */
-
-        CUarray_format Format;      /**< Array format */
-        unsigned int NumChannels;   /**< Channels per array element */
-    } CUDA_ARRAY_DESCRIPTOR_v1;
-
-    typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st
-    {
-        unsigned int Width;         /**< Width of 3D array */
-        unsigned int Height;        /**< Height of 3D array */
-        unsigned int Depth;         /**< Depth of 3D array */
-
-        CUarray_format Format;      /**< Array format */
-        unsigned int NumChannels;   /**< Channels per array element */
-        unsigned int Flags;         /**< Flags */
-    } CUDA_ARRAY3D_DESCRIPTOR_v1;
-
-    CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
-    CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
-    CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
-    CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
-    CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize);
-    CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
-    CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr);
-    CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
-    CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
-    CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
-    CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
-    CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy);
-    CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy);
-    CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy);
-    CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
-    CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
-    CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
-    CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
-    CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
-    CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
-    CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
-    CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
-    CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
-    CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
-    CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
-    CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
-    CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
-    CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
-
-    CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
-    CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
-    CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
-    CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
-    CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
-    CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
-    CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
-    CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
-
-    CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy);
-    CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy);
-    CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy);
-    CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N);
-    CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N);
-    CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N);
-    CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
-    CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
-    CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
-    CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
-    CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
-    CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
-    CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
-
-    CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
-    CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
-
-    CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
-    CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
-    CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
-    CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
-    CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
-    CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
-    CUresult CUDAAPI cuStreamQuery(CUstream hStream);
-    CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
-    CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
-    CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags);
-    CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
-    CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
-    CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-    CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
-    CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
-    CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
-    CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
-    CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
-    CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
-    CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
-    CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
-    CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
-    CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
-    CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream);
-    CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream);
-    CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode);
-    CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
-    CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
-    CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
-    CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
-    CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream);
-    CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream);
-    CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream);
-    CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value);
-    CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param);
-
-    CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
-    CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
-    CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream);
-
-    CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);
-    CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream);
-    CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
-
-    CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
-#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
-static inline CUresult cuGetProcAddress_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags) {
-    const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM|
-                                 CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM);
-    if ((flags & procAddressMask) == 0) {
-        flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM;
-    }
-    return cuGetProcAddress(symbol, funcPtr, driverVersion, flags); 
-}
-#define cuGetProcAddress cuGetProcAddress_ptsz
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#if defined(__GNUC__)
-  #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT)
-    #pragma GCC visibility pop
-  #endif
-#endif
-
-#undef __CUDA_DEPRECATED
-
-#endif /* __cuda_cuda_h__ */
diff --git a/include/triton/external/CUDA/nvml.h b/include/triton/external/CUDA/nvml.h
deleted file mode 100755
index 0b38f5f8a..000000000
--- a/include/triton/external/CUDA/nvml.h
+++ /dev/null
@@ -1,6281 +0,0 @@
-/*
- * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO USER:   
- *
- * This source code is subject to NVIDIA ownership rights under U.S. and 
- * international Copyright laws.  Users and possessors of this source code 
- * are hereby granted a nonexclusive, royalty-free license to use this code 
- * in individual and commercial software.
- *
- * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
- * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
- * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
- * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
- * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
- * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
- * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
- * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE 
- * OR PERFORMANCE OF THIS SOURCE CODE.  
- *
- * U.S. Government End Users.   This source code is a "commercial item" as 
- * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of 
- * "commercial computer  software"  and "commercial computer software 
- * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) 
- * and is provided to the U.S. Government only as a commercial end item.  
- * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
- * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
- * source code with only those rights set forth herein. 
- *
- * Any use of this source code in individual and commercial software must 
- * include, in the user documentation and internal comments to the code,
- * the above Disclaimer and U.S. Government End Users Notice.
- */
-
-/* 
-NVML API Reference
-
-The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and 
-managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building
-3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi
-tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads.
-
-API Documentation
-
-Supported platforms:
-- Windows:     Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit
-- Linux:       32-bit and 64-bit
-- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5
-
-Supported products:
-- Full Support
-    - All Tesla products, starting with the Fermi architecture
-    - All Quadro products, starting with the Fermi architecture
-    - All GRID products, starting with the Kepler architecture
-    - Selected GeForce Titan products
-- Limited Support
-    - All Geforce products, starting with the Fermi architecture
-
-The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is
-not be added to the system path by default. To dynamically link to NVML, add this path to the PATH 
-environmental variable. To dynamically load NVML, call LoadLibrary with this path.
-
-On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit
-and 64 bit NVML libraries will be installed.
-
-Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html
-*/
-
-#ifndef __nvml_nvml_h__
-#define __nvml_nvml_h__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * On Windows, set up methods for DLL export
- * define NVML_STATIC_IMPORT when using nvml_loader library
- */
-#if defined _WINDOWS
-    #if !defined NVML_STATIC_IMPORT
-        #if defined NVML_LIB_EXPORT
-            #define DECLDIR __declspec(dllexport)
-        #else
-            #define DECLDIR __declspec(dllimport)
-        #endif
-    #else
-        #define DECLDIR
-    #endif
-#else
-    #define DECLDIR
-#endif
-
-/**
- * NVML API versioning support
- */
-#define NVML_API_VERSION            10
-#define NVML_API_VERSION_STR        "10"
-#define nvmlInit                    nvmlInit_v2
-#define nvmlDeviceGetPciInfo        nvmlDeviceGetPciInfo_v3
-#define nvmlDeviceGetCount          nvmlDeviceGetCount_v2
-#define nvmlDeviceGetHandleByIndex  nvmlDeviceGetHandleByIndex_v2
-#define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2
-#define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2
-#define nvmlDeviceRemoveGpu         nvmlDeviceRemoveGpu_v2
-
-/***************************************************************************************************/
-/** @defgroup nvmlDeviceStructs Device Structs
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Special constant that some fields take when they are not available.
- * Used when only part of the struct is not available.
- *
- * Each structure explicitly states when to check for this value.
- */
-#define NVML_VALUE_NOT_AVAILABLE (-1)
-
-typedef struct nvmlDevice_st* nvmlDevice_t;
-
-/**
- * Buffer size guaranteed to be large enough for pci bus id
- */
-#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE      32
-
-/**
- * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy
- */
-#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE   16
-
-/**
- * PCI information about a GPU device.
- */
-typedef struct nvmlPciInfo_st
-{
-    char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
-    unsigned int domain;             //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff
-    unsigned int bus;                //!< The bus on which the device resides, 0 to 0xff
-    unsigned int device;             //!< The device's id on the bus, 0 to 31
-    unsigned int pciDeviceId;        //!< The combined 16-bit device id and 16-bit vendor id
-
-    // Added in NVML 2.285 API
-    unsigned int pciSubSystemId;     //!< The 32-bit Sub System Device ID
-
-    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
-} nvmlPciInfo_t;
-
-/**
- * PCI format string for ::busIdLegacy
- */
-#define NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT           "%04X:%02X:%02X.0"
-
-/**
- * PCI format string for ::busId
- */
-#define NVML_DEVICE_PCI_BUS_ID_FMT                  "%08X:%02X:%02X.0"
-
-/**
- * Utility macro for filling the pci bus id format from a nvmlPciInfo_t
- */
-#define NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(pciInfo)    (pciInfo)->domain, \
-                                                    (pciInfo)->bus,    \
-                                                    (pciInfo)->device
-
-/**
- * Detailed ECC error counts for a device.
- *
- * @deprecated  Different GPU families can have different memory error counters
- *              See \ref nvmlDeviceGetMemoryErrorCounter
- */
-typedef struct nvmlEccErrorCounts_st 
-{
-    unsigned long long l1Cache;      //!< L1 cache errors
-    unsigned long long l2Cache;      //!< L2 cache errors
-    unsigned long long deviceMemory; //!< Device memory errors
-    unsigned long long registerFile; //!< Register file errors
-} nvmlEccErrorCounts_t;
-
-/** 
- * Utilization information for a device.
- * Each sample period may be between 1 second and 1/6 second, depending on the product being queried.
- */
-typedef struct nvmlUtilization_st 
-{
-    unsigned int gpu;                //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU
-    unsigned int memory;             //!< Percent of time over the past sample period during which global (device) memory was being read or written
-} nvmlUtilization_t;
-
-/** 
- * Memory allocation information for a device.
- */
-typedef struct nvmlMemory_st 
-{
-    unsigned long long total;        //!< Total installed FB memory (in bytes)
-    unsigned long long free;         //!< Unallocated FB memory (in bytes)
-    unsigned long long used;         //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
-} nvmlMemory_t;
-
-/**
- * BAR1 Memory allocation Information for a device
- */
-typedef struct nvmlBAR1Memory_st
-{
-    unsigned long long bar1Total;    //!< Total BAR1 Memory (in bytes)
-    unsigned long long bar1Free;     //!< Unallocated BAR1 Memory (in bytes)
-    unsigned long long bar1Used;     //!< Allocated Used Memory (in bytes)
-}nvmlBAR1Memory_t;
-
-/**
- * Information about running compute processes on the GPU
- */
-typedef struct nvmlProcessInfo_st
-{
-    unsigned int pid;                 //!< Process ID
-    unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes.
-                                      //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported
-                                      //! because Windows KMD manages all the memory and not the NVIDIA driver
-} nvmlProcessInfo_t;
-
-/**
- * Enum to represent type of bridge chip
- */
-typedef enum nvmlBridgeChipType_enum
-{
-    NVML_BRIDGE_CHIP_PLX = 0,
-    NVML_BRIDGE_CHIP_BRO4 = 1           
-}nvmlBridgeChipType_t;
-
-/**
- * Maximum number of NvLink links supported 
- */
-#define NVML_NVLINK_MAX_LINKS 6
-
-/**
- * Enum to represent the NvLink utilization counter packet units
- */
-typedef enum nvmlNvLinkUtilizationCountUnits_enum
-{
-    NVML_NVLINK_COUNTER_UNIT_CYCLES =  0,     // count by cycles
-    NVML_NVLINK_COUNTER_UNIT_PACKETS = 1,     // count by packets
-    NVML_NVLINK_COUNTER_UNIT_BYTES   = 2,     // count by bytes
-
-    // this must be last
-    NVML_NVLINK_COUNTER_UNIT_COUNT
-} nvmlNvLinkUtilizationCountUnits_t;
-
-/**
- * Enum to represent the NvLink utilization counter packet types to count
- *  ** this is ONLY applicable with the units as packets or bytes
- *  ** as specified in \a nvmlNvLinkUtilizationCountUnits_t
- *  ** all packet filter descriptions are target GPU centric
- *  ** these can be "OR'd" together 
- */
-typedef enum nvmlNvLinkUtilizationCountPktTypes_enum
-{
-    NVML_NVLINK_COUNTER_PKTFILTER_NOP        = 0x1,     // no operation packets
-    NVML_NVLINK_COUNTER_PKTFILTER_READ       = 0x2,     // read packets
-    NVML_NVLINK_COUNTER_PKTFILTER_WRITE      = 0x4,     // write packets
-    NVML_NVLINK_COUNTER_PKTFILTER_RATOM      = 0x8,     // reduction atomic requests
-    NVML_NVLINK_COUNTER_PKTFILTER_NRATOM     = 0x10,    // non-reduction atomic requests
-    NVML_NVLINK_COUNTER_PKTFILTER_FLUSH      = 0x20,    // flush requests
-    NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA   = 0x40,    // responses with data
-    NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80,    // responses without data
-    NVML_NVLINK_COUNTER_PKTFILTER_ALL        = 0xFF     // all packets
-} nvmlNvLinkUtilizationCountPktTypes_t;
-
-/** 
- * Struct to define the NVLINK counter controls
- */
-typedef struct nvmlNvLinkUtilizationControl_st
-{
-    nvmlNvLinkUtilizationCountUnits_t units;
-    nvmlNvLinkUtilizationCountPktTypes_t pktfilter;
-} nvmlNvLinkUtilizationControl_t;
-
-/**
- * Enum to represent NvLink queryable capabilities
- */
-typedef enum nvmlNvLinkCapability_enum
-{
-    NVML_NVLINK_CAP_P2P_SUPPORTED = 0,     // P2P over NVLink is supported
-    NVML_NVLINK_CAP_SYSMEM_ACCESS = 1,     // Access to system memory is supported
-    NVML_NVLINK_CAP_P2P_ATOMICS   = 2,     // P2P atomics are supported
-    NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3,     // System memory atomics are supported
-    NVML_NVLINK_CAP_SLI_BRIDGE    = 4,     // SLI is supported over this link
-    NVML_NVLINK_CAP_VALID         = 5,     // Link is supported on this device
-    // should be last
-    NVML_NVLINK_CAP_COUNT
-} nvmlNvLinkCapability_t;
-
-/**
- * Enum to represent NvLink queryable error counters
- */
-typedef enum nvmlNvLinkErrorCounter_enum
-{
-    NVML_NVLINK_ERROR_DL_REPLAY   = 0,     // Data link transmit replay error counter
-    NVML_NVLINK_ERROR_DL_RECOVERY = 1,     // Data link transmit recovery error counter
-    NVML_NVLINK_ERROR_DL_CRC_FLIT = 2,     // Data link receive flow control digit CRC error counter
-    NVML_NVLINK_ERROR_DL_CRC_DATA = 3,     // Data link receive data CRC error counter
-
-    // this must be last
-    NVML_NVLINK_ERROR_COUNT
-} nvmlNvLinkErrorCounter_t;
-
-/**
- * Represents level relationships within a system between two GPUs
- * The enums are spaced to allow for future relationships
- */
-typedef enum nvmlGpuLevel_enum
-{
-    NVML_TOPOLOGY_INTERNAL           = 0, // e.g. Tesla K80
-    NVML_TOPOLOGY_SINGLE             = 10, // all devices that only need traverse a single PCIe switch
-    NVML_TOPOLOGY_MULTIPLE           = 20, // all devices that need not traverse a host bridge
-    NVML_TOPOLOGY_HOSTBRIDGE         = 30, // all devices that are connected to the same host bridge
-    NVML_TOPOLOGY_NODE               = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges
-    NVML_TOPOLOGY_SYSTEM             = 50, // all devices in the system
-
-    // there is purposefully no COUNT here because of the need for spacing above
-} nvmlGpuTopologyLevel_t;
-
-/* Compatibility for CPU->NODE renaming */
-#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE
-
-/* P2P Capability Index Status*/
-typedef enum nvmlGpuP2PStatus_enum
-{
-    NVML_P2P_STATUS_OK     = 0,
-    NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
-    NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
-    NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
-    NVML_P2P_STATUS_DISABLED_BY_REGKEY,
-    NVML_P2P_STATUS_NOT_SUPPORTED,
-    NVML_P2P_STATUS_UNKNOWN
-
-} nvmlGpuP2PStatus_t;
-
-/* P2P Capability Index*/
-typedef enum nvmlGpuP2PCapsIndex_enum
-{
-    NVML_P2P_CAPS_INDEX_READ = 0,
-    NVML_P2P_CAPS_INDEX_WRITE,
-    NVML_P2P_CAPS_INDEX_NVLINK,
-    NVML_P2P_CAPS_INDEX_ATOMICS,
-    NVML_P2P_CAPS_INDEX_PROP,
-    NVML_P2P_CAPS_INDEX_UNKNOWN
-}nvmlGpuP2PCapsIndex_t;
-
-/**
- * Maximum limit on Physical Bridges per Board
- */
-#define NVML_MAX_PHYSICAL_BRIDGE                         (128)
-
-/**
- * Information about the Bridge Chip Firmware
- */
-typedef struct nvmlBridgeChipInfo_st
-{
-    nvmlBridgeChipType_t type;                  //!< Type of Bridge Chip 
-    unsigned int fwVersion;                     //!< Firmware Version. 0=Version is unavailable
-}nvmlBridgeChipInfo_t;
-
-/**
- * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate 
- * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth.
- */
-typedef struct nvmlBridgeChipHierarchy_st
-{
-    unsigned char  bridgeCount;                 //!< Number of Bridge Chips on the Board
-    nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board
-}nvmlBridgeChipHierarchy_t;
-
-/**
- *  Represents Type of Sampling Event
- */
-typedef enum nvmlSamplingType_enum
-{
-    NVML_TOTAL_POWER_SAMPLES        = 0, //!< To represent total power drawn by GPU
-    NVML_GPU_UTILIZATION_SAMPLES    = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU
-    NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written
-    NVML_ENC_UTILIZATION_SAMPLES    = 3, //!< To represent percent of time during which NVENC remains busy
-    NVML_DEC_UTILIZATION_SAMPLES    = 4, //!< To represent percent of time during which NVDEC remains busy            
-    NVML_PROCESSOR_CLK_SAMPLES      = 5, //!< To represent processor clock samples
-    NVML_MEMORY_CLK_SAMPLES         = 6, //!< To represent memory clock samples
-            
-    // Keep this last
-    NVML_SAMPLINGTYPE_COUNT               
-}nvmlSamplingType_t;
-
-/**
- * Represents the queryable PCIe utilization counters
- */
-typedef enum nvmlPcieUtilCounter_enum
-{
-    NVML_PCIE_UTIL_TX_BYTES             = 0, // 1KB granularity
-    NVML_PCIE_UTIL_RX_BYTES             = 1, // 1KB granularity
-    
-    // Keep this last
-    NVML_PCIE_UTIL_COUNT
-} nvmlPcieUtilCounter_t;
-
-/**
- * Represents the type for sample value returned
- */
-typedef enum nvmlValueType_enum 
-{
-    NVML_VALUE_TYPE_DOUBLE = 0,
-    NVML_VALUE_TYPE_UNSIGNED_INT = 1,
-    NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
-    NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
-    NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
-
-    // Keep this last
-    NVML_VALUE_TYPE_COUNT
-}nvmlValueType_t;
-
-
-/**
- * Union to represent different types of Value
- */
-typedef union nvmlValue_st
-{
-    double dVal;                    //!< If the value is double
-    unsigned int uiVal;             //!< If the value is unsigned int
-    unsigned long ulVal;            //!< If the value is unsigned long
-    unsigned long long ullVal;      //!< If the value is unsigned long long
-    signed long long sllVal;        //!< If the value is signed long long
-}nvmlValue_t;
-
-/**
- * Information for Sample
- */
-typedef struct nvmlSample_st 
-{
-    unsigned long long timeStamp;       //!< CPU Timestamp in microseconds
-    nvmlValue_t sampleValue;        //!< Sample Value
-}nvmlSample_t;
-
-/**
- * Represents type of perf policy for which violation times can be queried 
- */
-typedef enum nvmlPerfPolicyType_enum
-{
-    NVML_PERF_POLICY_POWER = 0,              //!< How long did power violations cause the GPU to be below application clocks
-    NVML_PERF_POLICY_THERMAL = 1,            //!< How long did thermal violations cause the GPU to be below application clocks
-    NVML_PERF_POLICY_SYNC_BOOST = 2,         //!< How long did sync boost cause the GPU to be below application clocks
-    NVML_PERF_POLICY_BOARD_LIMIT = 3,        //!< How long did the board limit cause the GPU to be below application clocks
-    NVML_PERF_POLICY_LOW_UTILIZATION = 4,    //!< How long did low utilization cause the GPU to be below application clocks
-    NVML_PERF_POLICY_RELIABILITY = 5,        //!< How long did the board reliability limit cause the GPU to be below application clocks
-
-    NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10,  //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above)
-    NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks
-
-    // Keep this last
-    NVML_PERF_POLICY_COUNT
-}nvmlPerfPolicyType_t;
-
-/**
- * Struct to hold perf policy violation status data
- */
-typedef struct nvmlViolationTime_st
-{
-    unsigned long long referenceTime;  //!< referenceTime represents CPU timestamp in microseconds
-    unsigned long long violationTime;  //!< violationTime in Nanoseconds
-}nvmlViolationTime_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlDeviceEnumvs Device Enums
- *  @{
- */
-/***************************************************************************************************/
-
-/** 
- * Generic enable/disable enum. 
- */
-typedef enum nvmlEnableState_enum 
-{
-    NVML_FEATURE_DISABLED    = 0,     //!< Feature disabled 
-    NVML_FEATURE_ENABLED     = 1      //!< Feature enabled
-} nvmlEnableState_t;
-
-//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details.
-#define nvmlFlagDefault     0x00      
-//! Generic flag used to force some behavior. See description of particular functions for details.
-#define nvmlFlagForce       0x01      
-
-/**
- *  * The Brand of the GPU
- *   */
-typedef enum nvmlBrandType_enum
-{
-    NVML_BRAND_UNKNOWN = 0, 
-    NVML_BRAND_QUADRO  = 1,
-    NVML_BRAND_TESLA   = 2,
-    NVML_BRAND_NVS     = 3,
-    NVML_BRAND_GRID    = 4,
-    NVML_BRAND_GEFORCE = 5,
-    NVML_BRAND_TITAN   = 6,
-
-    // Keep this last
-    NVML_BRAND_COUNT
-} nvmlBrandType_t;
-
-/**
- * Temperature thresholds.
- */
-typedef enum nvmlTemperatureThresholds_enum
-{
-    NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0,    // Temperature at which the GPU will shut down
-                                                // for HW protection
-    NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1,    // Temperature at which the GPU will begin HW slowdown
-    NVML_TEMPERATURE_THRESHOLD_MEM_MAX  = 2,    // Memory Temperature at which the GPU will begin SW slowdown
-    NVML_TEMPERATURE_THRESHOLD_GPU_MAX  = 3,    // GPU Temperature at which the GPU can be throttled below base clock
-    // Keep this last
-    NVML_TEMPERATURE_THRESHOLD_COUNT
-} nvmlTemperatureThresholds_t;
-
-/** 
- * Temperature sensors. 
- */
-typedef enum nvmlTemperatureSensors_enum 
-{
-    NVML_TEMPERATURE_GPU      = 0,    //!< Temperature sensor for the GPU die
-    
-    // Keep this last
-    NVML_TEMPERATURE_COUNT
-} nvmlTemperatureSensors_t;
-
-/** 
- * Compute mode. 
- *
- * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0.
- * Earlier CUDA versions supported a single exclusive mode, 
- * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond.
- */
-typedef enum nvmlComputeMode_enum 
-{
-    NVML_COMPUTEMODE_DEFAULT           = 0,  //!< Default compute mode -- multiple contexts per device
-    NVML_COMPUTEMODE_EXCLUSIVE_THREAD  = 1,  //!< Support Removed
-    NVML_COMPUTEMODE_PROHIBITED        = 2,  //!< Compute-prohibited mode -- no contexts per device
-    NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3,  //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
-    
-    // Keep this last
-    NVML_COMPUTEMODE_COUNT
-} nvmlComputeMode_t;
-
-/** 
- * ECC bit types.
- *
- * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type
- */
-#define nvmlEccBitType_t nvmlMemoryErrorType_t
-
-/**
- * Single bit ECC errors
- *
- * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED
- */
-#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED
-
-/**
- * Double bit ECC errors
- *
- * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED
- */
-#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED
-
-/**
- * Memory error types
- */
-typedef enum nvmlMemoryErrorType_enum
-{
-    /**
-     * A memory error that was corrected
-     * 
-     * For ECC errors, these are single bit errors
-     * For Texture memory, these are errors fixed by resend
-     */
-    NVML_MEMORY_ERROR_TYPE_CORRECTED = 0,
-    /**
-     * A memory error that was not corrected
-     * 
-     * For ECC errors, these are double bit errors
-     * For Texture memory, these are errors where the resend fails
-     */
-    NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1,
-    
-    
-    // Keep this last
-    NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types
-
-} nvmlMemoryErrorType_t;
-
-/** 
- * ECC counter types. 
- *
- * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent.
- *       On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver 
- *       client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app
- *       is run.
- */
-typedef enum nvmlEccCounterType_enum 
-{
-    NVML_VOLATILE_ECC      = 0,      //!< Volatile counts are reset each time the driver loads.
-    NVML_AGGREGATE_ECC     = 1,      //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device)
-    
-    // Keep this last
-    NVML_ECC_COUNTER_TYPE_COUNT      //!< Count of memory counter types
-} nvmlEccCounterType_t;
-
-/** 
- * Clock types. 
- * 
- * All speeds are in Mhz.
- */
-typedef enum nvmlClockType_enum 
-{
-    NVML_CLOCK_GRAPHICS  = 0,        //!< Graphics clock domain
-    NVML_CLOCK_SM        = 1,        //!< SM clock domain
-    NVML_CLOCK_MEM       = 2,        //!< Memory clock domain
-    NVML_CLOCK_VIDEO     = 3,        //!< Video encoder/decoder clock domain
-    
-    // Keep this last
-    NVML_CLOCK_COUNT //!< Count of clock types
-} nvmlClockType_t;
-
-/**
- * Clock Ids.  These are used in combination with nvmlClockType_t
- * to specify a single clock value.
- */
-typedef enum nvmlClockId_enum
-{
-    NVML_CLOCK_ID_CURRENT            = 0,   //!< Current actual clock value
-    NVML_CLOCK_ID_APP_CLOCK_TARGET   = 1,   //!< Target application clock
-    NVML_CLOCK_ID_APP_CLOCK_DEFAULT  = 2,   //!< Default application clock target
-    NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3,   //!< OEM-defined maximum clock rate
-
-    //Keep this last
-    NVML_CLOCK_ID_COUNT //!< Count of Clock Ids.
-} nvmlClockId_t;
-
-/** 
- * Driver models. 
- *
- * Windows only.
- */
-typedef enum nvmlDriverModel_enum 
-{
-    NVML_DRIVER_WDDM      = 0,       //!< WDDM driver model -- GPU treated as a display device
-    NVML_DRIVER_WDM       = 1        //!< WDM (TCC) model (recommended) -- GPU treated as a generic device
-} nvmlDriverModel_t;
-
-/**
- * Allowed PStates.
- */
-typedef enum nvmlPStates_enum 
-{
-    NVML_PSTATE_0               = 0,       //!< Performance state 0 -- Maximum Performance
-    NVML_PSTATE_1               = 1,       //!< Performance state 1 
-    NVML_PSTATE_2               = 2,       //!< Performance state 2
-    NVML_PSTATE_3               = 3,       //!< Performance state 3
-    NVML_PSTATE_4               = 4,       //!< Performance state 4
-    NVML_PSTATE_5               = 5,       //!< Performance state 5
-    NVML_PSTATE_6               = 6,       //!< Performance state 6
-    NVML_PSTATE_7               = 7,       //!< Performance state 7
-    NVML_PSTATE_8               = 8,       //!< Performance state 8
-    NVML_PSTATE_9               = 9,       //!< Performance state 9
-    NVML_PSTATE_10              = 10,      //!< Performance state 10
-    NVML_PSTATE_11              = 11,      //!< Performance state 11
-    NVML_PSTATE_12              = 12,      //!< Performance state 12
-    NVML_PSTATE_13              = 13,      //!< Performance state 13
-    NVML_PSTATE_14              = 14,      //!< Performance state 14
-    NVML_PSTATE_15              = 15,      //!< Performance state 15 -- Minimum Performance 
-    NVML_PSTATE_UNKNOWN         = 32       //!< Unknown performance state
-} nvmlPstates_t;
-
-/**
- * GPU Operation Mode
- *
- * GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features.
- *
- * Each GOM is designed to meet specific user needs.
- */
-typedef enum nvmlGom_enum
-{
-    NVML_GOM_ALL_ON                    = 0, //!< Everything is enabled and running at full speed
-
-    NVML_GOM_COMPUTE                   = 1, //!< Designed for running only compute tasks. Graphics operations
-                                            //!< are not allowed
-
-    NVML_GOM_LOW_DP                    = 2  //!< Designed for running graphics applications that don't require
-                                            //!< high bandwidth double precision
-} nvmlGpuOperationMode_t;
-
-/** 
- * Available infoROM objects.
- */
-typedef enum nvmlInforomObject_enum 
-{
-    NVML_INFOROM_OEM            = 0,       //!< An object defined by OEM
-    NVML_INFOROM_ECC            = 1,       //!< The ECC object determining the level of ECC support
-    NVML_INFOROM_POWER          = 2,       //!< The power management object
-
-    // Keep this last
-    NVML_INFOROM_COUNT                     //!< This counts the number of infoROM objects the driver knows about
-} nvmlInforomObject_t;
-
-/** 
- * Return values for NVML API calls. 
- */
-typedef enum nvmlReturn_enum 
-{
-    NVML_SUCCESS = 0,                   //!< The operation was successful
-    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
-    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
-    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
-    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
-    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
-    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
-    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
-    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
-    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
-    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
-    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
-    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
-    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
-    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
-    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
-    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
-    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
-    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
-    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
-    NVML_ERROR_MEMORY = 20,             //!< Insufficient memory
-    NVML_ERROR_NO_DATA = 21,            //!<No data
-    NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22,    //!< The requested vgpu operation is not available on target device, becasue ECC is enabled
-    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
-} nvmlReturn_t;
-
-/**
- * See \ref nvmlDeviceGetMemoryErrorCounter
- */
-typedef enum nvmlMemoryLocation_enum
-{
-    NVML_MEMORY_LOCATION_L1_CACHE        = 0,    //!< GPU L1 Cache
-    NVML_MEMORY_LOCATION_L2_CACHE        = 1,    //!< GPU L2 Cache
-    NVML_MEMORY_LOCATION_DRAM            = 2,    //!< Turing+ DRAM
-    NVML_MEMORY_LOCATION_DEVICE_MEMORY   = 2,    //!< GPU Device Memory
-    NVML_MEMORY_LOCATION_REGISTER_FILE   = 3,    //!< GPU Register File
-    NVML_MEMORY_LOCATION_TEXTURE_MEMORY  = 4,    //!< GPU Texture Memory
-    NVML_MEMORY_LOCATION_TEXTURE_SHM     = 5,    //!< Shared memory
-    NVML_MEMORY_LOCATION_CBU             = 6,    //!< CBU
-    NVML_MEMORY_LOCATION_SRAM            = 7,    //!< Turing+ SRAM
-    // Keep this last
-    NVML_MEMORY_LOCATION_COUNT              //!< This counts the number of memory locations the driver knows about
-} nvmlMemoryLocation_t;
-
-/**
- * Causes for page retirement
- */
-typedef enum nvmlPageRetirementCause_enum
-{
-    NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0, //!< Page was retired due to multiple single bit ECC error
-    NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1,           //!< Page was retired due to double bit ECC error
-
-    // Keep this last
-    NVML_PAGE_RETIREMENT_CAUSE_COUNT
-} nvmlPageRetirementCause_t;
-
-/**
- * API types that allow changes to default permission restrictions
- */
-typedef enum nvmlRestrictedAPI_enum
-{
-    NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0,   //!< APIs that change application clocks, see nvmlDeviceSetApplicationsClocks 
-                                                      //!< and see nvmlDeviceResetApplicationsClocks
-    NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1,  //!< APIs that enable/disable Auto Boosted clocks
-                                                      //!< see nvmlDeviceSetAutoBoostedClocksEnabled
-    // Keep this last
-    NVML_RESTRICTED_API_COUNT
-} nvmlRestrictedAPI_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlGridEnums GRID Enums
- *  @{
- */
-/***************************************************************************************************/
-
-/*!
- * GPU virtualization mode types.
- */
-typedef enum nvmlGpuVirtualizationMode {
-    NVML_GPU_VIRTUALIZATION_MODE_NONE = 0,  //!< Represents Bare Metal GPU
-    NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1,  //!< Device is associated with GPU-Passthorugh
-    NVML_GPU_VIRTUALIZATION_MODE_VGPU = 2,  //!< Device is associated with vGPU inside virtual machine.
-    NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3,  //!< Device is associated with VGX hypervisor in vGPU mode
-    NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4,  //!< Device is associated with VGX hypervisor in vSGA mode
-} nvmlGpuVirtualizationMode_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlFieldValueEnums Field Value Enums
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Field Identifiers.
- *
- * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
- */
-#define NVML_FI_DEV_ECC_CURRENT           1   //!< Current ECC mode. 1=Active. 0=Inactive
-#define NVML_FI_DEV_ECC_PENDING           2   //!< Pending ECC mode. 1=Active. 0=Inactive
-/* ECC Count Totals */
-#define NVML_FI_DEV_ECC_SBE_VOL_TOTAL     3   //!< Total single bit volatile ECC errors
-#define NVML_FI_DEV_ECC_DBE_VOL_TOTAL     4   //!< Total double bit volatile ECC errors
-#define NVML_FI_DEV_ECC_SBE_AGG_TOTAL     5   //!< Total single bit aggregate (persistent) ECC errors
-#define NVML_FI_DEV_ECC_DBE_AGG_TOTAL     6   //!< Total double bit aggregate (persistent) ECC errors
-/* Individual ECC locations */
-#define NVML_FI_DEV_ECC_SBE_VOL_L1        7   //!< L1 cache single bit volatile ECC errors
-#define NVML_FI_DEV_ECC_DBE_VOL_L1        8   //!< L1 cache double bit volatile ECC errors
-#define NVML_FI_DEV_ECC_SBE_VOL_L2        9   //!< L2 cache single bit volatile ECC errors
-#define NVML_FI_DEV_ECC_DBE_VOL_L2        10  //!< L2 cache double bit volatile ECC errors
-#define NVML_FI_DEV_ECC_SBE_VOL_DEV       11  //!< Device memory single bit volatile ECC errors
-#define NVML_FI_DEV_ECC_DBE_VOL_DEV       12  //!< Device memory double bit volatile ECC errors
-#define NVML_FI_DEV_ECC_SBE_VOL_REG       13  //!< Register file single bit volatile ECC errors
-#define NVML_FI_DEV_ECC_DBE_VOL_REG       14  //!< Register file double bit volatile ECC errors
-#define NVML_FI_DEV_ECC_SBE_VOL_TEX       15  //!< Texture memory single bit volatile ECC errors
-#define NVML_FI_DEV_ECC_DBE_VOL_TEX       16  //!< Texture memory double bit volatile ECC errors
-#define NVML_FI_DEV_ECC_DBE_VOL_CBU       17  //!< CBU double bit volatile ECC errors
-#define NVML_FI_DEV_ECC_SBE_AGG_L1        18  //!< L1 cache single bit aggregate (persistent) ECC errors
-#define NVML_FI_DEV_ECC_DBE_AGG_L1        19  //!< L1 cache double bit aggregate (persistent) ECC errors
-#define NVML_FI_DEV_ECC_SBE_AGG_L2        20  //!< L2 cache single bit aggregate (persistent) ECC errors
-#define NVML_FI_DEV_ECC_DBE_AGG_L2        21  //!< L2 cache double bit aggregate (persistent) ECC errors
-#define NVML_FI_DEV_ECC_SBE_AGG_DEV       22  //!< Device memory single bit aggregate (persistent) ECC errors
-#define NVML_FI_DEV_ECC_DBE_AGG_DEV       23  //!< Device memory double bit aggregate (persistent) ECC errors
-#define NVML_FI_DEV_ECC_SBE_AGG_REG       24  //!< Register File single bit aggregate (persistent) ECC errors
-#define NVML_FI_DEV_ECC_DBE_AGG_REG       25  //!< Register File double bit aggregate (persistent) ECC errors
-#define NVML_FI_DEV_ECC_SBE_AGG_TEX       26  //!< Texture memory single bit aggregate (persistent) ECC errors
-#define NVML_FI_DEV_ECC_DBE_AGG_TEX       27  //!< Texture memory double bit aggregate (persistent) ECC errors
-#define NVML_FI_DEV_ECC_DBE_AGG_CBU       28  //!< CBU double bit aggregate ECC errors
-
-/* Page Retirement */
-#define NVML_FI_DEV_RETIRED_SBE           29  //!< Number of retired pages because of single bit errors
-#define NVML_FI_DEV_RETIRED_DBE           30  //!< Number of retired pages because of double bit errors
-#define NVML_FI_DEV_RETIRED_PENDING       31  //!< If any pages are pending retirement. 1=yes. 0=no.
-
-/* NvLink Flit Error Counters */
-#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0    32 //!< NVLink flow control CRC  Error Counter for Lane 0
-#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1    33 //!< NVLink flow control CRC  Error Counter for Lane 1
-#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2    34 //!< NVLink flow control CRC  Error Counter for Lane 2
-#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3    35 //!< NVLink flow control CRC  Error Counter for Lane 3
-#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4    36 //!< NVLink flow control CRC  Error Counter for Lane 4
-#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5    37 //!< NVLink flow control CRC  Error Counter for Lane 5
-#define NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 38 //!< NVLink flow control CRC  Error Counter total for all Lanes
-
-/* NvLink CRC Data Error Counters */
-#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0    39 //!< NVLink data CRC Error Counter for Lane 0
-#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1    40 //!< NVLink data CRC Error Counter for Lane 1
-#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2    41 //!< NVLink data CRC Error Counter for Lane 2
-#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3    42 //!< NVLink data CRC Error Counter for Lane 3
-#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4    43 //!< NVLink data CRC Error Counter for Lane 4
-#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5    44 //!< NVLink data CRC Error Counter for Lane 5
-#define NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 45 //!< NvLink data CRC Error Counter total for all Lanes
-
-/* NvLink Replay Error Counters */
-#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0      46 //!< NVLink Replay Error Counter for Lane 0
-#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1      47 //!< NVLink Replay Error Counter for Lane 1
-#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2      48 //!< NVLink Replay Error Counter for Lane 2
-#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3      49 //!< NVLink Replay Error Counter for Lane 3
-#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4      50 //!< NVLink Replay Error Counter for Lane 4
-#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5      51 //!< NVLink Replay Error Counter for Lane 5
-#define NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL   52 //!< NVLink Replay Error Counter total for all Lanes
-
-/* NvLink Recovery Error Counters */
-#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0    53 //!< NVLink Recovery Error Counter for Lane 0
-#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1    54 //!< NVLink Recovery Error Counter for Lane 1
-#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2    55 //!< NVLink Recovery Error Counter for Lane 2
-#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3    56 //!< NVLink Recovery Error Counter for Lane 3
-#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4    57 //!< NVLink Recovery Error Counter for Lane 4
-#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5    58 //!< NVLink Recovery Error Counter for Lane 5
-#define NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 59 //!< NVLink Recovery Error Counter total for all Lanes
-
-/* NvLink Bandwidth Counters */
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L0     60 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 0
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L1     61 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 1
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L2     62 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 2
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L3     63 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 3
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L4     64 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 4
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L5     65 //!< NVLink Bandwidth Counter for Counter Set 0, Lane 5
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C0_TOTAL  66 //!< NVLink Bandwidth Counter Total for Counter Set 0, All Lanes
-
-/* NvLink Bandwidth Counters */
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L0     67 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 0
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L1     68 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 1
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L2     69 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 2
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L3     70 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 3
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L4     71 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 4
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L5     72 //!< NVLink Bandwidth Counter for Counter Set 1, Lane 5
-#define NVML_FI_DEV_NVLINK_BANDWIDTH_C1_TOTAL  73 //!< NVLink Bandwidth Counter Total for Counter Set 1, All Lanes
-
-/* NVML Perf Policy Counters */
-#define NVML_FI_DEV_PERF_POLICY_POWER              74   //!< Perf Policy Counter for Power Policy
-#define NVML_FI_DEV_PERF_POLICY_THERMAL            75   //!< Perf Policy Counter for Thermal Policy
-#define NVML_FI_DEV_PERF_POLICY_SYNC_BOOST         76   //!< Perf Policy Counter for Sync boost Policy
-#define NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT        77   //!< Perf Policy Counter for Board Limit
-#define NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION    78   //!< Perf Policy Counter for Low GPU Utilization Policy
-#define NVML_FI_DEV_PERF_POLICY_RELIABILITY        79   //!< Perf Policy Counter for Reliability Policy
-#define NVML_FI_DEV_PERF_POLICY_TOTAL_APP_CLOCKS   80   //!< Perf Policy Counter for Total App Clock Policy
-#define NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS  81   //!< Perf Policy Counter for Total Base Clocks Policy
-
-/* Memory temperatures */
-#define NVML_FI_DEV_MEMORY_TEMP  82 //!< Memory temperature for the device
-
-/* Energy Counter */
-#define NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION 83 //!< Total energy consumption for the GPU in mJ since the driver was last reloaded
-
-/* NVLink Speed */
-#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L0     84  //!< NVLink Speed in MBps for Link 0
-#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L1     85  //!< NVLink Speed in MBps for Link 1
-#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L2     86  //!< NVLink Speed in MBps for Link 2
-#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L3     87  //!< NVLink Speed in MBps for Link 3
-#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L4     88  //!< NVLink Speed in MBps for Link 4
-#define NVML_FI_DEV_NVLINK_SPEED_MBPS_L5     89  //!< NVLink Speed in MBps for Link 5
-#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90  //!< Common NVLink Speed in MBps for active links
-
-#define NVML_FI_DEV_NVLINK_LINK_COUNT        91  //!< Number of NVLinks present on the device
-
-#define NVML_FI_DEV_RETIRED_PENDING_SBE      92  //!< If any pages are pending retirement due to SBE. 1=yes. 0=no.
-#define NVML_FI_DEV_RETIRED_PENDING_DBE      93  //!< If any pages are pending retirement due to DBE. 1=yes. 0=no.
-
-#define NVML_FI_DEV_PCIE_REPLAY_COUNTER             94  //!< PCIe replay counter
-#define NVML_FI_DEV_PCIE_REPLAY_ROLLOVER_COUNTER    95  //!< PCIe replay rollover counter
-
-#define NVML_FI_MAX 96 //!< One greater than the largest field ID defined above
-
-/**
- * Information for a Field Value Sample
- */
-typedef struct nvmlFieldValue_st
-{
-    unsigned int fieldId;       //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above.
-    unsigned int unused;        //!< Currently unused. This should be initialized to 0 by the caller before any API call
-    long long timestamp;        //!< CPU Timestamp of this value in microseconds since 1970
-    long long latencyUsec;      //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call.
-    nvmlValueType_t valueType;  //!< Type of the value stored in value
-    nvmlReturn_t nvmlReturn;    //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS
-    nvmlValue_t value;          //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
-} nvmlFieldValue_t;
-
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlUnitStructs Unit Structs
- *  @{
- */
-/***************************************************************************************************/
-
-typedef struct nvmlUnit_st* nvmlUnit_t;
-
-/** 
- * Description of HWBC entry 
- */
-typedef struct nvmlHwbcEntry_st 
-{
-    unsigned int hwbcId;
-    char firmwareVersion[32];
-} nvmlHwbcEntry_t;
-
-/** 
- * Fan state enum. 
- */
-typedef enum nvmlFanState_enum 
-{
-    NVML_FAN_NORMAL       = 0,     //!< Fan is working properly
-    NVML_FAN_FAILED       = 1      //!< Fan has failed
-} nvmlFanState_t;
-
-/** 
- * Led color enum. 
- */
-typedef enum nvmlLedColor_enum 
-{
-    NVML_LED_COLOR_GREEN       = 0,     //!< GREEN, indicates good health
-    NVML_LED_COLOR_AMBER       = 1      //!< AMBER, indicates problem
-} nvmlLedColor_t;
-
-
-/** 
- * LED states for an S-class unit.
- */
-typedef struct nvmlLedState_st 
-{
-    char cause[256];               //!< If amber, a text description of the cause
-    nvmlLedColor_t color;          //!< GREEN or AMBER
-} nvmlLedState_t;
-
-/** 
- * Static S-class unit info.
- */
-typedef struct nvmlUnitInfo_st 
-{
-    char name[96];                      //!< Product name
-    char id[96];                        //!< Product identifier
-    char serial[96];                    //!< Product serial number
-    char firmwareVersion[96];           //!< Firmware version
-} nvmlUnitInfo_t;
-
-/** 
- * Power usage information for an S-class unit.
- * The power supply state is a human readable string that equals "Normal" or contains
- * a combination of "Abnormal" plus one or more of the following:
- *    
- *    - High voltage
- *    - Fan failure
- *    - Heatsink temperature
- *    - Current limit
- *    - Voltage below UV alarm threshold
- *    - Low-voltage
- *    - SI2C remote off command
- *    - MOD_DISABLE input
- *    - Short pin transition 
-*/
-typedef struct nvmlPSUInfo_st 
-{
-    char state[256];                 //!< The power supply state
-    unsigned int current;            //!< PSU current (A)
-    unsigned int voltage;            //!< PSU voltage (V)
-    unsigned int power;              //!< PSU power draw (W)
-} nvmlPSUInfo_t;
-
-/** 
- * Fan speed reading for a single fan in an S-class unit.
- */
-typedef struct nvmlUnitFanInfo_st 
-{
-    unsigned int speed;              //!< Fan speed (RPM)
-    nvmlFanState_t state;            //!< Flag that indicates whether fan is working properly
-} nvmlUnitFanInfo_t;
-
-/** 
- * Fan speed readings for an entire S-class unit.
- */
-typedef struct nvmlUnitFanSpeeds_st 
-{
-    nvmlUnitFanInfo_t fans[24];      //!< Fan speed data for each fan
-    unsigned int count;              //!< Number of fans in unit
-} nvmlUnitFanSpeeds_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @addtogroup nvmlEvents 
- *  @{
- */
-/***************************************************************************************************/
-
-/** 
- * Handle to an event set
- */
-typedef struct nvmlEventSet_st* nvmlEventSet_t;
-
-/** @defgroup nvmlEventType Event Types
- * @{
- * Event Types which user can be notified about.
- * See description of particular functions for details.
- *
- * See \ref nvmlDeviceRegisterEvents and \ref nvmlDeviceGetSupportedEventTypes to check which devices 
- * support each event.
- *
- * Types can be combined with bitwise or operator '|' when passed to \ref nvmlDeviceRegisterEvents
- */
-//! Event about single bit ECC errors
-/**
- * \note A corrected texture memory error is not an ECC error, so it does not generate a single bit event
- */
-#define nvmlEventTypeSingleBitEccError     0x0000000000000001LL
-
-//! Event about double bit ECC errors
-/**
- * \note An uncorrected texture memory error is not an ECC error, so it does not generate a double bit event
- */
-#define nvmlEventTypeDoubleBitEccError     0x0000000000000002LL
-
-//! Event about PState changes
-/**
- *  \note On Fermi architecture PState changes are also an indicator that GPU is throttling down due to
- *  no work being executed on the GPU, power capping or thermal capping. In a typical situation,
- *  Fermi-based GPU should stay in P0 for the duration of the execution of the compute process.
- */
-#define nvmlEventTypePState                0x0000000000000004LL
-
-//! Event that Xid critical error occurred
-#define nvmlEventTypeXidCriticalError      0x0000000000000008LL
-
-//! Event about clock changes
-/**
- * Kepler only
- */
-#define nvmlEventTypeClock                 0x0000000000000010LL
-
-//! Mask with no events
-#define nvmlEventTypeNone                  0x0000000000000000LL
-//! Mask of all events
-#define nvmlEventTypeAll (nvmlEventTypeNone    \
-        | nvmlEventTypeSingleBitEccError       \
-        | nvmlEventTypeDoubleBitEccError       \
-        | nvmlEventTypePState                  \
-        | nvmlEventTypeClock                   \
-        | nvmlEventTypeXidCriticalError        \
-        )
-/** @} */
-
-/** 
- * Information about occurred event
- */
-typedef struct nvmlEventData_st
-{
-    nvmlDevice_t        device;         //!< Specific device where the event occurred
-    unsigned long long  eventType;      //!< Information about what specific event occurred
-    unsigned long long  eventData;      //!< Stores last XID error for the device in the event of nvmlEventTypeXidCriticalError, 
-                                        //  eventData is 0 for any other event. eventData is set as 999 for unknown xid error.
-} nvmlEventData_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @addtogroup nvmlClocksThrottleReasons
- *  @{
- */
-/***************************************************************************************************/
-
-/** Nothing is running on the GPU and the clocks are dropping to Idle state
- * \note This limiter may be removed in a later release
- */
-#define nvmlClocksThrottleReasonGpuIdle                   0x0000000000000001LL
-
-/** GPU clocks are limited by current setting of applications clocks
- *
- * @see nvmlDeviceSetApplicationsClocks
- * @see nvmlDeviceGetApplicationsClock
- */
-#define nvmlClocksThrottleReasonApplicationsClocksSetting 0x0000000000000002LL
-
-/** 
- * @deprecated Renamed to \ref nvmlClocksThrottleReasonApplicationsClocksSetting 
- *             as the name describes the situation more accurately.
- */
-#define nvmlClocksThrottleReasonUserDefinedClocks         nvmlClocksThrottleReasonApplicationsClocksSetting 
-
-/** SW Power Scaling algorithm is reducing the clocks below requested clocks 
- *
- * @see nvmlDeviceGetPowerUsage
- * @see nvmlDeviceSetPowerManagementLimit
- * @see nvmlDeviceGetPowerManagementLimit
- */
-#define nvmlClocksThrottleReasonSwPowerCap                0x0000000000000004LL
-
-/** HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
- * 
- * This is an indicator of:
- *   - temperature being too high
- *   - External Power Brake Assertion is triggered (e.g. by the system power supply)
- *   - Power draw is too high and Fast Trigger protection is reducing the clocks
- *   - May be also reported during PState or clock change
- *      - This behavior may be removed in a later release.
- *
- * @see nvmlDeviceGetTemperature
- * @see nvmlDeviceGetTemperatureThreshold
- * @see nvmlDeviceGetPowerUsage
- */
-#define nvmlClocksThrottleReasonHwSlowdown                0x0000000000000008LL
-
-/** Sync Boost
- *
- * This GPU has been added to a Sync boost group with nvidia-smi or DCGM in
- * order to maximize performance per watt. All GPUs in the sync boost group
- * will boost to the minimum possible clocks across the entire group. Look at
- * the throttle reasons for other GPUs in the system to see why those GPUs are
- * holding this one at lower clocks.
- *
- */
-#define nvmlClocksThrottleReasonSyncBoost                 0x0000000000000010LL
-
-/** SW Thermal Slowdown
- *
- * This is an indicator of one or more of the following:
- *  - Current GPU temperature above the GPU Max Operating Temperature
- *  - Current memory temperature above the Memory Max Operating Temperature
- *
- */
-#define nvmlClocksThrottleReasonSwThermalSlowdown         0x0000000000000020LL
-
-/** HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
- * 
- * This is an indicator of:
- *   - temperature being too high
- *
- * @see nvmlDeviceGetTemperature
- * @see nvmlDeviceGetTemperatureThreshold
- * @see nvmlDeviceGetPowerUsage
- */
-#define nvmlClocksThrottleReasonHwThermalSlowdown         0x0000000000000040LL
-
-/** HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
- * 
- * This is an indicator of:
- *   - External Power Brake Assertion being triggered (e.g. by the system power supply)
- *
- * @see nvmlDeviceGetTemperature
- * @see nvmlDeviceGetTemperatureThreshold
- * @see nvmlDeviceGetPowerUsage
- */
-#define nvmlClocksThrottleReasonHwPowerBrakeSlowdown      0x0000000000000080LL
-
-/** GPU clocks are limited by current setting of Display clocks
- *
- * @see bug 1997531
- */
-#define nvmlClocksThrottleReasonDisplayClockSetting       0x0000000000000100LL
-
-/** Bit mask representing no clocks throttling
- *
- * Clocks are as high as possible.
- * */
-#define nvmlClocksThrottleReasonNone                      0x0000000000000000LL
-
-/** Bit mask representing all supported clocks throttling reasons 
- * New reasons might be added to this list in the future
- */
-#define nvmlClocksThrottleReasonAll (nvmlClocksThrottleReasonNone \
-      | nvmlClocksThrottleReasonGpuIdle                           \
-      | nvmlClocksThrottleReasonApplicationsClocksSetting         \
-      | nvmlClocksThrottleReasonSwPowerCap                        \
-      | nvmlClocksThrottleReasonHwSlowdown                        \
-      | nvmlClocksThrottleReasonSyncBoost                         \
-      | nvmlClocksThrottleReasonSwThermalSlowdown                 \
-      | nvmlClocksThrottleReasonHwThermalSlowdown                 \
-      | nvmlClocksThrottleReasonHwPowerBrakeSlowdown              \
-      | nvmlClocksThrottleReasonDisplayClockSetting               \
-)
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlAccountingStats Accounting Statistics
- *  @{
- *
- *  Set of APIs designed to provide per process information about usage of GPU.
- *
- *  @note All accounting statistics and accounting mode live in nvidia driver and reset 
- *        to default (Disabled) when driver unloads.
- *        It is advised to run with persistence mode enabled.
- *
- *  @note Enabling accounting mode has no negative impact on the GPU performance.
- */
-/***************************************************************************************************/
-
-/**
- * Describes accounting statistics of a process.
- */
-typedef struct nvmlAccountingStats_st {
-    unsigned int gpuUtilization;                //!< Percent of time over the process's lifetime during which one or more kernels was executing on the GPU.
-                                                //! Utilization stats just like returned by \ref nvmlDeviceGetUtilizationRates but for the life time of a
-                                                //! process (not just the last sample period).
-                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported
-    
-    unsigned int memoryUtilization;             //!< Percent of time over the process's lifetime during which global (device) memory was being read or written.
-                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported
-    
-    unsigned long long maxMemoryUsage;          //!< Maximum total memory in bytes that was ever allocated by the process.
-                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlProcessInfo_t->usedGpuMemory is not supported
-    
-
-    unsigned long long time;                    //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if 
-                                                //!< the process is not terminated
-    
-    unsigned long long startTime;               //!< CPU Timestamp in usec representing start time for the process
-    
-    unsigned int isRunning;                     //!< Flag to represent if the process is running (1 for running, 0 for terminated)
-
-    unsigned int reserved[5];                   //!< Reserved for future use
-} nvmlAccountingStats_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlVgpuConstants Vgpu Constants
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense
- */
-#define NVML_GRID_LICENSE_BUFFER_SIZE       128
-
-#define NVML_VGPU_NAME_BUFFER_SIZE          64
-
-#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3
-
-/*!
- * Macros for pGPU's virtualization capabilities bitfield.
- */
-#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION         0:0
-#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO      0x0
-#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES     0x1
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlVgpuEnum Vgpu Enum
- *  @{
- */
-/***************************************************************************************************/
-
-/*!
- * Types of VM identifiers
- */
-typedef enum nvmlVgpuVmIdType {
-    NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID
-    NVML_VGPU_VM_ID_UUID = 1,      //!< VM ID represents UUID
-} nvmlVgpuVmIdType_t;
-
-/**
- * vGPU GUEST info state.
- */
-typedef enum nvmlVgpuGuestInfoState_enum
-{
-    NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0,  //!< Guest-dependent fields uninitialized
-    NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED   = 1,  //!< Guest-dependent fields initialized
-} nvmlVgpuGuestInfoState_t;
-
-/**
- * GRID license feature code
- */
-typedef enum {
-    NVML_GRID_LICENSE_FEATURE_CODE_VGPU = 1,         //!< Virtual GPU
-    NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = 2  //!< Virtual Workstation
-} nvmlGridLicenseFeatureCode_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlVgpuStructs Vgpu Structs
- *  @{
- */
-/***************************************************************************************************/
-
-typedef unsigned int nvmlVgpuTypeId_t;
-
-typedef unsigned int nvmlVgpuInstance_t;
-
-/**
- * Structure to store Utilization Value and vgpuInstance
- */
-typedef struct nvmlVgpuInstanceUtilizationSample_st
-{
-    nvmlVgpuInstance_t vgpuInstance;    //!< vGPU Instance
-    unsigned long long timeStamp;       //!< CPU Timestamp in microseconds
-    nvmlValue_t smUtil;                 //!< SM (3D/Compute) Util Value
-    nvmlValue_t memUtil;                //!< Frame Buffer Memory Util Value
-    nvmlValue_t encUtil;                //!< Encoder Util Value
-    nvmlValue_t decUtil;                //!< Decoder Util Value
-} nvmlVgpuInstanceUtilizationSample_t;
-
-/**
- * Structure to store Utilization Value, vgpuInstance and subprocess information
- */
-typedef struct nvmlVgpuProcessUtilizationSample_st
-{
-    nvmlVgpuInstance_t vgpuInstance;                //!< vGPU Instance
-    unsigned int pid;                               //!< PID of process running within the vGPU VM
-    char processName[NVML_VGPU_NAME_BUFFER_SIZE];   //!< Name of process running within the vGPU VM
-    unsigned long long timeStamp;                   //!< CPU Timestamp in microseconds
-    unsigned int smUtil;                            //!< SM (3D/Compute) Util Value
-    unsigned int memUtil;                           //!< Frame Buffer Memory Util Value
-    unsigned int encUtil;                           //!< Encoder Util Value
-    unsigned int decUtil;                           //!< Decoder Util Value
-} nvmlVgpuProcessUtilizationSample_t;
-
-/**
- * Structure to store utilization value and process Id
- */
-typedef struct nvmlProcessUtilizationSample_st
-{
-    unsigned int pid;                   //!< PID of process
-    unsigned long long timeStamp;       //!< CPU Timestamp in microseconds
-    unsigned int smUtil;                //!< SM (3D/Compute) Util Value
-    unsigned int memUtil;               //!< Frame Buffer Memory Util Value
-    unsigned int encUtil;               //!< Encoder Util Value
-    unsigned int decUtil;               //!< Decoder Util Value
-} nvmlProcessUtilizationSample_t;
-
-/**
- * Structure containing GRID licensable feature information
- */
-typedef struct nvmlGridLicensableFeature_st
-{
-    nvmlGridLicenseFeatureCode_t    featureCode;                                 //!< Licensed feature code
-    unsigned int                    featureState;                                //!< Non-zero if feature is currently licensed, otherwise zero
-    char                            licenseInfo[NVML_GRID_LICENSE_BUFFER_SIZE];
-} nvmlGridLicensableFeature_t;
-
-/**
- * Structure to store GRID licensable features
- */
-typedef struct nvmlGridLicensableFeatures_st
-{
-    int                         isGridLicenseSupported;                                       //!< Non-zero if GRID Software Licensing is supported on the system, otherwise zero
-    unsigned int                licensableFeaturesCount;                                      //!< Entries returned in \a gridLicensableFeatures array
-    nvmlGridLicensableFeature_t gridLicensableFeatures[NVML_GRID_LICENSE_FEATURE_MAX_COUNT];  //!< Array of GRID licensable features.
-} nvmlGridLicensableFeatures_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlEncoderStructs Encoder Structs
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Represents type of encoder for capacity can be queried
- */
-typedef enum nvmlEncoderQueryType_enum
-{
-    NVML_ENCODER_QUERY_H264 = 0,        //!< H264 encoder
-    NVML_ENCODER_QUERY_HEVC = 1,        //!< HEVC encoder
-}nvmlEncoderType_t;
-
-/**
- * Structure to hold encoder session data
- */
-typedef struct nvmlEncoderSessionInfo_st
-{
-    unsigned int       sessionId;       //!< Unique session ID
-    unsigned int       pid;             //!< Owning process ID
-    nvmlVgpuInstance_t vgpuInstance;    //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero)
-    nvmlEncoderType_t  codecType;       //!< Video encoder type
-    unsigned int       hResolution;     //!< Current encode horizontal resolution
-    unsigned int       vResolution;     //!< Current encode vertical resolution
-    unsigned int       averageFps;      //!< Moving average encode frames per second
-    unsigned int       averageLatency;  //!< Moving average encode latency in microseconds
-}nvmlEncoderSessionInfo_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlFBCStructs Frame Buffer Capture Structures
-*  @{
-*/
-/***************************************************************************************************/
-
-/**
- * Represents frame buffer capture session type
- */
-typedef enum nvmlFBCSessionType_enum
-{
-    NVML_FBC_SESSION_TYPE_UNKNOWN = 0,     //!< Unknwon
-    NVML_FBC_SESSION_TYPE_TOSYS,           //!< ToSys
-    NVML_FBC_SESSION_TYPE_CUDA,            //!< Cuda
-    NVML_FBC_SESSION_TYPE_VID,             //!< Vid
-    NVML_FBC_SESSION_TYPE_HWENC,           //!< HEnc
-} nvmlFBCSessionType_t;
-
-/**
- * Structure to hold frame buffer capture sessions stats
- */
-typedef struct nvmlFBCStats_st
-{
-    unsigned int      sessionsCount;    //!< Total no of sessions
-    unsigned int      averageFPS;       //!< Moving average new frames captured per second
-    unsigned int      averageLatency;   //!< Moving average new frame capture latency in microseconds
-} nvmlFBCStats_t;
-
-#define NVML_NVFBC_SESSION_FLAG_DIFFMAP_ENABLED                0x00000001    //!< Bit specifying differential map state.
-#define NVML_NVFBC_SESSION_FLAG_CLASSIFICATIONMAP_ENABLED      0x00000002    //!< Bit specifying classification map state.
-#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_NO_WAIT      0x00000004    //!< Bit specifying if capture was requested as non-blocking call.
-#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_INFINITE     0x00000008    //!< Bit specifying if capture was requested as blocking call.
-#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT      0x00000010    //!< Bit specifying if capture was requested as blocking call with timeout period.
-
-/**
- * Structure to hold FBC session data
- */
-typedef struct nvmlFBCSessionInfo_st
-{
-    unsigned int          sessionId;                           //!< Unique session ID
-    unsigned int          pid;                                 //!< Owning process ID
-    nvmlVgpuInstance_t    vgpuInstance;                        //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero)
-    unsigned int          displayOrdinal;                      //!< Display identifier
-    nvmlFBCSessionType_t  sessionType;                         //!< Type of frame buffer capture session
-    unsigned int          sessionFlags;                        //!< Session flags (one or more of NVML_NVFBC_SESSION_FLAG_XXX).
-    unsigned int          hMaxResolution;                      //!< Max horizontal resolution supported by the capture session
-    unsigned int          vMaxResolution;                      //!< Max vertical resolution supported by the capture session
-    unsigned int          hResolution;                         //!< Horizontal resolution requested by caller in capture call
-    unsigned int          vResolution;                         //!< Vertical resolution requested by caller in capture call
-    unsigned int          averageFPS;                          //!< Moving average new frames captured per second
-    unsigned int          averageLatency;                      //!< Moving average new frame capture latency in microseconds
-} nvmlFBCSessionInfo_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlDrainDefs definitions related to the drain state
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- *  Is the GPU device to be removed from the kernel by nvmlDeviceRemoveGpu()
- */
-typedef enum nvmlDetachGpuState_enum
-{
-    NVML_DETACH_GPU_KEEP         = 0,
-    NVML_DETACH_GPU_REMOVE,
-} nvmlDetachGpuState_t;
-
-/**
- *  Parent bridge PCIe link state requested by nvmlDeviceRemoveGpu()
- */
-typedef enum nvmlPcieLinkState_enum
-{
-    NVML_PCIE_LINK_KEEP         = 0,
-    NVML_PCIE_LINK_SHUT_DOWN,
-} nvmlPcieLinkState_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup
- * This chapter describes the methods that handle NVML initialization and cleanup.
- * It is the user's responsibility to call \ref nvmlInit() before calling any other methods, and 
- * nvmlShutdown() once NVML is no longer being used.
- *  @{
- */
-/***************************************************************************************************/
-
-#define NVML_INIT_FLAG_NO_GPUS      1   //!< Don't fail nvmlInit() when no GPUs are found
-#define NVML_INIT_FLAG_NO_ATTACH    2   //!< Don't attach GPUs
-
-/**
- * Initialize NVML, but don't initialize any GPUs yet.
- *
- * \note nvmlInit_v3 introduces a "flags" argument, that allows passing boolean values
- *       modifying the behaviour of nvmlInit().
- * \note In NVML 5.319 new nvmlInit_v2 has replaced nvmlInit"_v1" (default in NVML 4.304 and older) that
- *       did initialize all GPU devices in the system.
- *       
- * This allows NVML to communicate with a GPU
- * when other GPUs in the system are unstable or in a bad state.  When using this API, GPUs are
- * discovered and initialized in nvmlDeviceGetHandleBy* functions instead.
- * 
- * \note To contrast nvmlInit_v2 with nvmlInit"_v1", NVML 4.304 nvmlInit"_v1" will fail when any detected GPU is in
- *       a bad or unstable state.
- * 
- * For all products.
- *
- * This method, should be called once before invoking any other methods in the library.
- * A reference count of the number of initializations is maintained.  Shutdown only occurs
- * when the reference count reaches zero.
- * 
- * @return 
- *         - \ref NVML_SUCCESS                   if NVML has been properly initialized
- *         - \ref NVML_ERROR_DRIVER_NOT_LOADED   if NVIDIA driver is not running
- *         - \ref NVML_ERROR_NO_PERMISSION       if NVML does not have permission to talk to the driver
- *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlInit(void);
-
-/**
- * nvmlInitWithFlags is a variant of nvmlInit(), that allows passing a set of boolean values
- *       modifying the behaviour of nvmlInit().
- *       Other than the "flags" parameter it is completely similar to \ref nvmlInit.
- *       
- * For all products.
- *
- * @param flags                                 behaviour modifier flags
- *
- * @return 
- *         - \ref NVML_SUCCESS                   if NVML has been properly initialized
- *         - \ref NVML_ERROR_DRIVER_NOT_LOADED   if NVIDIA driver is not running
- *         - \ref NVML_ERROR_NO_PERMISSION       if NVML does not have permission to talk to the driver
- *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlInitWithFlags(unsigned int flags);
-
-/**
- * Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit().
- * 
- * For all products.
- *
- * This method should be called after NVML work is done, once for each call to \ref nvmlInit()
- * A reference count of the number of initializations is maintained.  Shutdown only occurs
- * when the reference count reaches zero.  For backwards compatibility, no error is reported if
- * nvmlShutdown() is called more times than nvmlInit().
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if NVML has been properly shut down
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlShutdown(void);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlErrorReporting Error reporting
- * This chapter describes helper functions for error reporting routines.
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Helper method for converting NVML error codes into readable strings.
- *
- * For all products.
- *
- * @param result                               NVML error code to convert
- *
- * @return String representation of the error.
- *
- */
-const DECLDIR char* nvmlErrorString(nvmlReturn_t result);
-/** @} */
-
-
-/***************************************************************************************************/
-/** @defgroup nvmlConstants Constants
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetInforomVersion and \ref nvmlDeviceGetInforomImageVersion
- */
-#define NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE       16
-
-/**
- * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID
- */
-#define NVML_DEVICE_UUID_BUFFER_SIZE                  80
-
-/**
- * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber
- */
-#define NVML_DEVICE_PART_NUMBER_BUFFER_SIZE           80
-
-/**
- * Buffer size guaranteed to be large enough for \ref nvmlSystemGetDriverVersion
- */
-#define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE        80
-
-/**
- * Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion
- */
-#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE          80
-
-/**
- * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName
- */
-#define NVML_DEVICE_NAME_BUFFER_SIZE                  64
-
-/**
- * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial
- */
-#define NVML_DEVICE_SERIAL_BUFFER_SIZE                30
-
-/**
- * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion
- */
-#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE         32
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlSystemQueries System Queries
- * This chapter describes the queries that NVML can perform against the local system. These queries
- * are not device-specific.
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Retrieves the version of the system's graphics driver.
- * 
- * For all products.
- *
- * The version identifier is an alphanumeric string.  It will not exceed 80 characters in length
- * (including the NULL terminator).  See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE.
- *
- * @param version                              Reference in which to return the version identifier
- * @param length                               The maximum allowed length of the string returned in \a version
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a version has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
- */
-nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length);
-
-/**
- * Retrieves the version of the NVML library.
- * 
- * For all products.
- *
- * The version identifier is an alphanumeric string.  It will not exceed 80 characters in length
- * (including the NULL terminator).  See \ref nvmlConstants::NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE.
- *
- * @param version                              Reference in which to return the version identifier
- * @param length                               The maximum allowed length of the string returned in \a version
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a version has been set
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
- */
-nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length);
-
-/**
- * Retrieves the version of the CUDA driver.
- *
- * For all products.
- *
- * The returned CUDA driver version is the same as the CUDA API
- * cuDriverGetVersion() would return on the system.
- *
- * @param cudaDriverVersion                    Reference in which to return the version identifier
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a cudaDriverVersion has been set
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a cudaDriverVersion is NULL
- */
-nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion(int *cudaDriverVersion);
-
-/**
- * Gets name of the process with provided process id
- *
- * For all products.
- *
- * Returned process name is cropped to provided length.
- * name string is encoded in ANSI.
- *
- * @param pid                                  The identifier of the process
- * @param name                                 Reference in which to return the process name
- * @param length                               The maximum allowed length of the string returned in \a name
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a name has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a name is NULL or \a length is 0.
- *         - \ref NVML_ERROR_NOT_FOUND         if process doesn't exists
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlUnitQueries Unit Queries
- * This chapter describes that queries that NVML can perform against each unit. For S-class systems only.
- * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by 
- * calling \ref nvmlUnitGetHandleByIndex().
- *  @{
- */
-/***************************************************************************************************/
-
- /**
- * Retrieves the number of units in the system.
- *
- * For S-class products.
- *
- * @param unitCount                            Reference in which to return the number of units
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a unitCount has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unitCount is NULL
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount);
-
-/**
- * Acquire the handle for a particular unit, based on its index.
- *
- * For S-class products.
- *
- * Valid indices are derived from the \a unitCount returned by \ref nvmlUnitGetCount(). 
- *   For example, if \a unitCount is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1.
- *
- * The order in which NVML enumerates units has no guarantees of consistency between reboots.
- *
- * @param index                                The index of the target unit, >= 0 and < \a unitCount
- * @param unit                                 Reference in which to return the unit handle
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a unit has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a index is invalid or \a unit is NULL
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit);
-
-/**
- * Retrieves the static information associated with a unit.
- *
- * For S-class products.
- *
- * See \ref nvmlUnitInfo_t for details on available unit info.
- *
- * @param unit                                 The identifier of the target unit
- * @param info                                 Reference in which to return the unit information
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a info has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a info is NULL
- */
-nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info);
-
-/**
- * Retrieves the LED state associated with this unit.
- *
- * For S-class products.
- *
- * See \ref nvmlLedState_t for details on allowed states.
- *
- * @param unit                                 The identifier of the target unit
- * @param state                                Reference in which to return the current LED state
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a state has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a state is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- * 
- * @see nvmlUnitSetLedState()
- */
-nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state);
-
-/**
- * Retrieves the PSU stats for the unit.
- *
- * For S-class products.
- *
- * See \ref nvmlPSUInfo_t for details on available PSU info.
- *
- * @param unit                                 The identifier of the target unit
- * @param psu                                  Reference in which to return the PSU information
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a psu has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a psu is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu);
-
-/**
- * Retrieves the temperature readings for the unit, in degrees C.
- *
- * For S-class products.
- *
- * Depending on the product, readings may be available for intake (type=0), 
- * exhaust (type=1) and board (type=2).
- *
- * @param unit                                 The identifier of the target unit
- * @param type                                 The type of reading to take
- * @param temp                                 Reference in which to return the intake temperature
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a temp has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit or \a type is invalid or \a temp is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp);
-
-/**
- * Retrieves the fan speed readings for the unit.
- *
- * For S-class products.
- *
- * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info.
- *
- * @param unit                                 The identifier of the target unit
- * @param fanSpeeds                            Reference in which to return the fan speed information
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a fanSpeeds has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid or \a fanSpeeds is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds);
-
-/**
- * Retrieves the set of GPU devices that are attached to the specified unit.
- *
- * For S-class products.
- *
- * The \a deviceCount argument is expected to be set to the size of the input \a devices array.
- *
- * @param unit                                 The identifier of the target unit
- * @param deviceCount                          Reference in which to provide the \a devices array size, and
- *                                             to return the number of attached GPU devices
- * @param devices                              Reference in which to return the references to the attached GPU devices
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a deviceCount and \a devices have been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit is invalid, either of \a deviceCount or \a devices is NULL
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices);
-
-/**
- * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system.
- * 
- * For S-class products.
- *
- * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array.
- * The HIC must be connected to an S-class system for it to be reported by this function.
- *
- * @param hwbcCount                            Size of hwbcEntries array
- * @param hwbcEntries                          Array holding information about hwbc
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a hwbcCount and \a hwbcEntries have been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if either \a hwbcCount or \a hwbcEntries is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small
- */
-nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries);
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlDeviceQueries Device Queries
- * This chapter describes that queries that NVML can perform against each device.
- * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by  
- * calling one of \ref nvmlDeviceGetHandleByIndex(), \ref nvmlDeviceGetHandleBySerial(),
- * \ref nvmlDeviceGetHandleByPciBusId(). or \ref nvmlDeviceGetHandleByUUID(). 
- *  @{
- */
-/***************************************************************************************************/
-
- /**
- * Retrieves the number of compute devices in the system. A compute device is a single GPU.
- * 
- * For all products.
- *
- * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system
- *       even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device.
- *       Update your code to handle this error, or use NVML 4.304 or older nvml header file.
- *       For backward binary compatibility reasons _v1 version of the API is still present in the shared
- *       library.
- *       Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to.
- *
- * @param deviceCount                          Reference in which to return the number of accessible devices
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a deviceCount has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a deviceCount is NULL
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount);
-
-/**
- * Acquire the handle for a particular device, based on its index.
- * 
- * For all products.
- *
- * Valid indices are derived from the \a accessibleDevices count returned by 
- *   \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices  
- *   are 0 and 1, corresponding to GPU 0 and GPU 1.
- *
- * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it
- *   is recommended that devices be looked up by their PCI ids or UUID. See 
- *   \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId().
- *
- * Note: The NVML index may not correlate with other APIs, such as the CUDA device index.
- *
- * Starting from NVML 5, this API causes NVML to initialize the target GPU
- * NVML may initialize additional GPUs if:
- *  - The target GPU is an SLI slave
- * 
- * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system
- *       even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device.
- *       Update your code to handle this error, or use NVML 4.304 or older nvml header file.
- *       For backward binary compatibility reasons _v1 version of the API is still present in the shared
- *       library.
- *       Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to.
- *
- *       This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index.
- *       If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't
- *       need to worry about that.
- *
- * @param index                                The index of the target GPU, >= 0 and < \a accessibleDevices
- * @param device                               Reference in which to return the device handle
- * 
- * @return 
- *         - \ref NVML_SUCCESS                  if \a device has been set
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a index is invalid or \a device is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
- *         - \ref NVML_ERROR_NO_PERMISSION      if the user doesn't have permission to talk to this device
- *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
- *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
- *
- * @see nvmlDeviceGetIndex
- * @see nvmlDeviceGetCount
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
-
-/**
- * Acquire the handle for a particular device, based on its board serial number.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * This number corresponds to the value printed directly on the board, and to the value returned by
- *   \ref nvmlDeviceGetSerial().
- *
- * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor 
- *             of \ref nvmlDeviceGetHandleByUUID.
- *             For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT.
- *
- * Starting from NVML 5, this API causes NVML to initialize the target GPU
- * NVML may initialize additional GPUs as it searches for the target GPU
- *
- * @param serial                               The board serial number of the target GPU
- * @param device                               Reference in which to return the device handle
- * 
- * @return 
- *         - \ref NVML_SUCCESS                  if \a device has been set
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a serial is invalid, \a device is NULL or more than one
- *                                              device has the same serial (dual GPU boards)
- *         - \ref NVML_ERROR_NOT_FOUND          if \a serial does not match a valid device on the system
- *         - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
- *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
- *         - \ref NVML_ERROR_GPU_IS_LOST        if any GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
- *
- * @see nvmlDeviceGetSerial
- * @see nvmlDeviceGetHandleByUUID
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device);
-
-/**
- * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device.
- *
- * For all products.
- *
- * @param uuid                                 The UUID of the target GPU
- * @param device                               Reference in which to return the device handle
- * 
- * Starting from NVML 5, this API causes NVML to initialize the target GPU
- * NVML may initialize additional GPUs as it searches for the target GPU
- *
- * @return 
- *         - \ref NVML_SUCCESS                  if \a device has been set
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a uuid is invalid or \a device is null
- *         - \ref NVML_ERROR_NOT_FOUND          if \a uuid does not match a valid device on the system
- *         - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
- *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
- *         - \ref NVML_ERROR_GPU_IS_LOST        if any GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
- *
- * @see nvmlDeviceGetUUID
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device);
-
-/**
- * Acquire the handle for a particular device, based on its PCI bus id.
- * 
- * For all products.
- *
- * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo().
- *
- * Starting from NVML 5, this API causes NVML to initialize the target GPU
- * NVML may initialize additional GPUs if:
- *  - The target GPU is an SLI slave
- *
- * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND 
- *       instead of NVML_ERROR_NO_PERMISSION.
- *
- * @param pciBusId                             The PCI bus id of the target GPU
- * @param device                               Reference in which to return the device handle
- * 
- * @return 
- *         - \ref NVML_SUCCESS                  if \a device has been set
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a pciBusId is invalid or \a device is NULL
- *         - \ref NVML_ERROR_NOT_FOUND          if \a pciBusId does not match a valid device on the system
- *         - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables
- *         - \ref NVML_ERROR_NO_PERMISSION      if the user doesn't have permission to talk to this device
- *         - \ref NVML_ERROR_IRQ_ISSUE          if NVIDIA kernel detected an interrupt issue with the attached GPUs
- *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device);
-
-/**
- * Retrieves the name of this device. 
- * 
- * For all products.
- *
- * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not
- * exceed 64 characters in length (including the NULL terminator).  See \ref
- * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
- *
- * @param device                               The identifier of the target device
- * @param name                                 Reference in which to return the product name
- * @param length                               The maximum allowed length of the string returned in \a name
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a name has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a name is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length);
-
-/**
- * Retrieves the brand of this device.
- *
- * For all products.
- *
- * The type is a member of \ref nvmlBrandType_t defined above.
- *
- * @param device                               The identifier of the target device
- * @param type                                 Reference in which to return the product brand type
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a name has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a type is NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type);
-
-/**
- * Retrieves the NVML index of this device.
- *
- * For all products.
- * 
- * Valid indices are derived from the \a accessibleDevices count returned by 
- *   \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices  
- *   are 0 and 1, corresponding to GPU 0 and GPU 1.
- *
- * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it
- *   is recommended that devices be looked up by their PCI ids or GPU UUID. See 
- *   \ref nvmlDeviceGetHandleByPciBusId() and \ref nvmlDeviceGetHandleByUUID().
- *
- * Note: The NVML index may not correlate with other APIs, such as the CUDA device index.
- *
- * @param device                               The identifier of the target device
- * @param index                                Reference in which to return the NVML index of the device
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a index has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a index is NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceGetHandleByIndex()
- * @see nvmlDeviceGetCount()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index);
-
-/**
- * Retrieves the globally unique board serial number associated with this device's board.
- *
- * For all products with an inforom.
- *
- * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator).
- * This number matches the serial number tag that is physically attached to the board.  See \ref
- * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE.
- *
- * @param device                               The identifier of the target device
- * @param serial                               Reference in which to return the board/module serial number
- * @param length                               The maximum allowed length of the string returned in \a serial
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a serial has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a serial is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length);
-
-/**
- * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device
- * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2,
- *     result[0] = 0x3, result[1] = 0x3
- *
- * For Kepler &tm; or newer fully supported devices.
- * Supported on Linux only.
- *
- * @param device                               The identifier of the target device
- * @param cpuSetSize                           The size of the cpuSet array that is safe to access
- * @param cpuSet                               Array reference in which to return a bitmask of CPUs, 64 CPUs per 
- *                                                 unsigned long on 64-bit machines, 32 on 32-bit machines
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a cpuAffinity has been filled
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet);
-
-/**
- * Sets the ideal affinity for the calling thread and device using the guidelines 
- * given in nvmlDeviceGetCpuAffinity().  Note, this is a change as of version 8.0.  
- * Older versions set the affinity for a calling process and all children.
- * Currently supports up to 64 processors.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Supported on Linux only.
- *
- * @param device                               The identifier of the target device
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if the calling process has been successfully bound
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device);
-
-/**
- * Clear all affinity bindings for the calling thread.  Note, this is a change as of version
- * 8.0 as older versions cleared the affinity for a calling process and all children.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Supported on Linux only.
- *
- * @param device                               The identifier of the target device
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if the calling process has been successfully unbound
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device);
-
-/**
- * Retrieve the common ancestor for two devices
- * For all products.
- * Supported on Linux only.
- *
- * @param device1                              The identifier of the first device
- * @param device2                              The identifier of the second device
- * @param pathInfo                             A \ref nvmlGpuTopologyLevel_t that gives the path type
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a pathInfo has been set
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device1, or \a device2 is invalid, or \a pathInfo is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or OS does not support this feature
- *         - \ref NVML_ERROR_UNKNOWN           an error has occurred in underlying topology discovery
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo);
-
-/**
- * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level
- * For all products.
- * Supported on Linux only.
- *
- * @param device                               The identifier of the first device
- * @param level                                The \ref nvmlGpuTopologyLevel_t level to search for other GPUs
- * @param count                                When zero, is set to the number of matching GPUs such that \a deviceArray 
- *                                             can be malloc'd.  When non-zero, \a deviceArray will be filled with \a count
- *                                             number of device handles.
- * @param deviceArray                          An array of device handles for GPUs found at \a level
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a deviceArray or \a count (if initially zero) has been set
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or OS does not support this feature
- *         - \ref NVML_ERROR_UNKNOWN           an error has occurred in underlying topology discovery
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray);
-
-/**
- * Retrieve the set of GPUs that have a CPU affinity with the given CPU number
- * For all products.
- * Supported on Linux only.
- *
- * @param cpuNumber                            The CPU number
- * @param count                                When zero, is set to the number of matching GPUs such that \a deviceArray 
- *                                             can be malloc'd.  When non-zero, \a deviceArray will be filled with \a count
- *                                             number of device handles.
- * @param deviceArray                          An array of device handles for GPUs found with affinity to \a cpuNumber
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a deviceArray or \a count (if initially zero) has been set
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or OS does not support this feature
- *         - \ref NVML_ERROR_UNKNOWN           an error has occurred in underlying topology discovery
- */
-nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray);
-
-/**
- * Retrieve the status for a given p2p capability index between a given pair of GPU 
- * 
- * @param device1                              The first device 
- * @param device2                              The second device
- * @param p2pIndex                             p2p Capability Index being looked for between \a device1 and \a device2
- * @param p2pStatus                            Reference in which to return the status of the \a p2pIndex 
- *                                             between \a device1 and \a device2
- * @return 
- *         - \ref NVML_SUCCESS         if \a p2pStatus has been populated
- *         - \ref NVML_ERROR_INVALID_ARGUMENT     if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL
- *         - \ref NVML_ERROR_UNKNOWN              on any unexpected error
- */ 
-nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus);
-
-/**
- * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string,
- * that augments the immutable, board serial identifier.
- *
- * For all products.
- *
- * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products.
- * It does NOT correspond to any identifier printed on the board.  It will not exceed 80 characters in length
- * (including the NULL terminator).  See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
- *
- * @param device                               The identifier of the target device
- * @param uuid                                 Reference in which to return the GPU UUID
- * @param length                               The maximum allowed length of the string returned in \a uuid
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a uuid has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a uuid is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length);
-
-/**
- * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for 
- * each GPU will have the form /dev/nvidia[minor number].
- *
- * For all products.
- * Supported only for Linux
- *
- * @param device                                The identifier of the target device
- * @param minorNumber                           Reference in which to return the minor number for the device
- * @return
- *         - \ref NVML_SUCCESS                 if the minor number is successfully retrieved
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a minorNumber is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber);
-
-/**
- * Retrieves the the device board part number which is programmed into the board's InfoROM
- *
- * For all products.
- *
- * @param device                                Identifier of the target device
- * @param partNumber                            Reference to the buffer to return
- * @param length                                Length of the buffer reference
- *
- * @return
- *         - \ref NVML_SUCCESS                  if \a partNumber has been set
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_NOT_SUPPORTED      if the needed VBIOS fields have not been filled
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid or \a serial is NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length);
-
-/**
- * Retrieves the version information for the device's infoROM object.
- *
- * For all products with an inforom.
- *
- * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate 
- * ECC counts. The version of the data structures in this memory may change from time to time. It will not
- * exceed 16 characters in length (including the NULL terminator).
- * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE.
- *
- * See \ref nvmlInforomObject_t for details on the available infoROM objects.
- *
- * @param device                               The identifier of the target device
- * @param object                               The target infoROM object
- * @param version                              Reference in which to return the infoROM version
- * @param length                               The maximum allowed length of the string returned in \a version
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a version has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have an infoROM
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceGetInforomImageVersion
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length);
-
-/**
- * Retrieves the global infoROM image version
- *
- * For all products with an inforom.
- *
- * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board 
- * in contrast to infoROM object version which is only an indicator of supported features.
- * Version string will not exceed 16 characters in length (including the NULL terminator).
- * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE.
- *
- * @param device                               The identifier of the target device
- * @param version                              Reference in which to return the infoROM image version
- * @param length                               The maximum allowed length of the string returned in \a version
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a version has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a version is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have an infoROM
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceGetInforomVersion
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length);
-
-/**
- * Retrieves the checksum of the configuration stored in the device's infoROM.
- *
- * For all products with an inforom.
- *
- * Can be used to make sure that two GPUs have the exact same configuration.
- * Current checksum takes into account configuration stored in PWR and ECC infoROM objects.
- * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC)
- *
- * @param device                               The identifier of the target device
- * @param checksum                             Reference in which to return the infoROM configuration checksum
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a checksum has been set
- *         - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a checksum is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error 
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum);
-
-/**
- * Reads the infoROM from the flash and verifies the checksums.
- *
- * For all products with an inforom.
- *
- * @param device                               The identifier of the target device
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if infoROM is not corrupted
- *         - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error 
- */
-nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device);
-
-/**
- * Retrieves the display mode for the device.
- *
- * For all products.
- *
- * This method indicates whether a physical display (e.g. monitor) is currently connected to
- * any of the device's connectors.
- *
- * See \ref nvmlEnableState_t for details on allowed modes.
- *
- * @param device                               The identifier of the target device
- * @param display                              Reference in which to return the display mode
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a display has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a display is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display);
-
-/**
- * Retrieves the display active state for the device.
- *
- * For all products.
- *
- * This method indicates whether a display is initialized on the device.
- * For example whether X Server is attached to this device and has allocated memory for the screen.
- *
- * Display can be active even when no monitor is physically attached.
- *
- * See \ref nvmlEnableState_t for details on allowed modes.
- *
- * @param device                               The identifier of the target device
- * @param isActive                             Reference in which to return the display active state
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a isActive has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a isActive is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive);
-
-/**
- * Retrieves the persistence mode associated with this device.
- *
- * For all products.
- * For Linux only.
- *
- * When driver persistence mode is enabled the driver software state is not torn down when the last 
- * client disconnects. By default this feature is disabled. 
- *
- * See \ref nvmlEnableState_t for details on allowed modes.
- *
- * @param device                               The identifier of the target device
- * @param mode                                 Reference in which to return the current driver persistence mode
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a mode has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceSetPersistenceMode()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode);
-
-/**
- * Retrieves the PCI attributes of this device.
- * 
- * For all products.
- *
- * See \ref nvmlPciInfo_t for details on the available PCI info.
- *
- * @param device                               The identifier of the target device
- * @param pci                                  Reference in which to return the PCI info
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a pci has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pci is NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci);
-
-/**
- * Retrieves the maximum PCIe link generation possible with this device and system
- *
- * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will
- * report is generation 1.
- * 
- * For Fermi &tm; or newer fully supported devices.
- * 
- * @param device                               The identifier of the target device
- * @param maxLinkGen                           Reference in which to return the max PCIe link generation
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a maxLinkGen has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a maxLinkGen is null
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen);
-
-/**
- * Retrieves the maximum PCIe link width possible with this device and system
- *
- * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report
- * a max link width of 8.
- * 
- * For Fermi &tm; or newer fully supported devices.
- * 
- * @param device                               The identifier of the target device
- * @param maxLinkWidth                         Reference in which to return the max PCIe link generation
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a maxLinkWidth has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a maxLinkWidth is null
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth);
-
-/**
- * Retrieves the current PCIe link generation
- * 
- * For Fermi &tm; or newer fully supported devices.
- * 
- * @param device                               The identifier of the target device
- * @param currLinkGen                          Reference in which to return the current PCIe link generation
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a currLinkGen has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a currLinkGen is null
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen);
-
-/**
- * Retrieves the current PCIe link width
- * 
- * For Fermi &tm; or newer fully supported devices.
- * 
- * @param device                               The identifier of the target device
- * @param currLinkWidth                        Reference in which to return the current PCIe link generation
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a currLinkWidth has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a currLinkWidth is null
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if PCIe link information is not available
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth);
-
-/**
- * Retrieve PCIe utilization information.
- * This function is querying a byte counter over a 20ms interval and thus is the 
- *   PCIe throughput over that interval.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * This method is not supported in virtual machines running virtual GPU (vGPU).
- *
- * @param device                               The identifier of the target device
- * @param counter                              The specific counter that should be queried \ref nvmlPcieUtilCounter_t
- * @param value                                Reference in which to return throughput in KB/s
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a value has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a counter is invalid, or \a value is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value);
-
-/**  
- * Retrieve the PCIe replay counter.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param value                                Reference in which to return the counter's value
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a value has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a value is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value);
-
-/**
- * Retrieves the current clock speeds for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * See \ref nvmlClockType_t for details on available clock information.
- *
- * @param device                               The identifier of the target device
- * @param type                                 Identify which clock domain to query
- * @param clock                                Reference in which to return the clock speed in MHz
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a clock has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clock is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device cannot report the specified clock
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
-
-/**
- * Retrieves the maximum clock speeds for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * See \ref nvmlClockType_t for details on available clock information.
- *
- * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks
- *       by few MHz.
- *
- * @param device                               The identifier of the target device
- * @param type                                 Identify which clock domain to query
- * @param clock                                Reference in which to return the clock speed in MHz
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a clock has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clock is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device cannot report the specified clock
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
-
-/**
- * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs.
- * Can be changed using \ref nvmlDeviceSetApplicationsClocks.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param clockType                            Identify which clock domain to query
- * @param clockMHz                             Reference in which to return the clock in MHz
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
-
-/**
- * Retrieves the default applications clock that GPU boots with or 
- * defaults to after \ref nvmlDeviceResetApplicationsClocks call.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param clockType                            Identify which clock domain to query
- * @param clockMHz                             Reference in which to return the default clock in MHz
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * \see nvmlDeviceGetApplicationsClock
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
-
-/**
- * Resets the application clock to the default value
- *
- * This is the applications clock that will be used after system reboot or driver reload.
- * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks.
- *
- * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks,
- * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above
- * base clocks as thermal limits allow.
- *
- * @see nvmlDeviceGetApplicationsClock
- * @see nvmlDeviceSetApplicationsClocks
- *
- * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
- *
- * @param device                               The identifier of the target device
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if new settings were successfully set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device);
-
-/**
- * Retrieves the clock speed for the clock specified by the clock type and clock ID.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param clockType                            Identify which clock domain to query
- * @param clockId                              Identify which clock in the domain to query
- * @param clockMHz                             Reference in which to return the clock in MHz
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz);
-
-/**
- * Retrieves the customer defined maximum boost clock speed specified by the given clock type.
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param clockType                            Identify which clock domain to query
- * @param clockMHz                             Reference in which to return the clock in MHz
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a clockMHz has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device or the \a clockType on this device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
-
-/**
- * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param count                                Reference in which to provide the \a clocksMHz array size, and
- *                                             to return the number of elements
- * @param clocksMHz                            Reference in which to return the clock in MHz
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a count and \a clocksMHz have been populated 
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a count is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of
- *                                                required elements)
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceSetApplicationsClocks
- * @see nvmlDeviceGetSupportedGraphicsClocks
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz);
-
-/**
- * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param memoryClockMHz                       Memory clock for which to return possible graphics clocks
- * @param count                                Reference in which to provide the \a clocksMHz array size, and
- *                                             to return the number of elements
- * @param clocksMHz                            Reference in which to return the clocks in MHz
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a count and \a clocksMHz have been populated 
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_NOT_FOUND         if the specified \a memoryClockMHz is not a supported frequency
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clock is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small 
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceSetApplicationsClocks
- * @see nvmlDeviceGetSupportedMemoryClocks
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz);
-
-/**
- * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
- * to maximize performance as thermal limits allow.
- *
- * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks.
- * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
- * behavior.
- *
- * @param device                               The identifier of the target device
- * @param isEnabled                            Where to store the current state of Auto Boosted clocks of the target device
- * @param defaultIsEnabled                     Where to store the default Auto Boosted clocks behavior of the target device that the device will
- *                                                 revert to when no applications are using the GPU
- *
- * @return
- *         - \ref NVML_SUCCESS                 If \a isEnabled has been been set with the Auto Boosted clocks state of \a device
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a isEnabled is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support Auto Boosted clocks
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled);
-
-/**
- * Try to set the current state of Auto Boosted clocks on a device.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
- * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock
- * rates are desired.
- *
- * Non-root users may use this API by default but can be restricted by root from using this API by calling
- * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS.
- * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled.
- *
- * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks.
- * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
- * behavior.
- *
- * @param device                               The identifier of the target device
- * @param enabled                              What state to try to set Auto Boosted clocks of the target device to
- *
- * @return
- *         - \ref NVML_SUCCESS                 If the Auto Boosted clocks were successfully set to the state specified by \a enabled
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support Auto Boosted clocks
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled);
-
-/**
- * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will
- * return to when no compute running processes (e.g. CUDA application which have an active context) are running
- *
- * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
- * Requires root/admin permissions.
- *
- * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
- * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock
- * rates are desired.
- *
- * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks.
- * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
- * behavior.
- *
- * @param device                               The identifier of the target device
- * @param enabled                              What state to try to set default Auto Boosted clocks of the target device to
- * @param flags                                Flags that change the default behavior. Currently Unused.
- *
- * @return
- *         - \ref NVML_SUCCESS                 If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_NO_PERMISSION     If the calling user does not have permission to change Auto Boosted clock's default state.
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support Auto Boosted clocks
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags);
-
-
-/**
- * Retrieves the intended operating speed of the device's fan.
- *
- * Note: The reported speed is the intended fan speed.  If the fan is physically blocked and unable to spin, the
- * output will not match the actual fan speed.
- * 
- * For all discrete products with dedicated fans.
- *
- * The fan speed is expressed as a percent of the maximum, i.e. full speed is 100%.
- *
- * @param device                               The identifier of the target device
- * @param speed                                Reference in which to return the fan speed percentage
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a speed has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a speed is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have a fan
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed);
-
-/**
- * Retrieves the current temperature readings for the device, in degrees C. 
- * 
- * For all products.
- *
- * See \ref nvmlTemperatureSensors_t for details on available temperature sensors.
- *
- * @param device                               The identifier of the target device
- * @param sensorType                           Flag that indicates which sensor reading to retrieve
- * @param temp                                 Reference in which to return the temperature reading
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a temp has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a sensorType is invalid or \a temp is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have the specified sensor
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp);
-
-/**
- * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds.
- *
- * @param device                               The identifier of the target device
- * @param thresholdType                        The type of threshold value queried
- * @param temp                                 Reference in which to return the temperature reading
- * @return
- *         - \ref NVML_SUCCESS                 if \a temp has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a thresholdType is invalid or \a temp is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not have a temperature sensor or is unsupported
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
-
-/**
- * Retrieves the current performance state for the device. 
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * See \ref nvmlPstates_t for details on allowed performance states.
- *
- * @param device                               The identifier of the target device
- * @param pState                               Reference in which to return the performance state reading
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a pState has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pState is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState);
-
-/**
- * Retrieves current clocks throttling reasons.
- *
- * For all fully supported products.
- *
- * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once.
- *
- * @param device                                The identifier of the target device
- * @param clocksThrottleReasons                 Reference in which to return bitmask of active clocks throttle
- *                                                  reasons
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a clocksThrottleReasons has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a clocksThrottleReasons is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlClocksThrottleReasons
- * @see nvmlDeviceGetSupportedClocksThrottleReasons
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons);
-
-/**
- * Retrieves bitmask of supported clocks throttle reasons that can be returned by 
- * \ref nvmlDeviceGetCurrentClocksThrottleReasons
- *
- * For all fully supported products.
- *
- * This method is not supported in virtual machines running virtual GPU (vGPU).
- *
- * @param device                               The identifier of the target device
- * @param supportedClocksThrottleReasons       Reference in which to return bitmask of supported
- *                                              clocks throttle reasons
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a supportedClocksThrottleReasons has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a supportedClocksThrottleReasons is NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlClocksThrottleReasons
- * @see nvmlDeviceGetCurrentClocksThrottleReasons
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons);
-
-/**
- * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization.
- *
- * Retrieve the current performance state for the device. 
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * See \ref nvmlPstates_t for details on allowed performance states.
- *
- * @param device                               The identifier of the target device
- * @param pState                               Reference in which to return the performance state reading
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a pState has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a pState is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState);
-
-/**
- * This API has been deprecated.
- *
- * Retrieves the power management mode associated with this device.
- *
- * For products from the Fermi family.
- *     - Requires \a NVML_INFOROM_POWER version 3.0 or higher.
- *
- * For from the Kepler or newer families.
- *     - Does not require \a NVML_INFOROM_POWER object.
- *
- * This flag indicates whether any power management algorithm is currently active on the device. An 
- * enabled state does not necessarily mean the device is being actively throttled -- only that 
- * that the driver will do so if the appropriate conditions are met.
- *
- * See \ref nvmlEnableState_t for details on allowed modes.
- *
- * @param device                               The identifier of the target device
- * @param mode                                 Reference in which to return the current power management mode
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a mode has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode);
-
-/**
- * Retrieves the power management limit associated with this device.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * The power limit defines the upper boundary for the card's power draw. If
- * the card's total power draw reaches this limit the power management algorithm kicks in.
- *
- * This reading is only available if power management mode is supported. 
- * See \ref nvmlDeviceGetPowerManagementMode.
- *
- * @param device                               The identifier of the target device
- * @param limit                                Reference in which to return the power management limit in milliwatts
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a limit has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a limit is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit);
-
-/**
- * Retrieves information about possible values of power management limits on this device.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param minLimit                             Reference in which to return the minimum power management limit in milliwatts
- * @param maxLimit                             Reference in which to return the maximum power management limit in milliwatts
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a minLimit and \a maxLimit have been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a minLimit or \a maxLimit is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceSetPowerManagementLimit
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit);
-
-/**
- * Retrieves default power management limit on this device, in milliwatts.
- * Default power management limit is a power management limit that the device boots with.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param defaultLimit                         Reference in which to return the default power management limit in milliwatts
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a defaultLimit has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a defaultLimit is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit);
-
-/**
- * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
- *
- * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode.
- *
- * @param device                               The identifier of the target device
- * @param power                                Reference in which to return the power usage information
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a power has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a power is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support power readings
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power);
-
-/**
- * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
- *
- * For newer than Pascal &tm; fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param energy                               Reference in which to return the energy consumption information
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a energy has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a energy is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support energy readings
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy);
-
-/**
- * Get the effective power limit that the driver enforces after taking into account all limiters
- *
- * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere
- * This includes the out of band power limit interface
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                           The device to communicate with
- * @param limit                            Reference in which to return the power management limit in milliwatts
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a limit has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a limit is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit);
-
-/**
- * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot).
- *
- * For GK110 M-class and X-class Tesla &tm; products from the Kepler family.
- * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products.
- * Not supported on Quadro &reg; and Tesla &tm; C-class products.
- *
- * @param device                               The identifier of the target device
- * @param current                              Reference in which to return the current GOM
- * @param pending                              Reference in which to return the pending GOM
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a mode has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a current or \a pending is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlGpuOperationMode_t
- * @see nvmlDeviceSetGpuOperationMode
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending);
-
-/**
- * Retrieves the amount of used, free and total memory available on the device, in bytes.
- * 
- * For all products.
- *
- * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits.
- * Under WDDM most device memory is allocated and managed on startup by Windows.
- *
- * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated 
- * by all active channels on the device.
- *
- * See \ref nvmlMemory_t for details on available memory info.
- *
- * @param device                               The identifier of the target device
- * @param memory                               Reference in which to return the memory information
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a memory has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memory is NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory);
-
-/**
- * Retrieves the current compute mode for the device.
- *
- * For all products.
- *
- * See \ref nvmlComputeMode_t for details on allowed compute modes.
- *
- * @param device                               The identifier of the target device
- * @param mode                                 Reference in which to return the current compute mode
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a mode has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceSetComputeMode()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode);
-
-/**
- * Retrieves the CUDA compute capability of the device.
- *
- * For all products.
- *
- * Returns the major and minor compute capability version numbers of the
- * device.  The major and minor versions are equivalent to the
- * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and
- * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be
- * returned by CUDA's cuDeviceGetAttribute().
- *
- * @param device                               The identifier of the target device
- * @param major                                Reference in which to return the major CUDA compute capability
- * @param minor                                Reference in which to return the minor CUDA compute capability
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a major and \a minor have been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a major or \a minor are NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor);
-
-/**
- * Retrieves the current and pending ECC modes for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- * Only applicable to devices with ECC.
- * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
- *
- * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following
- * the next reboot.
- *
- * See \ref nvmlEnableState_t for details on allowed modes.
- *
- * @param device                               The identifier of the target device
- * @param current                              Reference in which to return the current ECC mode
- * @param pending                              Reference in which to return the pending ECC mode
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a current and \a pending have been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or either \a current or \a pending is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceSetEccMode()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending);
-
-/**
- * Retrieves the device boardId from 0-N.
- * Devices with the same boardId indicate GPUs connected to the same PLX.  Use in conjunction with 
- *  \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well.
- *  The boardId returned is a unique ID for the current configuration.  Uniqueness and ordering across 
- *  reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and
- *  the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will 
- *  always return those values but they will always be different from each other).
- *  
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param boardId                              Reference in which to return the device's board ID
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a boardId has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a boardId is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId);
-
-/**
- * Retrieves whether the device is on a Multi-GPU Board
- * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param multiGpuBool                         Reference in which to return a zero or non-zero value
- *                                                 to indicate whether the device is on a multi GPU board
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a multiGpuBool has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a multiGpuBool is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool);
-
-/**
- * Retrieves the total ECC error counts for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- * Only applicable to devices with ECC.
- * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
- * Requires ECC Mode to be enabled.
- *
- * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of 
- * errors across the entire device.
- *
- * See \ref nvmlMemoryErrorType_t for a description of available error types.\n
- * See \ref nvmlEccCounterType_t for a description of available counter types.
- *
- * @param device                               The identifier of the target device
- * @param errorType                            Flag that specifies the type of the errors. 
- * @param counterType                          Flag that specifies the counter-type of the errors. 
- * @param eccCounts                            Reference in which to return the specified ECC errors
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a eccCounts has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceClearEccErrorCounts()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts);
-
-/**
- * Retrieves the detailed ECC error counts for the device.
- *
- * @deprecated   This API supports only a fixed set of ECC error locations
- *               On different GPU architectures different locations are supported
- *               See \ref nvmlDeviceGetMemoryErrorCounter
- *
- * For Fermi &tm; or newer fully supported devices.
- * Only applicable to devices with ECC.
- * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts.
- * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts.
- * Requires ECC Mode to be enabled.
- *
- * Detailed errors provide separate ECC counts for specific parts of the memory system.
- *
- * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported.
- *
- * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n
- * See \ref nvmlEccCounterType_t for a description of available counter types.\n
- * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts.
- *
- * @param device                               The identifier of the target device
- * @param errorType                            Flag that specifies the type of the errors. 
- * @param counterType                          Flag that specifies the counter-type of the errors. 
- * @param eccCounts                            Reference in which to return the specified ECC errors
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a eccCounts has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceClearEccErrorCounts()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts);
-
-/**
- * Retrieves the requested memory error counter for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts.
- * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts.
- *
- * Only applicable to devices with ECC.
- *
- * Requires ECC Mode to be enabled.
- *
- * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n
- * See \ref nvmlEccCounterType_t for a description of available counter types.\n
- * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n
- * 
- * @param device                               The identifier of the target device
- * @param errorType                            Flag that specifies the type of error.
- * @param counterType                          Flag that specifies the counter-type of the errors. 
- * @param locationType                         Specifies the location of the counter. 
- * @param count                                Reference in which to return the ECC counter
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a count has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a bitTyp,e \a counterType or \a locationType is
- *                                             invalid, or \a count is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support ECC error reporting in the specified memory
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType,
-                                                   nvmlEccCounterType_t counterType,
-                                                   nvmlMemoryLocation_t locationType, unsigned long long *count);
-
-/**
- * Retrieves the current utilization rates for the device's major subsystems.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * See \ref nvmlUtilization_t for details on available utilization rates.
- *
- * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings.
- *       This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization.
- *
- * @param device                               The identifier of the target device
- * @param utilization                          Reference in which to return the utilization information
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a utilization has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a utilization is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization);
-
-/**
- * Retrieves the current utilization and sampling size in microseconds for the Encoder
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param utilization                          Reference to an unsigned int for encoder utilization info
- * @param samplingPeriodUs                     Reference to an unsigned int for the sampling period in US
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a utilization has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
-
-/**
- * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param device                            The identifier of the target device
- * @param encoderQueryType                  Type of encoder to query
- * @param encoderCapacity                   Reference to an unsigned int for the encoder capacity
- * 
- * @return
- *         - \ref NVML_SUCCESS                  if \a encoderCapacity is fetched
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a encoderCapacity is NULL, or \a device or \a encoderQueryType
- *                                              are invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED      if device does not support the encoder specified in \a encodeQueryType
- *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity);
-
-/**
- * Retrieves the current encoder statistics for a given device.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param device                            The identifier of the target device
- * @param sessionCount                      Reference to an unsigned int for count of active encoder sessions
- * @param averageFps                        Reference to an unsigned int for trailing average FPS of all active sessions
- * @param averageLatency                    Reference to an unsigned int for encode latency in microseconds
- * 
- * @return
- *         - \ref NVML_SUCCESS                  if \a sessionCount, \a averageFps and \a averageLatency is fetched
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount, or \a device or \a averageFps,
- *                                              or \a averageLatency is NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount,
-                                                unsigned int *averageFps, unsigned int *averageLatency);
-
-/**
- * Retrieves information about active encoder sessions on a target device.
- *
- * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The
- * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
- * written to the buffer.
- *
- * If the supplied buffer is not large enough to accomodate the active session array, the function returns
- * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount.
- * To query the number of active encoder sessions, call this function with *sessionCount = 0.  The code will return
- * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param device                            The identifier of the target device
- * @param sessionCount                      Reference to caller supplied array size, and returns the number of sessions.
- * @param sessionInfos                      Reference in which to return the session information
- * 
- * @return
- *         - \ref NVML_SUCCESS                  if \a sessionInfos is fetched
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a sessionCount is too small, array element count is returned in \a sessionCount
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount is NULL.
- *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos);
-
-/**
- * Retrieves the current utilization and sampling size in microseconds for the Decoder
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param utilization                          Reference to an unsigned int for decoder utilization info
- * @param samplingPeriodUs                     Reference to an unsigned int for the sampling period in US
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a utilization has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
-
-/**
-* Retrieves the active frame buffer capture sessions statistics for a given device.
-*
-* For Maxwell &tm; or newer fully supported devices.
-*
-* @param device                            The identifier of the target device
-* @param fbcStats                          Reference to nvmlFBCStats_t structure contianing NvFBC stats
-*
-* @return
-*         - \ref NVML_SUCCESS                  if \a fbcStats is fetched
-*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
-*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a fbcStats is NULL
-*         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
-*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
-*/
-nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *fbcStats);
-
-/**
-* Retrieves information about active frame buffer capture sessions on a target device.
-*
-* An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
-* array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
-* written to the buffer.
-*
-* If the supplied buffer is not large enough to accomodate the active session array, the function returns
-* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount.
-* To query the number of active FBC sessions, call this function with *sessionCount = 0.  The code will return
-* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount.
-*
-* For Maxwell &tm; or newer fully supported devices.
-*
-* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may
-*       be zero if there are no new frames captured since the session started.
-*
-* @param device                            The identifier of the target device
-* @param sessionCount                      Reference to caller supplied array size, and returns the number of sessions.
-* @param sessionInfo                       Reference in which to return the session information
-*
-* @return
-*         - \ref NVML_SUCCESS                  if \a sessionInfo is fetched
-*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
-*         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a sessionCount is too small, array element count is returned in \a sessionCount
-*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount is NULL.
-*         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
-*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
-*/
-nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo);
-
-/**
- * Retrieves the current and pending driver model for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- * For windows only.
- *
- * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached
- * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached.
- *
- * See \ref nvmlDriverModel_t for details on available driver models.
- *
- * @param device                               The identifier of the target device
- * @param current                              Reference in which to return the current driver model
- * @param pending                              Reference in which to return the pending driver model
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if either \a current and/or \a pending have been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or both \a current and \a pending are NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the platform is not windows
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- * 
- * @see nvmlDeviceSetDriverModel()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending);
-
-/**
- * Get VBIOS version of the device.
- *
- * For all products.
- *
- * The VBIOS version may change from time to time. It will not exceed 32 characters in length 
- * (including the NULL terminator).  See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE.
- *
- * @param device                               The identifier of the target device
- * @param version                              Reference to which to return the VBIOS version
- * @param length                               The maximum allowed length of the string returned in \a version
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a version has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a version is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small 
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length);
-
-/**
- * Get Bridge Chip Information for all the bridge chips on the board.
- * 
- * For all fully supported products.
- * Only applicable to multi-GPU products.
- * 
- * @param device                                The identifier of the target device
- * @param bridgeHierarchy                       Reference to the returned bridge chip Hierarchy
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if bridge chip exists
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a bridgeInfo is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if bridge chip not supported on the device
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- * 
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy);
-
-/**
- * Get information about processes with a compute context on a device
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * This function returns information only about compute running processes (e.g. CUDA application which have
- * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function.
- *
- * To query the current number of running compute processes, call this function with *infoCount = 0. The
- * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
- * \a infos is allowed to be NULL.
- *
- * The usedGpuMemory field returned is all of the memory used by the application.
- *
- * Keep in mind that information returned by this call is dynamic and the number of elements might change in
- * time. Allocate more space for \a infos table in case new compute processes are spawned.
- *
- * @param device                               The identifier of the target device
- * @param infoCount                            Reference in which to provide the \a infos array size, and
- *                                             to return the number of returned elements
- * @param infos                                Reference in which to return the process information
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a infoCount and \a infos have been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
- *                                             \a infoCount will contain minimal amount of space necessary for
- *                                             the call to complete
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, either of \a infoCount or \a infos is NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see \ref nvmlSystemGetProcessName
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
-
-/**
- * Get information about processes with a graphics context on a device
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * This function returns information only about graphics based processes 
- * (eg. applications using OpenGL, DirectX)
- *
- * To query the current number of running graphics processes, call this function with *infoCount = 0. The
- * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
- * \a infos is allowed to be NULL.
- *
- * The usedGpuMemory field returned is all of the memory used by the application.
- *
- * Keep in mind that information returned by this call is dynamic and the number of elements might change in
- * time. Allocate more space for \a infos table in case new graphics processes are spawned.
- *
- * @param device                               The identifier of the target device
- * @param infoCount                            Reference in which to provide the \a infos array size, and
- *                                             to return the number of returned elements
- * @param infos                                Reference in which to return the process information
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a infoCount and \a infos have been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
- *                                             \a infoCount will contain minimal amount of space necessary for
- *                                             the call to complete
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, either of \a infoCount or \a infos is NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see \ref nvmlSystemGetProcessName
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
-
-/**
- * Check if the GPU devices are on the same physical board.
- *
- * For all fully supported products.
- *
- * @param device1                               The first GPU device
- * @param device2                               The second GPU device
- * @param onSameBoard                           Reference in which to return the status.
- *                                              Non-zero indicates that the GPUs are on the same board.
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a onSameBoard has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if this check is not supported by the device
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the either GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard);
-
-/**
- * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs.
- * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions.
- *
- * For all fully supported products.
- *
- * @param device                               The identifier of the target device
- * @param apiType                              Target API type for this operation
- * @param isRestricted                         Reference in which to return the current restriction 
- *                                             NVML_FEATURE_ENABLED indicates that the API is root-only
- *                                             NVML_FEATURE_DISABLED indicates that the API is accessible to all users
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a isRestricted has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device or the device does not support
- *                                                 the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is
- *                                                 not supported by the device)
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlRestrictedAPI_t
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted);
-
-/**
- * Gets recent samples for the GPU.
- * 
- * For Kepler &tm; or newer fully supported devices.
- * 
- * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by 
- * the driver.
- * 
- * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t.
- * 
- * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. 
- * The returned samplesCount will provide the number of samples that can be queried. The user needs to 
- * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t).
- * 
- * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the 
- * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query 
- * to get more recent samples.
- * 
- * This method fetches the number of entries which can be accommodated in the provided samples array, and the 
- * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this 
- * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost.
- * 
- * @param device                        The identifier for the target device
- * @param type                          Type of sampling event
- * @param lastSeenTimeStamp             Return only samples with timestamp greater than lastSeenTimeStamp. 
- * @param sampleValType                 Output parameter to represent the type of sample value as described in nvmlSampleVal_t
- * @param sampleCount                   Reference to provide the number of elements which can be queried in samples array
- * @param samples                       Reference in which samples are returned
- 
- * @return 
- *         - \ref NVML_SUCCESS                 if samples are successfully retrieved
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a samplesCount is NULL or 
- *                                             reference to \a sampleCount is 0 for non null \a samples
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp,
-        nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples);
-
-/**
- * Gets Total, Available and Used size of BAR1 memory.
- * 
- * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party 
- * devices (peer-to-peer on the PCIE bus). 
- * 
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param bar1Memory                           Reference in which BAR1 memory
- *                                             information is returned.
- *
- * @return
- *         - \ref NVML_SUCCESS                 if BAR1 memory is successfully retrieved
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a bar1Memory is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory);
-
-
-/**
- * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power 
- * or thermal constraints.
- *
- * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The
- * difference in violation times at two different reference times gives the indication of GPU throttling event. 
- *
- * Violation for thermal capping is not supported at this time.
- * 
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param perfPolicyType                       Represents Performance policy which can trigger GPU throttling
- * @param violTime                             Reference to which violation time related information is returned 
- *                                         
- *
- * @return
- *         - \ref NVML_SUCCESS                 if violation time is successfully retrieved
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if this query is not supported by the device
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime);
-
-/**
- * @}
- */
-
-/** @addtogroup nvmlAccountingStats
- *  @{
- */
-
-/**
- * Queries the state of per process accounting mode.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * See \ref nvmlDeviceGetAccountingStats for more details.
- * See \ref nvmlDeviceSetAccountingMode
- *
- * @param device                               The identifier of the target device
- * @param mode                                 Reference in which to return the current accounting mode
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if the mode has been successfully retrieved 
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode are NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode);
-
-/**
- * Queries process's accounting stats.
- *
- * For Kepler &tm; or newer fully supported devices.
- * 
- * Accounting stats capture GPU utilization and other statistics across the lifetime of a process.
- * Accounting stats can be queried during life time of the process and after its termination.
- * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and 
- * updated to actual running time after its termination.
- * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old
- * processes.
- *
- * See \ref nvmlAccountingStats_t for description of each returned metric.
- * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids.
- *
- * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode.
- * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be
- *         queried since they don't contribute to GPU utilization.
- * @note In case of pid collision stats of only the latest process (that terminated last) will be reported
- *
- * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU.
- * 
- * @param device                               The identifier of the target device
- * @param pid                                  Process Id of the target process to query stats for
- * @param stats                                Reference in which to return the process's accounting stats
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if stats have been successfully retrieved
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a stats are NULL
- *         - \ref NVML_ERROR_NOT_FOUND         if process stats were not found
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature or accounting mode is disabled
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceGetAccountingBufferSize
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats);
-
-/**
- * Queries list of processes that can be queried for accounting stats. The list of processes returned 
- * can be in running or terminated state.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * To just query the number of processes ready to be queried, call this function with *count = 0 and
- * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty.
- * 
- * For more details see \ref nvmlDeviceGetAccountingStats.
- *
- * @note In case of PID collision some processes might not be accessible before the circular buffer is full.
- *
- * @param device                               The identifier of the target device
- * @param count                                Reference in which to provide the \a pids array size, and
- *                                               to return the number of elements ready to be queried
- * @param pids                                 Reference in which to return list of process ids
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if pids were successfully retrieved
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a count is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature or accounting mode is disabled
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to
- *                                                 expected value)
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceGetAccountingBufferSize
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids);
-
-/**
- * Returns the number of processes that the circular buffer with accounting pids can hold.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * This is the maximum number of processes that accounting information will be stored for before information
- * about oldest processes will get overwritten by information about new processes.
- *
- * @param device                               The identifier of the target device
- * @param bufferSize                           Reference in which to provide the size (in number of elements)
- *                                               of the circular buffer for accounting stats.
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if buffer size was successfully retrieved
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a bufferSize is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature or accounting mode is disabled
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- * 
- * @see nvmlDeviceGetAccountingStats
- * @see nvmlDeviceGetAccountingPids
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize);
-
-/** @} */
-
-/** @addtogroup nvmlDeviceQueries
- *  @{
- */
-
-/**
- * Returns the list of retired pages by source, including pages that are pending retirement
- * The address information provided from this API is the hardware address of the page that was retired.  Note
- * that this does not match the virtual address used in CUDA, but will match the address information in XID 63
- * 
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                            The identifier of the target device
- * @param cause                             Filter page addresses by cause of retirement
- * @param pageCount                         Reference in which to provide the \a addresses buffer size, and
- *                                          to return the number of retired pages that match \a cause
- *                                          Set to 0 to query the size without allocating an \a addresses buffer
- * @param addresses                         Buffer to write the page addresses into
- * 
- * @return
- *         - \ref NVML_SUCCESS                 if \a pageCount was populated and \a addresses was filled
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the
- *                                             matching page addresses.  \a pageCount is set to the needed size.
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or 
- *                                             \a addresses is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause,
-    unsigned int *pageCount, unsigned long long *addresses);
-
-/**
- * Returns the list of retired pages by source, including pages that are pending retirement
- * The address information provided from this API is the hardware address of the page that was retired.  Note
- * that this does not match the virtual address used in CUDA, but will match the address information in XID 63
- *
- * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps paramter to return the time of each page's
- *       retirement.
- * 
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                            The identifier of the target device
- * @param cause                             Filter page addresses by cause of retirement
- * @param pageCount                         Reference in which to provide the \a addresses buffer size, and
- *                                          to return the number of retired pages that match \a cause
- *                                          Set to 0 to query the size without allocating an \a addresses buffer
- * @param addresses                         Buffer to write the page addresses into
- * @param timestamps                        Buffer to write the timestamps of page retirement, additional for _v2
- * 
- * @return
- *         - \ref NVML_SUCCESS                 if \a pageCount was populated and \a addresses was filled
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the
- *                                             matching page addresses.  \a pageCount is set to the needed size.
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or 
- *                                             \a addresses is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause,
-    unsigned int *pageCount, unsigned long long *addresses, unsigned long long *timestamps);
-
-/**
- * Check if any pages are pending retirement and need a reboot to fully retire.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                            The identifier of the target device
- * @param isPending                         Reference in which to return the pending status
- * 
- * @return
- *         - \ref NVML_SUCCESS                 if \a isPending was populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a isPending is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlUnitCommands Unit Commands
- *  This chapter describes NVML operations that change the state of the unit. For S-class products.
- *  Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION
- *  error code when invoking any of these methods.
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Set the LED state for the unit. The LED can be either green (0) or amber (1).
- *
- * For S-class products.
- * Requires root/admin permissions.
- *
- * This operation takes effect immediately.
- * 
- *
- * <b>Current S-Class products don't provide unique LEDs for each unit. As such, both front 
- * and back LEDs will be toggled in unison regardless of which unit is specified with this command.</b>
- *
- * See \ref nvmlLedColor_t for available colors.
- *
- * @param unit                                 The identifier of the target unit
- * @param color                                The target LED color
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if the LED color has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a unit or \a color is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if this is not an S-class product
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- * 
- * @see nvmlUnitGetLedState()
- */
-nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlDeviceCommands Device Commands
- *  This chapter describes NVML operations that change the state of the device.
- *  Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION
- *  error code when invoking any of these methods.
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Set the persistence mode for the device.
- *
- * For all products.
- * For Linux only.
- * Requires root/admin permissions.
- *
- * The persistence mode determines whether the GPU driver software is torn down after the last client
- * exits.
- *
- * This operation takes effect immediately. It is not persistent across reboots. After each reboot the
- * persistence mode is reset to "Disabled".
- *
- * See \ref nvmlEnableState_t for available modes.
- *
- * @param device                               The identifier of the target device
- * @param mode                                 The target persistence mode
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if the persistence mode was set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceGetPersistenceMode()
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode);
-
-/**
- * Set the compute mode for the device.
- *
- * For all products.
- * Requires root/admin permissions.
- *
- * The compute mode determines whether a GPU can be used for compute operations and whether it can
- * be shared across contexts.
- *
- * This operation takes effect immediately. Under Linux it is not persistent across reboots and
- * always resets to "Default". Under windows it is persistent.
- *
- * Under windows compute mode may only be set to DEFAULT when running in WDDM
- *
- * See \ref nvmlComputeMode_t for details on available compute modes.
- *
- * @param device                               The identifier of the target device
- * @param mode                                 The target compute mode
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if the compute mode was set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceGetComputeMode()
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode);
-
-/**
- * Set the ECC mode for the device.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Only applicable to devices with ECC.
- * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
- * Requires root/admin permissions.
- *
- * The ECC mode determines whether the GPU enables its ECC support.
- *
- * This operation takes effect after the next reboot.
- *
- * See \ref nvmlEnableState_t for details on available modes.
- *
- * @param device                               The identifier of the target device
- * @param ecc                                  The target ECC mode
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if the ECC mode was set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a ecc is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceGetEccMode()
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc);  
-
-/**
- * Clear the ECC error and other memory error counts for the device.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Only applicable to devices with ECC.
- * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts.
- * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts.
- * Requires root/admin permissions.
- * Requires ECC Mode to be enabled.
- *
- * Sets all of the specified ECC counters to 0, including both detailed and total counts.
- *
- * This operation takes effect immediately.
- *
- * See \ref nvmlMemoryErrorType_t for details on available counter types.
- *
- * @param device                               The identifier of the target device
- * @param counterType                          Flag that indicates which type of errors should be cleared.
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if the error counts were cleared
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a counterType is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see 
- *      - nvmlDeviceGetDetailedEccErrors()
- *      - nvmlDeviceGetTotalEccErrors()
- */
-nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType);
-
-/**
- * Set the driver model for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- * For windows only.
- * Requires root/admin permissions.
- *
- * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached
- * to the device it must run in WDDM mode.  
- *
- * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce).
- * This should only be done if the host is subsequently powered down and the display is detached from the device
- * before the next reboot. 
- *
- * This operation takes effect after the next reboot.
- * 
- * Windows driver model may only be set to WDDM when running in DEFAULT compute mode.
- *
- * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or 
- * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode.
- *
- * See \ref nvmlDriverModel_t for details on available driver models.
- * See \ref nvmlFlagDefault and \ref nvmlFlagForce
- *
- * @param device                               The identifier of the target device
- * @param driverModel                          The target driver model
- * @param flags                                Flags that change the default behavior
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if the driver model has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a driverModel is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the platform is not windows or the device does not support this feature
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- * 
- * @see nvmlDeviceGetDriverModel()
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags);
-
-/**
- * Set clocks that device will lock to.
- *
- * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz.
- * Setting this will supercede application clock values and take effect regardless if a cuda app is running.
- * See /ref nvmlDeviceSetApplicationsClocks
- *
- * Can be used as a setting to request constant performance.
- *
- * Requires root/admin permissions.
- *
- * After system reboot or driver reload applications clocks go back to their default value.
- * See \ref nvmlDeviceResetGpuLockedClocks.
- *
- * For newer than Pascal &tm; fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param minGpuClockMHz                       Requested minimum gpu clock in MHz
- * @param maxGpuClockMHz                       Requested maximum gpu clock in MHz
- *
- * @return
- *         - \ref NVML_SUCCESS                 if new settings were successfully set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz
- *                                                 is not a valid clock combination
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz);
-
-/**
- * Resets the gpu clock to the default value
- *
- * This is the gpu clock that will be used after system reboot or driver reload.
- * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks.
- *
- * @see nvmlDeviceSetGpuLockedClocks
- *
- * For newer than Pascal &tm; fully supported devices.
- *
- * @param device                               The identifier of the target device
- *
- * @return
- *         - \ref NVML_SUCCESS                 if new settings were successfully set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device);
-
-/**
- * Set clocks that applications will lock to.
- *
- * Sets the clocks that compute and graphics applications will be running at.
- * e.g. CUDA driver requests these clocks during context creation which means this property
- * defines clocks at which CUDA applications will be running unless some overspec event
- * occurs (e.g. over power, over thermal or external HW brake).
- *
- * Can be used as a setting to request constant performance.
- *
- * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks.
- *
- * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call
- * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting
- * above the clock value being set.
- *
- * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
- * Requires root/admin permissions.
- *
- * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks
- * for details on how to list available clocks combinations.
- *
- * After system reboot or driver reload applications clocks go back to their default value.
- * See \ref nvmlDeviceResetApplicationsClocks.
- *
- * @param device                               The identifier of the target device
- * @param memClockMHz                          Requested memory clock in MHz
- * @param graphicsClockMHz                     Requested graphics clock in MHz
- *
- * @return
- *         - \ref NVML_SUCCESS                 if new settings were successfully set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a memClockMHz and \a graphicsClockMHz
- *                                                 is not a valid clock combination
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz);
-
-/**
- * Set new power limit of this device.
- * 
- * For Kepler &tm; or newer fully supported devices.
- * Requires root/admin permissions.
- *
- * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values.
- *
- * \note Limit is not persistent across reboots or driver unloads.
- * Enable persistent mode to prevent driver from unloading when no application is using the device.
- *
- * @param device                               The identifier of the target device
- * @param limit                                Power management limit in milliwatts to set
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a limit has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a defaultLimit is out of range
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlDeviceGetPowerManagementLimitConstraints
- * @see nvmlDeviceGetPowerManagementDefaultLimit
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit);
-
-/**
- * Sets new GOM. See \a nvmlGpuOperationMode_t for details.
- *
- * For GK110 M-class and X-class Tesla &tm; products from the Kepler family.
- * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products.
- * Not supported on Quadro &reg; and Tesla &tm; C-class products.
- * Requires root/admin permissions.
- * 
- * Changing GOMs requires a reboot. 
- * The reboot requirement might be removed in the future.
- *
- * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when
- * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel.
- * 
- * @param device                               The identifier of the target device
- * @param mode                                 Target GOM
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a mode has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a mode incorrect
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support GOM or specific mode
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlGpuOperationMode_t
- * @see nvmlDeviceGetGpuOperationMode
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode);
-
-/**
- * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs.
- * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs.
- * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction
- * to query the current restriction settings.
- * 
- * For Kepler &tm; or newer fully supported devices.
- * Requires root/admin permissions.
- *
- * @param device                               The identifier of the target device
- * @param apiType                              Target API type for this operation
- * @param isRestricted                         The target restriction
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a isRestricted has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a apiType incorrect
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support changing API restrictions or the device does not support
- *                                                 the feature that api restrictions are being set for (E.G. Enabling/disabling auto 
- *                                                 boosted clocks is not supported by the device)
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlRestrictedAPI_t
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted);
-
-/**
- * @}
- */
- 
-/** @addtogroup nvmlAccountingStats
- *  @{
- */
-
-/**
- * Enables or disables per process accounting.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Requires root/admin permissions.
- *
- * @note This setting is not persistent and will default to disabled after driver unloads.
- *       Enable persistence mode to be sure the setting doesn't switch off to disabled.
- * 
- * @note Enabling accounting mode has no negative impact on the GPU performance.
- *
- * @note Disabling accounting clears all accounting pids information.
- *
- * See \ref nvmlDeviceGetAccountingMode
- * See \ref nvmlDeviceGetAccountingStats
- * See \ref nvmlDeviceClearAccountingPids
- *
- * @param device                               The identifier of the target device
- * @param mode                                 The target accounting mode
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if the new mode has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a mode are invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode);
-
-/**
- * Clears accounting information about all processes that have already terminated.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Requires root/admin permissions.
- *
- * See \ref nvmlDeviceGetAccountingMode
- * See \ref nvmlDeviceGetAccountingStats
- * See \ref nvmlDeviceSetAccountingMode
- *
- * @param device                               The identifier of the target device
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if accounting information has been cleared 
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device are invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_NO_PERMISSION     if the user doesn't have permission to perform this operation
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup NvLink NvLink Methods
- * This chapter describes methods that NVML can perform on NVLINK enabled devices.
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Retrieves the state of the device's NvLink for the link specified
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param link                                 Specifies the NvLink link to be queried
- * @param isActive                             \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that
- *                                             the link is active and NVML_FEATURE_DISABLED indicates it 
- *                                             is inactive
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a isActive has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid or \a isActive is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
-
-/**
- * Retrieves the version of the device's NvLink for the link specified
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param link                                 Specifies the NvLink link to be queried
- * @param version                              Requested NvLink version
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a version has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid or \a version is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version);
-
-/**
- * Retrieves the requested capability from the device's NvLink for the link specified
- * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried
- * The return value should be treated as a boolean.
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param link                                 Specifies the NvLink link to be queried
- * @param capability                           Specifies the \a nvmlNvLinkCapability_t to be queried
- * @param capResult                            A boolean for the queried capability indicating that feature is available
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a capResult has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, or \a capability is invalid or \a capResult is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
-                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult); 
-
-/**
- * Retrieves the PCI information for the remote node on a NvLink link 
- * Note: pciSubSystemId is not filled in this function and is indeterminate
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param link                                 Specifies the NvLink link to be queried
- * @param pci                                  \a nvmlPciInfo_t of the remote node for the specified link                            
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a pci has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid or \a pci is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
-
-/**
- * Retrieves the specified error counter value
- * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param link                                 Specifies the NvLink link to be queried
- * @param counter                              Specifies the NvLink counter to be queried
- * @param counterValue                         Returned counter value
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a counter has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, or \a counter is invalid or \a counterValue is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link,
-                                                     nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue);
-
-/**
- * Resets all error counters to zero
- * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param link                                 Specifies the NvLink link to be queried
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if the reset is successful
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device or \a link is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link);
-
-/**
- * Set the NVLINK utilization counter control information for the specified counter, 0 or 1.
- * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition.  Performs a reset
- * of the counters if the reset parameter is non-zero.
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param counter                              Specifies the counter that should be set (0 or 1).
- * @param link                                 Specifies the NvLink link to be queried
- * @param control                              A reference to the \a nvmlNvLinkUtilizationControl_t to set
- * @param reset                                Resets the counters on set if non-zero
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if the control has been set successfully
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a counter, \a link, or \a control is invalid 
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter,
-                                                           nvmlNvLinkUtilizationControl_t *control, unsigned int reset);
-
-/**
- * Get the NVLINK utilization counter control information for the specified counter, 0 or 1.
- * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param counter                              Specifies the counter that should be set (0 or 1).
- * @param link                                 Specifies the NvLink link to be queried
- * @param control                              A reference to the \a nvmlNvLinkUtilizationControl_t to place information
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if the control has been set successfully
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a counter, \a link, or \a control is invalid 
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter,
-                                                           nvmlNvLinkUtilizationControl_t *control);
-
-
-/**
- * Retrieve the NVLINK utilization counter based on the current control for a specified counter.
- * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl
- *  before reading the utilization counters as they have no default state
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param link                                 Specifies the NvLink link to be queried
- * @param counter                              Specifies the counter that should be read (0 or 1).
- * @param rxcounter                            Receive counter return value
- * @param txcounter                            Transmit counter return value
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if \a rxcounter and \a txcounter have been successfully set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, 
-                                                           unsigned long long *rxcounter, unsigned long long *txcounter);
-
-/**
- * Freeze the NVLINK utilization counters 
- * Both the receive and transmit counters are operated on by this function
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param link                                 Specifies the NvLink link to be queried
- * @param counter                              Specifies the counter that should be frozen (0 or 1).
- * @param freeze                               NVML_FEATURE_ENABLED = freeze the receive and transmit counters
- *                                             NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if counters were successfully frozen or unfrozen
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, \a counter, or \a freeze is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, 
-                                            unsigned int counter, nvmlEnableState_t freeze);
-
-/**
- * Reset the NVLINK utilization counters 
- * Both the receive and transmit counters are operated on by this function
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device                               The identifier of the target device
- * @param link                                 Specifies the NvLink link to be reset
- * @param counter                              Specifies the counter that should be reset (0 or 1)
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if counters were successfully reset
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device, \a link, or \a counter is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlEvents Event Handling Methods
- * This chapter describes methods that NVML can perform against each device to register and wait for 
- * some event to occur.
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Create an empty set of events.
- * Event set should be freed by \ref nvmlEventSetFree
- *
- * For Fermi &tm; or newer fully supported devices.
- * @param set                                  Reference in which to return the event handle
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if the event has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a set is NULL
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- * 
- * @see nvmlEventSetFree
- */
-nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set);
-
-/**
- * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t
- *
- * For Fermi &tm; or newer fully supported devices.
- * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors)
- * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode)
- *
- * For Linux only.
- *
- * \b IMPORTANT: Operations on \a set are not thread safe
- *
- * This call starts recording of events on specific device.
- * All events that occurred before this call are not recorded.
- * Checking if some event occurred can be done with \ref nvmlEventSetWait
- *
- * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed.
- * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes
- *     are registered in that case.
- *
- * @param device                               The identifier of the target device
- * @param eventTypes                           Bitmask of \ref nvmlEventType to record
- * @param set                                  Set to which add new event types
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if the event has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a eventTypes is invalid or \a set is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the platform does not support this feature or some of requested event types
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- * 
- * @see nvmlEventType
- * @see nvmlDeviceGetSupportedEventTypes
- * @see nvmlEventSetWait
- * @see nvmlEventSetFree
- */
-nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set);
-
-/**
- * Returns information about events supported on device
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows.
- *
- * @param device                               The identifier of the target device
- * @param eventTypes                           Reference in which to return bitmask of supported events
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if the eventTypes has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a eventType is NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- * 
- * @see nvmlEventType
- * @see nvmlDeviceRegisterEvents
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes);
-
-/**
- * Waits on events and delivers events
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * If some events are ready to be delivered at the time of the call, function returns immediately.
- * If there are no events ready to be delivered, function sleeps till event arrives 
- * but not longer than specified timeout. This function in certain conditions can return before
- * specified timeout passes (e.g. when interrupt arrives)
- * 
- * In case of xid error, the function returns the most recent xid error type seen by the system. If there are multiple
- * xid errors generated before nvmlEventSetWait is invoked then the last seen xid error type is returned for all
- * xid error events.
- * 
- * @param set                                  Reference to set of events to wait on
- * @param data                                 Reference in which to return event data
- * @param timeoutms                            Maximum amount of wait time in milliseconds for registered event
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if the data has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a data is NULL
- *         - \ref NVML_ERROR_TIMEOUT           if no event arrived in specified timeout or interrupt arrived
- *         - \ref NVML_ERROR_GPU_IS_LOST       if a GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- * 
- * @see nvmlEventType
- * @see nvmlDeviceRegisterEvents
- */
-nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms);
-
-/**
- * Releases events in the set
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * @param set                                  Reference to events to be released 
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if the event has been successfully released
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- * 
- * @see nvmlDeviceRegisterEvents
- */
-nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlZPI Drain states 
- * This chapter describes methods that NVML can perform against each device to control their drain state
- * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to
- * power on/off GPUs, enable robust reset scenarios, etc.
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Modify the drain state of a GPU.  This method forces a GPU to no longer accept new incoming requests.
- * Any new NVML process will no longer see this GPU.  Persistence mode for this GPU must be turned off before
- * this call is made.
- * Must be called as administrator.
- * For Linux only.
- * 
- * For Pascal &tm; or newer fully supported devices.
- * Some Kepler devices supported.
- *
- * @param pciInfo                              The PCI address of the GPU drain state to be modified
- * @param newState                             The drain state that should be entered, see \ref nvmlEnableState_t
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if counters were successfully reset
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a nvmlIndex or \a newState is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_NO_PERMISSION     if the calling process has insufficient permissions to perform operation
- *         - \ref NVML_ERROR_IN_USE            if the device has persistence mode turned on
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState);
-
-/**
- * Query the drain state of a GPU.  This method is used to check if a GPU is in a currently draining
- * state.
- * For Linux only.
- * 
- * For Pascal &tm; or newer fully supported devices.
- * Some Kepler devices supported.
- *
- * @param pciInfo                              The PCI address of the GPU drain state to be queried
- * @param currentState                         The current drain state for this GPU, see \ref nvmlEnableState_t
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if counters were successfully reset
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a nvmlIndex or \a currentState is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState);
-
-/**
- * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver
- * as long as no other processes are attached. If other processes are attached, this call will return
- * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the
- * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called
- * to initiate the draining state is if that process was using, and is still using, a GPU before the 
- * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled
- * prior to this call.
- *
- * For long-running NVML processes please note that this will change the enumeration of current GPUs.
- * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2.
- * Also, device handles after the removed GPU will not be valid and must be re-established.
- * Must be run as administrator. 
- * For Linux only.
- *
- * For Pascal &tm; or newer fully supported devices.
- * Some Kepler devices supported.
- *
- * @param pciInfo                              The PCI address of the GPU to be removed
- * @param gpuState                             Whether the GPU is to be removed, from the OS
- *                                             see \ref nvmlDetachGpuState_t
- * @param linkState                            Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t
- *
- * @return
- *         - \ref NVML_SUCCESS                 if counters were successfully reset
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a nvmlIndex is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device doesn't support this feature
- *         - \ref NVML_ERROR_IN_USE            if the device is still in use and cannot be removed
- */
-nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState);
-
-/**
- * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that
- * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device.  
- * If all are zeroes then the entire PCI tree will be searched.  Please note that for long-running NVML processes
- * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order.
- *
- * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds
- * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery.
- *
- * Must be run as administrator.
- * For Linux only.
- * 
- * For Pascal &tm; or newer fully supported devices.
- * Some Kepler devices supported.
- *
- * @param pciInfo                              The PCI tree to be searched.  Only the domain, bus, and device
- *                                             fields are used in this call.
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if counters were successfully reset
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a pciInfo is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the operating system does not support this feature
- *         - \ref NVML_ERROR_OPERATING_SYSTEM  if the operating system is denying this feature
- *         - \ref NVML_ERROR_NO_PERMISSION     if the calling process has insufficient permissions to perform operation
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlFieldValueQueries Field Value Queries
- *  This chapter describes NVML operations that are associated with retrieving Field Values from NVML
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Request values for a list of fields for a device. This API allows multiple fields to be queried at once.
- * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs
- * will be populated from a single call rather than making a driver call for each fieldId.
- *
- * @param device                               The device handle of the GPU to request field values for
- * @param valuesCount                          Number of entries in values that should be retrieved
- * @param values                               Array of \a valuesCount structures to hold field values.
- *                                             Each value's fieldId must be populated prior to this call
- *
- * @return
- *         - \ref NVML_SUCCESS                 if any values in \a values were populated. Note that you must
- *                                             check the nvmlReturn field of each value for each individual
- *                                             status
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid or \a values is NULL
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
-
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlGridQueries Grid Queries
- *  This chapter describes NVML operations that are associated with NVIDIA GRID products.
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * This method is used to get the virtualization mode corresponding to the GPU.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                    Identifier of the target device
- * @param pVirtualMode              Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_?
- * 
- * @return 
- *         - \ref NVML_SUCCESS                  if \a pVirtualMode is fetched
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid or \a pVirtualMode is NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlGridCommands Grid Commands
- *  This chapter describes NVML operations that are associated with NVIDIA GRID products.
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * This method is used to set the virtualization mode corresponding to the GPU.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                    Identifier of the target device
- * @param virtualMode               virtualization mode. One of NVML_GPU_VIRTUALIZATION_?
- *
- * @return 
- *         - \ref NVML_SUCCESS                  if \a pVirtualMode is set
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid or \a pVirtualMode is NULL
- *         - \ref NVML_ERROR_GPU_IS_LOST        if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_NOT_SUPPORTED      if setting of virtualization mode is not supported.
- *         - \ref NVML_ERROR_NO_PERMISSION      if setting of virtualization mode is not allowed for this client.
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlVgpu vGPU Management
- * @{
- *
- * Set of APIs supporting GRID vGPU
- */
-/***************************************************************************************************/
-
-/**
- * Retrieve the supported vGPU types on a physical GPU (device).
- *
- * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer
- * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount
- * is used to return the number of vGPU types written to the buffer.
- *
- * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns
- * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount.
- * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0.
- * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported.
- *
- * @param device                   The identifier of the target device
- * @param vgpuCount                Pointer to caller-supplied array size, and returns number of vGPU types
- * @param vgpuTypeIds              Pointer to caller-supplied array in which to return list of vGPU types
- *
- * @return
- *         - \ref NVML_SUCCESS                      successful completion
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE      \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount
- *         - \ref NVML_ERROR_INVALID_ARGUMENT       if \a vgpuCount is NULL or \a device is invalid
- *         - \ref NVML_ERROR_NOT_SUPPORTED          if vGPU is not supported by the device
- *         - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device
- *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds);
-
-/**
- * Retrieve the currently creatable vGPU types on a physical GPU (device).
- *
- * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer
- * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount
- * is used to return the number of vGPU types written to the buffer.
- *
- * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types
- * can concurrently run on a device.  For example, if only one vGPU type is allowed at a time on a device, then the creatable
- * list will be restricted to whatever vGPU type is already running on the device.
- *
- * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns
- * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount.
- * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0.
- * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable.
- *
- * @param device                   The identifier of the target device
- * @param vgpuCount                Pointer to caller-supplied array size, and returns number of vGPU types
- * @param vgpuTypeIds              Pointer to caller-supplied array in which to return list of vGPU types
- *
- * @return
- *         - \ref NVML_SUCCESS                      successful completion
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE      \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount
- *         - \ref NVML_ERROR_INVALID_ARGUMENT       if \a vgpuCount is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED          if vGPU is not supported by the device
- *         - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device
- *         - \ref NVML_ERROR_UNKNOWN                on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds);
-
-/**
- * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator).
- * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId               Handle to vGPU type
- * @param vgpuTypeClass            Pointer to string array to return class in
- * @param size                     Size of string
- *
- * @return
- *         - \ref NVML_SUCCESS                   successful completion
- *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE   if \a size is too small
- *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size);
-
-/**
- * Retrieve the vGPU type name.
- *
- * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not
- * exceed 64 characters in length (including the NUL terminator).  See \ref
- * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId               Handle to vGPU type
- * @param vgpuTypeName             Pointer to buffer to return name
- * @param size                     Size of buffer
- *
- * @return
- *         - \ref NVML_SUCCESS                 successful completion
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a name is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size);
-
-/**
- * Retrieve the device ID of a vGPU type.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId               Handle to vGPU type
- * @param deviceID                 Device ID and vendor ID of the device contained in single 32 bit value
- * @param subsystemID              Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value
- *
- * @return
- *         - \ref NVML_SUCCESS                 successful completion
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID);
-
-/**
- * Retrieve the vGPU framebuffer size in bytes.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId               Handle to vGPU type
- * @param fbSize                   Pointer to framebuffer size in bytes
- *
- * @return
- *         - \ref NVML_SUCCESS                 successful completion
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a fbSize is NULL
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize);
-
-/**
- * Retrieve count of vGPU's supported display heads.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId               Handle to vGPU type
- * @param numDisplayHeads          Pointer to number of display heads
- *
- * @return
- *         - \ref NVML_SUCCESS                 successful completion
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads);
-
-/**
- * Retrieve vGPU display head's maximum supported resolution.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId               Handle to vGPU type
- * @param displayIndex             Zero-based index of display head
- * @param xdim                     Pointer to maximum number of pixels in X dimension
- * @param ydim                     Pointer to maximum number of pixels in Y dimension
- *
- * @return
- *         - \ref NVML_SUCCESS                 successful completion
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex
- *                                             is out of range.
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim);
-
-/**
- * Retrieve license requirements for a vGPU type
- *
- * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form
- * "<license name>,<version>", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license,
- * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0".
- *
- * The total length of the returned string will not exceed 128 characters, including the NUL terminator.
- * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId               Handle to vGPU type
- * @param vgpuTypeLicenseString    Pointer to buffer to return license info
- * @param size                     Size of \a vgpuTypeLicenseString buffer
- *
- * @return
- *         - \ref NVML_SUCCESS                 successful completion
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size);
-
-/**
- * Retrieve the static frame rate limit value of the vGPU type
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId               Handle to vGPU type
- * @param frameRateLimit           Reference to return the frame rate limit value
- * @return
- *         - \ref NVML_SUCCESS                 successful completion
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if frame rate limiter is turned off for the vGPU type
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, or \a frameRateLimit is NULL
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit);
-
-/**
- * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                   The identifier of the target device
- * @param vgpuTypeId               Handle to vGPU type
- * @param vgpuInstanceCount        Pointer to get the max number of vGPU instances
- *                                 that can be created on a deicve for given vgpuTypeId
- * @return
- *         - \ref NVML_SUCCESS                 successful completion
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuTypeId is invalid or is not supported on target device,
- *                                             or \a vgpuInstanceCount is NULL
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount);
-
-/**
- * Retrieve the active vGPU instances on a device.
- *
- * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The
- * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances
- * written to the buffer.
- *
- * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns
- * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount.
- * To query the number of active vGPU instances, call this function with *vgpuCount = 0.  The code will return
- * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device                   The identifier of the target device
- * @param vgpuCount                Pointer which passes in the array size as well as get
- *                                 back the number of types
- * @param vgpuInstances            Pointer to array in which to return list of vGPU instances
- *
- * @return
- *         - \ref NVML_SUCCESS                  successful completion
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a device is invalid, or \a vgpuCount is NULL
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a size is too small
- *         - \ref NVML_ERROR_NOT_SUPPORTED      if vGPU is not supported by the device
- *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances);
-
-/**
- * Retrieve the VM ID associated with a vGPU instance.
- *
- * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator).
- * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
- *
- * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance             Identifier of the target vGPU instance
- * @param vmId                     Pointer to caller-supplied buffer to hold VM ID
- * @param size                     Size of buffer in bytes
- * @param vmIdType                 Pointer to hold VM ID type
- *
- * @return
- *         - \ref NVML_SUCCESS                 successful completion
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vmId or \a vmIdType is NULL, or \a vgpuInstance is 0
- *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType);
-
-/**
- * Retrieve the UUID of a vGPU instance.
- *
- * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string,
- * not exceeding 80 characters in length (including the NULL terminator).
- * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance             Identifier of the target vGPU instance
- * @param uuid                     Pointer to caller-supplied buffer to hold vGPU UUID
- * @param size                     Size of buffer in bytes
- *
- * @return
- *         - \ref NVML_SUCCESS                 successful completion
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a uuid is NULL
- *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size);
-
-/**
- * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU.
- *
- * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version
- * string will not exceed 80 characters in length (including the NUL terminator).
- * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE.
- *
- * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is
- * returned as "Unknown" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the
- * NVIDIA driver is loaded and initialized.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance             Identifier of the target vGPU instance
- * @param version                  Caller-supplied buffer to return driver version string
- * @param length                   Size of \a version buffer
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a version has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0
- *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length);
-
-/**
- * Retrieve the framebuffer usage in bytes.
- *
- * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance             The identifier of the target instance
- * @param fbUsage                  Pointer to framebuffer usage in bytes
- *
- * @return
- *         - \ref NVML_SUCCESS                 successful completion
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a fbUsage is NULL
- *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage);
-
-/**
- * Retrieve the current licensing state of the vGPU instance.
- *
- * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance             Identifier of the target vGPU instance
- * @param licensed                 Reference to return the licensing status
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a licensed has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a licensed is NULL
- *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed);
-
-/**
- * Retrieve the vGPU type of a vGPU instance.
- *
- * Returns the vGPU type ID of vgpu assigned to the vGPU instance.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance             Identifier of the target vGPU instance
- * @param vgpuTypeId               Reference to return the vgpuTypeId
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a vgpuTypeId has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a vgpuTypeId is NULL
- *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId);
-
-/**
- * Retrieve the frame rate limit set for the vGPU instance.
- *
- * Returns the value of the frame rate limit set for the vGPU instance
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance             Identifier of the target vGPU instance
- * @param frameRateLimit           Reference to return the frame rate limit
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a frameRateLimit has been set
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if frame rate limiter is turned off for the vGPU type
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a frameRateLimit is NULL
- *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit);
-
-/**
- * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param vgpuInstance             Identifier of the target vGPU instance
- * @param encoderCapacity          Reference to an unsigned int for the encoder capacity
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a encoderCapacity has been retrived
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a encoderQueryType is invalid
- *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity);
-
-/**
- * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param vgpuInstance             Identifier of the target vGPU instance
- * @param encoderCapacity          Unsigned int for the encoder capacity value
- *
- * @return
- *         - \ref NVML_SUCCESS                 if \a encoderCapacity has been set
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0
- *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int  encoderCapacity);
-
-/**
- * Retrieves current utilization for vGPUs on a physical GPU (device).
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running
- * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer
- * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the
- * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values
- * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to
- * indicate the returned value type.
- *
- * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
- * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance
- * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate
- * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with
- * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the
- * buffer is sized for.
- *
- * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample
- * structures that were actually written. This may differ from a previously read value as vGPU instances are created or
- * destroyed.
- *
- * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
- * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
- * to a timeStamp retrieved from a previous query to read utilization since the previous query.
- *
- * @param device                        The identifier for the target device
- * @param lastSeenTimeStamp             Return only samples with timestamp greater than lastSeenTimeStamp.
- * @param sampleValType                 Pointer to caller-supplied buffer to hold the type of returned sample values
- * @param vgpuInstanceSamplesCount      Pointer to caller-supplied array size, and returns number of vGPU instances
- * @param utilizationSamples            Pointer to caller-supplied buffer in which vGPU utilization samples are returned
-
- * @return
- *         - \ref NVML_SUCCESS                 if utilization samples are successfully retrieved
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is
- *                                             NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all
- *                                             vGPU instances currently executing on the device
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if vGPU is not supported by the device
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp,
-                                                  nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount,
-                                                  nvmlVgpuInstanceUtilizationSample_t *utilizationSamples);
-
-/**
- * Retrieves current utilization for processes running on vGPUs on a physical GPU (device).
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on
- * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the
- * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running
- * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which
- * the samples were recorded. Individual utilization values are returned as "unsigned int" values.
- *
- * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
- * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance
- * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size
- * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with
- * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the
- * buffer is sized for.
- *
- * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample
- * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active
- * in any given sample period.
- *
- * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
- * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
- * to a timeStamp retrieved from a previous query to read utilization since the previous query.
- *
- * @param device                        The identifier for the target device
- * @param lastSeenTimeStamp             Return only samples with timestamp greater than lastSeenTimeStamp.
- * @param vgpuProcessSamplesCount       Pointer to caller-supplied array size, and returns number of processes running on vGPU instances
- * @param utilizationSamples            Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned
-
- * @return
- *         - \ref NVML_SUCCESS                 if utilization samples are successfully retrieved
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is
- *                                             passed with a non-NULL \a utilizationSamples
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all
- *                                             vGPU instances currently executing on the device
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if vGPU is not supported by the device
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_NOT_FOUND         if sample entries are not found
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp,
-                                                         unsigned int *vgpuProcessSamplesCount,
-                                                         nvmlVgpuProcessUtilizationSample_t *utilizationSamples);
-/**
- * Retrieve the GRID licensable features.
- *
- * Identifies whether the system supports GRID Software Licensing. If it does, return the list of licensable feature(s)
- * and their current license status.
- *
- * @param device                    Identifier of the target device
- * @param pGridLicensableFeatures   Pointer to structure in which GRID licensable features are returned
- *
- * @return
- *         - \ref NVML_SUCCESS                 if licensable features are successfully retrieved
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a pGridLicensableFeatures is NULL
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures);
-
-/**
- * Retrieves the current encoder statistics of a vGPU Instance
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param vgpuInstance                      Identifier of the target vGPU instance
- * @param sessionCount                      Reference to an unsigned int for count of active encoder sessions
- * @param averageFps                        Reference to an unsigned int for trailing average FPS of all active sessions
- * @param averageLatency                    Reference to an unsigned int for encode latency in microseconds
- *
- * @return
- *         - \ref NVML_SUCCESS                  if \a sessionCount, \a averageFps and \a averageLatency is fetched
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount , or \a averageFps or \a averageLatency is NULL
- *                                              or \a vgpuInstance is 0.
- *         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount,
-                                                     unsigned int *averageFps, unsigned int *averageLatency);
-
-/**
- * Retrieves information about all active encoder sessions on a vGPU Instance.
- *
- * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
- * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
- * written to the buffer.
- *
- * If the supplied buffer is not large enough to accomodate the active session array, the function returns
- * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount.
- * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return
- * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param vgpuInstance                      Identifier of the target vGPU instance
- * @param sessionCount                      Reference to caller supplied array size, and returns
- *                                          the number of sessions.
- * @param sessionInfo                       Reference to caller supplied array in which the list
- *                                          of session information us returned.
- *
- * @return
- *         - \ref NVML_SUCCESS                  if \a sessionInfo is fetched
- *         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a sessionCount is too small, array element count is
-                                                returned in \a sessionCount
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a sessionCount is NULL, or \a vgpuInstance is 0.
- *         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo);
-
-/**
-* Retrieves the active frame buffer capture sessions statistics of a vGPU Instance
-*
-* For Maxwell &tm; or newer fully supported devices.
-*
-* @param vgpuInstance                      Identifier of the target vGPU instance
-* @param fbcStats                          Reference to nvmlFBCStats_t structure contianing NvFBC stats
-*
-* @return
-*         - \ref NVML_SUCCESS                  if \a fbcStats is fetched
-*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
-*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a vgpuInstance is 0, or \a fbcStats is NULL
-*         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
-*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
-*/
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFBCStats_t *fbcStats);
-
-/**
-* Retrieves information about active frame buffer capture sessions on a vGPU Instance.
-*
-* An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
-* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
-* written to the buffer.
-*
-* If the supplied buffer is not large enough to accomodate the active session array, the function returns
-* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount.
-* To query the number of active FBC sessions, call this function with *sessionCount = 0.  The code will return
-* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount.
-*
-* For Maxwell &tm; or newer fully supported devices.
-*
-* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may
-*       be zero if there are no new frames captured since the session started.
-*
-* @param vgpuInstance                      Identifier of the target vGPU instance
-* @param sessionCount                      Reference to caller supplied array size, and returns the number of sessions.
-* @param sessionInfo                       Reference in which to return the session information
-*
-* @return
-*         - \ref NVML_SUCCESS                  if \a sessionInfo is fetched
-*         - \ref NVML_ERROR_UNINITIALIZED      if the library has not been successfully initialized
-*         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a vgpuInstance is 0, or \a sessionCount is NULL.
-*         - \ref NVML_ERROR_NOT_FOUND          if \a vgpuInstance does not match a valid active vGPU instance on the system
-*         - \ref NVML_ERROR_INSUFFICIENT_SIZE  if \a sessionCount is too small, array element count is returned in \a sessionCount
-*         - \ref NVML_ERROR_UNKNOWN            on any unexpected error
-*/
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo);
-
-/**
- * Retrieves the current utilization and process ID
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running.
- * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at
- * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization
- * during the last sample period. It includes the CPU timestamp at which  the samples were recorded. Individual utilization values
- * are returned as "unsigned int" values.
- *
- * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
- * \a utilization set to NULL. The caller should allocate a buffer of size
- * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed
- * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for.
- *
- * On successful return, the function updates \a processSamplesCount with the number of process utilization sample
- * structures that were actually written. This may differ from a previously read value as instances are created or
- * destroyed.
- *
- * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
- * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
- * to a timeStamp retrieved from a previous query to read utilization since the previous query.
- *
- * @param device                    The identifier of the target device
- * @param utilization               Pointer to caller-supplied buffer in which guest process utilization samples are returned
- * @param processSamplesCount       Pointer to caller-supplied array size, and returns number of processes running
- * @param lastSeenTimeStamp         Return only samples with timestamp greater than lastSeenTimeStamp.
-
- * @return
- *         - \ref NVML_SUCCESS                 if \a utilization has been populated
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the device does not support this feature
- *         - \ref NVML_ERROR_GPU_IS_LOST       if the target GPU has fallen off the bus or is otherwise inaccessible
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization,
-                                              unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp);
-
-/**
- * Queries the state of per process accounting mode on vGPU.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param vgpuInstance            The identifier of the target vGPU VM
- * @param mode                    Reference in which to return the current accounting mode
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if the mode has been successfully retrieved 
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a mode is NULL
- *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the vGPU doesn't support this feature
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *mode);
-
-/**
- * Queries list of processes running on vGPU that can be queried for accounting stats. The list of processes 
- * returned can be in running or terminated state.
- *
- * For Maxwell &tm; or newer fully supported devices.
- * 
- * To just query the maximum number of processes that can be queried, call this function with *count = 0 and
- * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty.
- * 
- * For more details see \ref nvmlVgpuInstanceGetAccountingStats.
- *
- * @note In case of PID collision some processes might not be accessible before the circular buffer is full.
- *
- * @param vgpuInstance            The identifier of the target vGPU VM
- * @param count                   Reference in which to provide the \a pids array size, and
- *                                to return the number of elements ready to be queried
- * @param pids                    Reference in which to return list of process ids
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if pids were successfully retrieved
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a count is NULL
- *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the vGPU doesn't support this feature or accounting mode is disabled
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to expected value)
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- *
- * @see nvmlVgpuInstanceGetAccountingPids
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int *count, unsigned int *pids);
-
-/**
- * Queries process's accounting stats.
- *
- * For Maxwell &tm; or newer fully supported devices.
- * 
- * Accounting stats capture GPU utilization and other statistics across the lifetime of a process, and
- * can be queried during life time of the process or after its termination.
- * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and 
- * updated to actual running time after its termination.
- * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old
- * processes.
- *
- * See \ref nvmlAccountingStats_t for description of each returned metric.
- * List of processes that can be queried can be retrieved from \ref nvmlVgpuInstanceGetAccountingPids.
- *
- * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode.
- * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be
- *         queried since they don't contribute to GPU utilization.
- * @note In case of pid collision stats of only the latest process (that terminated last) will be reported
- *
- * @param vgpuInstance            The identifier of the target vGPU VM
- * @param pid                     Process Id of the target process to query stats for
- * @param stats                   Reference in which to return the process's accounting stats
- *
- * @return 
- *         - \ref NVML_SUCCESS                 if stats have been successfully retrieved
- *         - \ref NVML_ERROR_UNINITIALIZED     if the library has not been successfully initialized
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a vgpuInstance is 0, or \a stats is NULL
- *         - \ref NVML_ERROR_NOT_FOUND         if \a vgpuInstance does not match a valid active vGPU instance on the system
- *                                             or \a stats is not found
- *         - \ref NVML_ERROR_NOT_SUPPORTED     if the vGPU doesn't support this feature or accounting mode is disabled
- *         - \ref NVML_ERROR_UNKNOWN           on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t *stats);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvml vGPU Migration
- * This chapter describes NVML operations that are associated with vGPU Migration.
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * vGPU metadata structure.
- */
-typedef struct nvmlVgpuMetadata_st
-{
-    unsigned int             version;                                                    //!< Current version of the structure
-    unsigned int             revision;                                                   //!< Current revision of the structure
-    nvmlVgpuGuestInfoState_t guestInfoState;                                             //!< Current state of Guest-dependent fields
-    char                     guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest
-    char                     hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];  //!< Version of driver installed in host
-    unsigned int             reserved[8];                                                //!< Reserved for internal use
-    unsigned int             opaqueDataSize;                                             //!< Size of opaque data field in bytes
-    char                     opaqueData[4];                                              //!< Opaque data
-} nvmlVgpuMetadata_t;
-
-/**
- * Physical GPU metadata structure
- */
-typedef struct nvmlVgpuPgpuMetadata_st
-{
-    unsigned int            version;                                                    //!< Current version of the structure
-    unsigned int            revision;                                                   //!< Current revision of the structure
-    char                    hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];  //!< Host driver version
-    unsigned int            pgpuVirtualizationCaps;                                     //!< Pgpu virtualizaion capabilities bitfileld
-    unsigned int            reserved[7];                                                //!< Reserved for internal use
-    unsigned int            opaqueDataSize;                                             //!< Size of opaque data field in bytes
-    char                    opaqueData[4];                                              //!< Opaque data
-} nvmlVgpuPgpuMetadata_t;
-
-/**
- * vGPU VM compatibility codes
- */
-typedef enum nvmlVgpuVmCompatibility_enum
-{
-    NVML_VGPU_VM_COMPATIBILITY_NONE         = 0x0,    //!< vGPU is not runnable
-    NVML_VGPU_VM_COMPATIBILITY_COLD         = 0x1,    //!< vGPU is runnable from a cold / powered-off state (ACPI S5)
-    NVML_VGPU_VM_COMPATIBILITY_HIBERNATE    = 0x2,    //!< vGPU is runnable from a hibernated state (ACPI S4)
-    NVML_VGPU_VM_COMPATIBILITY_SLEEP        = 0x4,    //!< vGPU is runnable from a sleeped state (ACPI S3)
-    NVML_VGPU_VM_COMPATIBILITY_LIVE         = 0x8,    //!< vGPU is runnable from a live/paused (ACPI S0)
-} nvmlVgpuVmCompatibility_t;
-
-/**
- *  vGPU-pGPU compatibility limit codes
- */
-typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum
-{
-    NVML_VGPU_COMPATIBILITY_LIMIT_NONE          = 0x0,           //!< Compatibility is not limited.
-    NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER   = 0x1,           //!< Compatibility is limited by host driver version.
-    NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER  = 0x2,           //!< Compatibility is limited by guest driver version.
-    NVML_VGPU_COMPATIBILITY_LIMIT_GPU           = 0x4,           //!< Compatibility is limited by GPU hardware.
-    NVML_VGPU_COMPATIBILITY_LIMIT_OTHER         = 0x80000000,    //!< Compatibility is limited by an undefined factor.
-} nvmlVgpuPgpuCompatibilityLimitCode_t;
-
-/**
- * vGPU-pGPU compatibility structure
- */
-typedef struct nvmlVgpuPgpuCompatibility_st
-{
-    nvmlVgpuVmCompatibility_t               vgpuVmCompatibility;    //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t
-    nvmlVgpuPgpuCompatibilityLimitCode_t    compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t
-} nvmlVgpuPgpuCompatibility_t;
-
-/**
- * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM
- * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section
- * containing internal state.
- *
- * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are
- * dependent on information obtained from the guest VM, which may not yet have reached a state where that information
- * is available. The current state of these dependent fields is reflected in the info structure's \ref guestInfoState field.
- *
- * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide
- * it to GRID Virtual GPU Manager when creating a vGPU for subsequent instances of the VM.
- *
- * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure
- * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed
- * in \a bufferSize.
- *
- * @param vgpuInstance             vGPU instance handle
- * @param vgpuMetadata             Pointer to caller-supplied buffer into which vGPU metadata is written
- * @param bufferSize               Size of vgpuMetadata buffer
- *
- * @return
- *         - \ref NVML_SUCCESS                   vGPU metadata structure was successfully returned
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE   vgpuMetadata buffer is too small, required size is returned in \a bufferSize
- *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a bufferSize is NULL or \a vgpuInstance is 0; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0.
- *         - \ref NVML_ERROR_NOT_FOUND           if \a vgpuInstance does not match a valid active vGPU instance on the system
- *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize);
-
-/**
- * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about
- * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section
- * containing internal state.
- *
- * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata
- * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed
- * in \a bufferSize.
- *
- * @param device                The identifier of the target device
- * @param pgpuMetadata          Pointer to caller-supplied buffer into which \a pgpuMetadata is written
- * @param bufferSize            Pointer to size of \a pgpuMetadata buffer
- *
- * @return
- *         - \ref NVML_SUCCESS                   GPU metadata structure was successfully returned
- *         - \ref NVML_ERROR_INSUFFICIENT_SIZE   pgpuMetadata buffer is too small, required size is returned in \a bufferSize
- *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0.
- *         - \ref NVML_ERROR_NOT_SUPPORTED       vGPU is not supported by the system
- *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize);
-
-/**
- * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a
- * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the
- * physical GPU.
- *
- * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The
- * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility
- * with the physical GPU is limited, a limit code indicates the factor limiting compability.
- * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details).
- *
- * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to
- *       boot a given vGPU or associated VM.
- *
- * @param vgpuMetadata          Pointer to caller-supplied vGPU metadata structure
- * @param pgpuMetadata          Pointer to caller-supplied GPU metadata structure
- * @param compatibilityInfo     Pointer to caller-supplied buffer to hold compatibility info
- *
- * @return
- *         - \ref NVML_SUCCESS                   vGPU metadata structure was successfully returned
- *         - \ref NVML_ERROR_INVALID_ARGUMENT    if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL
- *         - \ref NVML_ERROR_UNKNOWN             on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlGpuBlacklistQueries GPU Blacklist Queries
- * This chapter describes NVML operations that are associated with blacklisted GPUs.
- *  @{
- */
-/***************************************************************************************************/
-
-/**
- * Blacklist GPU device information
- **/
-typedef struct nvmlBlacklistDeviceInfo_st
-{
-    nvmlPciInfo_t pciInfo;                   //!< The PCI information for the blacklisted GPU
-    char uuid[NVML_DEVICE_UUID_BUFFER_SIZE]; //!< The ASCII string UUID for the blacklisted GPU
-} nvmlBlacklistDeviceInfo_t;
-
- /**
- * Retrieves the number of blacklisted GPU devices in the system.
- * 
- * For all products.
- *
- * @param deviceCount                          Reference in which to return the number of blacklisted devices
- * 
- * @return 
- *         - \ref NVML_SUCCESS                 if \a deviceCount has been set
- *         - \ref NVML_ERROR_INVALID_ARGUMENT  if \a deviceCount is NULL
- */
-nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceCount(unsigned int *deviceCount);
-
-/**
- * Acquire the device information for a blacklisted device, based on its index.
- * 
- * For all products.
- *
- * Valid indices are derived from the \a deviceCount returned by 
- *   \ref nvmlGetBlacklistDeviceCount(). For example, if \a deviceCount is 2 the valid indices  
- *   are 0 and 1, corresponding to GPU 0 and GPU 1.
- *
- * @param index                                The index of the target GPU, >= 0 and < \a deviceCount
- * @param info                                 Reference in which to return the device information
- * 
- * @return 
- *         - \ref NVML_SUCCESS                  if \a device has been set
- *         - \ref NVML_ERROR_INVALID_ARGUMENT   if \a index is invalid or \a info is NULL
- *
- * @see nvmlGetBlacklistDeviceCount
- */
-nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceInfoByIndex(unsigned int index, nvmlBlacklistDeviceInfo_t *info);
-
-/** @} */
-
-/**
- * NVML API versioning support
- */
-#if defined(__NVML_API_VERSION_INTERNAL)
-#undef nvmlDeviceRemoveGpu
-#undef nvmlDeviceGetNvLinkRemotePciInfo
-#undef nvmlDeviceGetPciInfo
-#undef nvmlDeviceGetCount
-#undef nvmlDeviceGetHandleByIndex
-#undef nvmlDeviceGetHandleByPciBusId
-#undef nvmlInit
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/include/triton/external/half.hpp b/include/triton/external/half.hpp
deleted file mode 100644
index 625cce7cb..000000000
--- a/include/triton/external/half.hpp
+++ /dev/null
@@ -1,3067 +0,0 @@
-// half - IEEE 754-based half-precision floating point library.
-//
-// Copyright (c) 2012-2017 Christian Rau <rauy@users.sourceforge.net>
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation 
-// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the 
-// Software is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 
-// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 
-// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-// Version 1.12.0
-
-/// \file
-/// Main header file for half precision functionality.
-
-#ifndef HALF_HALF_HPP
-#define HALF_HALF_HPP
-
-/// Combined gcc version number.
-#define HALF_GNUC_VERSION (__GNUC__*100+__GNUC_MINOR__)
-
-//check C++11 language features
-#if defined(__clang__)										//clang
-	#if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
-		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
-	#endif
-	#if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
-		#define HALF_ENABLE_CPP11_CONSTEXPR 1
-	#endif
-	#if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
-		#define HALF_ENABLE_CPP11_NOEXCEPT 1
-	#endif
-	#if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
-		#define HALF_ENABLE_CPP11_USER_LITERALS 1
-	#endif
-	#if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG)
-		#define HALF_ENABLE_CPP11_LONG_LONG 1
-	#endif
-/*#elif defined(__INTEL_COMPILER)								//Intel C++
-	#if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)		????????
-		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
-	#endif
-	#if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)			????????
-		#define HALF_ENABLE_CPP11_CONSTEXPR 1
-	#endif
-	#if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)			????????
-		#define HALF_ENABLE_CPP11_NOEXCEPT 1
-	#endif
-	#if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_LONG_LONG)			????????
-		#define HALF_ENABLE_CPP11_LONG_LONG 1
-	#endif*/
-#elif defined(__GNUC__)										//gcc
-	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
-		#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
-			#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
-		#endif
-		#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
-			#define HALF_ENABLE_CPP11_CONSTEXPR 1
-		#endif
-		#if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
-			#define HALF_ENABLE_CPP11_NOEXCEPT 1
-		#endif
-		#if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
-			#define HALF_ENABLE_CPP11_USER_LITERALS 1
-		#endif
-		#if !defined(HALF_ENABLE_CPP11_LONG_LONG)
-			#define HALF_ENABLE_CPP11_LONG_LONG 1
-		#endif
-	#endif
-#elif defined(_MSC_VER)										//Visual C++
-	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR)
-		#define HALF_ENABLE_CPP11_CONSTEXPR 1
-	#endif
-	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT)
-		#define HALF_ENABLE_CPP11_NOEXCEPT 1
-	#endif
-	#if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS)
-		#define HALF_ENABLE_CPP11_USER_LITERALS 1
-	#endif
-	#if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT)
-		#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
-	#endif
-	#if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG)
-		#define HALF_ENABLE_CPP11_LONG_LONG 1
-	#endif
-	#define HALF_POP_WARNINGS 1
-	#pragma warning(push)
-	#pragma warning(disable : 4099 4127 4146)	//struct vs class, constant in if, negative unsigned
-#endif
-
-//check C++11 library features
-#include <utility>
-#if defined(_LIBCPP_VERSION)								//libc++
-	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
-		#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
-			#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
-		#endif
-		#ifndef HALF_ENABLE_CPP11_CSTDINT
-			#define HALF_ENABLE_CPP11_CSTDINT 1
-		#endif
-		#ifndef HALF_ENABLE_CPP11_CMATH
-			#define HALF_ENABLE_CPP11_CMATH 1
-		#endif
-		#ifndef HALF_ENABLE_CPP11_HASH
-			#define HALF_ENABLE_CPP11_HASH 1
-		#endif
-	#endif
-#elif defined(__GLIBCXX__)									//libstdc++
-	#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103
-		#ifdef __clang__
-			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS)
-				#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
-			#endif
-			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT)
-				#define HALF_ENABLE_CPP11_CSTDINT 1
-			#endif
-			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH)
-				#define HALF_ENABLE_CPP11_CMATH 1
-			#endif
-			#if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH)
-				#define HALF_ENABLE_CPP11_HASH 1
-			#endif
-		#else
-			#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT)
-				#define HALF_ENABLE_CPP11_CSTDINT 1
-			#endif
-			#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH)
-				#define HALF_ENABLE_CPP11_CMATH 1
-			#endif
-			#if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH)
-				#define HALF_ENABLE_CPP11_HASH 1
-			#endif
-		#endif
-	#endif
-#elif defined(_CPPLIB_VER)									//Dinkumware/Visual C++
-	#if _CPPLIB_VER >= 520
-		#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
-			#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
-		#endif
-		#ifndef HALF_ENABLE_CPP11_CSTDINT
-			#define HALF_ENABLE_CPP11_CSTDINT 1
-		#endif
-		#ifndef HALF_ENABLE_CPP11_HASH
-			#define HALF_ENABLE_CPP11_HASH 1
-		#endif
-	#endif
-	#if _CPPLIB_VER >= 610
-		#ifndef HALF_ENABLE_CPP11_CMATH
-			#define HALF_ENABLE_CPP11_CMATH 1
-		#endif
-	#endif
-#endif
-#undef HALF_GNUC_VERSION
-
-//support constexpr
-#if HALF_ENABLE_CPP11_CONSTEXPR
-	#define HALF_CONSTEXPR			constexpr
-	#define HALF_CONSTEXPR_CONST	constexpr
-#else
-	#define HALF_CONSTEXPR
-	#define HALF_CONSTEXPR_CONST	const
-#endif
-
-//support noexcept
-#if HALF_ENABLE_CPP11_NOEXCEPT
-	#define HALF_NOEXCEPT	noexcept
-	#define HALF_NOTHROW	noexcept
-#else
-	#define HALF_NOEXCEPT
-	#define HALF_NOTHROW	throw()
-#endif
-
-#include <algorithm>
-#include <iostream>
-#include <limits>
-#include <climits>
-#include <cmath>
-#include <cstring>
-#if HALF_ENABLE_CPP11_TYPE_TRAITS
-	#include <type_traits>
-#endif
-#if HALF_ENABLE_CPP11_CSTDINT
-	#include <cstdint>
-#endif
-#if HALF_ENABLE_CPP11_HASH
-	#include <functional>
-#endif
-
-
-/// Default rounding mode.
-/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and `float`s as well as 
-/// for the half_cast() if not specifying a rounding mode explicitly. It can be redefined (before including half.hpp) to one 
-/// of the standard rounding modes using their respective constants or the equivalent values of `std::float_round_style`:
-///
-/// `std::float_round_style`         | value | rounding
-/// ---------------------------------|-------|-------------------------
-/// `std::round_indeterminate`       | -1    | fastest (default)
-/// `std::round_toward_zero`         | 0     | toward zero
-/// `std::round_to_nearest`          | 1     | to nearest
-/// `std::round_toward_infinity`     | 2     | toward positive infinity
-/// `std::round_toward_neg_infinity` | 3     | toward negative infinity
-///
-/// By default this is set to `-1` (`std::round_indeterminate`), which uses truncation (round toward zero, but with overflows 
-/// set to infinity) and is the fastest rounding mode possible. It can even be set to `std::numeric_limits<float>::round_style` 
-/// to synchronize the rounding mode with that of the underlying single-precision implementation.
-#ifndef HALF_ROUND_STYLE
-	#define HALF_ROUND_STYLE	-1			// = std::round_indeterminate
-#endif
-
-/// Tie-breaking behaviour for round to nearest.
-/// This specifies if ties in round to nearest should be resolved by rounding to the nearest even value. By default this is 
-/// defined to `0` resulting in the faster but slightly more biased behaviour of rounding away from zero in half-way cases (and 
-/// thus equal to the round() function), but can be redefined to `1` (before including half.hpp) if more IEEE-conformant 
-/// behaviour is needed.
-#ifndef HALF_ROUND_TIES_TO_EVEN
-	#define HALF_ROUND_TIES_TO_EVEN	0		// ties away from zero
-#endif
-
-/// Value signaling overflow.
-/// In correspondence with `HUGE_VAL[F|L]` from `<cmath>` this symbol expands to a positive value signaling the overflow of an 
-/// operation, in particular it just evaluates to positive infinity.
-#define HUGE_VALH	std::numeric_limits<half_float::half>::infinity()
-
-/// Fast half-precision fma function.
-/// This symbol is only defined if the fma() function generally executes as fast as, or faster than, a separate 
-/// half-precision multiplication followed by an addition. Due to the internal single-precision implementation of all 
-/// arithmetic operations, this is in fact always the case.
-#define FP_FAST_FMAH	1
-
-#ifndef FP_ILOGB0
-	#define FP_ILOGB0		INT_MIN
-#endif
-#ifndef FP_ILOGBNAN
-	#define FP_ILOGBNAN		INT_MAX
-#endif
-#ifndef FP_SUBNORMAL
-	#define FP_SUBNORMAL	0
-#endif
-#ifndef FP_ZERO
-	#define FP_ZERO			1
-#endif
-#ifndef FP_NAN
-	#define FP_NAN			2
-#endif
-#ifndef FP_INFINITE
-	#define FP_INFINITE		3
-#endif
-#ifndef FP_NORMAL
-	#define FP_NORMAL		4
-#endif
-
-
-/// Main namespace for half precision functionality.
-/// This namespace contains all the functionality provided by the library.
-namespace half_float
-{
-	class half;
-
-#if HALF_ENABLE_CPP11_USER_LITERALS
-	/// Library-defined half-precision literals.
-	/// Import this namespace to enable half-precision floating point literals:
-	/// ~~~~{.cpp}
-	/// using namespace half_float::literal;
-	/// half_float::half = 4.2_h;
-	/// ~~~~
-	namespace literal
-	{
-		half operator""_h(long double);
-	}
-#endif
-
-	/// \internal
-	/// \brief Implementation details.
-	namespace detail
-	{
-	#if HALF_ENABLE_CPP11_TYPE_TRAITS
-		/// Conditional type.
-		template<bool B,typename T,typename F> struct conditional : std::conditional<B,T,F> {};
-
-		/// Helper for tag dispatching.
-		template<bool B> struct bool_type : std::integral_constant<bool,B> {};
-		using std::true_type;
-		using std::false_type;
-
-		/// Type traits for floating point types.
-		template<typename T> struct is_float : std::is_floating_point<T> {};
-	#else
-		/// Conditional type.
-		template<bool,typename T,typename> struct conditional { typedef T type; };
-		template<typename T,typename F> struct conditional<false,T,F> { typedef F type; };
-
-		/// Helper for tag dispatching.
-		template<bool> struct bool_type {};
-		typedef bool_type<true> true_type;
-		typedef bool_type<false> false_type;
-
-		/// Type traits for floating point types.
-		template<typename> struct is_float : false_type {};
-		template<typename T> struct is_float<const T> : is_float<T> {};
-		template<typename T> struct is_float<volatile T> : is_float<T> {};
-		template<typename T> struct is_float<const volatile T> : is_float<T> {};
-		template<> struct is_float<float> : true_type {};
-		template<> struct is_float<double> : true_type {};
-		template<> struct is_float<long double> : true_type {};
-	#endif
-
-		/// Type traits for floating point bits.
-		template<typename T> struct bits { typedef unsigned char type; };
-		template<typename T> struct bits<const T> : bits<T> {};
-		template<typename T> struct bits<volatile T> : bits<T> {};
-		template<typename T> struct bits<const volatile T> : bits<T> {};
-
-	#if HALF_ENABLE_CPP11_CSTDINT
-		/// Unsigned integer of (at least) 16 bits width.
-		typedef std::uint_least16_t uint16;
-
-		/// Unsigned integer of (at least) 32 bits width.
-		template<> struct bits<float> { typedef std::uint_least32_t type; };
-
-		/// Unsigned integer of (at least) 64 bits width.
-		template<> struct bits<double> { typedef std::uint_least64_t type; };
-	#else
-		/// Unsigned integer of (at least) 16 bits width.
-		typedef unsigned short uint16;
-
-		/// Unsigned integer of (at least) 32 bits width.
-		template<> struct bits<float> : conditional<std::numeric_limits<unsigned int>::digits>=32,unsigned int,unsigned long> {};
-
-		#if HALF_ENABLE_CPP11_LONG_LONG
-			/// Unsigned integer of (at least) 64 bits width.
-			template<> struct bits<double> : conditional<std::numeric_limits<unsigned long>::digits>=64,unsigned long,unsigned long long> {};
-		#else
-			/// Unsigned integer of (at least) 64 bits width.
-			template<> struct bits<double> { typedef unsigned long type; };
-		#endif
-	#endif
-
-		/// Tag type for binary construction.
-		struct binary_t {};
-
-		/// Tag for binary construction.
-		HALF_CONSTEXPR_CONST binary_t binary = binary_t();
-
-		/// Temporary half-precision expression.
-		/// This class represents a half-precision expression which just stores a single-precision value internally.
-		struct expr
-		{
-			/// Conversion constructor.
-			/// \param f single-precision value to convert
-			explicit HALF_CONSTEXPR expr(float f) HALF_NOEXCEPT : value_(f) {}
-
-			/// Conversion to single-precision.
-			/// \return single precision value representing expression value
-			HALF_CONSTEXPR operator float() const HALF_NOEXCEPT { return value_; }
-
-		private:
-			/// Internal expression value stored in single-precision.
-			float value_;
-		};
-
-		/// SFINAE helper for generic half-precision functions.
-		/// This class template has to be specialized for each valid combination of argument types to provide a corresponding 
-		/// `type` member equivalent to \a T.
-		/// \tparam T type to return
-		template<typename T,typename,typename=void,typename=void> struct enable {};
-		template<typename T> struct enable<T,half,void,void> { typedef T type; };
-		template<typename T> struct enable<T,expr,void,void> { typedef T type; };
-		template<typename T> struct enable<T,half,half,void> { typedef T type; };
-		template<typename T> struct enable<T,half,expr,void> { typedef T type; };
-		template<typename T> struct enable<T,expr,half,void> { typedef T type; };
-		template<typename T> struct enable<T,expr,expr,void> { typedef T type; };
-		template<typename T> struct enable<T,half,half,half> { typedef T type; };
-		template<typename T> struct enable<T,half,half,expr> { typedef T type; };
-		template<typename T> struct enable<T,half,expr,half> { typedef T type; };
-		template<typename T> struct enable<T,half,expr,expr> { typedef T type; };
-		template<typename T> struct enable<T,expr,half,half> { typedef T type; };
-		template<typename T> struct enable<T,expr,half,expr> { typedef T type; };
-		template<typename T> struct enable<T,expr,expr,half> { typedef T type; };
-		template<typename T> struct enable<T,expr,expr,expr> { typedef T type; };
-
-		/// Return type for specialized generic 2-argument half-precision functions.
-		/// This class template has to be specialized for each valid combination of argument types to provide a corresponding 
-		/// `type` member denoting the appropriate return type.
-		/// \tparam T first argument type
-		/// \tparam U first argument type
-		template<typename T,typename U> struct result : enable<expr,T,U> {};
-		template<> struct result<half,half> { typedef half type; };
-
-		/// \name Classification helpers
-		/// \{
-
-		/// Check for infinity.
-		/// \tparam T argument type (builtin floating point type)
-		/// \param arg value to query
-		/// \retval true if infinity
-		/// \retval false else
-		template<typename T> bool builtin_isinf(T arg)
-		{
-		#if HALF_ENABLE_CPP11_CMATH
-			return std::isinf(arg);
-		#elif defined(_MSC_VER)
-			return !::_finite(static_cast<double>(arg)) && !::_isnan(static_cast<double>(arg));
-		#else
-			return arg == std::numeric_limits<T>::infinity() || arg == -std::numeric_limits<T>::infinity();
-		#endif
-		}
-
-		/// Check for NaN.
-		/// \tparam T argument type (builtin floating point type)
-		/// \param arg value to query
-		/// \retval true if not a number
-		/// \retval false else
-		template<typename T> bool builtin_isnan(T arg)
-		{
-		#if HALF_ENABLE_CPP11_CMATH
-			return std::isnan(arg);
-		#elif defined(_MSC_VER)
-			return ::_isnan(static_cast<double>(arg)) != 0;
-		#else
-			return arg != arg;
-		#endif
-		}
-
-		/// Check sign.
-		/// \tparam T argument type (builtin floating point type)
-		/// \param arg value to query
-		/// \retval true if signbit set
-		/// \retval false else
-		template<typename T> bool builtin_signbit(T arg)
-		{
-		#if HALF_ENABLE_CPP11_CMATH
-			return std::signbit(arg);
-		#else
-			return arg < T() || (arg == T() && T(1)/arg < T());
-		#endif
-		}
-
-		/// \}
-		/// \name Conversion
-		/// \{
-
-		/// Convert IEEE single-precision to half-precision.
-		/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
-		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
-		/// \param value single-precision value
-		/// \return binary representation of half-precision value
-		template<std::float_round_style R> uint16 float2half_impl(float value, true_type)
-		{
-			typedef bits<float>::type uint32;
-			uint32 bits;// = *reinterpret_cast<uint32*>(&value);		//violating strict aliasing!
-			std::memcpy(&bits, &value, sizeof(float));
-/*			uint16 hbits = (bits>>16) & 0x8000;
-			bits &= 0x7FFFFFFF;
-			int exp = bits >> 23;
-			if(exp == 255)
-				return hbits | 0x7C00 | (0x3FF&-static_cast<unsigned>((bits&0x7FFFFF)!=0));
-			if(exp > 142)
-			{
-				if(R == std::round_toward_infinity)
-					return hbits | 0x7C00 - (hbits>>15);
-				if(R == std::round_toward_neg_infinity)
-					return hbits | 0x7BFF + (hbits>>15);
-				return hbits | 0x7BFF + (R!=std::round_toward_zero);
-			}
-			int g, s;
-			if(exp > 112)
-			{
-				g = (bits>>12) & 1;
-				s = (bits&0xFFF) != 0;
-				hbits |= ((exp-112)<<10) | ((bits>>13)&0x3FF);
-			}
-			else if(exp > 101)
-			{
-				int i = 125 - exp;
-				bits = (bits&0x7FFFFF) | 0x800000;
-				g = (bits>>i) & 1;
-				s = (bits&((1L<<i)-1)) != 0;
-				hbits |= bits >> (i+1);
-			}
-			else
-			{
-				g = 0;
-				s = bits != 0;
-			}
-			if(R == std::round_to_nearest)
-				#if HALF_ROUND_TIES_TO_EVEN
-					hbits += g & (s|hbits);
-				#else
-					hbits += g;
-				#endif
-			else if(R == std::round_toward_infinity)
-				hbits += ~(hbits>>15) & (s|g);
-			else if(R == std::round_toward_neg_infinity)
-				hbits += (hbits>>15) & (g|s);
-*/			static const uint16 base_table[512] = { 
-				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
-				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
-				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
-				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
-				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
-				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 
-				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 
-				0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, 
-				0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, 
-				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
-				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
-				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
-				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
-				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
-				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
-				0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 
-				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
-				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
-				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
-				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
-				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
-				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 
-				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 
-				0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, 
-				0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, 
-				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 
-				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 
-				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 
-				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 
-				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 
-				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 
-				0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 };
-			static const unsigned char shift_table[512] = { 
-				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
-				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
-				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
-				24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 
-				13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
-				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
-				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
-				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, 
-				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
-				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
-				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
-				24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 
-				13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
-				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
-				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
-				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 };
-			uint16 hbits = base_table[bits>>23] + static_cast<uint16>((bits&0x7FFFFF)>>shift_table[bits>>23]);
-			if(R == std::round_to_nearest)
-				hbits += (((bits&0x7FFFFF)>>(shift_table[bits>>23]-1))|(((bits>>23)&0xFF)==102)) & ((hbits&0x7C00)!=0x7C00)
-				#if HALF_ROUND_TIES_TO_EVEN
-					& (((((static_cast<uint32>(1)<<(shift_table[bits>>23]-1))-1)&bits)!=0)|hbits)
-				#endif
-				;
-			else if(R == std::round_toward_zero)
-				hbits -= ((hbits&0x7FFF)==0x7C00) & ~shift_table[bits>>23];
-			else if(R == std::round_toward_infinity)
-				hbits += ((((bits&0x7FFFFF&((static_cast<uint32>(1)<<(shift_table[bits>>23]))-1))!=0)|(((bits>>23)<=102)&
-					((bits>>23)!=0)))&(hbits<0x7C00)) - ((hbits==0xFC00)&((bits>>23)!=511));
-			else if(R == std::round_toward_neg_infinity)
-				hbits += ((((bits&0x7FFFFF&((static_cast<uint32>(1)<<(shift_table[bits>>23]))-1))!=0)|(((bits>>23)<=358)&
-					((bits>>23)!=256)))&(hbits<0xFC00)&(hbits>>15)) - ((hbits==0x7C00)&((bits>>23)!=255));
-			return hbits;
-		}
-
-		/// Convert IEEE double-precision to half-precision.
-		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
-		/// \param value double-precision value
-		/// \return binary representation of half-precision value
-		template<std::float_round_style R> uint16 float2half_impl(double value, true_type)
-		{
-			typedef bits<float>::type uint32;
-			typedef bits<double>::type uint64;
-			uint64 bits;// = *reinterpret_cast<uint64*>(&value);		//violating strict aliasing!
-			std::memcpy(&bits, &value, sizeof(double));
-			uint32 hi = bits >> 32, lo = bits & 0xFFFFFFFF;
-			uint16 hbits = (hi>>16) & 0x8000;
-			hi &= 0x7FFFFFFF;
-			int exp = hi >> 20;
-			if(exp == 2047)
-				return hbits | 0x7C00 | (0x3FF&-static_cast<unsigned>((bits&0xFFFFFFFFFFFFF)!=0));
-			if(exp > 1038)
-			{
-				if(R == std::round_toward_infinity)
-					return hbits | 0x7C00 - (hbits>>15);
-				if(R == std::round_toward_neg_infinity)
-					return hbits | 0x7BFF + (hbits>>15);
-				return hbits | 0x7BFF + (R!=std::round_toward_zero);
-			}
-			int g, s = lo != 0;
-			if(exp > 1008)
-			{
-				g = (hi>>9) & 1;
-				s |= (hi&0x1FF) != 0;
-				hbits |= ((exp-1008)<<10) | ((hi>>10)&0x3FF);
-			}
-			else if(exp > 997)
-			{
-				int i = 1018 - exp;
-				hi = (hi&0xFFFFF) | 0x100000;
-				g = (hi>>i) & 1;
-				s |= (hi&((1L<<i)-1)) != 0;
-				hbits |= hi >> (i+1);
-			}
-			else
-			{
-				g = 0;
-				s |= hi != 0;
-			}
-			if(R == std::round_to_nearest)
-				#if HALF_ROUND_TIES_TO_EVEN
-					hbits += g & (s|hbits);
-				#else
-					hbits += g;
-				#endif
-			else if(R == std::round_toward_infinity)
-				hbits += ~(hbits>>15) & (s|g);
-			else if(R == std::round_toward_neg_infinity)
-				hbits += (hbits>>15) & (g|s);
-			return hbits;
-		}
-
-		/// Convert non-IEEE floating point to half-precision.
-		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
-		/// \tparam T source type (builtin floating point type)
-		/// \param value floating point value
-		/// \return binary representation of half-precision value
-		template<std::float_round_style R,typename T> uint16 float2half_impl(T value, ...)
-		{
-			uint16 hbits = static_cast<unsigned>(builtin_signbit(value)) << 15;
-			if(value == T())
-				return hbits;
-			if(builtin_isnan(value))
-				return hbits | 0x7FFF;
-			if(builtin_isinf(value))
-				return hbits | 0x7C00;
-			int exp;
-			std::frexp(value, &exp);
-			if(exp > 16)
-			{
-				if(R == std::round_toward_infinity)
-					return hbits | 0x7C00 - (hbits>>15);
-				else if(R == std::round_toward_neg_infinity)
-					return hbits | 0x7BFF + (hbits>>15);
-				return hbits | 0x7BFF + (R!=std::round_toward_zero);
-			}
-			if(exp < -13)
-				value = std::ldexp(value, 24);
-			else
-			{
-				value = std::ldexp(value, 11-exp);
-				hbits |= ((exp+13)<<10);
-			}
-			T ival, frac = std::modf(value, &ival);
-			hbits += static_cast<uint16>(std::abs(static_cast<int>(ival)));
-			if(R == std::round_to_nearest)
-			{
-				frac = std::abs(frac);
-				#if HALF_ROUND_TIES_TO_EVEN
-					hbits += (frac>T(0.5)) | ((frac==T(0.5))&hbits);
-				#else
-					hbits += frac >= T(0.5);
-				#endif
-			}
-			else if(R == std::round_toward_infinity)
-				hbits += frac > T();
-			else if(R == std::round_toward_neg_infinity)
-				hbits += frac < T();
-			return hbits;
-		}
-
-		/// Convert floating point to half-precision.
-		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
-		/// \tparam T source type (builtin floating point type)
-		/// \param value floating point value
-		/// \return binary representation of half-precision value
-		template<std::float_round_style R,typename T> uint16 float2half(T value)
-		{
-			return float2half_impl<R>(value, bool_type<std::numeric_limits<T>::is_iec559&&sizeof(typename bits<T>::type)==sizeof(T)>());
-		}
-
-		/// Convert integer to half-precision floating point.
-		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
-		/// \tparam S `true` if value negative, `false` else
-		/// \tparam T type to convert (builtin integer type)
-		/// \param value non-negative integral value
-		/// \return binary representation of half-precision value
-		template<std::float_round_style R,bool S,typename T> uint16 int2half_impl(T value)
-		{
-		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
-			static_assert(std::is_integral<T>::value, "int to half conversion only supports builtin integer types");
-		#endif
-			if(S)
-				value = -value;
-			uint16 bits = S << 15;
-			if(value > 0xFFFF)
-			{
-				if(R == std::round_toward_infinity)
-					bits |= 0x7C00 - S;
-				else if(R == std::round_toward_neg_infinity)
-					bits |= 0x7BFF + S;
-				else
-					bits |= 0x7BFF + (R!=std::round_toward_zero);
-			}
-			else if(value)
-			{
-				unsigned int m = value, exp = 24;
-				for(; m<0x400; m<<=1,--exp) ;
-				for(; m>0x7FF; m>>=1,++exp) ;
-				bits |= (exp<<10) + m;
-				if(exp > 24)
-				{
-					if(R == std::round_to_nearest)
-						bits += (value>>(exp-25)) & 1
-						#if HALF_ROUND_TIES_TO_EVEN
-							& (((((1<<(exp-25))-1)&value)!=0)|bits)
-						#endif
-						;
-					else if(R == std::round_toward_infinity)
-						bits += ((value&((1<<(exp-24))-1))!=0) & !S;
-					else if(R == std::round_toward_neg_infinity)
-						bits += ((value&((1<<(exp-24))-1))!=0) & S;
-				}
-			}
-			return bits;
-		}
-
-		/// Convert integer to half-precision floating point.
-		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
-		/// \tparam T type to convert (builtin integer type)
-		/// \param value integral value
-		/// \return binary representation of half-precision value
-		template<std::float_round_style R,typename T> uint16 int2half(T value)
-		{
-			return (value<0) ? int2half_impl<R,true>(value) : int2half_impl<R,false>(value);
-		}
-
-		/// Convert half-precision to IEEE single-precision.
-		/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
-		/// \param value binary representation of half-precision value
-		/// \return single-precision value
-		inline float half2float_impl(uint16 value, float, true_type)
-		{
-			typedef bits<float>::type uint32;
-/*			uint32 bits = static_cast<uint32>(value&0x8000) << 16;
-			int abs = value & 0x7FFF;
-			if(abs)
-			{
-				bits |= 0x38000000 << static_cast<unsigned>(abs>=0x7C00);
-				for(; abs<0x400; abs<<=1,bits-=0x800000) ;
-				bits += static_cast<uint32>(abs) << 13;
-			}
-*/			static const uint32 mantissa_table[2048] = { 
-				0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, 
-				0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, 
-				0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, 
-				0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, 
-				0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, 
-				0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, 
-				0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, 
-				0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, 
-				0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, 
-				0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, 
-				0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, 
-				0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, 
-				0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, 
-				0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, 
-				0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, 
-				0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, 
-				0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, 
-				0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, 
-				0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, 
-				0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, 
-				0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, 
-				0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, 
-				0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, 
-				0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, 
-				0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, 
-				0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, 
-				0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, 
-				0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, 
-				0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, 
-				0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, 
-				0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, 
-				0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, 
-				0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, 
-				0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, 
-				0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, 
-				0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, 
-				0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, 
-				0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, 
-				0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, 
-				0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, 
-				0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, 
-				0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, 
-				0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, 
-				0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, 
-				0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, 
-				0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, 
-				0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, 
-				0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, 
-				0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, 
-				0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, 
-				0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, 
-				0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, 
-				0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, 
-				0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, 
-				0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, 
-				0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, 
-				0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, 
-				0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, 
-				0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, 
-				0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, 
-				0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, 
-				0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, 
-				0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, 
-				0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, 
-				0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, 
-				0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, 
-				0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, 
-				0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, 
-				0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, 
-				0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, 
-				0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, 
-				0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, 
-				0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, 
-				0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, 
-				0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, 
-				0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, 
-				0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, 
-				0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, 
-				0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, 
-				0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, 
-				0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, 
-				0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, 
-				0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, 
-				0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, 
-				0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, 
-				0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, 
-				0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, 
-				0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, 
-				0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, 
-				0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, 
-				0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, 
-				0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, 
-				0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, 
-				0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, 
-				0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, 
-				0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, 
-				0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, 
-				0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, 
-				0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, 
-				0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, 
-				0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, 
-				0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, 
-				0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, 
-				0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, 
-				0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, 
-				0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, 
-				0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, 
-				0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, 
-				0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, 
-				0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, 
-				0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, 
-				0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, 
-				0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, 
-				0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, 
-				0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, 
-				0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, 
-				0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, 
-				0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, 
-				0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, 
-				0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, 
-				0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, 
-				0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, 
-				0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, 
-				0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, 
-				0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, 
-				0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, 
-				0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, 
-				0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 };
-			static const uint32 exponent_table[64] = { 
-				0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, 
-				0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, 
-				0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, 
-				0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 };
-			static const unsigned short offset_table[64] = { 
-				   0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 
-				   0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };
-			uint32 bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10];
-//			return *reinterpret_cast<float*>(&bits);			//violating strict aliasing!
-			float out;
-			std::memcpy(&out, &bits, sizeof(float));
-			return out;
-		}
-
-		/// Convert half-precision to IEEE double-precision.
-		/// \param value binary representation of half-precision value
-		/// \return double-precision value
-		inline double half2float_impl(uint16 value, double, true_type)
-		{
-			typedef bits<float>::type uint32;
-			typedef bits<double>::type uint64;
-			uint32 hi = static_cast<uint32>(value&0x8000) << 16;
-			int abs = value & 0x7FFF;
-			if(abs)
-			{
-				hi |= 0x3F000000 << static_cast<unsigned>(abs>=0x7C00);
-				for(; abs<0x400; abs<<=1,hi-=0x100000) ;
-				hi += static_cast<uint32>(abs) << 10;
-			}
-			uint64 bits = static_cast<uint64>(hi) << 32;
-//			return *reinterpret_cast<double*>(&bits);			//violating strict aliasing!
-			double out;
-			std::memcpy(&out, &bits, sizeof(double));
-			return out;
-		}
-
-		/// Convert half-precision to non-IEEE floating point.
-		/// \tparam T type to convert to (builtin integer type)
-		/// \param value binary representation of half-precision value
-		/// \return floating point value
-		template<typename T> T half2float_impl(uint16 value, T, ...)
-		{
-			T out;
-			int abs = value & 0x7FFF;
-			if(abs > 0x7C00)
-				out = std::numeric_limits<T>::has_quiet_NaN ? std::numeric_limits<T>::quiet_NaN() : T();
-			else if(abs == 0x7C00)
-				out = std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() : std::numeric_limits<T>::max();
-			else if(abs > 0x3FF)
-				out = std::ldexp(static_cast<T>((abs&0x3FF)|0x400), (abs>>10)-25);
-			else
-				out = std::ldexp(static_cast<T>(abs), -24);
-			return (value&0x8000) ? -out : out;
-		}
-
-		/// Convert half-precision to floating point.
-		/// \tparam T type to convert to (builtin integer type)
-		/// \param value binary representation of half-precision value
-		/// \return floating point value
-		template<typename T> T half2float(uint16 value)
-		{
-			return half2float_impl(value, T(), bool_type<std::numeric_limits<T>::is_iec559&&sizeof(typename bits<T>::type)==sizeof(T)>());
-		}
-
-		/// Convert half-precision floating point to integer.
-		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
-		/// \tparam E `true` for round to even, `false` for round away from zero
-		/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
-		/// \param value binary representation of half-precision value
-		/// \return integral value
-		template<std::float_round_style R,bool E,typename T> T half2int_impl(uint16 value)
-		{
-		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
-			static_assert(std::is_integral<T>::value, "half to int conversion only supports builtin integer types");
-		#endif
-			unsigned int e = value & 0x7FFF;
-			if(e >= 0x7C00)
-				return (value&0x8000) ? std::numeric_limits<T>::min() : std::numeric_limits<T>::max();
-			if(e < 0x3800)
-			{
-				if(R == std::round_toward_infinity)
-					return T(~(value>>15)&(e!=0));
-				else if(R == std::round_toward_neg_infinity)
-					return -T(value>0x8000);
-				return T();
-			}
-			unsigned int m = (value&0x3FF) | 0x400;
-			e >>= 10;
-			if(e < 25)
-			{
-				if(R == std::round_to_nearest)
-					m += (1<<(24-e)) - (~(m>>(25-e))&E);
-				else if(R == std::round_toward_infinity)
-					m += ((value>>15)-1) & ((1<<(25-e))-1U);
-				else if(R == std::round_toward_neg_infinity)
-					m += -(value>>15) & ((1<<(25-e))-1U);
-				m >>= 25 - e;
-			}
-			else
-				m <<= e - 25;
-			return (value&0x8000) ? -static_cast<T>(m) : static_cast<T>(m);
-		}
-
-		/// Convert half-precision floating point to integer.
-		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
-		/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
-		/// \param value binary representation of half-precision value
-		/// \return integral value
-		template<std::float_round_style R,typename T> T half2int(uint16 value) { return half2int_impl<R,HALF_ROUND_TIES_TO_EVEN,T>(value); }
-
-		/// Convert half-precision floating point to integer using round-to-nearest-away-from-zero.
-		/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
-		/// \param value binary representation of half-precision value
-		/// \return integral value
-		template<typename T> T half2int_up(uint16 value) { return half2int_impl<std::round_to_nearest,0,T>(value); }
-
-		/// Round half-precision number to nearest integer value.
-		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
-		/// \tparam E `true` for round to even, `false` for round away from zero
-		/// \param value binary representation of half-precision value
-		/// \return half-precision bits for nearest integral value
-		template<std::float_round_style R,bool E> uint16 round_half_impl(uint16 value)
-		{
-			unsigned int e = value & 0x7FFF;
-			uint16 result = value;
-			if(e < 0x3C00)
-			{
-				result &= 0x8000;
-				if(R == std::round_to_nearest)
-					result |= 0x3C00U & -(e>=(0x3800+E));
-				else if(R == std::round_toward_infinity)
-					result |= 0x3C00U & -(~(value>>15)&(e!=0));
-				else if(R == std::round_toward_neg_infinity)
-					result |= 0x3C00U & -(value>0x8000);
-			}
-			else if(e < 0x6400)
-			{
-				e = 25 - (e>>10);
-				unsigned int mask = (1<<e) - 1;
-				if(R == std::round_to_nearest)
-					result += (1<<(e-1)) - (~(result>>e)&E);
-				else if(R == std::round_toward_infinity)
-					result += mask & ((value>>15)-1);
-				else if(R == std::round_toward_neg_infinity)
-					result += mask & -(value>>15);
-				result &= ~mask;
-			}
-			return result;
-		}
-
-		/// Round half-precision number to nearest integer value.
-		/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
-		/// \param value binary representation of half-precision value
-		/// \return half-precision bits for nearest integral value
-		template<std::float_round_style R> uint16 round_half(uint16 value) { return round_half_impl<R,HALF_ROUND_TIES_TO_EVEN>(value); }
-
-		/// Round half-precision number to nearest integer value using round-to-nearest-away-from-zero.
-		/// \param value binary representation of half-precision value
-		/// \return half-precision bits for nearest integral value
-		inline uint16 round_half_up(uint16 value) { return round_half_impl<std::round_to_nearest,0>(value); }
-		/// \}
-
-		struct functions;
-		template<typename> struct unary_specialized;
-		template<typename,typename> struct binary_specialized;
-		template<typename,typename,std::float_round_style> struct half_caster;
-	}
-
-	/// Half-precision floating point type.
-	/// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and 
-	/// conversions. It is implicitly convertible to single-precision floating point, which makes artihmetic expressions and 
-	/// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations 
-	/// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to 
-	/// half-precision are done using the library's default rounding mode, but temporary results inside chained arithmetic 
-	/// expressions are kept in single-precision as long as possible (while of course still maintaining a strong half-precision type).
-	///
-	/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and 
-	/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which 
-	/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the 
-	/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be of 
-	/// exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will most 
-	/// probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying 16-bit 
-	/// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 bits if 
-	/// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the case on 
-	/// nearly any reasonable platform.
-	///
-	/// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable 
-	/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation.
-	class half
-	{
-		friend struct detail::functions;
-		friend struct detail::unary_specialized<half>;
-		friend struct detail::binary_specialized<half,half>;
-		template<typename,typename,std::float_round_style> friend struct detail::half_caster;
-		friend class std::numeric_limits<half>;
-	#if HALF_ENABLE_CPP11_HASH
-		friend struct std::hash<half>;
-	#endif
-	#if HALF_ENABLE_CPP11_USER_LITERALS
-		friend half literal::operator""_h(long double);
-	#endif
-
-	public:
-		/// Default constructor.
-		/// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics 
-		/// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics.
-		HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {}
-
-		/// Copy constructor.
-		/// \tparam T type of concrete half expression
-		/// \param rhs half expression to copy from
-		half(detail::expr rhs) : data_(detail::float2half<round_style>(static_cast<float>(rhs))) {}
-
-		/// Conversion constructor.
-		/// \param rhs float to convert
-		explicit half(float rhs) : data_(detail::float2half<round_style>(rhs)) {}
-	
-		/// Conversion to single-precision.
-		/// \return single precision value representing expression value
-		operator float() const { return detail::half2float<float>(data_); }
-
-		/// Assignment operator.
-		/// \tparam T type of concrete half expression
-		/// \param rhs half expression to copy from
-		/// \return reference to this half
-		half& operator=(detail::expr rhs) { return *this = static_cast<float>(rhs); }
-
-		/// Arithmetic assignment.
-		/// \tparam T type of concrete half expression
-		/// \param rhs half expression to add
-		/// \return reference to this half
-		template<typename T> typename detail::enable<half&,T>::type operator+=(T rhs) { return *this += static_cast<float>(rhs); }
-
-		/// Arithmetic assignment.
-		/// \tparam T type of concrete half expression
-		/// \param rhs half expression to subtract
-		/// \return reference to this half
-		template<typename T> typename detail::enable<half&,T>::type operator-=(T rhs) { return *this -= static_cast<float>(rhs); }
-
-		/// Arithmetic assignment.
-		/// \tparam T type of concrete half expression
-		/// \param rhs half expression to multiply with
-		/// \return reference to this half
-		template<typename T> typename detail::enable<half&,T>::type operator*=(T rhs) { return *this *= static_cast<float>(rhs); }
-
-		/// Arithmetic assignment.
-		/// \tparam T type of concrete half expression
-		/// \param rhs half expression to divide by
-		/// \return reference to this half
-		template<typename T> typename detail::enable<half&,T>::type operator/=(T rhs) { return *this /= static_cast<float>(rhs); }
-
-		/// Assignment operator.
-		/// \param rhs single-precision value to copy from
-		/// \return reference to this half
-		half& operator=(float rhs) { data_ = detail::float2half<round_style>(rhs); return *this; }
-
-		/// Arithmetic assignment.
-		/// \param rhs single-precision value to add
-		/// \return reference to this half
-		half& operator+=(float rhs) { data_ = detail::float2half<round_style>(detail::half2float<float>(data_)+rhs); return *this; }
-
-		/// Arithmetic assignment.
-		/// \param rhs single-precision value to subtract
-		/// \return reference to this half
-		half& operator-=(float rhs) { data_ = detail::float2half<round_style>(detail::half2float<float>(data_)-rhs); return *this; }
-
-		/// Arithmetic assignment.
-		/// \param rhs single-precision value to multiply with
-		/// \return reference to this half
-		half& operator*=(float rhs) { data_ = detail::float2half<round_style>(detail::half2float<float>(data_)*rhs); return *this; }
-
-		/// Arithmetic assignment.
-		/// \param rhs single-precision value to divide by
-		/// \return reference to this half
-		half& operator/=(float rhs) { data_ = detail::float2half<round_style>(detail::half2float<float>(data_)/rhs); return *this; }
-
-		/// Prefix increment.
-		/// \return incremented half value
-		half& operator++() { return *this += 1.0f; }
-
-		/// Prefix decrement.
-		/// \return decremented half value
-		half& operator--() { return *this -= 1.0f; }
-
-		/// Postfix increment.
-		/// \return non-incremented half value
-		half operator++(int) { half out(*this); ++*this; return out; }
-
-		/// Postfix decrement.
-		/// \return non-decremented half value
-		half operator--(int) { half out(*this); --*this; return out; }
-	
-	private:
-		/// Rounding mode to use
-		static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE);
-
-		/// Constructor.
-		/// \param bits binary representation to set half to
-		HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) HALF_NOEXCEPT : data_(bits) {}
-
-		/// Internal binary representation
-		detail::uint16 data_;
-	};
-
-#if HALF_ENABLE_CPP11_USER_LITERALS
-	namespace literal
-	{
-		/// Half literal.
-		/// While this returns an actual half-precision value, half literals can unfortunately not be constant expressions due 
-		/// to rather involved conversions.
-		/// \param value literal value
-		/// \return half with given value (if representable)
-		inline half operator""_h(long double value) { return half(detail::binary, detail::float2half<half::round_style>(value)); }
-	}
-#endif
-
-	namespace detail
-	{
-		/// Wrapper implementing unspecialized half-precision functions.
-		struct functions
-		{
-			/// Addition implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \return Half-precision sum stored in single-precision
-			static expr plus(float x, float y) { return expr(x+y); }
-
-			/// Subtraction implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \return Half-precision difference stored in single-precision
-			static expr minus(float x, float y) { return expr(x-y); }
-
-			/// Multiplication implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \return Half-precision product stored in single-precision
-			static expr multiplies(float x, float y) { return expr(x*y); }
-
-			/// Division implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \return Half-precision quotient stored in single-precision
-			static expr divides(float x, float y) { return expr(x/y); }
-
-			/// Output implementation.
-			/// \param out stream to write to
-			/// \param arg value to write
-			/// \return reference to stream
-			template<typename charT,typename traits> static std::basic_ostream<charT,traits>& write(std::basic_ostream<charT,traits> &out, float arg) { return out << arg; }
-
-			/// Input implementation.
-			/// \param in stream to read from
-			/// \param arg half to read into
-			/// \return reference to stream
-			template<typename charT,typename traits> static std::basic_istream<charT,traits>& read(std::basic_istream<charT,traits> &in, half &arg)
-			{
-				float f;
-				if(in >> f)
-					arg = f;
-				return in;
-			}
-
-			/// Modulo implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \return Half-precision division remainder stored in single-precision
-			static expr fmod(float x, float y) { return expr(std::fmod(x, y)); }
-
-			/// Remainder implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \return Half-precision division remainder stored in single-precision
-			static expr remainder(float x, float y)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::remainder(x, y));
-			#else
-				if(builtin_isnan(x) || builtin_isnan(y))
-					return expr(std::numeric_limits<float>::quiet_NaN());
-				float ax = std::fabs(x), ay = std::fabs(y);
-				if(ax >= 65536.0f || ay < std::ldexp(1.0f, -24))
-					return expr(std::numeric_limits<float>::quiet_NaN());
-				if(ay >= 65536.0f)
-					return expr(x);
-				if(ax == ay)
-					return expr(builtin_signbit(x) ? -0.0f : 0.0f);
-				ax = std::fmod(ax, ay+ay);
-				float y2 = 0.5f * ay;
-				if(ax > y2)
-				{
-					ax -= ay;
-					if(ax >= y2)
-						ax -= ay;
-				}
-				return expr(builtin_signbit(x) ? -ax : ax);
-			#endif
-			}
-
-			/// Remainder implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \param quo address to store quotient bits at
-			/// \return Half-precision division remainder stored in single-precision
-			static expr remquo(float x, float y, int *quo)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::remquo(x, y, quo));
-			#else
-				if(builtin_isnan(x) || builtin_isnan(y))
-					return expr(std::numeric_limits<float>::quiet_NaN());
-				bool sign = builtin_signbit(x), qsign = static_cast<bool>(sign^builtin_signbit(y));
-				float ax = std::fabs(x), ay = std::fabs(y);
-				if(ax >= 65536.0f || ay < std::ldexp(1.0f, -24))
-					return expr(std::numeric_limits<float>::quiet_NaN());
-				if(ay >= 65536.0f)
-					return expr(x);
-				if(ax == ay)
-					return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f);
-				ax = std::fmod(ax, 8.0f*ay);
-				int cquo = 0;
-				if(ax >= 4.0f * ay)
-				{
-					ax -= 4.0f * ay;
-					cquo += 4;
-				}
-				if(ax >= 2.0f * ay)
-				{
-					ax -= 2.0f * ay;
-					cquo += 2;
-				}
-				float y2 = 0.5f * ay;
-				if(ax > y2)
-				{
-					ax -= ay;
-					++cquo;
-					if(ax >= y2)
-					{
-						ax -= ay;
-						++cquo;
-					}
-				}
-				return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax);
-			#endif
-			}
-
-			/// Positive difference implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \return Positive difference stored in single-precision
-			static expr fdim(float x, float y)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::fdim(x, y));
-			#else
-				return expr((x<=y) ? 0.0f : (x-y));
-			#endif
-			}
-
-			/// Fused multiply-add implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \param z third operand
-			/// \return \a x * \a y + \a z stored in single-precision
-			static expr fma(float x, float y, float z)
-			{
-			#if HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF)
-				return expr(std::fma(x, y, z));
-			#else
-				return expr(x*y+z);
-			#endif
-			}
-
-			/// Get NaN.
-			/// \return Half-precision quiet NaN
-			static half nanh() { return half(binary, 0x7FFF); }
-
-			/// Exponential implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr exp(float arg) { return expr(std::exp(arg)); }
-
-			/// Exponential implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr expm1(float arg)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::expm1(arg));
-			#else
-				return expr(static_cast<float>(std::exp(static_cast<double>(arg))-1.0));
-			#endif
-			}
-
-			/// Binary exponential implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr exp2(float arg)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::exp2(arg));
-			#else
-				return expr(static_cast<float>(std::exp(arg*0.69314718055994530941723212145818)));
-			#endif
-			}
-
-			/// Logarithm implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr log(float arg) { return expr(std::log(arg)); }
-
-			/// Common logarithm implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr log10(float arg) { return expr(std::log10(arg)); }
-
-			/// Logarithm implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr log1p(float arg)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::log1p(arg));
-			#else
-				return expr(static_cast<float>(std::log(1.0+arg)));
-			#endif
-			}
-
-			/// Binary logarithm implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr log2(float arg)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::log2(arg));
-			#else
-				return expr(static_cast<float>(std::log(static_cast<double>(arg))*1.4426950408889634073599246810019));
-			#endif
-			}
-
-			/// Square root implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr sqrt(float arg) { return expr(std::sqrt(arg)); }
-
-			/// Cubic root implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr cbrt(float arg)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::cbrt(arg));
-			#else
-				if(builtin_isnan(arg) || builtin_isinf(arg))
-					return expr(arg);
-				return expr(builtin_signbit(arg) ? -static_cast<float>(std::pow(-static_cast<double>(arg), 1.0/3.0)) : 
-					static_cast<float>(std::pow(static_cast<double>(arg), 1.0/3.0)));
-			#endif
-			}
-
-			/// Hypotenuse implementation.
-			/// \param x first argument
-			/// \param y second argument
-			/// \return function value stored in single-preicision
-			static expr hypot(float x, float y)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::hypot(x, y));
-			#else
-				return expr((builtin_isinf(x) || builtin_isinf(y)) ? std::numeric_limits<float>::infinity() : 
-					static_cast<float>(std::sqrt(static_cast<double>(x)*x+static_cast<double>(y)*y)));
-			#endif
-			}
-
-			/// Power implementation.
-			/// \param base value to exponentiate
-			/// \param exp power to expontiate to
-			/// \return function value stored in single-preicision
-			static expr pow(float base, float exp) { return expr(std::pow(base, exp)); }
-
-			/// Sine implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr sin(float arg) { return expr(std::sin(arg)); }
-
-			/// Cosine implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr cos(float arg) { return expr(std::cos(arg)); }
-
-			/// Tan implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr tan(float arg) { return expr(std::tan(arg)); }
-
-			/// Arc sine implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr asin(float arg) { return expr(std::asin(arg)); }
-
-			/// Arc cosine implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr acos(float arg) { return expr(std::acos(arg)); }
-
-			/// Arc tangent implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr atan(float arg) { return expr(std::atan(arg)); }
-
-			/// Arc tangent implementation.
-			/// \param x first argument
-			/// \param y second argument
-			/// \return function value stored in single-preicision
-			static expr atan2(float x, float y) { return expr(std::atan2(x, y)); }
-
-			/// Hyperbolic sine implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr sinh(float arg) { return expr(std::sinh(arg)); }
-
-			/// Hyperbolic cosine implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr cosh(float arg) { return expr(std::cosh(arg)); }
-
-			/// Hyperbolic tangent implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr tanh(float arg) { return expr(std::tanh(arg)); }
-
-			/// Hyperbolic area sine implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr asinh(float arg)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::asinh(arg));
-			#else
-				return expr((arg==-std::numeric_limits<float>::infinity()) ? arg : static_cast<float>(std::log(arg+std::sqrt(arg*arg+1.0))));
-			#endif
-			}
-
-			/// Hyperbolic area cosine implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr acosh(float arg)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::acosh(arg));
-			#else
-				return expr((arg<-1.0f) ? std::numeric_limits<float>::quiet_NaN() : static_cast<float>(std::log(arg+std::sqrt(arg*arg-1.0))));
-			#endif
-			}
-
-			/// Hyperbolic area tangent implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr atanh(float arg)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::atanh(arg));
-			#else
-				return expr(static_cast<float>(0.5*std::log((1.0+arg)/(1.0-arg))));
-			#endif
-			}
-
-			/// Error function implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr erf(float arg)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::erf(arg));
-			#else
-				return expr(static_cast<float>(erf(static_cast<double>(arg))));
-			#endif
-			}
-
-			/// Complementary implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr erfc(float arg)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::erfc(arg));
-			#else
-				return expr(static_cast<float>(1.0-erf(static_cast<double>(arg))));
-			#endif
-			}
-
-			/// Gamma logarithm implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr lgamma(float arg)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::lgamma(arg));
-			#else
-				if(builtin_isinf(arg))
-					return expr(std::numeric_limits<float>::infinity());
-				if(arg < 0.0f)
-				{
-					float i, f = std::modf(-arg, &i);
-					if(f == 0.0f)
-						return expr(std::numeric_limits<float>::infinity());
-					return expr(static_cast<float>(1.1447298858494001741434273513531-
-						std::log(std::abs(std::sin(3.1415926535897932384626433832795*f)))-lgamma(1.0-arg)));
-				}
-				return expr(static_cast<float>(lgamma(static_cast<double>(arg))));
-			#endif
-			}
-
-			/// Gamma implementation.
-			/// \param arg function argument
-			/// \return function value stored in single-preicision
-			static expr tgamma(float arg)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::tgamma(arg));
-			#else
-				if(arg == 0.0f)
-					return builtin_signbit(arg) ? expr(-std::numeric_limits<float>::infinity()) : expr(std::numeric_limits<float>::infinity());
-				if(arg < 0.0f)
-				{
-					float i, f = std::modf(-arg, &i);
-					if(f == 0.0f)
-						return expr(std::numeric_limits<float>::quiet_NaN());
-					double value = 3.1415926535897932384626433832795 / (std::sin(3.1415926535897932384626433832795*f)*std::exp(lgamma(1.0-arg)));
-					return expr(static_cast<float>((std::fmod(i, 2.0f)==0.0f) ? -value : value));
-				}
-				if(builtin_isinf(arg))
-					return expr(arg);
-				return expr(static_cast<float>(std::exp(lgamma(static_cast<double>(arg)))));
-			#endif
-			}
-
-			/// Floor implementation.
-			/// \param arg value to round
-			/// \return rounded value
-			static half floor(half arg) { return half(binary, round_half<std::round_toward_neg_infinity>(arg.data_)); }
-
-			/// Ceiling implementation.
-			/// \param arg value to round
-			/// \return rounded value
-			static half ceil(half arg) { return half(binary, round_half<std::round_toward_infinity>(arg.data_)); }
-
-			/// Truncation implementation.
-			/// \param arg value to round
-			/// \return rounded value
-			static half trunc(half arg) { return half(binary, round_half<std::round_toward_zero>(arg.data_)); }
-
-			/// Nearest integer implementation.
-			/// \param arg value to round
-			/// \return rounded value
-			static half round(half arg) { return half(binary, round_half_up(arg.data_)); }
-
-			/// Nearest integer implementation.
-			/// \param arg value to round
-			/// \return rounded value
-			static long lround(half arg) { return detail::half2int_up<long>(arg.data_); }
-
-			/// Nearest integer implementation.
-			/// \param arg value to round
-			/// \return rounded value
-			static half rint(half arg) { return half(binary, round_half<half::round_style>(arg.data_)); }
-
-			/// Nearest integer implementation.
-			/// \param arg value to round
-			/// \return rounded value
-			static long lrint(half arg) { return detail::half2int<half::round_style,long>(arg.data_); }
-
-		#if HALF_ENABLE_CPP11_LONG_LONG
-			/// Nearest integer implementation.
-			/// \param arg value to round
-			/// \return rounded value
-			static long long llround(half arg) { return detail::half2int_up<long long>(arg.data_); }
-
-			/// Nearest integer implementation.
-			/// \param arg value to round
-			/// \return rounded value
-			static long long llrint(half arg) { return detail::half2int<half::round_style,long long>(arg.data_); }
-		#endif
-
-			/// Decompression implementation.
-			/// \param arg number to decompress
-			/// \param exp address to store exponent at
-			/// \return normalized significant
-			static half frexp(half arg, int *exp)
-			{
-				int m = arg.data_ & 0x7FFF, e = -14;
-				if(m >= 0x7C00 || !m)
-					return *exp = 0, arg;
-				for(; m<0x400; m<<=1,--e) ;
-				return *exp = e+(m>>10), half(binary, (arg.data_&0x8000)|0x3800|(m&0x3FF));
-			}
-
-			/// Decompression implementation.
-			/// \param arg number to decompress
-			/// \param iptr address to store integer part at
-			/// \return fractional part
-			static half modf(half arg, half *iptr)
-			{
-				unsigned int e = arg.data_ & 0x7FFF;
-				if(e >= 0x6400)
-					return *iptr = arg, half(binary, arg.data_&(0x8000U|-(e>0x7C00)));
-				if(e < 0x3C00)
-					return iptr->data_ = arg.data_ & 0x8000, arg;
-				e >>= 10;
-				unsigned int mask = (1<<(25-e)) - 1, m = arg.data_ & mask;
-				iptr->data_ = arg.data_ & ~mask;
-				if(!m)
-					return half(binary, arg.data_&0x8000);
-				for(; m<0x400; m<<=1,--e) ;
-				return half(binary, static_cast<uint16>((arg.data_&0x8000)|(e<<10)|(m&0x3FF)));
-			}
-
-			/// Scaling implementation.
-			/// \param arg number to scale
-			/// \param exp power of two to scale by
-			/// \return scaled number
-			static half scalbln(half arg, long exp)
-			{
-				unsigned int m = arg.data_ & 0x7FFF;
-				if(m >= 0x7C00 || !m)
-					return arg;
-				for(; m<0x400; m<<=1,--exp) ;
-				exp += m >> 10;
-				uint16 value = arg.data_ & 0x8000;
-				if(exp > 30)
-				{
-					if(half::round_style == std::round_toward_zero)
-						value |= 0x7BFF;
-					else if(half::round_style == std::round_toward_infinity)
-						value |= 0x7C00 - (value>>15);
-					else if(half::round_style == std::round_toward_neg_infinity)
-						value |= 0x7BFF + (value>>15);
-					else
-						value |= 0x7C00;
-				}
-				else if(exp > 0)
-					value |= (exp<<10) | (m&0x3FF);
-				else if(exp > -11)
-				{
-					m = (m&0x3FF) | 0x400;
-					if(half::round_style == std::round_to_nearest)
-					{
-						m += 1 << -exp;
-					#if HALF_ROUND_TIES_TO_EVEN
-						m -= (m>>(1-exp)) & 1;
-					#endif
-					}
-					else if(half::round_style == std::round_toward_infinity)
-						m += ((value>>15)-1) & ((1<<(1-exp))-1U);
-					else if(half::round_style == std::round_toward_neg_infinity)
-						m += -(value>>15) & ((1<<(1-exp))-1U);
-					value |= m >> (1-exp);
-				}
-				else if(half::round_style == std::round_toward_infinity)
-					value -= (value>>15) - 1;
-				else if(half::round_style == std::round_toward_neg_infinity)
-					value += value >> 15;
-				return half(binary, value);
-			}
-
-			/// Exponent implementation.
-			/// \param arg number to query
-			/// \return floating point exponent
-			static int ilogb(half arg)
-			{
-				int abs = arg.data_ & 0x7FFF;
-				if(!abs)
-					return FP_ILOGB0;
-				if(abs < 0x7C00)
-				{
-					int exp = (abs>>10) - 15;
-					if(abs < 0x400)
-						for(; abs<0x200; abs<<=1,--exp) ;
-					return exp;
-				}
-				if(abs > 0x7C00)
-					return FP_ILOGBNAN;
-				return INT_MAX;
-			}
-
-			/// Exponent implementation.
-			/// \param arg number to query
-			/// \return floating point exponent
-			static half logb(half arg)
-			{
-				int abs = arg.data_ & 0x7FFF;
-				if(!abs)
-					return half(binary, 0xFC00);
-				if(abs < 0x7C00)
-				{
-					int exp = (abs>>10) - 15;
-					if(abs < 0x400)
-						for(; abs<0x200; abs<<=1,--exp) ;
-					uint16 bits = (exp<0) << 15;
-					if(exp)
-					{
-						unsigned int m = std::abs(exp) << 6, e = 18;
-						for(; m<0x400; m<<=1,--e) ;
-						bits |= (e<<10) + m;
-					}
-					return half(binary, bits);
-				}
-				if(abs > 0x7C00)
-					return arg;
-				return half(binary, 0x7C00);
-			}
-
-			/// Enumeration implementation.
-			/// \param from number to increase/decrease
-			/// \param to direction to enumerate into
-			/// \return next representable number
-			static half nextafter(half from, half to)
-			{
-				uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF;
-				if(fabs > 0x7C00)
-					return from;
-				if(tabs > 0x7C00 || from.data_ == to.data_ || !(fabs|tabs))
-					return to;
-				if(!fabs)
-					return half(binary, (to.data_&0x8000)+1);
-				bool lt = ((fabs==from.data_) ? static_cast<int>(fabs) : -static_cast<int>(fabs)) < 
-					((tabs==to.data_) ? static_cast<int>(tabs) : -static_cast<int>(tabs));
-				return half(binary, from.data_+(((from.data_>>15)^static_cast<unsigned>(lt))<<1)-1);
-			}
-
-			/// Enumeration implementation.
-			/// \param from number to increase/decrease
-			/// \param to direction to enumerate into
-			/// \return next representable number
-			static half nexttoward(half from, long double to)
-			{
-				if(isnan(from))
-					return from;
-				long double lfrom = static_cast<long double>(from);
-				if(builtin_isnan(to) || lfrom == to)
-					return half(static_cast<float>(to));
-				if(!(from.data_&0x7FFF))
-					return half(binary, (static_cast<detail::uint16>(builtin_signbit(to))<<15)+1);
-				return half(binary, from.data_+(((from.data_>>15)^static_cast<unsigned>(lfrom<to))<<1)-1);
-			}
-
-			/// Sign implementation
-			/// \param x first operand
-			/// \param y second operand
-			/// \return composed value
-			static half copysign(half x, half y) { return half(binary, x.data_^((x.data_^y.data_)&0x8000)); }
-
-			/// Classification implementation.
-			/// \param arg value to classify
-			/// \retval true if infinite number
-			/// \retval false else
-			static int fpclassify(half arg)
-			{
-				unsigned int abs = arg.data_ & 0x7FFF;
-				return abs ? ((abs>0x3FF) ? ((abs>=0x7C00) ? ((abs>0x7C00) ? FP_NAN : FP_INFINITE) : FP_NORMAL) :FP_SUBNORMAL) : FP_ZERO;
-			}
-
-			/// Classification implementation.
-			/// \param arg value to classify
-			/// \retval true if finite number
-			/// \retval false else
-			static bool isfinite(half arg) { return (arg.data_&0x7C00) != 0x7C00; }
-
-			/// Classification implementation.
-			/// \param arg value to classify
-			/// \retval true if infinite number
-			/// \retval false else
-			static bool isinf(half arg) { return (arg.data_&0x7FFF) == 0x7C00; }
-
-			/// Classification implementation.
-			/// \param arg value to classify
-			/// \retval true if not a number
-			/// \retval false else
-			static bool isnan(half arg) { return (arg.data_&0x7FFF) > 0x7C00; }
-
-			/// Classification implementation.
-			/// \param arg value to classify
-			/// \retval true if normal number
-			/// \retval false else
-			static bool isnormal(half arg) { return ((arg.data_&0x7C00)!=0) & ((arg.data_&0x7C00)!=0x7C00); }
-
-			/// Sign bit implementation.
-			/// \param arg value to check
-			/// \retval true if signed
-			/// \retval false if unsigned
-			static bool signbit(half arg) { return (arg.data_&0x8000) != 0; }
-
-			/// Comparison implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \retval true if operands equal
-			/// \retval false else
-			static bool isequal(half x, half y) { return (x.data_==y.data_ || !((x.data_|y.data_)&0x7FFF)) && !isnan(x); }
-
-			/// Comparison implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \retval true if operands not equal
-			/// \retval false else
-			static bool isnotequal(half x, half y) { return (x.data_!=y.data_ && ((x.data_|y.data_)&0x7FFF)) || isnan(x); }
-
-			/// Comparison implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \retval true if \a x > \a y
-			/// \retval false else
-			static bool isgreater(half x, half y)
-			{
-				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
-				return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) > ((yabs==y.data_) ? yabs : -yabs));
-			}
-
-			/// Comparison implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \retval true if \a x >= \a y
-			/// \retval false else
-			static bool isgreaterequal(half x, half y)
-			{
-				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
-				return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) >= ((yabs==y.data_) ? yabs : -yabs));
-			}
-
-			/// Comparison implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \retval true if \a x < \a y
-			/// \retval false else
-			static bool isless(half x, half y)
-			{
-				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
-				return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) < ((yabs==y.data_) ? yabs : -yabs));
-			}
-
-			/// Comparison implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \retval true if \a x <= \a y
-			/// \retval false else
-			static bool islessequal(half x, half y)
-			{
-				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
-				return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) <= ((yabs==y.data_) ? yabs : -yabs));
-			}
-
-			/// Comparison implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \retval true if either \a x > \a y nor \a x < \a y
-			/// \retval false else
-			static bool islessgreater(half x, half y)
-			{
-				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
-				if(xabs > 0x7C00 || yabs > 0x7C00)
-					return false;
-				int a = (xabs==x.data_) ? xabs : -xabs, b = (yabs==y.data_) ? yabs : -yabs;
-				return a < b || a > b;
-			}
-
-			/// Comparison implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \retval true if operand unordered
-			/// \retval false else
-			static bool isunordered(half x, half y) { return isnan(x) || isnan(y); }
-
-		private:
-			static double erf(double arg)
-			{
-				if(builtin_isinf(arg))
-					return (arg<0.0) ? -1.0 : 1.0;
-				double x2 = arg * arg, ax2 = 0.147 * x2, value = std::sqrt(1.0-std::exp(-x2*(1.2732395447351626861510701069801+ax2)/(1.0+ax2)));
-				return builtin_signbit(arg) ? -value : value;
-			}
-
-			static double lgamma(double arg)
-			{
-				double v = 1.0;
-				for(; arg<8.0; ++arg) v *= arg;
-				double w = 1.0 / (arg*arg);
-				return (((((((-0.02955065359477124183006535947712*w+0.00641025641025641025641025641026)*w+
-					-0.00191752691752691752691752691753)*w+8.4175084175084175084175084175084e-4)*w+
-					-5.952380952380952380952380952381e-4)*w+7.9365079365079365079365079365079e-4)*w+
-					-0.00277777777777777777777777777778)*w+0.08333333333333333333333333333333)/arg + 
-					0.91893853320467274178032973640562 - std::log(v) - arg + (arg-0.5) * std::log(arg);
-			}
-		};
-
-		/// Wrapper for unary half-precision functions needing specialization for individual argument types.
-		/// \tparam T argument type
-		template<typename T> struct unary_specialized
-		{
-			/// Negation implementation.
-			/// \param arg value to negate
-			/// \return negated value
-			static HALF_CONSTEXPR half negate(half arg) { return half(binary, arg.data_^0x8000); }
-
-			/// Absolute value implementation.
-			/// \param arg function argument
-			/// \return absolute value
-			static half fabs(half arg) { return half(binary, arg.data_&0x7FFF); }
-		};
-		template<> struct unary_specialized<expr>
-		{
-			static HALF_CONSTEXPR expr negate(float arg) { return expr(-arg); }
-			static expr fabs(float arg) { return expr(std::fabs(arg)); }
-		};
-
-		/// Wrapper for binary half-precision functions needing specialization for individual argument types.
-		/// \tparam T first argument type
-		/// \tparam U first argument type
-		template<typename T,typename U> struct binary_specialized
-		{
-			/// Minimum implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \return minimum value
-			static expr fmin(float x, float y)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::fmin(x, y));
-			#else
-				if(builtin_isnan(x))
-					return expr(y);
-				if(builtin_isnan(y))
-					return expr(x);
-				return expr(std::min(x, y));
-			#endif
-			}
-
-			/// Maximum implementation.
-			/// \param x first operand
-			/// \param y second operand
-			/// \return maximum value
-			static expr fmax(float x, float y)
-			{
-			#if HALF_ENABLE_CPP11_CMATH
-				return expr(std::fmax(x, y));
-			#else
-				if(builtin_isnan(x))
-					return expr(y);
-				if(builtin_isnan(y))
-					return expr(x);
-				return expr(std::max(x, y));
-			#endif
-			}
-		};
-		template<> struct binary_specialized<half,half>
-		{
-			static half fmin(half x, half y)
-			{
-				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
-				if(xabs > 0x7C00)
-					return y;
-				if(yabs > 0x7C00)
-					return x;
-				return (((xabs==x.data_) ? xabs : -xabs) > ((yabs==y.data_) ? yabs : -yabs)) ? y : x;
-			}
-			static half fmax(half x, half y)
-			{
-				int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF;
-				if(xabs > 0x7C00)
-					return y;
-				if(yabs > 0x7C00)
-					return x;
-				return (((xabs==x.data_) ? xabs : -xabs) < ((yabs==y.data_) ? yabs : -yabs)) ? y : x;
-			}
-		};
-
-		/// Helper class for half casts.
-		/// This class template has to be specialized for all valid cast argument to define an appropriate static `cast` member 
-		/// function and a corresponding `type` member denoting its return type.
-		/// \tparam T destination type
-		/// \tparam U source type
-		/// \tparam R rounding mode to use
-		template<typename T,typename U,std::float_round_style R=(std::float_round_style)(HALF_ROUND_STYLE)> struct half_caster {};
-		template<typename U,std::float_round_style R> struct half_caster<half,U,R>
-		{
-		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
-			static_assert(std::is_arithmetic<U>::value, "half_cast from non-arithmetic type unsupported");
-		#endif
-
-			static half cast(U arg) { return cast_impl(arg, is_float<U>()); };
-
-		private:
-			static half cast_impl(U arg, true_type) { return half(binary, float2half<R>(arg)); }
-			static half cast_impl(U arg, false_type) { return half(binary, int2half<R>(arg)); }
-		};
-		template<typename T,std::float_round_style R> struct half_caster<T,half,R>
-		{
-		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
-			static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
-		#endif
-
-			static T cast(half arg) { return cast_impl(arg, is_float<T>()); }
-
-		private:
-			static T cast_impl(half arg, true_type) { return half2float<T>(arg.data_); }
-			static T cast_impl(half arg, false_type) { return half2int<R,T>(arg.data_); }
-		};
-		template<typename T,std::float_round_style R> struct half_caster<T,expr,R>
-		{
-		#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
-			static_assert(std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported");
-		#endif
-
-			static T cast(expr arg) { return cast_impl(arg, is_float<T>()); }
-
-		private:
-			static T cast_impl(float arg, true_type) { return static_cast<T>(arg); }
-			static T cast_impl(half arg, false_type) { return half2int<R,T>(arg.data_); }
-		};
-		template<std::float_round_style R> struct half_caster<half,half,R>
-		{
-			static half cast(half arg) { return arg; }
-		};
-		template<std::float_round_style R> struct half_caster<half,expr,R> : half_caster<half,half,R> {};
-
-		/// \name Comparison operators
-		/// \{
-
-		/// Comparison for equality.
-		/// \param x first operand
-		/// \param y second operand
-		/// \retval true if operands equal
-		/// \retval false else
-		template<typename T,typename U> typename enable<bool,T,U>::type operator==(T x, U y) { return functions::isequal(x, y); }
-
-		/// Comparison for inequality.
-		/// \param x first operand
-		/// \param y second operand
-		/// \retval true if operands not equal
-		/// \retval false else
-		template<typename T,typename U> typename enable<bool,T,U>::type operator!=(T x, U y) { return functions::isnotequal(x, y); }
-
-		/// Comparison for less than.
-		/// \param x first operand
-		/// \param y second operand
-		/// \retval true if \a x less than \a y
-		/// \retval false else
-		template<typename T,typename U> typename enable<bool,T,U>::type operator<(T x, U y) { return functions::isless(x, y); }
-
-		/// Comparison for greater than.
-		/// \param x first operand
-		/// \param y second operand
-		/// \retval true if \a x greater than \a y
-		/// \retval false else
-		template<typename T,typename U> typename enable<bool,T,U>::type operator>(T x, U y) { return functions::isgreater(x, y); }
-
-		/// Comparison for less equal.
-		/// \param x first operand
-		/// \param y second operand
-		/// \retval true if \a x less equal \a y
-		/// \retval false else
-		template<typename T,typename U> typename enable<bool,T,U>::type operator<=(T x, U y) { return functions::islessequal(x, y); }
-
-		/// Comparison for greater equal.
-		/// \param x first operand
-		/// \param y second operand
-		/// \retval true if \a x greater equal \a y
-		/// \retval false else
-		template<typename T,typename U> typename enable<bool,T,U>::type operator>=(T x, U y) { return functions::isgreaterequal(x, y); }
-
-		/// \}
-		/// \name Arithmetic operators
-		/// \{
-
-		/// Add halfs.
-		/// \param x left operand
-		/// \param y right operand
-		/// \return sum of half expressions
-		template<typename T,typename U> typename enable<expr,T,U>::type operator+(T x, U y) { return functions::plus(x, y); }
-
-		/// Subtract halfs.
-		/// \param x left operand
-		/// \param y right operand
-		/// \return difference of half expressions
-		template<typename T,typename U> typename enable<expr,T,U>::type operator-(T x, U y) { return functions::minus(x, y); }
-
-		/// Multiply halfs.
-		/// \param x left operand
-		/// \param y right operand
-		/// \return product of half expressions
-		template<typename T,typename U> typename enable<expr,T,U>::type operator*(T x, U y) { return functions::multiplies(x, y); }
-
-		/// Divide halfs.
-		/// \param x left operand
-		/// \param y right operand
-		/// \return quotient of half expressions
-		template<typename T,typename U> typename enable<expr,T,U>::type operator/(T x, U y) { return functions::divides(x, y); }
-
-		/// Identity.
-		/// \param arg operand
-		/// \return uncahnged operand
-		template<typename T> HALF_CONSTEXPR typename enable<T,T>::type operator+(T arg) { return arg; }
-
-		/// Negation.
-		/// \param arg operand
-		/// \return negated operand
-		template<typename T> HALF_CONSTEXPR typename enable<T,T>::type operator-(T arg) { return unary_specialized<T>::negate(arg); }
-
-		/// \}
-		/// \name Input and output
-		/// \{
-
-		/// Output operator.
-		/// \param out output stream to write into
-		/// \param arg half expression to write
-		/// \return reference to output stream
-		template<typename T,typename charT,typename traits> typename enable<std::basic_ostream<charT,traits>&,T>::type
-			operator<<(std::basic_ostream<charT,traits> &out, T arg) { return functions::write(out, arg); }
-
-		/// Input operator.
-		/// \param in input stream to read from
-		/// \param arg half to read into
-		/// \return reference to input stream
-		template<typename charT,typename traits> std::basic_istream<charT,traits>&
-			operator>>(std::basic_istream<charT,traits> &in, half &arg) { return functions::read(in, arg); }
-
-		/// \}
-		/// \name Basic mathematical operations
-		/// \{
-
-		/// Absolute value.
-		/// \param arg operand
-		/// \return absolute value of \a arg
-//		template<typename T> typename enable<T,T>::type abs(T arg) { return unary_specialized<T>::fabs(arg); }
-		inline half abs(half arg) { return unary_specialized<half>::fabs(arg); }
-		inline expr abs(expr arg) { return unary_specialized<expr>::fabs(arg); }
-
-		/// Absolute value.
-		/// \param arg operand
-		/// \return absolute value of \a arg
-//		template<typename T> typename enable<T,T>::type fabs(T arg) { return unary_specialized<T>::fabs(arg); }
-		inline half fabs(half arg) { return unary_specialized<half>::fabs(arg); }
-		inline expr fabs(expr arg) { return unary_specialized<expr>::fabs(arg); }
-
-		/// Remainder of division.
-		/// \param x first operand
-		/// \param y second operand
-		/// \return remainder of floating point division.
-//		template<typename T,typename U> typename enable<expr,T,U>::type fmod(T x, U y) { return functions::fmod(x, y); }
-		inline expr fmod(half x, half y) { return functions::fmod(x, y); }
-		inline expr fmod(half x, expr y) { return functions::fmod(x, y); }
-		inline expr fmod(expr x, half y) { return functions::fmod(x, y); }
-		inline expr fmod(expr x, expr y) { return functions::fmod(x, y); }
-
-		/// Remainder of division.
-		/// \param x first operand
-		/// \param y second operand
-		/// \return remainder of floating point division.
-//		template<typename T,typename U> typename enable<expr,T,U>::type remainder(T x, U y) { return functions::remainder(x, y); }
-		inline expr remainder(half x, half y) { return functions::remainder(x, y); }
-		inline expr remainder(half x, expr y) { return functions::remainder(x, y); }
-		inline expr remainder(expr x, half y) { return functions::remainder(x, y); }
-		inline expr remainder(expr x, expr y) { return functions::remainder(x, y); }
-
-		/// Remainder of division.
-		/// \param x first operand
-		/// \param y second operand
-		/// \param quo address to store some bits of quotient at
-		/// \return remainder of floating point division.
-//		template<typename T,typename U> typename enable<expr,T,U>::type remquo(T x, U y, int *quo) { return functions::remquo(x, y, quo); }
-		inline expr remquo(half x, half y, int *quo) { return functions::remquo(x, y, quo); }
-		inline expr remquo(half x, expr y, int *quo) { return functions::remquo(x, y, quo); }
-		inline expr remquo(expr x, half y, int *quo) { return functions::remquo(x, y, quo); }
-		inline expr remquo(expr x, expr y, int *quo) { return functions::remquo(x, y, quo); }
-
-		/// Fused multiply add.
-		/// \param x first operand
-		/// \param y second operand
-		/// \param z third operand
-		/// \return ( \a x * \a y ) + \a z rounded as one operation.
-//		template<typename T,typename U,typename V> typename enable<expr,T,U,V>::type fma(T x, U y, V z) { return functions::fma(x, y, z); }
-		inline expr fma(half x, half y, half z) { return functions::fma(x, y, z); }
-		inline expr fma(half x, half y, expr z) { return functions::fma(x, y, z); }
-		inline expr fma(half x, expr y, half z) { return functions::fma(x, y, z); }
-		inline expr fma(half x, expr y, expr z) { return functions::fma(x, y, z); }
-		inline expr fma(expr x, half y, half z) { return functions::fma(x, y, z); }
-		inline expr fma(expr x, half y, expr z) { return functions::fma(x, y, z); }
-		inline expr fma(expr x, expr y, half z) { return functions::fma(x, y, z); }
-		inline expr fma(expr x, expr y, expr z) { return functions::fma(x, y, z); }
-
-		/// Maximum of half expressions.
-		/// \param x first operand
-		/// \param y second operand
-		/// \return maximum of operands
-//		template<typename T,typename U> typename result<T,U>::type fmax(T x, U y) { return binary_specialized<T,U>::fmax(x, y); }
-		inline half fmax(half x, half y) { return binary_specialized<half,half>::fmax(x, y); }
-		inline expr fmax(half x, expr y) { return binary_specialized<half,expr>::fmax(x, y); }
-		inline expr fmax(expr x, half y) { return binary_specialized<expr,half>::fmax(x, y); }
-		inline expr fmax(expr x, expr y) { return binary_specialized<expr,expr>::fmax(x, y); }
-
-		/// Minimum of half expressions.
-		/// \param x first operand
-		/// \param y second operand
-		/// \return minimum of operands
-//		template<typename T,typename U> typename result<T,U>::type fmin(T x, U y) { return binary_specialized<T,U>::fmin(x, y); }
-		inline half fmin(half x, half y) { return binary_specialized<half,half>::fmin(x, y); }
-		inline expr fmin(half x, expr y) { return binary_specialized<half,expr>::fmin(x, y); }
-		inline expr fmin(expr x, half y) { return binary_specialized<expr,half>::fmin(x, y); }
-		inline expr fmin(expr x, expr y) { return binary_specialized<expr,expr>::fmin(x, y); }
-
-		/// Positive difference.
-		/// \param x first operand
-		/// \param y second operand
-		/// \return \a x - \a y or 0 if difference negative
-//		template<typename T,typename U> typename enable<expr,T,U>::type fdim(T x, U y) { return functions::fdim(x, y); }
-		inline expr fdim(half x, half y) { return functions::fdim(x, y); }
-		inline expr fdim(half x, expr y) { return functions::fdim(x, y); }
-		inline expr fdim(expr x, half y) { return functions::fdim(x, y); }
-		inline expr fdim(expr x, expr y) { return functions::fdim(x, y); }
-
-		/// Get NaN value.
-		/// \return quiet NaN
-		inline half nanh(const char*) { return functions::nanh(); }
-
-		/// \}
-		/// \name Exponential functions
-		/// \{
-
-		/// Exponential function.
-		/// \param arg function argument
-		/// \return e raised to \a arg
-//		template<typename T> typename enable<expr,T>::type exp(T arg) { return functions::exp(arg); }
-		inline expr exp(half arg) { return functions::exp(arg); }
-		inline expr exp(expr arg) { return functions::exp(arg); }
-
-		/// Exponential minus one.
-		/// \param arg function argument
-		/// \return e raised to \a arg subtracted by 1
-//		template<typename T> typename enable<expr,T>::type expm1(T arg) { return functions::expm1(arg); }
-		inline expr expm1(half arg) { return functions::expm1(arg); }
-		inline expr expm1(expr arg) { return functions::expm1(arg); }
-
-		/// Binary exponential.
-		/// \param arg function argument
-		/// \return 2 raised to \a arg
-//		template<typename T> typename enable<expr,T>::type exp2(T arg) { return functions::exp2(arg); }
-		inline expr exp2(half arg) { return functions::exp2(arg); }
-		inline expr exp2(expr arg) { return functions::exp2(arg); }
-
-		/// Natural logorithm.
-		/// \param arg function argument
-		/// \return logarithm of \a arg to base e
-//		template<typename T> typename enable<expr,T>::type log(T arg) { return functions::log(arg); }
-		inline expr log(half arg) { return functions::log(arg); }
-		inline expr log(expr arg) { return functions::log(arg); }
-
-		/// Common logorithm.
-		/// \param arg function argument
-		/// \return logarithm of \a arg to base 10
-//		template<typename T> typename enable<expr,T>::type log10(T arg) { return functions::log10(arg); }
-		inline expr log10(half arg) { return functions::log10(arg); }
-		inline expr log10(expr arg) { return functions::log10(arg); }
-
-		/// Natural logorithm.
-		/// \param arg function argument
-		/// \return logarithm of \a arg plus 1 to base e
-//		template<typename T> typename enable<expr,T>::type log1p(T arg) { return functions::log1p(arg); }
-		inline expr log1p(half arg) { return functions::log1p(arg); }
-		inline expr log1p(expr arg) { return functions::log1p(arg); }
-
-		/// Binary logorithm.
-		/// \param arg function argument
-		/// \return logarithm of \a arg to base 2
-//		template<typename T> typename enable<expr,T>::type log2(T arg) { return functions::log2(arg); }
-		inline expr log2(half arg) { return functions::log2(arg); }
-		inline expr log2(expr arg) { return functions::log2(arg); }
-
-		/// \}
-		/// \name Power functions
-		/// \{
-
-		/// Square root.
-		/// \param arg function argument
-		/// \return square root of \a arg
-//		template<typename T> typename enable<expr,T>::type sqrt(T arg) { return functions::sqrt(arg); }
-		inline expr sqrt(half arg) { return functions::sqrt(arg); }
-		inline expr sqrt(expr arg) { return functions::sqrt(arg); }
-
-		/// Cubic root.
-		/// \param arg function argument
-		/// \return cubic root of \a arg
-//		template<typename T> typename enable<expr,T>::type cbrt(T arg) { return functions::cbrt(arg); }
-		inline expr cbrt(half arg) { return functions::cbrt(arg); }
-		inline expr cbrt(expr arg) { return functions::cbrt(arg); }
-
-		/// Hypotenuse function.
-		/// \param x first argument
-		/// \param y second argument
-		/// \return square root of sum of squares without internal over- or underflows
-//		template<typename T,typename U> typename enable<expr,T,U>::type hypot(T x, U y) { return functions::hypot(x, y); }
-		inline expr hypot(half x, half y) { return functions::hypot(x, y); }
-		inline expr hypot(half x, expr y) { return functions::hypot(x, y); }
-		inline expr hypot(expr x, half y) { return functions::hypot(x, y); }
-		inline expr hypot(expr x, expr y) { return functions::hypot(x, y); }
-
-		/// Power function.
-		/// \param base first argument
-		/// \param exp second argument
-		/// \return \a base raised to \a exp
-//		template<typename T,typename U> typename enable<expr,T,U>::type pow(T base, U exp) { return functions::pow(base, exp); }
-		inline expr pow(half base, half exp) { return functions::pow(base, exp); }
-		inline expr pow(half base, expr exp) { return functions::pow(base, exp); }
-		inline expr pow(expr base, half exp) { return functions::pow(base, exp); }
-		inline expr pow(expr base, expr exp) { return functions::pow(base, exp); }
-
-		/// \}
-		/// \name Trigonometric functions
-		/// \{
-
-		/// Sine function.
-		/// \param arg function argument
-		/// \return sine value of \a arg
-//		template<typename T> typename enable<expr,T>::type sin(T arg) { return functions::sin(arg); }
-		inline expr sin(half arg) { return functions::sin(arg); }
-		inline expr sin(expr arg) { return functions::sin(arg); }
-
-		/// Cosine function.
-		/// \param arg function argument
-		/// \return cosine value of \a arg
-//		template<typename T> typename enable<expr,T>::type cos(T arg) { return functions::cos(arg); }
-		inline expr cos(half arg) { return functions::cos(arg); }
-		inline expr cos(expr arg) { return functions::cos(arg); }
-
-		/// Tangent function.
-		/// \param arg function argument
-		/// \return tangent value of \a arg
-//		template<typename T> typename enable<expr,T>::type tan(T arg) { return functions::tan(arg); }
-		inline expr tan(half arg) { return functions::tan(arg); }
-		inline expr tan(expr arg) { return functions::tan(arg); }
-
-		/// Arc sine.
-		/// \param arg function argument
-		/// \return arc sine value of \a arg
-//		template<typename T> typename enable<expr,T>::type asin(T arg) { return functions::asin(arg); }
-		inline expr asin(half arg) { return functions::asin(arg); }
-		inline expr asin(expr arg) { return functions::asin(arg); }
-
-		/// Arc cosine function.
-		/// \param arg function argument
-		/// \return arc cosine value of \a arg
-//		template<typename T> typename enable<expr,T>::type acos(T arg) { return functions::acos(arg); }
-		inline expr acos(half arg) { return functions::acos(arg); }
-		inline expr acos(expr arg) { return functions::acos(arg); }
-
-		/// Arc tangent function.
-		/// \param arg function argument
-		/// \return arc tangent value of \a arg
-//		template<typename T> typename enable<expr,T>::type atan(T arg) { return functions::atan(arg); }
-		inline expr atan(half arg) { return functions::atan(arg); }
-		inline expr atan(expr arg) { return functions::atan(arg); }
-
-		/// Arc tangent function.
-		/// \param x first argument
-		/// \param y second argument
-		/// \return arc tangent value
-//		template<typename T,typename U> typename enable<expr,T,U>::type atan2(T x, U y) { return functions::atan2(x, y); }
-		inline expr atan2(half x, half y) { return functions::atan2(x, y); }
-		inline expr atan2(half x, expr y) { return functions::atan2(x, y); }
-		inline expr atan2(expr x, half y) { return functions::atan2(x, y); }
-		inline expr atan2(expr x, expr y) { return functions::atan2(x, y); }
-
-		/// \}
-		/// \name Hyperbolic functions
-		/// \{
-
-		/// Hyperbolic sine.
-		/// \param arg function argument
-		/// \return hyperbolic sine value of \a arg
-//		template<typename T> typename enable<expr,T>::type sinh(T arg) { return functions::sinh(arg); }
-		inline expr sinh(half arg) { return functions::sinh(arg); }
-		inline expr sinh(expr arg) { return functions::sinh(arg); }
-
-		/// Hyperbolic cosine.
-		/// \param arg function argument
-		/// \return hyperbolic cosine value of \a arg
-//		template<typename T> typename enable<expr,T>::type cosh(T arg) { return functions::cosh(arg); }
-		inline expr cosh(half arg) { return functions::cosh(arg); }
-		inline expr cosh(expr arg) { return functions::cosh(arg); }
-
-		/// Hyperbolic tangent.
-		/// \param arg function argument
-		/// \return hyperbolic tangent value of \a arg
-//		template<typename T> typename enable<expr,T>::type tanh(T arg) { return functions::tanh(arg); }
-		inline expr tanh(half arg) { return functions::tanh(arg); }
-		inline expr tanh(expr arg) { return functions::tanh(arg); }
-
-		/// Hyperbolic area sine.
-		/// \param arg function argument
-		/// \return area sine value of \a arg
-//		template<typename T> typename enable<expr,T>::type asinh(T arg) { return functions::asinh(arg); }
-		inline expr asinh(half arg) { return functions::asinh(arg); }
-		inline expr asinh(expr arg) { return functions::asinh(arg); }
-
-		/// Hyperbolic area cosine.
-		/// \param arg function argument
-		/// \return area cosine value of \a arg
-//		template<typename T> typename enable<expr,T>::type acosh(T arg) { return functions::acosh(arg); }
-		inline expr acosh(half arg) { return functions::acosh(arg); }
-		inline expr acosh(expr arg) { return functions::acosh(arg); }
-
-		/// Hyperbolic area tangent.
-		/// \param arg function argument
-		/// \return area tangent value of \a arg
-//		template<typename T> typename enable<expr,T>::type atanh(T arg) { return functions::atanh(arg); }
-		inline expr atanh(half arg) { return functions::atanh(arg); }
-		inline expr atanh(expr arg) { return functions::atanh(arg); }
-
-		/// \}
-		/// \name Error and gamma functions
-		/// \{
-
-		/// Error function.
-		/// \param arg function argument
-		/// \return error function value of \a arg
-//		template<typename T> typename enable<expr,T>::type erf(T arg) { return functions::erf(arg); }
-		inline expr erf(half arg) { return functions::erf(arg); }
-		inline expr erf(expr arg) { return functions::erf(arg); }
-
-		/// Complementary error function.
-		/// \param arg function argument
-		/// \return 1 minus error function value of \a arg
-//		template<typename T> typename enable<expr,T>::type erfc(T arg) { return functions::erfc(arg); }
-		inline expr erfc(half arg) { return functions::erfc(arg); }
-		inline expr erfc(expr arg) { return functions::erfc(arg); }
-
-		/// Natural logarithm of gamma function.
-		/// \param arg function argument
-		/// \return natural logarith of gamma function for \a arg
-//		template<typename T> typename enable<expr,T>::type lgamma(T arg) { return functions::lgamma(arg); }
-		inline expr lgamma(half arg) { return functions::lgamma(arg); }
-		inline expr lgamma(expr arg) { return functions::lgamma(arg); }
-
-		/// Gamma function.
-		/// \param arg function argument
-		/// \return gamma function value of \a arg
-//		template<typename T> typename enable<expr,T>::type tgamma(T arg) { return functions::tgamma(arg); }
-		inline expr tgamma(half arg) { return functions::tgamma(arg); }
-		inline expr tgamma(expr arg) { return functions::tgamma(arg); }
-
-		/// \}
-		/// \name Rounding
-		/// \{
-
-		/// Nearest integer not less than half value.
-		/// \param arg half to round
-		/// \return nearest integer not less than \a arg
-//		template<typename T> typename enable<half,T>::type ceil(T arg) { return functions::ceil(arg); }
-		inline half ceil(half arg) { return functions::ceil(arg); }
-		inline half ceil(expr arg) { return functions::ceil(arg); }
-
-		/// Nearest integer not greater than half value.
-		/// \param arg half to round
-		/// \return nearest integer not greater than \a arg
-//		template<typename T> typename enable<half,T>::type floor(T arg) { return functions::floor(arg); }
-		inline half floor(half arg) { return functions::floor(arg); }
-		inline half floor(expr arg) { return functions::floor(arg); }
-
-		/// Nearest integer not greater in magnitude than half value.
-		/// \param arg half to round
-		/// \return nearest integer not greater in magnitude than \a arg
-//		template<typename T> typename enable<half,T>::type trunc(T arg) { return functions::trunc(arg); }
-		inline half trunc(half arg) { return functions::trunc(arg); }
-		inline half trunc(expr arg) { return functions::trunc(arg); }
-
-		/// Nearest integer.
-		/// \param arg half to round
-		/// \return nearest integer, rounded away from zero in half-way cases
-//		template<typename T> typename enable<half,T>::type round(T arg) { return functions::round(arg); }
-		inline half round(half arg) { return functions::round(arg); }
-		inline half round(expr arg) { return functions::round(arg); }
-
-		/// Nearest integer.
-		/// \param arg half to round
-		/// \return nearest integer, rounded away from zero in half-way cases
-//		template<typename T> typename enable<long,T>::type lround(T arg) { return functions::lround(arg); }
-		inline long lround(half arg) { return functions::lround(arg); }
-		inline long lround(expr arg) { return functions::lround(arg); }
-
-		/// Nearest integer using half's internal rounding mode.
-		/// \param arg half expression to round
-		/// \return nearest integer using default rounding mode
-//		template<typename T> typename enable<half,T>::type nearbyint(T arg) { return functions::nearbyint(arg); }
-		inline half nearbyint(half arg) { return functions::rint(arg); }
-		inline half nearbyint(expr arg) { return functions::rint(arg); }
-
-		/// Nearest integer using half's internal rounding mode.
-		/// \param arg half expression to round
-		/// \return nearest integer using default rounding mode
-//		template<typename T> typename enable<half,T>::type rint(T arg) { return functions::rint(arg); }
-		inline half rint(half arg) { return functions::rint(arg); }
-		inline half rint(expr arg) { return functions::rint(arg); }
-
-		/// Nearest integer using half's internal rounding mode.
-		/// \param arg half expression to round
-		/// \return nearest integer using default rounding mode
-//		template<typename T> typename enable<long,T>::type lrint(T arg) { return functions::lrint(arg); }
-		inline long lrint(half arg) { return functions::lrint(arg); }
-		inline long lrint(expr arg) { return functions::lrint(arg); }
-	#if HALF_ENABLE_CPP11_LONG_LONG
-		/// Nearest integer.
-		/// \param arg half to round
-		/// \return nearest integer, rounded away from zero in half-way cases
-//		template<typename T> typename enable<long long,T>::type llround(T arg) { return functions::llround(arg); }
-		inline long long llround(half arg) { return functions::llround(arg); }
-		inline long long llround(expr arg) { return functions::llround(arg); }
-
-		/// Nearest integer using half's internal rounding mode.
-		/// \param arg half expression to round
-		/// \return nearest integer using default rounding mode
-//		template<typename T> typename enable<long long,T>::type llrint(T arg) { return functions::llrint(arg); }
-		inline long long llrint(half arg) { return functions::llrint(arg); }
-		inline long long llrint(expr arg) { return functions::llrint(arg); }
-	#endif
-
-		/// \}
-		/// \name Floating point manipulation
-		/// \{
-
-		/// Decompress floating point number.
-		/// \param arg number to decompress
-		/// \param exp address to store exponent at
-		/// \return significant in range [0.5, 1)
-//		template<typename T> typename enable<half,T>::type frexp(T arg, int *exp) { return functions::frexp(arg, exp); }
-		inline half frexp(half arg, int *exp) { return functions::frexp(arg, exp); }
-		inline half frexp(expr arg, int *exp) { return functions::frexp(arg, exp); }
-
-		/// Multiply by power of two.
-		/// \param arg number to modify
-		/// \param exp power of two to multiply with
-		/// \return \a arg multplied by 2 raised to \a exp
-//		template<typename T> typename enable<half,T>::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp); }
-		inline half ldexp(half arg, int exp) { return functions::scalbln(arg, exp); }
-		inline half ldexp(expr arg, int exp) { return functions::scalbln(arg, exp); }
-
-		/// Extract integer and fractional parts.
-		/// \param arg number to decompress
-		/// \param iptr address to store integer part at
-		/// \return fractional part
-//		template<typename T> typename enable<half,T>::type modf(T arg, half *iptr) { return functions::modf(arg, iptr); }
-		inline half modf(half arg, half *iptr) { return functions::modf(arg, iptr); }
-		inline half modf(expr arg, half *iptr) { return functions::modf(arg, iptr); }
-
-		/// Multiply by power of two.
-		/// \param arg number to modify
-		/// \param exp power of two to multiply with
-		/// \return \a arg multplied by 2 raised to \a exp
-//		template<typename T> typename enable<half,T>::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp); }
-		inline half scalbn(half arg, int exp) { return functions::scalbln(arg, exp); }
-		inline half scalbn(expr arg, int exp) { return functions::scalbln(arg, exp); }
-
-		/// Multiply by power of two.
-		/// \param arg number to modify
-		/// \param exp power of two to multiply with
-		/// \return \a arg multplied by 2 raised to \a exp	
-//		template<typename T> typename enable<half,T>::type scalbln(T arg, long exp) { return functions::scalbln(arg, exp); }
-		inline half scalbln(half arg, long exp) { return functions::scalbln(arg, exp); }
-		inline half scalbln(expr arg, long exp) { return functions::scalbln(arg, exp); }
-
-		/// Extract exponent.
-		/// \param arg number to query
-		/// \return floating point exponent
-		/// \retval FP_ILOGB0 for zero
-		/// \retval FP_ILOGBNAN for NaN
-		/// \retval MAX_INT for infinity
-//		template<typename T> typename enable<int,T>::type ilogb(T arg) { return functions::ilogb(arg); }
-		inline int ilogb(half arg) { return functions::ilogb(arg); }
-		inline int ilogb(expr arg) { return functions::ilogb(arg); }
-
-		/// Extract exponent.
-		/// \param arg number to query
-		/// \return floating point exponent
-//		template<typename T> typename enable<half,T>::type logb(T arg) { return functions::logb(arg); }
-		inline half logb(half arg) { return functions::logb(arg); }
-		inline half logb(expr arg) { return functions::logb(arg); }
-
-		/// Next representable value.
-		/// \param from value to compute next representable value for
-		/// \param to direction towards which to compute next value
-		/// \return next representable value after \a from in direction towards \a to
-//		template<typename T,typename U> typename enable<half,T,U>::type nextafter(T from, U to) { return functions::nextafter(from, to); }
-		inline half nextafter(half from, half to) { return functions::nextafter(from, to); }
-		inline half nextafter(half from, expr to) { return functions::nextafter(from, to); }
-		inline half nextafter(expr from, half to) { return functions::nextafter(from, to); }
-		inline half nextafter(expr from, expr to) { return functions::nextafter(from, to); }
-
-		/// Next representable value.
-		/// \param from value to compute next representable value for
-		/// \param to direction towards which to compute next value
-		/// \return next representable value after \a from in direction towards \a to
-//		template<typename T> typename enable<half,T>::type nexttoward(T from, long double to) { return functions::nexttoward(from, to); }
-		inline half nexttoward(half from, long double to) { return functions::nexttoward(from, to); }
-		inline half nexttoward(expr from, long double to) { return functions::nexttoward(from, to); }
-
-		/// Take sign.
-		/// \param x value to change sign for
-		/// \param y value to take sign from
-		/// \return value equal to \a x in magnitude and to \a y in sign
-//		template<typename T,typename U> typename enable<half,T,U>::type copysign(T x, U y) { return functions::copysign(x, y); }
-		inline half copysign(half x, half y) { return functions::copysign(x, y); }
-		inline half copysign(half x, expr y) { return functions::copysign(x, y); }
-		inline half copysign(expr x, half y) { return functions::copysign(x, y); }
-		inline half copysign(expr x, expr y) { return functions::copysign(x, y); }
-
-		/// \}
-		/// \name Floating point classification
-		/// \{
-
-
-		/// Classify floating point value.
-		/// \param arg number to classify
-		/// \retval FP_ZERO for positive and negative zero
-		/// \retval FP_SUBNORMAL for subnormal numbers
-		/// \retval FP_INFINITY for positive and negative infinity
-		/// \retval FP_NAN for NaNs
-		/// \retval FP_NORMAL for all other (normal) values
-//		template<typename T> typename enable<int,T>::type fpclassify(T arg) { return functions::fpclassify(arg); }
-		inline int fpclassify(half arg) { return functions::fpclassify(arg); }
-		inline int fpclassify(expr arg) { return functions::fpclassify(arg); }
-
-		/// Check if finite number.
-		/// \param arg number to check
-		/// \retval true if neither infinity nor NaN
-		/// \retval false else
-//		template<typename T> typename enable<bool,T>::type isfinite(T arg) { return functions::isfinite(arg); }
-		inline bool isfinite(half arg) { return functions::isfinite(arg); }
-		inline bool isfinite(expr arg) { return functions::isfinite(arg); }
-
-		/// Check for infinity.
-		/// \param arg number to check
-		/// \retval true for positive or negative infinity
-		/// \retval false else
-//		template<typename T> typename enable<bool,T>::type isinf(T arg) { return functions::isinf(arg); }
-		inline bool isinf(half arg) { return functions::isinf(arg); }
-		inline bool isinf(expr arg) { return functions::isinf(arg); }
-
-		/// Check for NaN.
-		/// \param arg number to check
-		/// \retval true for NaNs
-		/// \retval false else
-//		template<typename T> typename enable<bool,T>::type isnan(T arg) { return functions::isnan(arg); }
-		inline bool isnan(half arg) { return functions::isnan(arg); }
-		inline bool isnan(expr arg) { return functions::isnan(arg); }
-
-		/// Check if normal number.
-		/// \param arg number to check
-		/// \retval true if normal number
-		/// \retval false if either subnormal, zero, infinity or NaN
-//		template<typename T> typename enable<bool,T>::type isnormal(T arg) { return functions::isnormal(arg); }
-		inline bool isnormal(half arg) { return functions::isnormal(arg); }
-		inline bool isnormal(expr arg) { return functions::isnormal(arg); }
-
-		/// Check sign.
-		/// \param arg number to check
-		/// \retval true for negative number
-		/// \retval false for positive number
-//		template<typename T> typename enable<bool,T>::type signbit(T arg) { return functions::signbit(arg); }
-		inline bool signbit(half arg) { return functions::signbit(arg); }
-		inline bool signbit(expr arg) { return functions::signbit(arg); }
-
-		/// \}
-		/// \name Comparison
-		/// \{
-
-		/// Comparison for greater than.
-		/// \param x first operand
-		/// \param y second operand
-		/// \retval true if \a x greater than \a y
-		/// \retval false else
-//		template<typename T,typename U> typename enable<bool,T,U>::type isgreater(T x, U y) { return functions::isgreater(x, y); }
-		inline bool isgreater(half x, half y) { return functions::isgreater(x, y); }
-		inline bool isgreater(half x, expr y) { return functions::isgreater(x, y); }
-		inline bool isgreater(expr x, half y) { return functions::isgreater(x, y); }
-		inline bool isgreater(expr x, expr y) { return functions::isgreater(x, y); }
-
-		/// Comparison for greater equal.
-		/// \param x first operand
-		/// \param y second operand
-		/// \retval true if \a x greater equal \a y
-		/// \retval false else
-//		template<typename T,typename U> typename enable<bool,T,U>::type isgreaterequal(T x, U y) { return functions::isgreaterequal(x, y); }
-		inline bool isgreaterequal(half x, half y) { return functions::isgreaterequal(x, y); }
-		inline bool isgreaterequal(half x, expr y) { return functions::isgreaterequal(x, y); }
-		inline bool isgreaterequal(expr x, half y) { return functions::isgreaterequal(x, y); }
-		inline bool isgreaterequal(expr x, expr y) { return functions::isgreaterequal(x, y); }
-
-		/// Comparison for less than.
-		/// \param x first operand
-		/// \param y second operand
-		/// \retval true if \a x less than \a y
-		/// \retval false else
-//		template<typename T,typename U> typename enable<bool,T,U>::type isless(T x, U y) { return functions::isless(x, y); }
-		inline bool isless(half x, half y) { return functions::isless(x, y); }
-		inline bool isless(half x, expr y) { return functions::isless(x, y); }
-		inline bool isless(expr x, half y) { return functions::isless(x, y); }
-		inline bool isless(expr x, expr y) { return functions::isless(x, y); }
-
-		/// Comparison for less equal.
-		/// \param x first operand
-		/// \param y second operand
-		/// \retval true if \a x less equal \a y
-		/// \retval false else
-//		template<typename T,typename U> typename enable<bool,T,U>::type islessequal(T x, U y) { return functions::islessequal(x, y); }
-		inline bool islessequal(half x, half y) { return functions::islessequal(x, y); }
-		inline bool islessequal(half x, expr y) { return functions::islessequal(x, y); }
-		inline bool islessequal(expr x, half y) { return functions::islessequal(x, y); }
-		inline bool islessequal(expr x, expr y) { return functions::islessequal(x, y); }
-
-		/// Comarison for less or greater.
-		/// \param x first operand
-		/// \param y second operand
-		/// \retval true if either less or greater
-		/// \retval false else
-//		template<typename T,typename U> typename enable<bool,T,U>::type islessgreater(T x, U y) { return functions::islessgreater(x, y); }
-		inline bool islessgreater(half x, half y) { return functions::islessgreater(x, y); }
-		inline bool islessgreater(half x, expr y) { return functions::islessgreater(x, y); }
-		inline bool islessgreater(expr x, half y) { return functions::islessgreater(x, y); }
-		inline bool islessgreater(expr x, expr y) { return functions::islessgreater(x, y); }
-
-		/// Check if unordered.
-		/// \param x first operand
-		/// \param y second operand
-		/// \retval true if unordered (one or two NaN operands)
-		/// \retval false else
-//		template<typename T,typename U> typename enable<bool,T,U>::type isunordered(T x, U y) { return functions::isunordered(x, y); }
-		inline bool isunordered(half x, half y) { return functions::isunordered(x, y); }
-		inline bool isunordered(half x, expr y) { return functions::isunordered(x, y); }
-		inline bool isunordered(expr x, half y) { return functions::isunordered(x, y); }
-		inline bool isunordered(expr x, expr y) { return functions::isunordered(x, y); }
-
-		/// \name Casting
-		/// \{
-
-		/// Cast to or from half-precision floating point number.
-		/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted 
-		/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. 
-		/// It uses the default rounding mode.
-		///
-		/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types 
-		/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler 
-		/// error and casting between [half](\ref half_float::half)s is just a no-op.
-		/// \tparam T destination type (half or built-in arithmetic type)
-		/// \tparam U source type (half or built-in arithmetic type)
-		/// \param arg value to cast
-		/// \return \a arg converted to destination type
-		template<typename T,typename U> T half_cast(U arg) { return half_caster<T,U>::cast(arg); }
-
-		/// Cast to or from half-precision floating point number.
-		/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted 
-		/// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
-		///
-		/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types 
-		/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler 
-		/// error and casting between [half](\ref half_float::half)s is just a no-op.
-		/// \tparam T destination type (half or built-in arithmetic type)
-		/// \tparam R rounding mode to use.
-		/// \tparam U source type (half or built-in arithmetic type)
-		/// \param arg value to cast
-		/// \return \a arg converted to destination type
-		template<typename T,std::float_round_style R,typename U> T half_cast(U arg) { return half_caster<T,U,R>::cast(arg); }
-		/// \}
-	}
-
-	using detail::operator==;
-	using detail::operator!=;
-	using detail::operator<;
-	using detail::operator>;
-	using detail::operator<=;
-	using detail::operator>=;
-	using detail::operator+;
-	using detail::operator-;
-	using detail::operator*;
-	using detail::operator/;
-	using detail::operator<<;
-	using detail::operator>>;
-
-	using detail::abs;
-	using detail::fabs;
-	using detail::fmod;
-	using detail::remainder;
-	using detail::remquo;
-	using detail::fma;
-	using detail::fmax;
-	using detail::fmin;
-	using detail::fdim;
-	using detail::nanh;
-	using detail::exp;
-	using detail::expm1;
-	using detail::exp2;
-	using detail::log;
-	using detail::log10;
-	using detail::log1p;
-	using detail::log2;
-	using detail::sqrt;
-	using detail::cbrt;
-	using detail::hypot;
-	using detail::pow;
-	using detail::sin;
-	using detail::cos;
-	using detail::tan;
-	using detail::asin;
-	using detail::acos;
-	using detail::atan;
-	using detail::atan2;
-	using detail::sinh;
-	using detail::cosh;
-	using detail::tanh;
-	using detail::asinh;
-	using detail::acosh;
-	using detail::atanh;
-	using detail::erf;
-	using detail::erfc;
-	using detail::lgamma;
-	using detail::tgamma;
-	using detail::ceil;
-	using detail::floor;
-	using detail::trunc;
-	using detail::round;
-	using detail::lround;
-	using detail::nearbyint;
-	using detail::rint;
-	using detail::lrint;
-#if HALF_ENABLE_CPP11_LONG_LONG
-	using detail::llround;
-	using detail::llrint;
-#endif
-	using detail::frexp;
-	using detail::ldexp;
-	using detail::modf;
-	using detail::scalbn;
-	using detail::scalbln;
-	using detail::ilogb;
-	using detail::logb;
-	using detail::nextafter;
-	using detail::nexttoward;
-	using detail::copysign;
-	using detail::fpclassify;
-	using detail::isfinite;
-	using detail::isinf;
-	using detail::isnan;
-	using detail::isnormal;
-	using detail::signbit;
-	using detail::isgreater;
-	using detail::isgreaterequal;
-	using detail::isless;
-	using detail::islessequal;
-	using detail::islessgreater;
-	using detail::isunordered;
-
-	using detail::half_cast;
-}
-
-
-/// Extensions to the C++ standard library.
-namespace std
-{
-	/// Numeric limits for half-precision floats.
-	/// Because of the underlying single-precision implementation of many operations, it inherits some properties from 
-	/// `std::numeric_limits<float>`.
-	template<> class numeric_limits<half_float::half> : public numeric_limits<float>
-	{
-	public:
-		/// Supports signed values.
-		static HALF_CONSTEXPR_CONST bool is_signed = true;
-
-		/// Is not exact.
-		static HALF_CONSTEXPR_CONST bool is_exact = false;
-
-		/// Doesn't provide modulo arithmetic.
-		static HALF_CONSTEXPR_CONST bool is_modulo = false;
-
-		/// IEEE conformant.
-		static HALF_CONSTEXPR_CONST bool is_iec559 = true;
-
-		/// Supports infinity.
-		static HALF_CONSTEXPR_CONST bool has_infinity = true;
-
-		/// Supports quiet NaNs.
-		static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true;
-
-		/// Supports subnormal values.
-		static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present;
-
-		/// Rounding mode.
-		/// Due to the mix of internal single-precision computations (using the rounding mode of the underlying 
-		/// single-precision implementation) with the rounding mode of the single-to-half conversions, the actual rounding 
-		/// mode might be `std::round_indeterminate` if the default half-precision rounding mode doesn't match the 
-		/// single-precision rounding mode.
-		static HALF_CONSTEXPR_CONST float_round_style round_style = (std::numeric_limits<float>::round_style==
-			half_float::half::round_style) ? half_float::half::round_style : round_indeterminate;
-
-		/// Significant digits.
-		static HALF_CONSTEXPR_CONST int digits = 11;
-
-		/// Significant decimal digits.
-		static HALF_CONSTEXPR_CONST int digits10 = 3;
-
-		/// Required decimal digits to represent all possible values.
-		static HALF_CONSTEXPR_CONST int max_digits10 = 5;
-
-		/// Number base.
-		static HALF_CONSTEXPR_CONST int radix = 2;
-
-		/// One more than smallest exponent.
-		static HALF_CONSTEXPR_CONST int min_exponent = -13;
-
-		/// Smallest normalized representable power of 10.
-		static HALF_CONSTEXPR_CONST int min_exponent10 = -4;
-
-		/// One more than largest exponent
-		static HALF_CONSTEXPR_CONST int max_exponent = 16;
-
-		/// Largest finitely representable power of 10.
-		static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
-
-		/// Smallest positive normal value.
-		static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0400); }
-
-		/// Smallest finite value.
-		static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0xFBFF); }
-
-		/// Largest finite value.
-		static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7BFF); }
-
-		/// Difference between one and next representable value.
-		static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x1400); }
-
-		/// Maximum rounding error.
-		static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW
-			{ return half_float::half(half_float::detail::binary, (round_style==std::round_to_nearest) ? 0x3800 : 0x3C00); }
-
-		/// Positive infinity.
-		static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7C00); }
-
-		/// Quiet NaN.
-		static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7FFF); }
-
-		/// Signalling NaN.
-		static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7DFF); }
-
-		/// Smallest positive subnormal value.
-		static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0001); }
-	};
-
-#if HALF_ENABLE_CPP11_HASH
-	/// Hash function for half-precision floats.
-	/// This is only defined if C++11 `std::hash` is supported and enabled.
-	template<> struct hash<half_float::half> //: unary_function<half_float::half,size_t>
-	{
-		/// Type of function argument.
-		typedef half_float::half argument_type;
-
-		/// Function return type.
-		typedef size_t result_type;
-
-		/// Compute hash function.
-		/// \param arg half to hash
-		/// \return hash value
-		result_type operator()(argument_type arg) const
-			{ return hash<half_float::detail::uint16>()(static_cast<unsigned>(arg.data_)&-(arg.data_!=0x8000)); }
-	};
-#endif
-}
-
-
-#undef HALF_CONSTEXPR
-#undef HALF_CONSTEXPR_CONST
-#undef HALF_NOEXCEPT
-#undef HALF_NOTHROW
-#ifdef HALF_POP_WARNINGS
-	#pragma warning(pop)
-	#undef HALF_POP_WARNINGS
-#endif
-
-#endif
diff --git a/include/triton/external/hip.h b/include/triton/external/hip.h
deleted file mode 100644
index a295eed68..000000000
--- a/include/triton/external/hip.h
+++ /dev/null
@@ -1,293 +0,0 @@
-#ifndef __external_hip_h__
-#define __external_hip_h__
-
-/*
- * @brief hipError_t
- * @enum
- * @ingroup Enumerations
- */
-// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
-// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
-
-// Ignoring error-code return values from hip APIs is discouraged. On C++17,
-// we can make that yield a warning
-
-/*
- * @brief hipError_t
- * @enum
- * @ingroup Enumerations
- */
-// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
-// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
-
-#include <cstddef>
-
-typedef enum hipError_t {
-    hipSuccess = 0,  ///< Successful completion.
-    hipErrorInvalidValue = 1,  ///< One or more of the parameters passed to the API call is NULL
-                               ///< or not in an acceptable range.
-    hipErrorOutOfMemory = 2,
-    // Deprecated
-    hipErrorMemoryAllocation = 2,  ///< Memory allocation error.
-    hipErrorNotInitialized = 3,
-    // Deprecated
-    hipErrorInitializationError = 3,
-    hipErrorDeinitialized = 4,
-    hipErrorProfilerDisabled = 5,
-    hipErrorProfilerNotInitialized = 6,
-    hipErrorProfilerAlreadyStarted = 7,
-    hipErrorProfilerAlreadyStopped = 8,
-    hipErrorInvalidConfiguration = 9,
-    hipErrorInvalidPitchValue = 12,
-    hipErrorInvalidSymbol = 13,
-    hipErrorInvalidDevicePointer = 17,  ///< Invalid Device Pointer
-    hipErrorInvalidMemcpyDirection = 21,  ///< Invalid memory copy direction
-    hipErrorInsufficientDriver = 35,
-    hipErrorMissingConfiguration = 52,
-    hipErrorPriorLaunchFailure = 53,
-    hipErrorInvalidDeviceFunction = 98,
-    hipErrorNoDevice = 100,  ///< Call to hipGetDeviceCount returned 0 devices
-    hipErrorInvalidDevice = 101,  ///< DeviceID must be in range 0...#compute-devices.
-    hipErrorInvalidImage = 200,
-    hipErrorInvalidContext = 201,  ///< Produced when input context is invalid.
-    hipErrorContextAlreadyCurrent = 202,
-    hipErrorMapFailed = 205,
-    // Deprecated
-    hipErrorMapBufferObjectFailed = 205,  ///< Produced when the IPC memory attach failed from ROCr.
-    hipErrorUnmapFailed = 206,
-    hipErrorArrayIsMapped = 207,
-    hipErrorAlreadyMapped = 208,
-    hipErrorNoBinaryForGpu = 209,
-    hipErrorAlreadyAcquired = 210,
-    hipErrorNotMapped = 211,
-    hipErrorNotMappedAsArray = 212,
-    hipErrorNotMappedAsPointer = 213,
-    hipErrorECCNotCorrectable = 214,
-    hipErrorUnsupportedLimit = 215,
-    hipErrorContextAlreadyInUse = 216,
-    hipErrorPeerAccessUnsupported = 217,
-    hipErrorInvalidKernelFile = 218,  ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
-    hipErrorInvalidGraphicsContext = 219,
-    hipErrorInvalidSource = 300,
-    hipErrorFileNotFound = 301,
-    hipErrorSharedObjectSymbolNotFound = 302,
-    hipErrorSharedObjectInitFailed = 303,
-    hipErrorOperatingSystem = 304,
-    hipErrorInvalidHandle = 400,
-    // Deprecated
-    hipErrorInvalidResourceHandle = 400,  ///< Resource handle (hipEvent_t or hipStream_t) invalid.
-    hipErrorNotFound = 500,
-    hipErrorNotReady = 600,  ///< Indicates that asynchronous operations enqueued earlier are not
-                             ///< ready.  This is not actually an error, but is used to distinguish
-                             ///< from hipSuccess (which indicates completion).  APIs that return
-                             ///< this error include hipEventQuery and hipStreamQuery.
-    hipErrorIllegalAddress = 700,
-    hipErrorLaunchOutOfResources = 701,  ///< Out of resources error.
-    hipErrorLaunchTimeOut = 702,
-    hipErrorPeerAccessAlreadyEnabled =
-        704,  ///< Peer access was already enabled from the current device.
-    hipErrorPeerAccessNotEnabled =
-        705,  ///< Peer access was never enabled from the current device.
-    hipErrorSetOnActiveProcess = 708,
-    hipErrorAssert = 710,  ///< Produced when the kernel calls assert.
-    hipErrorHostMemoryAlreadyRegistered =
-        712,  ///< Produced when trying to lock a page-locked memory.
-    hipErrorHostMemoryNotRegistered =
-        713,  ///< Produced when trying to unlock a non-page-locked memory.
-    hipErrorLaunchFailure =
-        719,  ///< An exception occurred on the device while executing a kernel.
-    hipErrorCooperativeLaunchTooLarge =
-        720,  ///< This error indicates that the number of blocks launched per grid for a kernel
-              ///< that was launched via cooperative launch APIs exceeds the maximum number of
-              ///< allowed blocks for the current device
-    hipErrorNotSupported = 801,  ///< Produced when the hip API is not supported/implemented
-    hipErrorUnknown = 999,  //< Unknown error.
-    // HSA Runtime Error Codes start here.
-    hipErrorRuntimeMemory = 1052,  ///< HSA runtime memory call returned error.  Typically not seen
-                                   ///< in production systems.
-    hipErrorRuntimeOther = 1053,  ///< HSA runtime call other than memory returned error.  Typically
-                                  ///< not seen in production systems.
-    hipErrorTbd  ///< Marker that more error codes are needed.
-} hipError_t;
-
-
-typedef struct ihipCtx_t* hipCtx_t;
-
-// Note many APIs also use integer deviceIds as an alternative to the device pointer:
-typedef int hipDevice_t;
-
-typedef enum hipDeviceP2PAttr {
-  hipDevP2PAttrPerformanceRank = 0,
-  hipDevP2PAttrAccessSupported,
-  hipDevP2PAttrNativeAtomicSupported,
-  hipDevP2PAttrHipArrayAccessSupported
-} hipDeviceP2PAttr;
-
-typedef struct ihipStream_t* hipStream_t;
-
-#define hipIpcMemLazyEnablePeerAccess 0
-
-#define HIP_IPC_HANDLE_SIZE 64
-
-typedef struct hipIpcMemHandle_st {
-    char reserved[HIP_IPC_HANDLE_SIZE];
-} hipIpcMemHandle_t;
-
-typedef struct hipIpcEventHandle_st {
-    char reserved[HIP_IPC_HANDLE_SIZE];
-} hipIpcEventHandle_t;
-
-typedef struct ihipModule_t* hipModule_t;
-
-typedef struct ihipModuleSymbol_t* hipFunction_t;
-
-typedef struct hipFuncAttributes {
-    int binaryVersion;
-    int cacheModeCA;
-    size_t constSizeBytes;
-    size_t localSizeBytes;
-    int maxDynamicSharedSizeBytes;
-    int maxThreadsPerBlock;
-    int numRegs;
-    int preferredShmemCarveout;
-    int ptxVersion;
-    size_t sharedSizeBytes;
-} hipFuncAttributes;
-
-typedef struct ihipEvent_t* hipEvent_t;
-
-/*
- * @brief hipDeviceAttribute_t
- * @enum
- * @ingroup Enumerations
- */
-typedef enum hipDeviceAttribute_t {
-    hipDeviceAttributeMaxThreadsPerBlock,       ///< Maximum number of threads per block.
-    hipDeviceAttributeMaxBlockDimX,             ///< Maximum x-dimension of a block.
-    hipDeviceAttributeMaxBlockDimY,             ///< Maximum y-dimension of a block.
-    hipDeviceAttributeMaxBlockDimZ,             ///< Maximum z-dimension of a block.
-    hipDeviceAttributeMaxGridDimX,              ///< Maximum x-dimension of a grid.
-    hipDeviceAttributeMaxGridDimY,              ///< Maximum y-dimension of a grid.
-    hipDeviceAttributeMaxGridDimZ,              ///< Maximum z-dimension of a grid.
-    hipDeviceAttributeMaxSharedMemoryPerBlock,  ///< Maximum shared memory available per block in
-                                                ///< bytes.
-    hipDeviceAttributeTotalConstantMemory,      ///< Constant memory size in bytes.
-    hipDeviceAttributeWarpSize,                 ///< Warp size in threads.
-    hipDeviceAttributeMaxRegistersPerBlock,  ///< Maximum number of 32-bit registers available to a
-                                             ///< thread block. This number is shared by all thread
-                                             ///< blocks simultaneously resident on a
-                                             ///< multiprocessor.
-    hipDeviceAttributeClockRate,             ///< Peak clock frequency in kilohertz.
-    hipDeviceAttributeMemoryClockRate,       ///< Peak memory clock frequency in kilohertz.
-    hipDeviceAttributeMemoryBusWidth,        ///< Global memory bus width in bits.
-    hipDeviceAttributeMultiprocessorCount,   ///< Number of multiprocessors on the device.
-    hipDeviceAttributeComputeMode,           ///< Compute mode that device is currently in.
-    hipDeviceAttributeL2CacheSize,  ///< Size of L2 cache in bytes. 0 if the device doesn't have L2
-                                    ///< cache.
-    hipDeviceAttributeMaxThreadsPerMultiProcessor,  ///< Maximum resident threads per
-                                                    ///< multiprocessor.
-    hipDeviceAttributeComputeCapabilityMajor,       ///< Major compute capability version number.
-    hipDeviceAttributeComputeCapabilityMinor,       ///< Minor compute capability version number.
-    hipDeviceAttributeConcurrentKernels,  ///< Device can possibly execute multiple kernels
-                                          ///< concurrently.
-    hipDeviceAttributePciBusId,           ///< PCI Bus ID.
-    hipDeviceAttributePciDeviceId,        ///< PCI Device ID.
-    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,  ///< Maximum Shared Memory Per
-                                                         ///< Multiprocessor.
-    hipDeviceAttributeIsMultiGpuBoard,                   ///< Multiple GPU devices.
-    hipDeviceAttributeIntegrated,                        ///< iGPU
-    hipDeviceAttributeCooperativeLaunch,                 ///< Support cooperative launch
-    hipDeviceAttributeCooperativeMultiDeviceLaunch,      ///< Support cooperative launch on multiple devices
-    hipDeviceAttributeMaxTexture1DWidth,    ///< Maximum number of elements in 1D images
-    hipDeviceAttributeMaxTexture2DWidth,    ///< Maximum dimension width of 2D images in image elements
-    hipDeviceAttributeMaxTexture2DHeight,   ///< Maximum dimension height of 2D images in image elements
-    hipDeviceAttributeMaxTexture3DWidth,    ///< Maximum dimension width of 3D images in image elements
-    hipDeviceAttributeMaxTexture3DHeight,   ///< Maximum dimensions height of 3D images in image elements
-    hipDeviceAttributeMaxTexture3DDepth,    ///< Maximum dimensions depth of 3D images in image elements
-
-    hipDeviceAttributeHdpMemFlushCntl,      ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
-    hipDeviceAttributeHdpRegFlushCntl,      ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
-
-    hipDeviceAttributeMaxPitch,             ///< Maximum pitch in bytes allowed by memory copies
-    hipDeviceAttributeTextureAlignment,     ///<Alignment requirement for textures
-    hipDeviceAttributeTexturePitchAlignment, ///<Pitch alignment requirement for 2D texture references bound to pitched memory;
-    hipDeviceAttributeKernelExecTimeout,    ///<Run time limit for kernels executed on the device
-    hipDeviceAttributeCanMapHostMemory,     ///<Device can map host memory into device address space
-    hipDeviceAttributeEccEnabled,           ///<Device has ECC support enabled
-
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,        ///< Supports cooperative launch on multiple
-                                                                  ///devices with unmatched functions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,     ///< Supports cooperative launch on multiple
-                                                                  ///devices with unmatched grid dimensions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,    ///< Supports cooperative launch on multiple
-                                                                  ///devices with unmatched block dimensions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem,   ///< Supports cooperative launch on multiple
-                                                                  ///devices with unmatched shared memories
-    hipDeviceAttributeAsicRevision,         ///< Revision of the GPU in this device
-    hipDeviceAttributeManagedMemory,        ///< Device supports allocating managed memory on this system
-    hipDeviceAttributeDirectManagedMemAccessFromHost, ///< Host can directly access managed memory on
-                                                      /// the device without migration
-    hipDeviceAttributeConcurrentManagedAccess,  ///< Device can coherently access managed memory
-                                                /// concurrently with the CPU
-    hipDeviceAttributePageableMemoryAccess,     ///< Device supports coherently accessing pageable memory
-                                                /// without calling hipHostRegister on it
-    hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via
-                                                              /// the host's page tables
-    hipDeviceAttributeCanUseStreamWaitValue ///< '1' if Device supports hipStreamWaitValue32() and
-                                            ///< hipStreamWaitValue64() , '0' otherwise.
-
-} hipDeviceAttribute_t;
-
-typedef void* hipDeviceptr_t;
-
-/*
- * @brief hipJitOption
- * @enum
- * @ingroup Enumerations
- */
-typedef enum hipJitOption {
-    hipJitOptionMaxRegisters = 0,
-    hipJitOptionThreadsPerBlock,
-    hipJitOptionWallTime,
-    hipJitOptionInfoLogBuffer,
-    hipJitOptionInfoLogBufferSizeBytes,
-    hipJitOptionErrorLogBuffer,
-    hipJitOptionErrorLogBufferSizeBytes,
-    hipJitOptionOptimizationLevel,
-    hipJitOptionTargetFromContext,
-    hipJitOptionTarget,
-    hipJitOptionFallbackStrategy,
-    hipJitOptionGenerateDebugInfo,
-    hipJitOptionLogVerbose,
-    hipJitOptionGenerateLineInfo,
-    hipJitOptionCacheMode,
-    hipJitOptionSm3xOpt,
-    hipJitOptionFastCompile,
-    hipJitOptionNumOptions
-} hipJitOption;
-
-/**
- * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
- */
-typedef enum hipFuncAttribute {
-    hipFuncAttributeMaxDynamicSharedMemorySize = 8,
-    hipFuncAttributePreferredSharedMemoryCarveout = 9,
-    hipFuncAttributeMax
-} hipFuncAttribute;
-
-/**
- * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
- */
-typedef enum hipFuncCache_t {
-    hipFuncCachePreferNone,    ///< no preference for shared memory or L1 (default)
-    hipFuncCachePreferShared,  ///< prefer larger shared memory and smaller L1 cache
-    hipFuncCachePreferL1,      ///< prefer larger L1 cache and smaller shared memory
-    hipFuncCachePreferEqual,   ///< prefer equal size L1 cache and shared memory
-} hipFuncCache_t;
-
-
-#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
-#define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
-#define HIP_LAUNCH_PARAM_END ((void*)0x03)
-
-#endif
\ No newline at end of file
diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp
deleted file mode 100644
index 258e06933..000000000
--- a/include/triton/tools/bench.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_TOOLS_BENCH_H_
-#define _TRITON_TOOLS_BENCH_H_
-
-#include "triton/driver/device.h"
-#include "triton/driver/stream.h"
-#include <algorithm>
-#include <chrono>
-#include <functional>
-
-namespace triton {
-namespace tools {
-
-class timer {
-  typedef std::chrono::high_resolution_clock high_resolution_clock;
-  typedef std::chrono::nanoseconds nanoseconds;
-
-public:
-  explicit timer(bool run = false) {
-    if (run)
-      start();
-  }
-
-  void start() { _start = high_resolution_clock::now(); }
-
-  nanoseconds get() const {
-    return std::chrono::duration_cast<nanoseconds>(
-        high_resolution_clock::now() - _start);
-  }
-
-private:
-  high_resolution_clock::time_point _start;
-};
-
-inline double bench(std::function<void()> const &op, driver::stream *stream,
-                    size_t warmup = 10, size_t repeat = 200) {
-  timer tmr;
-  std::vector<size_t> times;
-  double total_time = 0;
-  for (size_t i = 0; i < warmup; i++)
-    op();
-  stream->synchronize();
-  tmr.start();
-  for (size_t i = 0; i < repeat; i++) {
-    op();
-  }
-  stream->synchronize();
-  return (float)tmr.get().count() / repeat;
-
-  //  return *std::min_element(times.begin(), times.end());
-}
-
-} // namespace tools
-} // namespace triton
-
-#endif
diff --git a/include/triton/tools/graph.h b/include/triton/tools/graph.h
deleted file mode 100644
index 3725eb091..000000000
--- a/include/triton/tools/graph.h
+++ /dev/null
@@ -1,68 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_TOOLS_THREAD_GRAPH_H_
-#define _TRITON_TOOLS_THREAD_GRAPH_H_
-
-#include <iostream>
-#include <map>
-#include <set>
-#include <vector>
-
-namespace triton {
-namespace tools {
-
-template <class node_t> class graph {
-  typedef std::map<node_t, std::set<node_t>> edges_t;
-
-public:
-  typedef std::map<size_t, std::vector<node_t>> cmap_t;
-  typedef std::map<node_t, size_t> nmap_t;
-
-private:
-  void connected_components_impl(node_t x, std::set<node_t> &nodes,
-                                 nmap_t *nmap, cmap_t *cmap, int id) const {
-    if (nmap)
-      (*nmap)[x] = id;
-    if (cmap)
-      (*cmap)[id].push_back(x);
-    if (nodes.find(x) != nodes.end()) {
-      nodes.erase(x);
-      for (const node_t &y : edges_.at(x))
-        connected_components_impl(y, nodes, nmap, cmap, id);
-    }
-  }
-
-public:
-  void connected_components(cmap_t *cmap, nmap_t *nmap) const {
-    if (cmap)
-      cmap->clear();
-    if (nmap)
-      nmap->clear();
-    std::set<node_t> nodes = nodes_;
-    unsigned id = 0;
-    while (!nodes.empty()) {
-      connected_components_impl(*nodes.begin(), nodes, nmap, cmap, id++);
-    }
-  }
-
-  void add_edge(node_t x, node_t y) {
-    nodes_.insert(x);
-    nodes_.insert(y);
-    edges_[x].insert(y);
-    edges_[y].insert(x);
-  }
-
-  void clear() {
-    nodes_.clear();
-    edges_.clear();
-  }
-
-private:
-  std::set<node_t> nodes_;
-  edges_t edges_;
-};
-
-} // namespace tools
-} // namespace triton
-
-#endif
diff --git a/include/triton/tools/sha1.hpp b/include/triton/tools/sha1.hpp
deleted file mode 100644
index 1e71034de..000000000
--- a/include/triton/tools/sha1.hpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- Copyright (c) 2011, Micael Hildenborg
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of Micael Hildenborg nor the
-      names of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY Micael Hildenborg ''AS IS'' AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL Micael Hildenborg BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- Contributors:
- Gustav
- Several members in the gamedev.se forum.
- Gregory Petrosyan
- */
-
-#ifndef _TRITON_TOOLS_SHA1_HPP_
-#define _TRITON_TOOLS_SHA1_HPP_
-
-namespace sha1 {
-namespace // local
-{
-// Rotate an integer value to left.
-inline unsigned int rol(const unsigned int value, const unsigned int steps) {
-  return ((value << steps) | (value >> (32 - steps)));
-}
-
-// Sets the first 16 integers in the buffert to zero.
-// Used for clearing the W buffert.
-inline void clearWBuffert(unsigned int *buffert) {
-  for (int pos = 16; --pos >= 0;) {
-    buffert[pos] = 0;
-  }
-}
-
-inline void innerHash(unsigned int *result, unsigned int *w) {
-  unsigned int a = result[0];
-  unsigned int b = result[1];
-  unsigned int c = result[2];
-  unsigned int d = result[3];
-  unsigned int e = result[4];
-
-  int round = 0;
-
-#define sha1macro(func, val)                                                   \
-  {                                                                            \
-    const unsigned int t = rol(a, 5) + (func) + e + val + w[round];            \
-    e = d;                                                                     \
-    d = c;                                                                     \
-    c = rol(b, 30);                                                            \
-    b = a;                                                                     \
-    a = t;                                                                     \
-  }
-
-  while (round < 16) {
-    sha1macro((b & c) | (~b & d), 0x5a827999)++ round;
-  }
-  while (round < 20) {
-    w[round] =
-        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-    sha1macro((b & c) | (~b & d), 0x5a827999)++ round;
-  }
-  while (round < 40) {
-    w[round] =
-        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-    sha1macro(b ^ c ^ d, 0x6ed9eba1)++ round;
-  }
-  while (round < 60) {
-    w[round] =
-        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-    sha1macro((b & c) | (b & d) | (c & d), 0x8f1bbcdc)++ round;
-  }
-  while (round < 80) {
-    w[round] =
-        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-    sha1macro(b ^ c ^ d, 0xca62c1d6)++ round;
-  }
-
-#undef sha1macro
-
-  result[0] += a;
-  result[1] += b;
-  result[2] += c;
-  result[3] += d;
-  result[4] += e;
-}
-} // namespace
-
-inline void calc(const void *src, const int bytelength, unsigned char *hash) {
-  // Init the result array.
-  unsigned int result[5] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476,
-                            0xc3d2e1f0};
-
-  // Cast the void src pointer to be the byte array we can work with.
-  const unsigned char *sarray = (const unsigned char *)src;
-
-  // The reusable round buffer
-  unsigned int w[80];
-
-  // Loop through all complete 64byte blocks.
-  const int endOfFullBlocks = bytelength - 64;
-  int endCurrentBlock;
-  int currentBlock = 0;
-
-  while (currentBlock <= endOfFullBlocks) {
-    endCurrentBlock = currentBlock + 64;
-
-    // Init the round buffer with the 64 byte block data.
-    for (int roundPos = 0; currentBlock < endCurrentBlock; currentBlock += 4) {
-      // This line will swap endian on big endian and keep endian on little
-      // endian.
-      w[roundPos++] = (unsigned int)sarray[currentBlock + 3] |
-                      (((unsigned int)sarray[currentBlock + 2]) << 8) |
-                      (((unsigned int)sarray[currentBlock + 1]) << 16) |
-                      (((unsigned int)sarray[currentBlock]) << 24);
-    }
-    innerHash(result, w);
-  }
-
-  // Handle the last and not full 64 byte block if existing.
-  endCurrentBlock = bytelength - currentBlock;
-  clearWBuffert(w);
-  int lastBlockBytes = 0;
-  for (; lastBlockBytes < endCurrentBlock; ++lastBlockBytes) {
-    w[lastBlockBytes >> 2] |=
-        (unsigned int)sarray[lastBlockBytes + currentBlock]
-        << ((3 - (lastBlockBytes & 3)) << 3);
-  }
-  w[lastBlockBytes >> 2] |= 0x80 << ((3 - (lastBlockBytes & 3)) << 3);
-  if (endCurrentBlock >= 56) {
-    innerHash(result, w);
-    clearWBuffert(w);
-  }
-  w[15] = bytelength << 3;
-  innerHash(result, w);
-
-  // Store hash in result pointer, and make sure we get in in the correct order
-  // on both endian models.
-  for (int hashByte = 20; --hashByte >= 0;) {
-    hash[hashByte] =
-        (result[hashByte >> 2] >> (((3 - hashByte) & 0x3) << 3)) & 0xff;
-  }
-}
-
-inline void toHexString(const unsigned char *hash, char *hexstring) {
-  const char hexDigits[] = {"0123456789abcdef"};
-
-  for (int hashByte = 20; --hashByte >= 0;) {
-    hexstring[hashByte << 1] = hexDigits[(hash[hashByte] >> 4) & 0xf];
-    hexstring[(hashByte << 1) + 1] = hexDigits[hash[hashByte] & 0xf];
-  }
-  hexstring[40] = 0;
-}
-} // namespace sha1
-
-#endif
diff --git a/include/triton/tools/sys/exec.hpp b/include/triton/tools/sys/exec.hpp
deleted file mode 100644
index e96a04314..000000000
--- a/include/triton/tools/sys/exec.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef TRITON_TOOLS_SYS_EXEC_HPP
-#define TRITON_TOOLS_SYS_EXEC_HPP
-
-#include <cstdio>
-#include <iostream>
-#include <memory>
-#include <stdexcept>
-#include <string>
-
-namespace triton {
-namespace tools {
-
-#ifdef _WIN32
-#define popen _popen
-#define pclose _pclose
-#endif
-
-#ifndef WEXITSTATUS
-#define WEXITSTATUS(stat_val) ((unsigned)(stat_val)&255)
-#endif
-
-int exec(const std::string &cmd, std::string &result) {
-  char buffer[128];
-  FILE *pipe = popen(cmd.c_str(), "r");
-  if (!pipe)
-    return 0;
-  result.clear();
-  try {
-    while (fgets(buffer, sizeof buffer, pipe) != NULL)
-      result += buffer;
-  } catch (...) {
-    pclose(pipe);
-    return 0;
-  }
-  int status = pclose(pipe);
-  return WEXITSTATUS(status);
-}
-
-} // namespace tools
-} // namespace triton
-
-#endif
diff --git a/include/triton/tools/sys/mkdir.hpp b/include/triton/tools/sys/mkdir.hpp
deleted file mode 100644
index 10cb0da6a..000000000
--- a/include/triton/tools/sys/mkdir.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
- *
- * This file is part of ISAAC.
- *
- * ISAAC is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301  USA
- */
-
-#ifndef TDL_TOOLS_SYS_MKDIR_HPP
-#define TDL_TOOLS_SYS_MKDIR_HPP
-
-#include <cstdlib>
-#include <cstring>
-#include <errno.h>
-#include <string>
-#include <sys/stat.h>
-#if defined(_WIN32)
-#include <direct.h>
-#endif
-
-namespace triton {
-
-namespace tools {
-
-inline int mkdir(std::string const &path) {
-#if defined(_WIN32)
-  return _mkdir(path.c_str());
-#else
-  return ::mkdir(path.c_str(), 0777);
-#endif
-}
-
-inline int mkpath(std::string const &path) {
-  int status = 0;
-  size_t pp = 0;
-  size_t sp;
-  while ((sp = path.find('/', pp)) != std::string::npos) {
-    if (sp != pp) {
-      status = mkdir(path.substr(0, sp));
-    }
-    pp = sp + 1;
-  }
-  return (status == 0 || errno == EEXIST) ? 0 : -1;
-}
-
-inline int mtime(std::string const &path) {
-  struct stat st;
-  if (stat(path.c_str(), &st) != 0)
-    return 0;
-  return st.st_mtime;
-}
-
-} // namespace tools
-
-} // namespace triton
-
-#endif
diff --git a/include/triton/tools/thread_pool.h b/include/triton/tools/thread_pool.h
deleted file mode 100644
index 045e983f8..000000000
--- a/include/triton/tools/thread_pool.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_TOOLS_THREAD_POOL_H_
-#define _TRITON_TOOLS_THREAD_POOL_H_
-
-#include <condition_variable>
-#include <functional>
-#include <future>
-#include <memory>
-#include <mutex>
-#include <queue>
-#include <stdexcept>
-#include <thread>
-#include <vector>
-
-class ThreadPool {
-public:
-  ThreadPool(size_t threads) : stop(false) {
-    for (size_t i = 0; i < threads; ++i)
-      workers.emplace_back([this] {
-        for (;;) {
-          std::function<void()> task;
-          {
-            std::unique_lock<std::mutex> lock(this->queue_mutex);
-            this->condition.wait(
-                lock, [this] { return this->stop || !this->tasks.empty(); });
-            if (this->stop && this->tasks.empty())
-              return;
-            task = std::move(this->tasks.front());
-            this->tasks.pop();
-          }
-          task();
-        }
-      });
-  }
-
-  template <class F, class... Args>
-  auto enqueue(F &&f, Args &&...args)
-      -> std::future<typename std::result_of<F(Args...)>::type> {
-    using return_type = typename std::result_of<F(Args...)>::type;
-
-    auto task = std::make_shared<std::packaged_task<return_type()>>(
-        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
-
-    std::future<return_type> res = task->get_future();
-    {
-      std::unique_lock<std::mutex> lock(queue_mutex);
-
-      // don't allow enqueueing after stopping the pool
-      if (stop)
-        throw std::runtime_error("enqueue on stopped ThreadPool");
-
-      tasks.emplace([task]() { (*task)(); });
-    }
-    condition.notify_one();
-    return res;
-  }
-
-  ~ThreadPool() {
-    {
-      std::unique_lock<std::mutex> lock(queue_mutex);
-      stop = true;
-    }
-    condition.notify_all();
-    for (std::thread &worker : workers)
-      worker.join();
-  }
-
-private:
-  // need to keep track of threads so we can join them
-  std::vector<std::thread> workers;
-  // the task queue
-  std::queue<std::function<void()>> tasks;
-
-  // synchronization
-  std::mutex queue_mutex;
-  std::condition_variable condition;
-  bool stop;
-};
-
-#endif
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 5a6ba8951..ab1d31a76 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -1,5 +1,4 @@
 # add_subdirectory(codegen)
-add_subdirectory(driver)
 add_subdirectory(Analysis)
 add_subdirectory(Conversion)
 add_subdirectory(Dialect)
diff --git a/lib/Target/LLVMIR/LLVMIRTranslation.cpp b/lib/Target/LLVMIR/LLVMIRTranslation.cpp
index 179c9391a..5837b0973 100644
--- a/lib/Target/LLVMIR/LLVMIRTranslation.cpp
+++ b/lib/Target/LLVMIR/LLVMIRTranslation.cpp
@@ -13,7 +13,6 @@
 #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
 #include "mlir/Transforms/Passes.h"
 #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.h"
-#include "triton/driver/llvm.h"
 #include "triton/tools/sys/getenv.hpp"
 #include "llvm/IR/Constants.h"
 
@@ -99,7 +98,6 @@ translateLLVMToLLVMIR(llvm::LLVMContext *llvmContext, mlir::ModuleOp module) {
   }
 
   // Initialize LLVM targets.
-  ::triton::driver::init_llvm();
   mlir::ExecutionEngine::setupTargetTriple(llvmModule.get());
 
   auto optPipeline = mlir::makeOptimizingTransformer(
diff --git a/lib/Target/PTX/PTXTranslation.cpp b/lib/Target/PTX/PTXTranslation.cpp
index b286e612a..631af81cc 100644
--- a/lib/Target/PTX/PTXTranslation.cpp
+++ b/lib/Target/PTX/PTXTranslation.cpp
@@ -11,31 +11,129 @@
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
 #include "triton/Target/LLVMIR/LLVMIRTranslation.h"
-#include "triton/driver/dispatch.h"
-#include "triton/driver/llvm.h"
+
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <regex>
 
 namespace triton {
 
-void getCuCCAndVersionFromDevice(uint64_t device, int *cc, int *version,
-                                 std::string *ptxasPath) {
-  CUdevice dev = (CUdevice)device;
-  size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>(dev);
-  size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(dev);
-  *cc = major * 10 + minor;
-  *ptxasPath = driver::path_to_ptxas(*version); // assign version
+extern "C" {
+int set_curterm(char *nterm) { return 0; }
+int del_curterm(char *nterm) { return 0; }
+int tigetnum(char *capname) { return 0; }
+int setupterm(char *term, int fildes, int *errret) { return 0; }
 }
 
-std::tuple<std::string, size_t, int, std::string>
-translateTritonGPUToPTX(mlir::ModuleOp module, uint64_t device) {
-  int cc;
-  int version;
-  std::string ptxasPath;
-  getCuCCAndVersionFromDevice(device, &cc, &version, &ptxasPath);
+static void init_llvm() {
+  LLVMInitializeNVPTXTargetInfo();
+  LLVMInitializeNVPTXTarget();
+  LLVMInitializeNVPTXTargetMC();
+  LLVMInitializeNVPTXAsmPrinter();
+}
 
-  llvm::LLVMContext ctx;
-  auto llModule = mlir::triton::translateTritonGPUToLLVMIR(&ctx, module);
-  auto ptxCode = driver::llir_to_ptx(llModule.get(), cc, version);
-  return std::make_tuple(ptxCode, cc, version, ptxasPath);
+static bool find_and_replace(std::string &str, const std::string &begin,
+                             const std::string &end,
+                             const std::string &target) {
+  size_t start_replace = str.find(begin);
+  if (start_replace == std::string::npos)
+    return false;
+  size_t end_replace = str.find(end, start_replace);
+  if (end_replace == std::string::npos)
+    return false;
+  str.replace(start_replace, end_replace + 1 - start_replace, target);
+  return true;
+}
+
+static std::string llir_to_ptx(llvm::Module *module, int capability, int ptx) {
+  // LLVM version in use may not officially support target hardware
+  int max_nvvm_cc = 75;
+  int max_nvvm_ptx = 74;
+  // options
+  auto options = llvm::cl::getRegisteredOptions();
+  auto *short_ptr =
+      static_cast<llvm::cl::opt<bool> *>(options["nvptx-short-ptr"]);
+  assert(short_ptr);
+  short_ptr->setValue(true);
+  // compute capability
+  std::string sm = "sm_" + std::to_string(capability);
+  // max PTX version
+  int ptx_major = ptx / 10;
+  int ptx_minor = ptx % 10;
+  // create
+  llvm::SmallVector<char, 0> buffer;
+  std::string triple = "nvptx64-nvidia-cuda";
+  std::string proc = "sm_" + std::to_string(std::min(capability, max_nvvm_cc));
+  std::string layout = "";
+  std::string features = "";
+  // std::string features = "+ptx" + std::to_string(std::min(ptx,
+  // max_nvvm_ptx));
+  init_llvm();
+  // verify and store llvm
+  llvm::legacy::PassManager pm;
+  pm.add(llvm::createVerifierPass());
+  pm.run(*module);
+  // module->print(llvm::outs(), nullptr);
+
+  // create machine
+  module->setTargetTriple(triple);
+  std::string error;
+  auto target =
+      llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
+  llvm::TargetOptions opt;
+  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+  opt.UnsafeFPMath = false;
+  opt.NoInfsFPMath = false;
+  opt.NoNaNsFPMath = true;
+  llvm::TargetMachine *machine = target->createTargetMachine(
+      module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_,
+      llvm::None, llvm::CodeGenOpt::Aggressive);
+  // set data layout
+  if (layout.empty())
+    module->setDataLayout(machine->createDataLayout());
+  else
+    module->setDataLayout(layout);
+  // emit machine code
+  for (llvm::Function &f : module->functions())
+    f.addFnAttr(llvm::Attribute::AlwaysInline);
+  llvm::legacy::PassManager pass;
+  llvm::raw_svector_ostream stream(buffer);
+  // emit
+  machine->addPassesToEmitFile(pass, stream, nullptr,
+                               llvm::CodeGenFileType::CGFT_AssemblyFile);
+  pass.run(*module);
+
+  // post-process
+  std::string result(buffer.begin(), buffer.end());
+  find_and_replace(result, ".version", "\n",
+                   ".version " + std::to_string(ptx_major) + "." +
+                       std::to_string(ptx_minor) + "\n");
+  find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
+  while (find_and_replace(result, "\t// begin inline asm", "\n", ""))
+    ;
+  while (find_and_replace(result, "\t// end inline asm", "\n", ""))
+    ;
+  return result;
+}
+
+std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version) {
+  auto ptxCode = llir_to_ptx(&module, cc, version);
+  return ptxCode;
 }
 
 } // namespace triton
diff --git a/lib/driver/CMakeLists.txt b/lib/driver/CMakeLists.txt
deleted file mode 100644
index d08c5a107..000000000
--- a/lib/driver/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-add_library(TritonDriver
-  dispatch.cc
-  error.cc
-  llvm.cc
-)
diff --git a/lib/driver/dispatch.cc b/lib/driver/dispatch.cc
deleted file mode 100644
index 427453b38..000000000
--- a/lib/driver/dispatch.cc
+++ /dev/null
@@ -1,395 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files
- * (the "Software"), to deal in the Software without restriction,
- * including without limitation the rights to use, copy, modify, merge,
- * publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "triton/driver/dispatch.h"
-
-namespace triton {
-namespace driver {
-
-// Helpers for function definition
-#define DEFINE0(init, hlib, ret, fname)                                        \
-  ret dispatch::fname() {                                                      \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname);              \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE1(init, hlib, ret, fname, t1)                                    \
-  ret dispatch::fname(t1 a) {                                                  \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a);           \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE2(init, hlib, ret, fname, t1, t2)                                \
-  ret dispatch::fname(t1 a, t2 b) {                                            \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b);        \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE3(init, hlib, ret, fname, t1, t2, t3)                            \
-  ret dispatch::fname(t1 a, t2 b, t3 c) {                                      \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c);     \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4)                        \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d) {                                \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d);  \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5)                    \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e) {                          \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e);                                          \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6)                \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f) {                    \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f);                                       \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7)            \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g) {              \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g);                                    \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)        \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h) {        \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g, h);                                 \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)    \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i) {  \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g, h, i);                              \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9,   \
-                 t10)                                                          \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i,    \
-                      t10 j) {                                                 \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g, h, i, j);                           \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9,   \
-                 t10, t11)                                                     \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i,    \
-                      t10 j, t11 k) {                                          \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g, h, i, j, k);                        \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9,   \
-                 t10, t11, t12, t13)                                           \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i,    \
-                      t10 j, t11 k, t12 l, t13 m) {                            \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g, h, i, j, k, l, m);                  \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9,   \
-                 t10, t11, t12, t13, t14, t15, t16, t17, t18, t19)             \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i,    \
-                      t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q,  \
-                      t18 r, t19 s) {                                          \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g, h, i, j, k, l, m, n, o, p, q, r,    \
-                                  s);                                          \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-/* ------------------- *
- * CUDA
- * ------------------- */
-
-bool dispatch::cuinit() {
-  if (cuda_ == nullptr) {
-#ifdef _WIN32
-    cuda_ = dlopen("cudart64_110.dll", RTLD_LAZY);
-#else
-    cuda_ = dlopen("libcuda.so", RTLD_LAZY);
-    if (!cuda_)
-      cuda_ = dlopen("libcuda.so.1", RTLD_LAZY);
-#endif
-    if (!cuda_)
-      throw std::runtime_error("Could not find `libcuda.so`. Make sure it is "
-                               "in your LD_LIBRARY_PATH.");
-  }
-  if (cuda_ == nullptr)
-    return false;
-  CUresult (*fptr)(unsigned int);
-  cuInit_ = dlsym(cuda_, "cuInit");
-  *reinterpret_cast<void **>(&fptr) = cuInit_;
-  CUresult res = (*fptr)(0);
-  check(res);
-  return true;
-}
-
-#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
-#define CUDA_DEFINE2(ret, fname, t1, t2)                                       \
-  DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
-#define CUDA_DEFINE3(ret, fname, t1, t2, t3)                                   \
-  DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
-#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4)                               \
-  DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
-#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5)                           \
-  DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
-#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6)                       \
-  DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
-#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7)                   \
-  DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
-#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)               \
-  DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
-#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)           \
-  DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
-#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)     \
-  DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
-#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,     \
-                      t11)                                                     \
-  DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, \
-           t11)
-
-// context management
-CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
-CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
-CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice *)
-CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
-CUDA_DEFINE1(CUresult, cuInit, unsigned int)
-CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
-// device management
-CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
-CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
-CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
-CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute,
-             CUdevice)
-CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
-
-// link management
-CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void *,
-             size_t, const char *, unsigned int, CUjit_option *, void **);
-CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option *, void **,
-             CUlinkState *);
-CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
-CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void **, size_t *);
-// module management
-CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr *, size_t *, CUmodule,
-             const char *)
-CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
-CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
-CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
-CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *,
-             unsigned int, CUjit_option *, void **)
-CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule,
-             const char *)
-// stream management
-CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
-CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
-CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
-CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext *)
-CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int,
-              unsigned int, unsigned int, unsigned int, unsigned int,
-              unsigned int, CUstream, void **, void **)
-// function management
-CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int *, CUfunction_attribute,
-             CUfunction)
-CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute,
-             int)
-CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
-// memory management
-CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
-CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
-CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t,
-             CUstream)
-CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t,
-             CUstream)
-CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t)
-CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr *, size_t)
-CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void *, CUpointer_attribute,
-             CUdeviceptr)
-CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t,
-             CUstream)
-// event management
-CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
-CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
-CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
-CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
-
-/* ------------------- *
- * NVML
- * ------------------- */
-bool dispatch::nvmlinit() {
-#ifdef _WIN32
-  if (nvml_ == nullptr)
-    nvml_ = dlopen("nvml.dll", RTLD_LAZY);
-#else
-  if (nvml_ == nullptr)
-    nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
-#endif
-  nvmlReturn_t (*fptr)();
-  nvmlInit_v2_ = dlsym(nvml_, "nvmlInit_v2");
-  *reinterpret_cast<void **>(&fptr) = nvmlInit_v2_;
-  nvmlReturn_t res = (*fptr)();
-  check(res);
-  return res;
-}
-
-#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
-#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
-#define NVML_DEFINE2(ret, fname, t1, t2)                                       \
-  DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
-#define NVML_DEFINE3(ret, fname, t1, t2, t3)                                   \
-  DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
-
-NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *,
-             nvmlDevice_t *)
-NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t,
-             nvmlClockType_t, unsigned int *)
-NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t,
-             nvmlClockType_t, unsigned int *)
-NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t,
-             unsigned int, unsigned int)
-
-/* ------------------- *
- * HIP
- * ------------------- */
-bool dispatch::hipinit() {
-  if (hip_ == nullptr)
-    hip_ = dlopen("libamdhip64.so", RTLD_LAZY);
-  if (hip_ == nullptr)
-    return false;
-  hipError_t (*fptr)();
-  hipInit_ = dlsym(hip_, "hipInit");
-  *reinterpret_cast<void **>(&fptr) = hipInit_;
-  hipError_t res = (*fptr)();
-  check(res);
-  return res;
-}
-
-#define HIP_DEFINE1(ret, fname, t1) DEFINE1(hipinit, hip_, ret, fname, t1)
-#define HIP_DEFINE2(ret, fname, t1, t2)                                        \
-  DEFINE2(hipinit, hip_, ret, fname, t1, t2)
-#define HIP_DEFINE3(ret, fname, t1, t2, t3)                                    \
-  DEFINE3(hipinit, hip_, ret, fname, t1, t2, t3)
-#define HIP_DEFINE4(ret, fname, t1, t2, t3, t4)                                \
-  DEFINE4(hipinit, hip_, ret, fname, t1, t2, t3, t4)
-#define HIP_DEFINE5(ret, fname, t1, t2, t3, t4, t5)                            \
-  DEFINE5(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5)
-#define HIP_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6)                        \
-  DEFINE6(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6)
-#define HIP_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7)                    \
-  DEFINE7(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
-#define HIP_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)                \
-  DEFINE8(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
-#define HIP_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)            \
-  DEFINE9(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
-#define HIP_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)      \
-  DEFINE10(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
-#define HIP_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
-  DEFINE11(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, \
-           t11)
-
-// context management
-HIP_DEFINE1(hipError_t, hipCtxDestroy, hipCtx_t)
-HIP_DEFINE3(hipError_t, hipCtxCreate, hipCtx_t *, unsigned int, hipDevice_t)
-HIP_DEFINE1(hipError_t, hipCtxGetDevice, hipDevice_t *)
-HIP_DEFINE1(hipError_t, hipCtxPushCurrent, hipCtx_t)
-HIP_DEFINE1(hipError_t, hipCtxPopCurrent, hipCtx_t *)
-HIP_DEFINE2(hipError_t, hipCtxEnablePeerAccess, hipCtx_t, unsigned int)
-HIP_DEFINE1(hipError_t, hipInit, unsigned int)
-HIP_DEFINE1(hipError_t, hipDriverGetVersion, int *)
-// device management
-HIP_DEFINE2(hipError_t, hipGetDevice, hipDevice_t *, int)
-HIP_DEFINE3(hipError_t, hipDeviceGetName, char *, int, hipDevice_t)
-HIP_DEFINE3(hipError_t, hipDeviceGetPCIBusId, char *, int, hipDevice_t)
-HIP_DEFINE3(hipError_t, hipDeviceGetAttribute, int *, hipDeviceAttribute_t,
-            hipDevice_t)
-HIP_DEFINE1(hipError_t, hipGetDeviceCount, int *)
-// module management
-HIP_DEFINE4(hipError_t, hipModuleGetGlobal, hipDeviceptr_t *, size_t *,
-            hipModule_t, const char *)
-HIP_DEFINE2(hipError_t, hipModuleLoad, hipModule_t *, const char *)
-HIP_DEFINE1(hipError_t, hipModuleUnload, hipModule_t)
-HIP_DEFINE2(hipError_t, hipModuleLoadData, hipModule_t *, const void *)
-HIP_DEFINE5(hipError_t, hipModuleLoadDataEx, hipModule_t *, const void *,
-            unsigned int, hipJitOption *, void **)
-HIP_DEFINE3(hipError_t, hipModuleGetFunction, hipFunction_t *, hipModule_t,
-            const char *)
-// stream management
-HIP_DEFINE2(hipError_t, hipStreamCreate, hipStream_t *, unsigned int)
-HIP_DEFINE1(hipError_t, hipStreamSynchronize, hipStream_t)
-HIP_DEFINE1(hipError_t, hipStreamDestroy, hipStream_t)
-HIP_DEFINE11(hipError_t, hipModuleLaunchKernel, hipFunction_t, unsigned int,
-             unsigned int, unsigned int, unsigned int, unsigned int,
-             unsigned int, unsigned int, hipStream_t, void **, void **)
-// function management
-HIP_DEFINE2(hipError_t, hipFuncGetAttributes, hipFuncAttributes *, void *)
-HIP_DEFINE2(hipError_t, hipFuncSetCacheConfig, hipFunction_t, hipFuncCache_t)
-// memory management
-HIP_DEFINE3(hipError_t, hipMemcpyDtoH, void *, hipDeviceptr_t, size_t)
-HIP_DEFINE1(hipError_t, hipFree, hipDeviceptr_t)
-HIP_DEFINE4(hipError_t, hipMemcpyDtoHAsync, void *, hipDeviceptr_t, size_t,
-            hipStream_t)
-HIP_DEFINE4(hipError_t, hipMemcpyHtoDAsync, hipDeviceptr_t, const void *,
-            size_t, hipStream_t)
-HIP_DEFINE3(hipError_t, hipMemcpyHtoD, hipDeviceptr_t, const void *, size_t)
-HIP_DEFINE2(hipError_t, hipMalloc, hipDeviceptr_t *, size_t)
-HIP_DEFINE3(hipError_t, hipPointerGetAttribute, void *, CUpointer_attribute,
-            hipDeviceptr_t)
-HIP_DEFINE4(hipError_t, hipMemsetD8Async, hipDeviceptr_t, unsigned char, size_t,
-            hipStream_t)
-// event management
-HIP_DEFINE2(hipError_t, hipEventCreate, hipEvent_t *, unsigned int)
-HIP_DEFINE3(hipError_t, hipEventElapsedTime, float *, hipEvent_t, hipEvent_t)
-HIP_DEFINE2(hipError_t, hipEventRecord, hipEvent_t, hipStream_t)
-HIP_DEFINE1(hipError_t, hipEventDestroy, hipEvent_t)
-
-/* ------------------- *
- * COMMON
- * ------------------- */
-
-// Release
-void dispatch::release() {
-  if (cuda_) {
-    dlclose(cuda_);
-    cuda_ = nullptr;
-  }
-}
-
-void *dispatch::cuda_;
-void *dispatch::nvml_;
-void *dispatch::nvmlInit_v2_;
-void *dispatch::hip_;
-
-} // namespace driver
-} // namespace triton
diff --git a/lib/driver/error.cc b/lib/driver/error.cc
deleted file mode 100644
index 4b366746e..000000000
--- a/lib/driver/error.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files
- * (the "Software"), to deal in the Software without restriction,
- * including without limitation the rights to use, copy, modify, merge,
- * publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "triton/driver/error.h"
-
-namespace triton {
-namespace driver {
-
-void check(CUresult err) {
-  using namespace exception::cuda;
-  switch (err) {
-  case CUDA_SUCCESS:
-    break;
-  case CUDA_ERROR_INVALID_VALUE:
-    throw invalid_value();
-  case CUDA_ERROR_OUT_OF_MEMORY:
-    throw out_of_memory();
-  case CUDA_ERROR_NOT_INITIALIZED:
-    throw not_initialized();
-  case CUDA_ERROR_DEINITIALIZED:
-    throw deinitialized();
-  case CUDA_ERROR_PROFILER_DISABLED:
-    throw profiler_disabled();
-  case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
-    throw profiler_not_initialized();
-  case CUDA_ERROR_PROFILER_ALREADY_STARTED:
-    throw profiler_already_started();
-  case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
-    throw profiler_already_stopped();
-  case CUDA_ERROR_NO_DEVICE:
-    throw no_device();
-  case CUDA_ERROR_INVALID_DEVICE:
-    throw invalid_device();
-  case CUDA_ERROR_INVALID_IMAGE:
-    throw invalid_image();
-  case CUDA_ERROR_INVALID_CONTEXT:
-    throw invalid_context();
-  case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
-    throw context_already_current();
-  case CUDA_ERROR_MAP_FAILED:
-    throw map_failed();
-  case CUDA_ERROR_UNMAP_FAILED:
-    throw unmap_failed();
-  case CUDA_ERROR_ARRAY_IS_MAPPED:
-    throw array_is_mapped();
-  case CUDA_ERROR_ALREADY_MAPPED:
-    throw already_mapped();
-  case CUDA_ERROR_NO_BINARY_FOR_GPU:
-    throw no_binary_for_gpu();
-  case CUDA_ERROR_ALREADY_ACQUIRED:
-    throw already_acquired();
-  case CUDA_ERROR_NOT_MAPPED:
-    throw not_mapped();
-  case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
-    throw not_mapped_as_array();
-  case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
-    throw not_mapped_as_pointer();
-  case CUDA_ERROR_ECC_UNCORRECTABLE:
-    throw ecc_uncorrectable();
-  case CUDA_ERROR_UNSUPPORTED_LIMIT:
-    throw unsupported_limit();
-  case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
-    throw context_already_in_use();
-  case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
-    throw peer_access_unsupported();
-  case CUDA_ERROR_INVALID_PTX:
-    throw invalid_ptx();
-  case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
-    throw invalid_graphics_context();
-  case CUDA_ERROR_INVALID_SOURCE:
-    throw invalid_source();
-  case CUDA_ERROR_FILE_NOT_FOUND:
-    throw file_not_found();
-  case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
-    throw shared_object_symbol_not_found();
-  case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
-    throw shared_object_init_failed();
-  case CUDA_ERROR_OPERATING_SYSTEM:
-    throw operating_system();
-  case CUDA_ERROR_INVALID_HANDLE:
-    throw invalid_handle();
-  case CUDA_ERROR_NOT_FOUND:
-    throw not_found();
-  case CUDA_ERROR_NOT_READY:
-    throw not_ready();
-  case CUDA_ERROR_ILLEGAL_ADDRESS:
-    throw illegal_address();
-  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
-    throw launch_out_of_resources();
-  case CUDA_ERROR_LAUNCH_TIMEOUT:
-    throw launch_timeout();
-  case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
-    throw launch_incompatible_texturing();
-  case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
-    throw peer_access_already_enabled();
-  case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
-    throw peer_access_not_enabled();
-  case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
-    throw primary_context_active();
-  case CUDA_ERROR_CONTEXT_IS_DESTROYED:
-    throw context_is_destroyed();
-  case CUDA_ERROR_ASSERT:
-    throw assert_error();
-  case CUDA_ERROR_TOO_MANY_PEERS:
-    throw too_many_peers();
-  case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
-    throw host_memory_already_registered();
-  case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
-    throw host_memory_not_registered();
-  case CUDA_ERROR_HARDWARE_STACK_ERROR:
-    throw hardware_stack_error();
-  case CUDA_ERROR_ILLEGAL_INSTRUCTION:
-    throw illegal_instruction();
-  case CUDA_ERROR_MISALIGNED_ADDRESS:
-    throw misaligned_address();
-  case CUDA_ERROR_INVALID_ADDRESS_SPACE:
-    throw invalid_address_space();
-  case CUDA_ERROR_INVALID_PC:
-    throw invalid_pc();
-  case CUDA_ERROR_LAUNCH_FAILED:
-    throw launch_failed();
-  case CUDA_ERROR_NOT_PERMITTED:
-    throw not_permitted();
-  case CUDA_ERROR_NOT_SUPPORTED:
-    throw not_supported();
-  case CUDA_ERROR_UNKNOWN:
-    throw unknown();
-  default:
-    throw unknown();
-  }
-}
-
-void check(hipError_t error) {
-  using namespace exception::hip;
-  switch (error) {
-  case hipSuccess:
-    break;
-  case hipErrorInvalidValue:
-    throw invalid_value();
-  case hipErrorMemoryAllocation:
-    throw out_of_memory();
-  case hipErrorNotInitialized:
-    throw not_initialized();
-  case hipErrorDeinitialized:
-    throw deinitialized();
-  case hipErrorProfilerDisabled:
-    throw profiler_disabled();
-  case hipErrorProfilerNotInitialized:
-    throw profiler_not_initialized();
-  case hipErrorProfilerAlreadyStarted:
-    throw profiler_already_started();
-  case hipErrorProfilerAlreadyStopped:
-    throw profiler_already_stopped();
-  case hipErrorNoDevice:
-    throw no_device();
-  case hipErrorInvalidSymbol:
-    throw invalid_symbol();
-  case hipErrorInvalidDevice:
-    throw invalid_device();
-  case hipErrorInvalidImage:
-    throw invalid_image();
-  case hipErrorInvalidContext:
-    throw invalid_context();
-  case hipErrorContextAlreadyCurrent:
-    throw context_already_current();
-  case hipErrorMapFailed:
-    throw map_failed();
-  case hipErrorUnmapFailed:
-    throw unmap_failed();
-  case hipErrorArrayIsMapped:
-    throw array_is_mapped();
-  case hipErrorAlreadyMapped:
-    throw already_mapped();
-  case hipErrorNoBinaryForGpu:
-    throw no_binary_for_gpu();
-  case hipErrorAlreadyAcquired:
-    throw already_acquired();
-  case hipErrorNotMapped:
-    throw not_mapped();
-  case hipErrorNotMappedAsArray:
-    throw not_mapped_as_array();
-  case hipErrorNotMappedAsPointer:
-    throw not_mapped_as_pointer();
-  case hipErrorECCNotCorrectable:
-    throw ecc_uncorrectable();
-  case hipErrorUnsupportedLimit:
-    throw unsupported_limit();
-  case hipErrorContextAlreadyInUse:
-    throw context_already_in_use();
-  case hipErrorPeerAccessUnsupported:
-    throw peer_access_unsupported();
-  case hipErrorInvalidKernelFile:
-    throw invalid_ptx();
-  case hipErrorInvalidGraphicsContext:
-    throw invalid_graphics_context();
-  case hipErrorInvalidSource:
-    throw invalid_source();
-  case hipErrorFileNotFound:
-    throw file_not_found();
-  case hipErrorSharedObjectSymbolNotFound:
-    throw shared_object_symbol_not_found();
-  case hipErrorSharedObjectInitFailed:
-    throw shared_object_init_failed();
-  case hipErrorOperatingSystem:
-    throw operating_system();
-  case hipErrorInvalidResourceHandle:
-    throw invalid_handle();
-  case hipErrorNotFound:
-    throw not_found();
-  case hipErrorNotReady:
-    throw not_ready();
-  case hipErrorIllegalAddress:
-    throw illegal_address();
-  case hipErrorLaunchOutOfResources:
-    throw launch_out_of_resources();
-  case hipErrorLaunchTimeOut:
-    throw launch_timeout();
-  // case hipErrorLaunchIncompatibleTexturing  : throw
-  // launch_incompatible_texturing();
-  case hipErrorPeerAccessAlreadyEnabled:
-    throw peer_access_already_enabled();
-  case hipErrorPeerAccessNotEnabled:
-    throw peer_access_not_enabled();
-  // case hipErrorPrimaryContextActive         : throw primary_context_active();
-  // case hipErrorContextIsDestroyed           : throw context_is_destroyed();
-  case hipErrorAssert:
-    throw assert_error();
-  // case hipErrorTooManyPeers                 : throw too_many_peers();
-  case hipErrorHostMemoryAlreadyRegistered:
-    throw host_memory_already_registered();
-  case hipErrorHostMemoryNotRegistered:
-    throw host_memory_not_registered();
-  // case hipErrorHardwareStackError           : throw hardware_stack_error();
-  // case hipErrorIllegalInstruction            : throw illegal_instruction();
-  // case hipErrorMisalignedAddress             : throw misaligned_address();
-  // case hipErrorInvalidAddressSpace          : throw invalid_address_space();
-  // case hipErrorInvalidPc                     : throw invalid_pc();
-  case hipErrorLaunchFailure:
-    throw launch_failed();
-  // case hipErrorNotPermitted                  : throw not_permitted();
-  case hipErrorNotSupported:
-    throw not_supported();
-  case hipErrorUnknown:
-    throw unknown();
-  default:
-    throw unknown();
-  }
-}
-
-} // namespace driver
-} // namespace triton
diff --git a/lib/driver/llvm.cc b/lib/driver/llvm.cc
deleted file mode 100644
index 140eff6cd..000000000
--- a/lib/driver/llvm.cc
+++ /dev/null
@@ -1,392 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files
- * (the "Software"), to deal in the Software without restriction,
- * including without limitation the rights to use, copy, modify, merge,
- * publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-#include <fstream>
-
-#if defined __has_include
-#if __has_include(<unistd.h>)
-#include <unistd.h>
-#endif
-#endif
-
-#include "triton/driver/dispatch.h"
-#include "triton/driver/error.h"
-#include "triton/driver/llvm.h"
-#include "triton/tools/sha1.hpp"
-#include "triton/tools/sys/exec.hpp"
-#include "triton/tools/sys/getenv.hpp"
-#include "triton/tools/sys/mkdir.hpp"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IRPrintingPasses.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include <memory>
-#include <regex>
-
-// begin AMD stuff
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/Program.h"
-#include "llvm/Support/ToolOutputFile.h"
-// end AMD stuff
-
-extern "C" {
-int set_curterm(char *nterm) { return 0; }
-int del_curterm(char *nterm) { return 0; }
-int tigetnum(char *capname) { return 0; }
-int setupterm(char *term, int fildes, int *errret) { return 0; }
-}
-
-namespace triton {
-namespace driver {
-
-void init_llvm() {
-  LLVMInitializeNVPTXTargetInfo();
-  LLVMInitializeNVPTXTarget();
-  LLVMInitializeNVPTXTargetMC();
-  LLVMInitializeNVPTXAsmPrinter();
-  LLVMInitializeAMDGPUTargetInfo();
-  LLVMInitializeAMDGPUTarget();
-  LLVMInitializeAMDGPUTargetMC();
-  LLVMInitializeAMDGPUAsmPrinter();
-}
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-static bool find_and_replace(std::string &str, const std::string &begin,
-                             const std::string &end,
-                             const std::string &target) {
-  size_t start_replace = str.find(begin);
-  if (start_replace == std::string::npos)
-    return false;
-  size_t end_replace = str.find(end, start_replace);
-  if (end_replace == std::string::npos)
-    return false;
-  str.replace(start_replace, end_replace + 1 - start_replace, target);
-  return true;
-}
-
-std::string path_to_ptxas(int &version) {
-  std::vector<std::string> rets;
-  std::string ret;
-  // search paths for ptxas
-  std::vector<std::string> ptxas_prefixes = {"", "/usr/local/cuda/bin/"};
-  std::string triton_ptxas = tools::getenv("TRITON_PTXAS_PATH");
-  if (!triton_ptxas.empty())
-    ptxas_prefixes.insert(ptxas_prefixes.begin(), triton_ptxas);
-  // see what path for ptxas are valid
-  std::vector<std::string> working_ptxas;
-  for (const std::string &prefix : ptxas_prefixes) {
-    std::string ptxas = prefix + "ptxas";
-    bool works = tools::exec(ptxas + " --version 2>&1", ret) == 0;
-    if (works) {
-      working_ptxas.push_back(ptxas);
-      rets.push_back(ret);
-    }
-  }
-  // error if no working ptxas was found
-  if (working_ptxas.empty())
-    throw std::runtime_error("`ptxas` was searched in TRITON_PTXAS_PATH, "
-                             "/usr/local/cuda/bin/ or PATH"
-                             " but a working version could not be found.");
-  std::string ptxas = working_ptxas.front();
-  // parse version
-  std::regex version_regex("release (\\d+)\\.(\\d+)");
-  std::smatch match;
-  bool found = false;
-  // currently choosing the first ptxas. Other logics can be implemented in
-  // future
-  size_t i = 0;
-  while (i < rets.size()) {
-    if (std::regex_search(rets[i], match, version_regex)) {
-      int major = std::stoi(match[1]);
-      int minor = std::stoi(match[2]);
-      version = major * 1000 + minor * 10;
-      found = true;
-      break;
-    }
-    ++i;
-  }
-  if (not found) {
-    throw std::runtime_error("Error in parsing version");
-  }
-  return working_ptxas[i];
-}
-
-int vptx(int version) {
-  if (version >= 11040)
-    return 74;
-  if (version >= 11030)
-    return 73;
-  if (version >= 11020)
-    return 72;
-  if (version >= 11010)
-    return 71;
-  if (version >= 11000)
-    return 70;
-  if (version >= 10020)
-    return 65;
-  if (version >= 10010)
-    return 64;
-  if (version >= 10000)
-    return 63;
-  throw std::runtime_error("Triton requires CUDA 10+");
-}
-
-std::string llir_to_ptx(llvm::Module *module, int cc, int version) {
-  // LLVM version in use may not officially support target hardware
-  int max_nvvm_cc = 75;
-  int max_nvvm_ptx = 74;
-  // options
-  auto options = llvm::cl::getRegisteredOptions();
-  auto *short_ptr =
-      static_cast<llvm::cl::opt<bool> *>(options["nvptx-short-ptr"]);
-  assert(short_ptr);
-  short_ptr->setValue(true);
-  // compute capability
-  std::string sm = "sm_" + std::to_string(cc);
-  // max PTX version
-  int ptx = vptx(version);
-  int ptx_major = ptx / 10;
-  int ptx_minor = ptx % 10;
-  // create
-  llvm::SmallVector<char, 0> buffer;
-  std::string triple = "nvptx64-nvidia-cuda";
-  std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
-  std::string layout = "";
-  std::string features = "";
-  // std::string features = "+ptx" + std::to_string(std::min(ptx,
-  // max_nvvm_ptx));
-  init_llvm();
-  // verify and store llvm
-  llvm::legacy::PassManager pm;
-  pm.add(llvm::createVerifierPass());
-  pm.run(*module);
-  // module->print(llvm::outs(), nullptr);
-
-  // create machine
-  module->setTargetTriple(triple);
-  std::string error;
-  auto target =
-      llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
-  llvm::TargetOptions opt;
-  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
-  opt.UnsafeFPMath = false;
-  opt.NoInfsFPMath = false;
-  opt.NoNaNsFPMath = true;
-  llvm::TargetMachine *machine = target->createTargetMachine(
-      module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_,
-      llvm::None, llvm::CodeGenOpt::Aggressive);
-  // set data layout
-  if (layout.empty())
-    module->setDataLayout(machine->createDataLayout());
-  else
-    module->setDataLayout(layout);
-  // emit machine code
-  for (llvm::Function &f : module->functions())
-    f.addFnAttr(llvm::Attribute::AlwaysInline);
-  llvm::legacy::PassManager pass;
-  llvm::raw_svector_ostream stream(buffer);
-  // emit
-  machine->addPassesToEmitFile(pass, stream, nullptr,
-                               llvm::CodeGenFileType::CGFT_AssemblyFile);
-  pass.run(*module);
-
-  // post-process
-  std::string result(buffer.begin(), buffer.end());
-  find_and_replace(result, ".version", "\n",
-                   ".version " + std::to_string(ptx_major) + "." +
-                       std::to_string(ptx_minor) + "\n");
-  find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
-  while (find_and_replace(result, "\t// begin inline asm", "\n", ""))
-    ;
-  while (find_and_replace(result, "\t// end inline asm", "\n", ""))
-    ;
-  return result;
-}
-
-std::string ptx_to_cubin(const std::string &ptx, const std::string &ptxas,
-                         int cc) {
-  // compile ptx with ptxas
-  char _fsrc[L_tmpnam];
-  char _flog[L_tmpnam];
-  std::tmpnam(_fsrc);
-  std::tmpnam(_flog);
-  std::string fsrc = _fsrc;
-  std::string flog = _flog;
-  std::string fbin = fsrc + ".o";
-  const char *_fbin = fbin.c_str();
-  std::ofstream ofs(fsrc);
-  ofs << ptx << std::endl;
-  ofs.close();
-  std::string cmd;
-  int err;
-  cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc +
-        " -o " + fsrc + ".o 2> " + flog;
-  err = system(cmd.c_str());
-  if (err != 0) {
-    std::ifstream _log(_flog);
-    std::string log(std::istreambuf_iterator<char>(_log), {});
-    unlink(_fsrc);
-    unlink(_flog);
-    throw std::runtime_error("Internal Triton PTX codegen error: \n" + log);
-  }
-  CUmodule ret;
-  std::ifstream _cubin(_fbin, std::ios::binary);
-  std::string cubin(std::istreambuf_iterator<char>(_cubin), {});
-  _cubin.close();
-  unlink(_fsrc);
-  unlink(_flog);
-  unlink(_fbin);
-  dispatch::cuModuleLoadData(&ret, cubin.c_str());
-  return cubin;
-}
-
-/* ------------------------ */
-//         HIP              //
-/* ------------------------ */
-
-std::string llir_to_amdgpu(llvm::Module *module, const std::string &_proc) {
-  init_llvm();
-
-  //  proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo));
-  //  features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo));
-
-  // create
-  llvm::SmallVector<char, 0> buffer;
-  std::string triple = "amdgcn-amd-amdhsa";
-  std::string layout = "";
-  std::string features;
-  std::string proc = "gfx908";
-  // verify and store llvm
-  llvm::legacy::PassManager pm;
-  pm.add(llvm::createVerifierPass());
-  pm.run(*module);
-  // create machine
-  module->setTargetTriple(triple);
-  std::string error;
-  auto target =
-      llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
-  llvm::TargetOptions opt;
-  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
-  opt.UnsafeFPMath = false;
-  opt.NoInfsFPMath = false;
-  opt.NoNaNsFPMath = true;
-  llvm::TargetMachine *machine = target->createTargetMachine(
-      module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_,
-      llvm::None, llvm::CodeGenOpt::Aggressive);
-  // set data layout
-  if (layout.empty())
-    module->setDataLayout(machine->createDataLayout());
-  else
-    module->setDataLayout(layout);
-  // emit machine code
-  for (llvm::Function &f : module->functions())
-    f.addFnAttr(llvm::Attribute::AlwaysInline);
-  llvm::legacy::PassManager pass;
-  llvm::raw_svector_ostream stream(buffer);
-
-  // create dump files
-  std::string module_name = module->getModuleIdentifier();
-  std::error_code ec;
-
-  // Save GCN ISA binary.
-  std::string isabin_path =
-      std::string("/tmp/") + module_name + std::string(".o");
-  std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
-      new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
-  if (ec) {
-    std::cout << isabin_path << " was not created. error code: " << ec
-              << std::endl;
-  }
-
-  // emit
-  machine->addPassesToEmitFile(pass, *isabin_fs, nullptr,
-                               llvm::CGFT_ObjectFile);
-  pass.run(*module);
-  // Save GCN ISA.
-  std::string amdgcn_path =
-      std::string("/tmp/") + module_name + std::string(".gcn");
-  std::string result(buffer.begin(), buffer.end());
-  std::ofstream amdgcn(amdgcn_path);
-  amdgcn << result;
-  amdgcn.close();
-
-  // generate HASCO file
-  std::string hsaco_path =
-      std::string("/tmp/") + module_name + std::string(".hsaco");
-  std::string error_message;
-  int lld_result =
-      llvm::sys::ExecuteAndWait("/opt/rocm/llvm/bin/ld.lld",
-                                {"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu",
-                                 "-shared", "-o", hsaco_path, isabin_path},
-                                llvm::None, {}, 0, 0, &error_message);
-  if (lld_result) {
-    std::cout << "ld.lld execute fail: " << std::endl;
-    std::cout << error_message << std::endl;
-    std::cout << lld_result << std::endl;
-  }
-
-  return hsaco_path;
-}
-
-hipModule_t amdgpu_to_hipmodule(const std::string &path) {
-  // Read HSACO.
-  std::ifstream hsaco_file(path, std::ios::binary | std::ios::ate);
-  std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
-
-  std::vector<unsigned char> hsaco(hsaco_file_size);
-  hsaco_file.seekg(0, std::ios::beg);
-  hsaco_file.read(reinterpret_cast<char *>(&hsaco[0]), hsaco_file_size);
-  hsaco_file.close();
-  hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes,
-                        hipJitOptionErrorLogBuffer,
-                        hipJitOptionInfoLogBufferSizeBytes,
-                        hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose};
-  const unsigned int errbufsize = 8192;
-  const unsigned int logbufsize = 8192;
-  char _err[errbufsize];
-  char _log[logbufsize];
-  void *optval[] = {(void *)(uintptr_t)errbufsize, (void *)_err,
-                    (void *)(uintptr_t)logbufsize, (void *)_log, (void *)1};
-  hipModule_t ret;
-  dispatch::hipModuleLoadDataEx(&ret, hsaco.data(), 5, opt, optval);
-  return ret;
-}
-
-} // namespace driver
-} // namespace triton
diff --git a/python/src/triton.cc b/python/src/triton.cc
index 52dffd1ae..424c2a28e 100644
--- a/python/src/triton.cc
+++ b/python/src/triton.cc
@@ -1,7 +1,4 @@
-﻿#include "triton/driver/error.h"
-#include "triton/driver/llvm.h"
-
-#include "mlir/IR/Builders.h"
+﻿#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
@@ -10,6 +7,9 @@
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
 
+#include "mlir/Parser.h"
+#include "mlir/Support/FileUtilities.h"
+
 #include "triton/Analysis/Allocation.h"
 #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.h"
 #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h"
@@ -24,10 +24,14 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include "llvm/Support/SourceMgr.h"
+
 #include <Python.h>
 #include <cctype>
+#include <fstream>
 #include <optional>
 #include <pybind11/buffer_info.h>
 #include <pybind11/functional.h>
@@ -40,10 +44,6 @@
 #include <string>
 
 namespace py = pybind11;
-// namespace ir = triton::ir;
-namespace drv = triton::driver;
-
-using triton::cuGetInfo;
 
 enum backend_t {
   HOST,
@@ -51,306 +51,6 @@ enum backend_t {
   ROCM,
 };
 
-void cu_enable_peer_access(uint64_t peer_ptr) {
-  CUcontext context;
-  drv::dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT,
-                                       peer_ptr);
-  try {
-    drv::dispatch::cuCtxEnablePeerAccess(context, 0);
-  } catch (drv::exception::cuda::peer_access_already_enabled) {
-  }
-}
-
-void host_enqueue(uint64_t stream, uint64_t kernel, uint64_t grid_0,
-                  uint64_t grid_1, uint64_t grid_2, uint64_t block_0,
-                  uint64_t block_1, uint64_t block_2, void *args_ptr,
-                  size_t args_size, int64_t shared_mem) {
-  throw std::runtime_error("unsupported");
-  // auto hst = kernel->module()->hst();
-  // hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
-  // char* params = new char[args_size];
-  // std::memcpy((void*)params, (void*)args, args_size);
-  // for(size_t i = 0; i < grid[0]; i++)
-  //   for(size_t j = 0; j < grid[1]; j++)
-  //     for(size_t k = 0; k < grid[2]; k++)
-  //       hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn,
-  //       (char**)params, int32_t(i), int32_t(j), int32_t(k)));
-}
-
-void cu_enqueue(uint64_t stream, uint64_t kernel, uint64_t grid_0,
-                uint64_t grid_1, uint64_t grid_2, uint64_t block_0,
-                uint64_t block_1, uint64_t block_2, void *args_ptr,
-                size_t args_size, int64_t shared_mem) {
-  void *config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, (void *)args_ptr,
-                    CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
-                    CU_LAUNCH_PARAM_END};
-  drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2,
-                                block_0, block_1, block_2, shared_mem,
-                                (CUstream)stream, nullptr, config);
-}
-
-long pow2_divisor(long N) {
-  if (N % 16 == 0)
-    return 16;
-  if (N % 8 == 0)
-    return 8;
-  if (N % 4 == 0)
-    return 4;
-  if (N % 2 == 0)
-    return 2;
-  return 1;
-}
-
-// Returns something like "int16", whether dtype is a torch.dtype or
-// triton.language.dtype.
-std::string dtype_cache_key_part(const py::object &dtype) {
-  if (py::hasattr(dtype, "cache_key_part")) {
-    // Presumed to be a triton.language.dtype.
-    return std::string(py::str(py::getattr(dtype, "cache_key_part")));
-  } else {
-    // Remove 'torch.' prefix from repr of torch.dtype.
-    py::object repr = py::repr(dtype);
-    size_t repr_len = PyUnicode_GET_LENGTH(repr.ptr());
-    const char *repr_ptr = (const char *)PyUnicode_1BYTE_DATA(repr.ptr());
-    if (repr_len <= 6 || strncmp(repr_ptr, "torch.", 6)) {
-      throw std::logic_error("invalid dtype: " +
-                             std::string(repr_ptr, repr_len));
-    }
-    return std::string(repr_ptr + 6, repr_len - 6);
-  }
-}
-
-size_t get_pointer_range_size(uint64_t addr) {
-  if (addr == 0)
-    return 0;
-  size_t size;
-  drv::dispatch::cuPointerGetAttribute(&size, CU_POINTER_ATTRIBUTE_RANGE_SIZE,
-                                       (CUdeviceptr)addr);
-  return size;
-}
-
-// Launch
-void parse_args(py::list &args, py::list do_not_specialize,
-                const std::string &func_key, py::list &arg_names,
-                std::string &cache_key, std::string &params,
-                size_t &params_size, py::dict constants, int num_warps,
-                int num_stages) {
-  size_t len = PyList_Size(args.ptr());
-  params.reserve(8 * len); // 8 max bytes by argument
-  char *params_ptr = &params[0];
-  cache_key = func_key;
-  cache_key += "-" + std::to_string(num_warps);
-  cache_key += "-" + std::to_string(num_stages);
-  cache_key += "-";
-  for (int i = 0; i < len; i++) {
-    cache_key += "_";
-    py::int_ py_i = py::int_(i);
-    bool specialize = !do_not_specialize.contains(py_i);
-    py::object arg = args[i];
-    auto arg_ptr = arg.ptr();
-
-    // argument is `long`
-    if (PyLong_Check(arg_ptr)) {
-      int overflow;
-      long long value = PyLong_AsLongLongAndOverflow(arg_ptr, &overflow);
-      // values equal to 1 are specialized
-      if (specialize && (value == 1)) {
-        cache_key += "1";
-        continue;
-      }
-      // int32, uint32, int64, and uint64 have different kernels
-      if (!overflow && -0x8000'0000LL <= value && value <= 0x7FFF'FFFFLL) {
-        cache_key += "int32";
-        params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4));
-        std::memcpy(params_ptr, &value, 4);
-        params_ptr += 4;
-      } else if (!overflow && 0x8000'0000LL <= value &&
-                 value <= 0xFFFF'FFFFLL) {
-        cache_key += "uint32";
-        params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4));
-        std::memcpy(params_ptr, &value, 4);
-        params_ptr += 4;
-      } else if (!overflow) {
-        cache_key += "int64";
-        params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8));
-        std::memcpy(params_ptr, &value, 8);
-        params_ptr += 8;
-      } else {
-        if (PyErr_Occurred()) {
-          throw std::logic_error("An error occurred?");
-        }
-        unsigned long long unsigned_value = PyLong_AsUnsignedLongLong(arg_ptr);
-        if (PyErr_Occurred()) {
-          throw std::runtime_error("integer overflow in argument: " +
-                                   std::string(py::str(arg)));
-        }
-        cache_key += "uint64";
-        params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8));
-        std::memcpy(params_ptr, &unsigned_value, 8);
-        params_ptr += 8;
-      }
-      if (!specialize)
-        continue;
-      // values divisible by small powers of 2 are specialized
-      cache_key += "[multipleof(";
-      cache_key += std::to_string(pow2_divisor(value));
-      cache_key += ")]";
-      continue;
-    }
-    // argument is `float`
-    if (PyFloat_Check(arg_ptr)) {
-      cache_key += "float32";
-      float value = PyFloat_AsDouble(arg_ptr);
-      params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4));
-      std::memcpy(params_ptr, &value, 4);
-      params_ptr += 4;
-      continue;
-    }
-    // argument is `bool`
-    if (PyBool_Check(arg_ptr)) {
-      cache_key += "bool";
-      bool value = arg_ptr == Py_True ? true : false;
-      std::memcpy(params_ptr, &value, 1);
-      params_ptr += 1;
-      continue;
-    }
-    // argument is tensor
-    if (py::hasattr(arg, "data_ptr")) {
-      py::object data_ptr = arg.attr("data_ptr")();
-      long value = data_ptr.cast<long>();
-      params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8));
-      // copy param
-      std::memcpy(params_ptr, &value, 8);
-      params_ptr += 8;
-      // update cache key
-      cache_key += dtype_cache_key_part(arg.attr("dtype"));
-      cache_key += "*";
-      cache_key += "[multipleof(";
-      size_t range_size = get_pointer_range_size(value);
-      cache_key += std::to_string(
-          std::min(pow2_divisor(value), pow2_divisor(range_size)));
-      cache_key += ")]";
-      continue;
-    }
-    // argument is `constexpr`
-    if (py::hasattr(arg, "value")) {
-      py::object value = arg.attr("value");
-      py::object name = arg_names[i];
-      constants[name] = value;
-      py::object repr = py::repr(value);
-      const char *start = (const char *)PyUnicode_1BYTE_DATA(repr.ptr());
-      size_t len = PyUnicode_GET_LENGTH(repr.ptr());
-      cache_key += std::string(start, len);
-      continue;
-    }
-    std::string ty_str =
-        arg.attr("__class__").attr("__name__").cast<std::string>();
-    if (ty_str == "NoneType") {
-      cache_key += "None";
-      continue;
-    }
-    std::string err_msg = "Received type '" + ty_str + "' for argument " +
-                          std::to_string(i) + "." +
-                          " Only int, float, bool, torch.Tensor, and "
-                          "triton.language.constexpr are supported.";
-    throw std::runtime_error(err_msg);
-  }
-  params_size = (std::ptrdiff_t)(params_ptr - &params[0]);
-}
-
-void parse_args(py::list &args, py::list &arg_names, std::string &params,
-                size_t &params_size, py::dict constants) {
-  size_t len = PyList_Size(args.ptr());
-  params.reserve(8 * len); // 8 max bytes by argument
-  char *params_ptr = params.data();
-  for (int i = 0; i < len; i++) {
-    py::object arg = args[i];
-    auto arg_ptr = arg.ptr();
-
-    if (PyLong_Check(arg_ptr)) {
-      int overflow{};
-      long long value = PyLong_AsLongLongAndOverflow(arg_ptr, &overflow);
-
-      if (!overflow && -0x8000'0000LL <= value && value <= 0x7FFF'FFFFLL) {
-        params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4));
-        std::memcpy(params_ptr, &value, 4);
-        params_ptr += 4;
-      } else if (!overflow && 0x8000'0000LL <= value &&
-                 value <= 0xFFFF'FFFFLL) {
-        params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4));
-        std::memcpy(params_ptr, &value, 4);
-        params_ptr += 4;
-      } else if (!overflow) {
-        params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8));
-        std::memcpy(params_ptr, &value, 8);
-        params_ptr += 8;
-      } else {
-        if (PyErr_Occurred()) {
-          throw std::logic_error("An error occurred?");
-        }
-        unsigned long long unsigned_value = PyLong_AsUnsignedLongLong(arg_ptr);
-        if (PyErr_Occurred()) {
-          throw std::runtime_error("integer overflow in argument: " +
-                                   std::string(py::str(arg)));
-        }
-        params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8));
-        std::memcpy(params_ptr, &unsigned_value, 8);
-        params_ptr += 8;
-      }
-      continue;
-    }
-
-    if (PyFloat_Check(arg_ptr)) {
-      float value = PyFloat_AsDouble(arg_ptr);
-      params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4));
-      std::memcpy(params_ptr, &value, 4);
-      params_ptr += 4;
-      continue;
-    }
-
-    // argument is `bool`
-    if (PyBool_Check(arg_ptr)) {
-      bool value = arg_ptr == Py_True ? true : false;
-      std::memcpy(params_ptr, &value, 1);
-      params_ptr += 1;
-      continue;
-    }
-    // argument is torch.tensor, get data_ptr as memory address
-    if (py::hasattr(arg, "data_ptr")) {
-      py::object data_ptr = arg.attr("data_ptr")();
-      long value = data_ptr.cast<long>();
-      params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8));
-      // copy param
-      std::memcpy(params_ptr, &value, 8);
-      params_ptr += 8;
-      // update cache key
-      continue;
-    }
-    // argument is `constexpr`
-    if (py::hasattr(arg, "value")) {
-      py::object value = arg.attr("value");
-      py::object name = arg_names[i];
-      constants[name] = value;
-      continue;
-    }
-    // argument is `LoadedBinary`
-    if (py::hasattr(arg, "get_sass")) {
-      // Do nothing, just a placeholder here to indicate validity.
-      continue;
-    }
-
-    std::string ty_str =
-        arg.attr("__class__").attr("__name__").cast<std::string>();
-    std::string err_msg = "Received type '" + ty_str + "' for argument " +
-                          std::to_string(i) + "." +
-                          " Only int, float, bool, torch.Tensor, and "
-                          "triton.language.constexpr are supported.";
-    throw std::runtime_error(err_msg);
-  }
-
-  params_size = (std::ptrdiff_t)(params_ptr - &params[0]);
-}
-
 void init_triton_runtime(py::module &&m) {
   // wrap backend_t
   py::enum_<backend_t>(m, "backend")
@@ -358,192 +58,8 @@ void init_triton_runtime(py::module &&m) {
       .value("CUDA", CUDA)
       // .value("ROCM", ROCM)
       .export_values();
-
-  // enable peer-to-peer
-  m.def("enable_peer_access", [](backend_t backend, uint64_t peer_ptr) {
-    if (backend != CUDA)
-      throw std::runtime_error("P2P only supported on CUDA devices!");
-    cu_enable_peer_access(peer_ptr);
-  });
-
-  // get range size for the given pointer
-  m.def("get_pointer_range_size", &get_pointer_range_size);
-
-  // cache key
-  m.def("launch", [](py::list args, py::list do_not_specialize,
-                     const std::string &func_key, py::list &arg_names,
-                     py::object device, py::int_ stream, py::dict bin_cache,
-                     py::int_ num_warps, py::int_ num_stages,
-                     py::function add_to_cache, py::object grid) {
-    // parse arguments to compute cache key, compile-time constants and packed
-    // kernel arguments
-    long _num_warps = PyLong_AsLong(num_warps.ptr());
-    long _num_stages = PyLong_AsLong(num_stages.ptr());
-    std::string cache_key;
-    std::string params;
-    size_t params_size;
-    py::dict constants;
-    parse_args(args, do_not_specialize, func_key, arg_names, cache_key, params,
-               params_size, constants, _num_warps, _num_stages);
-
-    // get cached binary
-    py::str key(cache_key);
-    py::bool_ noop = false;
-    if (!bin_cache.contains(key)) {
-      noop = add_to_cache(key, args, device, num_warps, num_stages);
-    }
-    if (noop)
-      return (py::object)py::none();
-    py::object bin = bin_cache[key];
-
-    // get grid
-    py::sequence seq;
-    if (!PySequence_Check(grid.ptr()))
-      seq = grid(constants);
-    else
-      seq = grid;
-    int size = seq.size();
-    int grid_0 = py::cast<int>(seq[0]);
-    int grid_1 = size < 2 ? 1 : py::cast<int>(seq[1]);
-    int grid_2 = size < 3 ? 1 : py::cast<int>(seq[2]);
-
-    // enqueue
-    uint64_t kernel = py::cast<uint64_t>(bin.attr("kernel"));
-    uint64_t shared_mem = py::cast<uint64_t>(bin.attr("shared_mem"));
-
-    // actually launch
-    void *config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, params.data(),
-                      CU_LAUNCH_PARAM_BUFFER_SIZE, &params_size,
-                      CU_LAUNCH_PARAM_END};
-    uint64_t _stream = PyLong_AsLong(stream.ptr());
-    if (grid_0 * grid_1 * grid_2 > 0) {
-      // release the gil in case the enqueue blocks
-      // cuda will block if too many ops are enqueued
-      py::gil_scoped_release allow_threads;
-      drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2,
-                                    _num_warps * 32, 1, 1, shared_mem,
-                                    (CUstream)_stream, nullptr, config);
-    }
-    return bin;
-  });
-
-  m.def("cc", [](backend_t backend, uint64_t device) -> int {
-    if (backend == CUDA) {
-      CUdevice dev = (CUdevice)device;
-      int major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>(dev);
-      int minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(dev);
-      return major * 10 + minor;
-    }
-    return -1;
-  });
-
-  m.def("launch_binary", [](py::object binary, py::list args,
-                            py::list do_not_specialize, py::list arg_names,
-                            py::int_ stream, py::int_ num_warps,
-                            py::int_ num_stages, py::object grid) {
-    long _num_warps = PyLong_AsLong(num_warps.ptr());
-    long _num_stages = PyLong_AsLong(num_stages.ptr());
-
-    // get grid
-    py::sequence seq;
-    py::dict constants;
-    std::string params;
-    size_t params_size{};
-    parse_args(args, arg_names, params, params_size, constants);
-    if (!PySequence_Check(grid.ptr()))
-      seq = grid(constants);
-    else
-      seq = grid;
-
-    int size = seq.size();
-    int grid_0 = py::cast<int>(seq[0]);
-    int grid_1 = size < 2 ? 1 : py::cast<int>(seq[1]);
-    int grid_2 = size < 3 ? 1 : py::cast<int>(seq[2]);
-
-    uint64_t kernel = py::cast<uint64_t>(binary.attr("kernel"));
-    uint64_t shared_mem = py::cast<uint64_t>(binary.attr("shared_mem"));
-
-    // actually launch
-    void *config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, params.data(),
-                      CU_LAUNCH_PARAM_BUFFER_SIZE, &params_size,
-                      CU_LAUNCH_PARAM_END};
-    uint64_t _stream = PyLong_AsLong(stream.ptr());
-    const int numGrids = grid_0 * grid_1 * grid_2;
-    if (numGrids) {
-      // release the gil in case the enqueue blocks
-      // cuda will block if too many ops are enqueued
-      py::gil_scoped_release allow_threads;
-      drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2,
-                                    _num_warps * 32, 1, 1, shared_mem,
-                                    (CUstream)_stream, nullptr, config);
-    }
-    return binary;
-  });
-
-  // query maximum shared memory
-  m.def("max_shared_memory", [](backend_t backend, uint64_t device) {
-    if (backend == HOST)
-      return 0;
-    if (backend == CUDA)
-      return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>(
-          device);
-    return -1;
-  });
-
-  // query DRAM & L2 cache
-  m.def("memory_clock_rate", [](backend_t backend, uint64_t device) {
-    if (backend == CUDA)
-      return cuGetInfo<CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE>(device);
-    return -1;
-  });
-  m.def("global_memory_bus_width", [](backend_t backend, uint64_t device) {
-    if (backend == CUDA)
-      return cuGetInfo<CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH>(device);
-    return -1;
-  });
-  m.def("l2_cache_size", [](backend_t backend, uint64_t device) {
-    if (backend == CUDA)
-      return cuGetInfo<CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE>(device);
-    return -1;
-  });
-
-  // query clock rate (in kilohertz)
-  m.def("clock_rate", [](backend_t backend, uint64_t device) {
-    if (backend == CUDA)
-      return cuGetInfo<CU_DEVICE_ATTRIBUTE_CLOCK_RATE>(device);
-    return -1;
-  });
-
-  m.def("num_sm", [](backend_t backend, uint64_t device) {
-    if (backend == CUDA)
-      return cuGetInfo<CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT>(device);
-    return -1;
-  });
-
-  // enqueue
-  m.def("enqueue",
-        [](backend_t backend, uint64_t stream, uint64_t kernel, uint64_t grid_0,
-           uint64_t grid_1, uint64_t grid_2, uint64_t block_0, uint64_t block_1,
-           uint64_t block_2, const std::string &args, int64_t shared_mem) {
-          void *args_ptr = (void *)args.data();
-          size_t args_size = args.size();
-          // release the gil in case the enqueue blocks
-          // cuda will block if too many ops are enqueued
-          py::gil_scoped_release allow_threads;
-          if (backend == HOST)
-            host_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0,
-                         block_1, block_2, args_ptr, args_size, shared_mem);
-          if (backend == CUDA)
-            cu_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1,
-                       block_2, args_ptr, args_size, shared_mem);
-        });
 }
 
-/*****************************************************************************/
-/* Python bindings for triton::codegen                                       */
-/*****************************************************************************/
-typedef std::map<std::string, py::object> asm_map_t;
-
 /*****************************************************************************/
 /* Python bindings for triton::ir                                            */
 /*****************************************************************************/
@@ -783,6 +299,38 @@ void init_triton_ir(py::module &&m) {
              return self.lookupSymbol<mlir::FuncOp>(funcName);
            });
 
+  m.def(
+      "parse_mlir_module",
+      [](const std::string &inputFilename, mlir::MLIRContext &context) {
+        // open file
+        std::string errorMessage;
+        auto input = mlir::openInputFile(inputFilename, &errorMessage);
+        if (!input)
+          throw std::runtime_error(errorMessage);
+
+        // initialize registry
+        mlir::DialectRegistry registry;
+        registry.insert<mlir::triton::TritonDialect,
+                        mlir::triton::gpu::TritonGPUDialect,
+                        mlir::math::MathDialect, mlir::arith::ArithmeticDialect,
+                        mlir::StandardOpsDialect, mlir::scf::SCFDialect>();
+
+        context.appendDialectRegistry(registry);
+        context.loadAllAvailableDialects();
+        context.allowUnregisteredDialects();
+
+        // parse module
+        llvm::SourceMgr sourceMgr;
+        sourceMgr.AddNewSourceBuffer(std::move(input), llvm::SMLoc());
+        mlir::OwningOpRef<mlir::ModuleOp> module(
+            mlir::parseSourceFile(sourceMgr, &context));
+        if (!module)
+          throw std::runtime_error("Parse MLIR file failed.");
+
+        return module->clone();
+      },
+      ret::take_ownership);
+
   py::class_<mlir::FuncOp, mlir::OpState>(m, "function")
       // .def_property_readonly("attrs", &ir::function::attrs)
       // .def("add_attr", &ir::function::add_attr);
@@ -1643,84 +1191,86 @@ void init_triton_ir(py::module &&m) {
 }
 
 void init_triton_translation(py::module &m) {
-  m.def("translate_triton_gpu_to_llvmir", [](mlir::ModuleOp op) -> std::string {
-    llvm::LLVMContext llvmContext;
-    auto llvmModule =
-        ::mlir::triton::translateTritonGPUToLLVMIR(&llvmContext, op);
 
-    std::string str;
-    llvm::raw_string_ostream os(str);
-    llvmModule->print(os, nullptr);
-    os.flush();
-    return str;
+  using ret = py::return_value_policy;
+
+  m.def("get_shared_memory_size", [](mlir::ModuleOp module) {
+    auto pass = std::make_unique<mlir::Allocation>(module);
+    return pass->getSharedMemorySize();
   });
 
-  m.def("translate_triton_gpu_to_ptx",
-        [](mlir::ModuleOp module, uint64_t device)
-            -> std::tuple<std::string /*ptx code*/, size_t /*shem size*/> {
-          auto [ptxCode, cc, version, ptxasPath] =
-              triton::translateTritonGPUToPTX(module, device);
+  m.def(
+      "translate_triton_gpu_to_llvmir",
+      [](mlir::ModuleOp op) {
+        llvm::LLVMContext llvmContext;
+        auto llvmModule =
+            ::mlir::triton::translateTritonGPUToLLVMIR(&llvmContext, op);
 
-          mlir::PassManager pm(module->getContext());
-          auto pass = std::make_unique<mlir::Allocation>(module);
-          size_t size = pass->getSharedMemorySize();
+        std::string str;
+        llvm::raw_string_ostream os(str);
+        llvmModule->print(os, nullptr);
+        os.flush();
+        return str;
+      },
+      ret::take_ownership);
 
-          return std::make_tuple(ptxCode, size);
-        });
+  m.def(
+      "translate_llvmir_to_ptx",
+      [](const std::string llvmIR, int capability, int version) -> std::string {
+        // create LLVM module from C++
+        llvm::LLVMContext context;
+        std::unique_ptr<llvm::MemoryBuffer> buffer =
+            llvm::MemoryBuffer::getMemBuffer(llvmIR.c_str());
+        llvm::SMDiagnostic error;
+        std::unique_ptr<llvm::Module> module =
+            llvm::parseIR(buffer->getMemBufferRef(), error, context);
+        // translate module to PTX
+        auto ptxCode =
+            triton::translateLLVMIRToPTX(*module, capability, version);
+        return ptxCode;
+      },
+      ret::take_ownership);
 
   m.def("compile_ptx_to_cubin",
-        [](const std::string &ptxCode, uint64_t device) -> py::object {
+        [](const std::string &ptxCode, const std::string &ptxasPath,
+           int capability) -> py::object {
           py::gil_scoped_release allow_threads;
-          int version;
-          int cc;
-          std::string ptxasPath;
-          triton::getCuCCAndVersionFromDevice(device, &cc, &version,
-                                              &ptxasPath);
 
-          std::string cubin = drv::ptx_to_cubin(ptxCode, ptxasPath, cc);
+          // compile ptx with ptxas
+          char _fsrc[L_tmpnam];
+          char _flog[L_tmpnam];
+          std::tmpnam(_fsrc);
+          std::tmpnam(_flog);
+          std::string fsrc = _fsrc;
+          std::string flog = _flog;
+          std::string fbin = fsrc + ".o";
+          const char *_fbin = fbin.c_str();
+          std::ofstream ofs(fsrc);
+          ofs << ptxCode << std::endl;
+          ofs.close();
+          std::string cmd;
+          int err;
+          cmd = ptxasPath + " -v --gpu-name=sm_" + std::to_string(capability) +
+                " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
+          err = system(cmd.c_str());
+          if (err != 0) {
+            std::ifstream _log(_flog);
+            std::string log(std::istreambuf_iterator<char>(_log), {});
+            unlink(_fsrc);
+            unlink(_flog);
+            throw std::runtime_error("Internal Triton PTX codegen error: \n" +
+                                     log);
+          }
+          std::ifstream _cubin(_fbin, std::ios::binary);
+          std::string cubin(std::istreambuf_iterator<char>(_cubin), {});
+          _cubin.close();
+          unlink(_fsrc);
+          unlink(_flog);
+          unlink(_fbin);
+
           py::bytes bytes(cubin);
           return bytes;
         });
-
-  m.def(
-      "load_binary",
-      [](const std::string &name, const std::string &data,
-         size_t n_shared_bytes, uint64_t device) {
-        py::gil_scoped_release allow_threads;
-        // create driver handles
-        CUfunction fun;
-        CUmodule mod;
-        drv::dispatch::cuModuleLoadData(&mod, data.c_str());
-        drv::dispatch::cuModuleGetFunction(&fun, mod, name.c_str());
-        // get allocated registers and spilled registers from the function
-        int n_regs = 0;
-        int n_spills = 0;
-        drv::dispatch::cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS,
-                                          fun);
-        drv::dispatch::cuFuncGetAttribute(
-            &n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun);
-        n_spills /= 4;
-        // set dynamic shared memory if necessary
-        int shared_optin;
-        drv::dispatch::cuDeviceGetAttribute(
-            &shared_optin,
-            CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device);
-        if (n_shared_bytes > 49152 && shared_optin > 49152) {
-          drv::dispatch::cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED);
-          int shared_total, shared_static;
-          drv::dispatch::cuDeviceGetAttribute(
-              &shared_total,
-              CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, device);
-          drv::dispatch::cuFuncGetAttribute(
-              &shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun);
-          drv::dispatch::cuFuncSetAttribute(
-              fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-              shared_optin - shared_static);
-        }
-        return std::make_tuple((uint64_t)mod, (uint64_t)fun, (uint64_t)n_regs,
-                               (uint64_t)n_spills);
-      },
-      py::return_value_policy::take_ownership);
 }
 
 void init_triton(py::module &m) {
diff --git a/python/triton/compiler.py b/python/triton/compiler.py
index a97252f75..e7d0b1318 100644
--- a/python/triton/compiler.py
+++ b/python/triton/compiler.py
@@ -7,6 +7,7 @@ import hashlib
 import io
 import json
 import os
+import re
 import shutil
 import subprocess
 import sys
@@ -843,7 +844,11 @@ def optimize_tritongpu_ir(mod, num_stages):
     return mod
 
 
-def make_ptx(mod: Any, device: int) -> Tuple[str, int]:
+def make_llvm_ir(mod):
+    return _triton.translate_triton_gpu_to_llvmir(mod)
+
+
+def make_ptx(mod: Any, compute_capability: int, ptx_version: int) -> Tuple[str, int]:
     '''
     Translate TritonGPU module to PTX code.
     :param mod: a TritonGPU dialect module
@@ -851,17 +856,17 @@ def make_ptx(mod: Any, device: int) -> Tuple[str, int]:
         - PTX code
         - shared memory alloaction size
     '''
-    return _triton.translate_triton_gpu_to_ptx(mod, device)
+    return _triton.translate_llvmir_to_ptx(mod, compute_capability, ptx_version)
 
 
-def make_cubin(ptx, device):
+def make_cubin(ptx: str, ptxas: str, compute_capability: int):
     '''
     Compile TritonGPU module to cubin.
     :param ptx: ptx code
     :param device: CUDA device
     :return: str
     '''
-    return _triton.compile_ptx_to_cubin(ptx, device)
+    return _triton.compile_ptx_to_cubin(ptx, ptxas, compute_capability)
 
 
 def ptx_get_kernel_name(ptx: str) -> str:
@@ -877,6 +882,46 @@ def ptx_get_kernel_name(ptx: str) -> str:
             return line.split()[-1]
 
 
+@functools.lru_cache
+def ptx_get_version(cuda_version) -> int:
+    '''
+    Get the highest PTX version supported by the current CUDA driver.
+    '''
+    assert isinstance(cuda_version, str)
+    major, minor = map(int, cuda_version.split('.'))
+    version = major * 1000 + minor * 10
+    if version >= 11040:
+        return 74
+    if version >= 11030:
+        return 73
+    if version >= 11020:
+        return 72
+    if version >= 11010:
+        return 71
+    if version >= 11000:
+        return 70
+    if version >= 10020:
+        return 65
+    if version >= 10010:
+        return 64
+    if version >= 10000:
+        return 63
+    raise RuntimeError("Triton only support CUDA 10.0 or higher")
+
+
+def path_to_ptxas():
+    prefixes = [os.environ.get("TRITON_PTXAS_PATH", ""), "", "/usr/local/cuda/"]
+    for prefix in prefixes:
+        ptxas = os.path.join(prefix, "bin", "ptxas")
+        if os.path.exists(ptxas):
+            result = subprocess.check_output([ptxas, "--version"], stderr=subprocess.STDOUT)
+            if result is not None:
+                version = re.search(r".*release (\d+\.\d+).*", result.decode("utf-8"), flags=re.MULTILINE)
+                if version is not None:
+                    return ptxas, version.group(1)
+    raise RuntimeError("Cannot find ptxas")
+
+
 instance_descriptor = namedtuple("instance_descriptor", ["divisible_by_16", "equal_to_1"], defaults=[set(), set()])
 
 
@@ -895,17 +940,24 @@ def _compile(fn, signature: str, device: int = -1, constants=dict(), specializat
     # tritongpu-ir
     module = make_tritongpu_ir(module, num_warps)
     module = optimize_tritongpu_ir(module, num_stages)
-
     if output == "ttgir":
         return module.str()
 
+    # llvm-ir
+    llvm_ir = make_llvm_ir(module)
+
     assert device >= 0, "device should be provided."
-    ptx, shem_size = make_ptx(module, device)
+    ptxas, cuda_version = path_to_ptxas()
+    compute_capability = torch.cuda.get_device_capability(device)
+    compute_capability = compute_capability[0] * 10 + compute_capability[1]
+    ptx_version = ptx_get_version(cuda_version)
+    ptx = make_ptx(llvm_ir, compute_capability, ptx_version)
+    shem_size = _triton.get_shared_memory_size(module)
     kernel_name = ptx_get_kernel_name(ptx)
     if output == "ptx":
         return ptx, shem_size, kernel_name
 
-    cubin = make_cubin(ptx, device)
+    cubin = make_cubin(ptx, ptxas, compute_capability)
     if output == "cubin":
         return cubin, ptx, shem_size, kernel_name
 
@@ -980,6 +1032,7 @@ def generate_launcher(identifier, constants, signature):
     src = f"""
 #include \"cuda.h\"
 #include <Python.h>
+
 static inline void gpuAssert(CUresult code, const char *file, int line)
 {{
    if (code != CUDA_SUCCESS)
@@ -993,13 +1046,16 @@ static inline void gpuAssert(CUresult code, const char *file, int line)
       PyErr_SetString(PyExc_RuntimeError, err);
    }}
 }}
+
 #define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
+
 void _launch(int gridX, int gridY, int gridZ, int num_warps, int shared_memory, CUstream stream, CUfunction function, {arg_decls}) {{
   void *params[] = {{ {', '.join(f"&arg{i}" for i in signature.keys() if i not in constants)} }};
   if(gridX*gridY*gridZ > 0){{
     CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
   }}
 }}
+
 static inline CUdeviceptr getPointer(PyObject *obj, int idx) {{
   if (PyLong_Check(obj)) {{
     return (CUdeviceptr)PyLong_AsUnsignedLongLong(obj);
@@ -1021,6 +1077,7 @@ static inline CUdeviceptr getPointer(PyObject *obj, int idx) {{
   PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
   return (CUdeviceptr)0;
 }}
+
 static PyObject* launch(PyObject* self, PyObject* args) {{
   int gridX, gridY, gridZ;
   uint64_t _stream;
@@ -1039,10 +1096,12 @@ static PyObject* launch(PyObject* self, PyObject* args) {{
   Py_INCREF(Py_None);
   return Py_None;
 }}
+
 static PyMethodDef ModuleMethods[] = {{
   {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
   {{NULL, NULL, 0, NULL}} // sentinel
 }};
+
 static struct PyModuleDef ModuleDef = {{
   PyModuleDef_HEAD_INIT,
   \"launcher\",
@@ -1050,6 +1109,7 @@ static struct PyModuleDef ModuleDef = {{
   -1, //size
   ModuleMethods
 }};
+
 PyMODINIT_FUNC PyInit_launcher(void) {{
   PyObject *m = PyModule_Create(&ModuleDef);
   if(m == NULL) {{
@@ -1251,7 +1311,10 @@ class CompiledKernel:
             self.asm["ptx"] = f.read()
 
         device = torch.cuda.current_device()
-        mod, func, n_regs, n_spills = _triton.load_binary(metadata["name"], self.asm["cubin"], self.shared, device)
+        global cuda_utils
+        if cuda_utils is None:
+            cuda_utils = CudaUtils()
+        mod, func, n_regs, n_spills = cuda_utils.load_binary(metadata["name"], self.asm["cubin"], self.shared, device)
         self.cu_module = mod
         self.cu_function = func
 
@@ -1261,3 +1324,118 @@ class CompiledKernel:
                 stream = torch.cuda.current_stream().cuda_stream
             self.c_wrapper(grid[0], grid[1], grid[2], self.num_warps, self.shared, stream, self.cu_function, *args)
         return
+
+
+class CudaUtils(object):
+
+    def __new__(cls):
+        if not hasattr(cls, 'instance'):
+            cls.instance = super(CudaUtils, cls).__new__(cls)
+        return cls.instance
+
+    def _generate_src(self):
+        return """
+        #include <cuda.h>
+
+        #include \"cuda.h\"
+        #include <Python.h>
+
+        static inline void gpuAssert(CUresult code, const char *file, int line)
+        {
+           if (code != CUDA_SUCCESS)
+           {
+              const char* prefix = "Triton Error [CUDA]: ";
+              const char* str;
+              cuGetErrorString(code, &str);
+              char err[1024] = {0};
+              strcat(err, prefix);
+              strcat(err, str);
+              PyErr_SetString(PyExc_RuntimeError, err);
+           }
+        }
+
+        #define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+
+        static PyObject* loadBinary(PyObject* self, PyObject* args) {
+            const char* name;
+            const char* data;
+            Py_ssize_t data_size;
+            int shared;
+            int device;
+            if(!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared, &device)) {
+                return NULL;
+            }
+            CUfunction fun;
+            CUmodule mod;
+            int32_t n_regs = 0;
+            int32_t n_spills = 0;
+            Py_BEGIN_ALLOW_THREADS;
+            // create driver handles
+            CUDA_CHECK(cuModuleLoadData(&mod, data));
+            CUDA_CHECK(cuModuleGetFunction(&fun, mod, name));
+            // get allocated registers and spilled registers from the function
+            CUDA_CHECK(cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun));
+            CUDA_CHECK(cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
+            n_spills /= 4;
+            // set dynamic shared memory if necessary
+            int shared_optin;
+            CUDA_CHECK(cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device));
+            if (shared > 49152 && shared_optin > 49152) {
+              CUDA_CHECK(cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED));
+              int shared_total, shared_static;
+              CUDA_CHECK(cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, device));
+              CUDA_CHECK(cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun));
+              CUDA_CHECK(cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static));
+            }
+            Py_END_ALLOW_THREADS;
+
+            if(PyErr_Occurred()) {
+              return NULL;
+            }
+            return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs, n_spills);
+        }
+
+        static PyMethodDef ModuleMethods[] = {
+          {"load_binary", loadBinary, METH_VARARGS, "Load provided cubin into CUDA driver"},
+          {NULL, NULL, 0, NULL} // sentinel
+        };
+
+        static struct PyModuleDef ModuleDef = {
+          PyModuleDef_HEAD_INIT,
+          \"cuda_utils\",
+          NULL, //documentation
+          -1, //size
+          ModuleMethods
+        };
+
+        PyMODINIT_FUNC PyInit_cuda_utils(void) {
+          PyObject *m = PyModule_Create(&ModuleDef);
+          if(m == NULL) {
+            return NULL;
+          }
+          PyModule_AddFunctions(m, ModuleMethods);
+          return m;
+        }
+        """
+
+    def __init__(self):
+        src = self._generate_src()
+        key = hashlib.md5(src.encode("utf-8")).hexdigest()
+        cache = CacheManager(key)
+        fname = "cuda_utils.so"
+        if not cache.has_file(fname):
+            with tempfile.TemporaryDirectory() as tmpdir:
+                src_path = os.path.join(tmpdir, "main.c")
+                with open(src_path, "w") as f:
+                    f.write(src)
+                so = _build("cuda_utils", src_path, tmpdir)
+                with open(so, "rb") as f:
+                    cache.put(f.read(), fname, binary=True)
+        import importlib.util
+        spec = importlib.util.spec_from_file_location("cuda_utils", cache._make_path(fname))
+        mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(mod)
+        self.load_binary = mod.load_binary
+
+
+cuda_utils = None
diff --git a/python/triton/tools/aot.py b/python/triton/tools/aot.py
new file mode 100644
index 000000000..c1b6010df
--- /dev/null
+++ b/python/triton/tools/aot.py
@@ -0,0 +1,61 @@
+import argparse
+
+import triton
+import triton._C.libtriton.triton as libtriton
+
+if __name__ == '__main__':
+
+    # valid source and target formats
+    VALID_FORMATS = ['llvm-ir', 'ptx', 'triton-ir', 'triton-gpu-ir']
+
+    # set up the argument parser
+    # TODO: conditional requirements
+    parser = argparse.ArgumentParser()
+    parser.add_argument('src', help="Source file to compile")
+    parser.add_argument('--target', required=True,
+                        help="Target format, one of: " + ', '.join(VALID_FORMATS))
+    parser.add_argument('--sm', type=int, help="Compute capability to compile for")
+    parser.add_argument('--ptx-version', type=int, help="PTX version to compile for")
+
+    # parse the args
+    args = parser.parse_args()
+
+    # TODO: clean-up and re-use triton.compiler primitive functions
+    # check for validity of format arguments
+    if args.target not in VALID_FORMATS:
+        print("Invalid target format: " + args.target)
+        exit(0)
+
+    # parse source file to MLIR module
+    context = libtriton.ir.context()
+    module = libtriton.ir.parse_mlir_module(args.src, context)
+    module.context = context
+
+    # optimizer triton-ir
+    module = triton.compiler.optimize_triton_ir(module)
+    if args.target == 'triton-ir':
+        print(module.str())
+        exit(0)
+
+    # triton-ir -> triton-gpu-ir
+    module = triton.compiler.make_tritongpu_ir(module, num_warps=4)
+    module = triton.compiler.optimize_tritongpu_ir(module, num_stages=3)
+    if args.target == 'triton-gpu-ir':
+        print(module.str())
+        exit(0)
+
+    # triton-gpu-ir -> llvm-ir
+    module = triton.compiler.make_llvm_ir(module)
+    if args.target == 'llvm-ir':
+        print(module)
+        exit(0)
+
+    if not args.sm:
+        raise argparse.ArgumentError(None, "Must specify --sm for PTX compilation")
+    if not args.ptx_version:
+        raise argparse.ArgumentError(None, "Must specify --ptx-version for PTX compilation")
+
+    # llvm-ir -> ptx
+    module = triton.compiler.make_ptx(module, compute_capability=args.sm, ptx_version=args.ptx_version)
+    assert args.target == 'ptx'
+    print(module)
diff --git a/test/Target/tritongpu_to_llvmir.mlir b/test/Target/tritongpu_to_llvmir.mlir
index 0f03323e9..7e203b1f9 100644
--- a/test/Target/tritongpu_to_llvmir.mlir
+++ b/test/Target/tritongpu_to_llvmir.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-translate %s --target=llvmir | FileCheck %s
+// RUN: python3 -m triton.tools.aot %s --target=llvm-ir | FileCheck %s
 
 // == LLVM IR check begin ==
 // CHECK-LABEL: ; ModuleID = 'LLVMDialectModule'
diff --git a/test/Target/tritongpu_to_ptx.mlir b/test/Target/tritongpu_to_ptx.mlir
index 1fa6d85bc..c652e1b08 100644
--- a/test/Target/tritongpu_to_ptx.mlir
+++ b/test/Target/tritongpu_to_ptx.mlir
@@ -1,5 +1,4 @@
-// RUN: triton-translate %s --target=ptx --sm=80 --ptx-version=10000 | FileCheck %s
-
+// RUN: python3 -m triton.tools.aot %s --target=ptx --sm=80 --ptx-version=63 | FileCheck %s
 // CHECK-LABEL: // Generated by LLVM NVPTX Back-End
 // CHECK: .version 6.3
 // CHECK: .target sm_80

CUDA array type	Valid extents that must always be met {(width range in elements), (height range), - * (depth range)}	Valid extents with CUDA_ARRAY3D_SURFACE_LDST set - * {(width range in elements), (height range), (depth range)}
1D	{ (1,TEXTURE1D_WIDTH), 0, 0 }	{ (1,SURFACE1D_WIDTH), 0, 0 }
2D	{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }	{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D	{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } - * OR { (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), - * (1,TEXTURE3D_DEPTH_ALTERNATE) }	{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), - * (1,SURFACE3D_DEPTH) }
1D Layered	{ (1,TEXTURE1D_LAYERED_WIDTH), 0, - * (1,TEXTURE1D_LAYERED_LAYERS) }	{ (1,SURFACE1D_LAYERED_WIDTH), 0, - * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered	{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), - * (1,TEXTURE2D_LAYERED_LAYERS) }	{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), - * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap	{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }	{ (1,SURFACECUBEMAP_WIDTH), - * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered	{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), - * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }	{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), - * (1,SURFACECUBEMAP_LAYERED_LAYERS) }