diff --git a/include/triton/external/CUDA/cuda.h b/include/triton/external/CUDA/cuda.h
index 24d96bd6c..f7bf9fc12 100755
--- a/include/triton/external/CUDA/cuda.h
+++ b/include/triton/external/CUDA/cuda.h
@@ -60,9 +60,6 @@ typedef uint32_t cuuint32_t;
 typedef uint64_t cuuint64_t;
 #endif
 
-/**
- * CUDA API versioning support
- */
 #if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
 #define __CUDA_DEPRECATED
 #elif defined(_MSC_VER)
@@ -74,14 +71,8 @@ typedef uint64_t cuuint64_t;
 #endif
 
 #if defined(CUDA_FORCE_API_VERSION)
-    #if (CUDA_FORCE_API_VERSION == 3010)
-        #define __CUDA_API_VERSION 3010
-    #else
-        #error "Unsupported value of CUDA_FORCE_API_VERSION"
-    #endif
-#else
-    #define __CUDA_API_VERSION 10000
-#endif /* CUDA_FORCE_API_VERSION */
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
 
 #if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
     #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
@@ -92,74 +83,66 @@ typedef uint64_t cuuint64_t;
     #define __CUDA_API_PTSZ(api) api
 #endif
 
-#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020
-    #define cuDeviceTotalMem                    cuDeviceTotalMem_v2
-    #define cuCtxCreate                         cuCtxCreate_v2
-    #define cuModuleGetGlobal                   cuModuleGetGlobal_v2
-    #define cuMemGetInfo                        cuMemGetInfo_v2
-    #define cuMemAlloc                          cuMemAlloc_v2
-    #define cuMemAllocPitch                     cuMemAllocPitch_v2
-    #define cuMemFree                           cuMemFree_v2
-    #define cuMemGetAddressRange                cuMemGetAddressRange_v2
-    #define cuMemAllocHost                      cuMemAllocHost_v2
-    #define cuMemHostGetDevicePointer           cuMemHostGetDevicePointer_v2
-    #define cuMemcpyHtoD                        __CUDA_API_PTDS(cuMemcpyHtoD_v2)
-    #define cuMemcpyDtoH                        __CUDA_API_PTDS(cuMemcpyDtoH_v2)
-    #define cuMemcpyDtoD                        __CUDA_API_PTDS(cuMemcpyDtoD_v2)
-    #define cuMemcpyDtoA                        __CUDA_API_PTDS(cuMemcpyDtoA_v2)
-    #define cuMemcpyAtoD                        __CUDA_API_PTDS(cuMemcpyAtoD_v2)
-    #define cuMemcpyHtoA                        __CUDA_API_PTDS(cuMemcpyHtoA_v2)
-    #define cuMemcpyAtoH                        __CUDA_API_PTDS(cuMemcpyAtoH_v2)
-    #define cuMemcpyAtoA                        __CUDA_API_PTDS(cuMemcpyAtoA_v2)
-    #define cuMemcpyHtoAAsync                   __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2)
-    #define cuMemcpyAtoHAsync                   __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2)
-    #define cuMemcpy2D                          __CUDA_API_PTDS(cuMemcpy2D_v2)
-    #define cuMemcpy2DUnaligned                 __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2)
-    #define cuMemcpy3D                          __CUDA_API_PTDS(cuMemcpy3D_v2)
-    #define cuMemcpyHtoDAsync                   __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2)
-    #define cuMemcpyDtoHAsync                   __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2)
-    #define cuMemcpyDtoDAsync                   __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
-    #define cuMemcpy2DAsync                     __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
-    #define cuMemcpy3DAsync                     __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
-    #define cuMemsetD8                          __CUDA_API_PTDS(cuMemsetD8_v2)
-    #define cuMemsetD16                         __CUDA_API_PTDS(cuMemsetD16_v2)
-    #define cuMemsetD32                         __CUDA_API_PTDS(cuMemsetD32_v2)
-    #define cuMemsetD2D8                        __CUDA_API_PTDS(cuMemsetD2D8_v2)
-    #define cuMemsetD2D16                       __CUDA_API_PTDS(cuMemsetD2D16_v2)
-    #define cuMemsetD2D32                       __CUDA_API_PTDS(cuMemsetD2D32_v2)
-    #define cuArrayCreate                       cuArrayCreate_v2
-    #define cuArrayGetDescriptor                cuArrayGetDescriptor_v2
-    #define cuArray3DCreate                     cuArray3DCreate_v2
-    #define cuArray3DGetDescriptor              cuArray3DGetDescriptor_v2
-    #define cuTexRefSetAddress                  cuTexRefSetAddress_v2
-    #define cuTexRefGetAddress                  cuTexRefGetAddress_v2
-    #define cuGraphicsResourceGetMappedPointer  cuGraphicsResourceGetMappedPointer_v2
-#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 3020 */
-#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4000
-    #define cuCtxDestroy                        cuCtxDestroy_v2
-    #define cuCtxPopCurrent                     cuCtxPopCurrent_v2
-    #define cuCtxPushCurrent                    cuCtxPushCurrent_v2
-    #define cuStreamDestroy                     cuStreamDestroy_v2
-    #define cuEventDestroy                      cuEventDestroy_v2
-#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4000 */
-#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4010
-    #define cuTexRefSetAddress2D                cuTexRefSetAddress2D_v3
-#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4010 */
-#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050
-    #define cuLinkCreate                        cuLinkCreate_v2
-    #define cuLinkAddData                       cuLinkAddData_v2
-    #define cuLinkAddFile                       cuLinkAddFile_v2
-#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */
-#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050
-    #define cuMemHostRegister                   cuMemHostRegister_v2
-    #define cuGraphicsResourceSetMapFlags       cuGraphicsResourceSetMapFlags_v2
-#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */
-
-#if !defined(__CUDA_API_VERSION_INTERNAL)
-#if defined(__CUDA_API_VERSION) && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010
-    #define cuTexRefSetAddress2D                cuTexRefSetAddress2D_v2
-#endif /* __CUDA_API_VERSION && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 */
-#endif /* __CUDA_API_VERSION_INTERNAL */
+#define cuDeviceTotalMem                    cuDeviceTotalMem_v2
+#define cuCtxCreate                         cuCtxCreate_v2
+#define cuCtxCreate_v3                      cuCtxCreate_v3
+#define cuModuleGetGlobal                   cuModuleGetGlobal_v2
+#define cuMemGetInfo                        cuMemGetInfo_v2
+#define cuMemAlloc                          cuMemAlloc_v2
+#define cuMemAllocPitch                     cuMemAllocPitch_v2
+#define cuMemFree                           cuMemFree_v2
+#define cuMemGetAddressRange                cuMemGetAddressRange_v2
+#define cuMemAllocHost                      cuMemAllocHost_v2
+#define cuMemHostGetDevicePointer           cuMemHostGetDevicePointer_v2
+#define cuMemcpyHtoD                        __CUDA_API_PTDS(cuMemcpyHtoD_v2)
+#define cuMemcpyDtoH                        __CUDA_API_PTDS(cuMemcpyDtoH_v2)
+#define cuMemcpyDtoD                        __CUDA_API_PTDS(cuMemcpyDtoD_v2)
+#define cuMemcpyDtoA                        __CUDA_API_PTDS(cuMemcpyDtoA_v2)
+#define cuMemcpyAtoD                        __CUDA_API_PTDS(cuMemcpyAtoD_v2)
+#define cuMemcpyHtoA                        __CUDA_API_PTDS(cuMemcpyHtoA_v2)
+#define cuMemcpyAtoH                        __CUDA_API_PTDS(cuMemcpyAtoH_v2)
+#define cuMemcpyAtoA                        __CUDA_API_PTDS(cuMemcpyAtoA_v2)
+#define cuMemcpyHtoAAsync                   __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2)
+#define cuMemcpyAtoHAsync                   __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2)
+#define cuMemcpy2D                          __CUDA_API_PTDS(cuMemcpy2D_v2)
+#define cuMemcpy2DUnaligned                 __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2)
+#define cuMemcpy3D                          __CUDA_API_PTDS(cuMemcpy3D_v2)
+#define cuMemcpyHtoDAsync                   __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2)
+#define cuMemcpyDtoHAsync                   __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2)
+#define cuMemcpyDtoDAsync                   __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
+#define cuMemcpy2DAsync                     __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
+#define cuMemcpy3DAsync                     __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
+#define cuMemsetD8                          __CUDA_API_PTDS(cuMemsetD8_v2)
+#define cuMemsetD16                         __CUDA_API_PTDS(cuMemsetD16_v2)
+#define cuMemsetD32                         __CUDA_API_PTDS(cuMemsetD32_v2)
+#define cuMemsetD2D8                        __CUDA_API_PTDS(cuMemsetD2D8_v2)
+#define cuMemsetD2D16                       __CUDA_API_PTDS(cuMemsetD2D16_v2)
+#define cuMemsetD2D32                       __CUDA_API_PTDS(cuMemsetD2D32_v2)
+#define cuArrayCreate                       cuArrayCreate_v2
+#define cuArrayGetDescriptor                cuArrayGetDescriptor_v2
+#define cuArray3DCreate                     cuArray3DCreate_v2
+#define cuArray3DGetDescriptor              cuArray3DGetDescriptor_v2
+#define cuTexRefSetAddress                  cuTexRefSetAddress_v2
+#define cuTexRefGetAddress                  cuTexRefGetAddress_v2
+#define cuGraphicsResourceGetMappedPointer  cuGraphicsResourceGetMappedPointer_v2
+#define cuCtxDestroy                        cuCtxDestroy_v2
+#define cuCtxPopCurrent                     cuCtxPopCurrent_v2
+#define cuCtxPushCurrent                    cuCtxPushCurrent_v2
+#define cuStreamDestroy                     cuStreamDestroy_v2
+#define cuEventDestroy                      cuEventDestroy_v2
+#define cuTexRefSetAddress2D                cuTexRefSetAddress2D_v3
+#define cuLinkCreate                        cuLinkCreate_v2
+#define cuLinkAddData                       cuLinkAddData_v2
+#define cuLinkAddFile                       cuLinkAddFile_v2
+#define cuMemHostRegister                   cuMemHostRegister_v2
+#define cuGraphicsResourceSetMapFlags       cuGraphicsResourceSetMapFlags_v2
+#define cuStreamBeginCapture                __CUDA_API_PTSZ(cuStreamBeginCapture_v2)
+#define cuDevicePrimaryCtxRelease           cuDevicePrimaryCtxRelease_v2
+#define cuDevicePrimaryCtxReset             cuDevicePrimaryCtxReset_v2
+#define cuDevicePrimaryCtxSetFlags          cuDevicePrimaryCtxSetFlags_v2
+#define cuDeviceGetUuid_v2                  cuDeviceGetUuid_v2
+#define cuIpcOpenMemHandle                  cuIpcOpenMemHandle_v2
+#define cuGraphInstantiate                  cuGraphInstantiate_v2
 
 #if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
     #define cuMemcpy                            __CUDA_API_PTDS(cuMemcpy)
@@ -181,14 +164,17 @@ typedef uint64_t cuuint64_t;
     #define cuStreamGetFlags                    __CUDA_API_PTSZ(cuStreamGetFlags)
     #define cuStreamGetCtx                      __CUDA_API_PTSZ(cuStreamGetCtx)
     #define cuStreamWaitEvent                   __CUDA_API_PTSZ(cuStreamWaitEvent)
-    #define cuStreamBeginCapture                __CUDA_API_PTSZ(cuStreamBeginCapture)
     #define cuStreamEndCapture                  __CUDA_API_PTSZ(cuStreamEndCapture)
     #define cuStreamIsCapturing                 __CUDA_API_PTSZ(cuStreamIsCapturing)
+    #define cuStreamGetCaptureInfo              __CUDA_API_PTSZ(cuStreamGetCaptureInfo)
+    #define cuStreamGetCaptureInfo_v2           __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2)
+    #define cuStreamUpdateCaptureDependencies   __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies)
     #define cuStreamAddCallback                 __CUDA_API_PTSZ(cuStreamAddCallback)
     #define cuStreamAttachMemAsync              __CUDA_API_PTSZ(cuStreamAttachMemAsync)
     #define cuStreamQuery                       __CUDA_API_PTSZ(cuStreamQuery)
     #define cuStreamSynchronize                 __CUDA_API_PTSZ(cuStreamSynchronize)
     #define cuEventRecord                       __CUDA_API_PTSZ(cuEventRecord)
+    #define cuEventRecordWithFlags              __CUDA_API_PTSZ(cuEventRecordWithFlags)
     #define cuLaunchKernel                      __CUDA_API_PTSZ(cuLaunchKernel)
     #define cuLaunchHostFunc                    __CUDA_API_PTSZ(cuLaunchHostFunc)
     #define cuGraphicsMapResources              __CUDA_API_PTSZ(cuGraphicsMapResources)
@@ -205,7 +191,16 @@ typedef uint64_t cuuint64_t;
     #define cuSignalExternalSemaphoresAsync     __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync)
     #define cuWaitExternalSemaphoresAsync       __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync)
 
+    #define cuGraphUpload                       __CUDA_API_PTSZ(cuGraphUpload)
     #define cuGraphLaunch                       __CUDA_API_PTSZ(cuGraphLaunch)
+    #define cuStreamCopyAttributes              __CUDA_API_PTSZ(cuStreamCopyAttributes)
+    #define cuStreamGetAttribute                __CUDA_API_PTSZ(cuStreamGetAttribute)
+    #define cuStreamSetAttribute                __CUDA_API_PTSZ(cuStreamSetAttribute)
+    #define cuMemMapArrayAsync                  __CUDA_API_PTSZ(cuMemMapArrayAsync)
+
+    #define cuMemFreeAsync                      __CUDA_API_PTSZ(cuMemFreeAsync)
+    #define cuMemAllocAsync                     __CUDA_API_PTSZ(cuMemAllocAsync)
+    #define cuMemAllocFromPoolAsync             __CUDA_API_PTSZ(cuMemAllocFromPoolAsync)
 #endif
 
 /**
@@ -229,7 +224,7 @@ typedef uint64_t cuuint64_t;
 /**
  * CUDA API version number
  */
-#define CUDA_VERSION 10000
+#define CUDA_VERSION 11050
 
 #ifdef __cplusplus
 extern "C" {
@@ -239,34 +234,36 @@ extern "C" {
  * CUDA device pointer
  * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
  */
-#if __CUDA_API_VERSION >= 3020
-
 #if defined(_WIN64) || defined(__LP64__)
-typedef unsigned long long CUdeviceptr;
+typedef unsigned long long CUdeviceptr_v2;
 #else
-typedef unsigned int CUdeviceptr;
+typedef unsigned int CUdeviceptr_v2;
 #endif
+typedef CUdeviceptr_v2 CUdeviceptr;                          /**< CUDA device pointer */
 
-#endif /* __CUDA_API_VERSION >= 3020 */
-
-typedef int CUdevice;                                     /**< CUDA device */
-typedef struct CUctx_st *CUcontext;                       /**< CUDA context */
-typedef struct CUmod_st *CUmodule;                        /**< CUDA module */
-typedef struct CUfunc_st *CUfunction;                     /**< CUDA function */
-typedef struct CUarray_st *CUarray;                       /**< CUDA array */
-typedef struct CUmipmappedArray_st *CUmipmappedArray;     /**< CUDA mipmapped array */
-typedef struct CUtexref_st *CUtexref;                     /**< CUDA texture reference */
-typedef struct CUsurfref_st *CUsurfref;                   /**< CUDA surface reference */
-typedef struct CUevent_st *CUevent;                       /**< CUDA event */
-typedef struct CUstream_st *CUstream;                     /**< CUDA stream */
-typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
-typedef unsigned long long CUtexObject;                   /**< An opaque value that represents a CUDA texture object */
-typedef unsigned long long CUsurfObject;                  /**< An opaque value that represents a CUDA surface object */
-typedef struct CUextMemory_st *CUexternalMemory;          /**< CUDA external memory */
-typedef struct CUextSemaphore_st *CUexternalSemaphore;    /**< CUDA external semaphore */
-typedef struct CUgraph_st *CUgraph;                       /**< CUDA graph */
-typedef struct CUgraphNode_st *CUgraphNode;               /**< CUDA graph node */
-typedef struct CUgraphExec_st *CUgraphExec;               /**< CUDA executable graph */
+typedef int CUdevice_v1;                                     /**< CUDA device */
+typedef CUdevice_v1 CUdevice;                                /**< CUDA device */
+typedef struct CUctx_st *CUcontext;                          /**< CUDA context */
+typedef struct CUmod_st *CUmodule;                           /**< CUDA module */
+typedef struct CUfunc_st *CUfunction;                        /**< CUDA function */
+typedef struct CUarray_st *CUarray;                          /**< CUDA array */
+typedef struct CUmipmappedArray_st *CUmipmappedArray;        /**< CUDA mipmapped array */
+typedef struct CUtexref_st *CUtexref;                        /**< CUDA texture reference */
+typedef struct CUsurfref_st *CUsurfref;                      /**< CUDA surface reference */
+typedef struct CUevent_st *CUevent;                          /**< CUDA event */
+typedef struct CUstream_st *CUstream;                        /**< CUDA stream */
+typedef struct CUgraphicsResource_st *CUgraphicsResource;    /**< CUDA graphics interop resource */
+typedef unsigned long long CUtexObject_v1;                   /**< An opaque value that represents a CUDA texture object */
+typedef CUtexObject_v1 CUtexObject;                          /**< An opaque value that represents a CUDA texture object */
+typedef unsigned long long CUsurfObject_v1;                  /**< An opaque value that represents a CUDA surface object */
+typedef CUsurfObject_v1 CUsurfObject;                        /**< An opaque value that represents a CUDA surface object */ 
+typedef struct CUextMemory_st *CUexternalMemory;             /**< CUDA external memory */
+typedef struct CUextSemaphore_st *CUexternalSemaphore;       /**< CUDA external semaphore */
+typedef struct CUgraph_st *CUgraph;                          /**< CUDA graph */
+typedef struct CUgraphNode_st *CUgraphNode;                  /**< CUDA graph node */
+typedef struct CUgraphExec_st *CUgraphExec;                  /**< CUDA executable graph */
+typedef struct CUmemPoolHandle_st *CUmemoryPool;             /**< CUDA memory pool */
+typedef struct CUuserObject_st *CUuserObject;                /**< CUDA user object for graphs */
 
 #ifndef CU_UUID_HAS_BEEN_DEFINED
 #define CU_UUID_HAS_BEEN_DEFINED
@@ -275,8 +272,6 @@ typedef struct CUuuid_st {                                /**< CUDA definition o
 } CUuuid;
 #endif
 
-#if __CUDA_API_VERSION >= 4010
-
 /**
  * CUDA IPC handle size
  */
@@ -287,14 +282,16 @@ typedef struct CUuuid_st {                                /**< CUDA definition o
  */
 typedef struct CUipcEventHandle_st {
     char reserved[CU_IPC_HANDLE_SIZE];
-} CUipcEventHandle;
+} CUipcEventHandle_v1;
+typedef CUipcEventHandle_v1 CUipcEventHandle;
 
 /**
  * CUDA IPC mem handle
  */
 typedef struct CUipcMemHandle_st {
     char reserved[CU_IPC_HANDLE_SIZE];
-} CUipcMemHandle;
+} CUipcMemHandle_v1;
+typedef CUipcMemHandle_v1 CUipcMemHandle;
 
 /**
  * CUDA Ipc Mem Flags
@@ -303,7 +300,6 @@ typedef enum CUipcMem_flags_enum {
     CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
 } CUipcMem_flags;
 
-#endif
 
 /**
  * CUDA Mem Attach Flags
@@ -326,7 +322,9 @@ typedef enum CUctx_flags_enum {
                                          *  \deprecated This flag was deprecated as of CUDA 4.0
                                          *  and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
     CU_CTX_SCHED_MASK          = 0x07,
-    CU_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
+    CU_CTX_MAP_HOST            = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 
+                                         *  and it no longer has any effect. All contexts 
+                                         *  as of CUDA 3.2 behave as though the flag is enabled. */
     CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
     CU_CTX_FLAGS_MASK          = 0x1f
 } CUctx_flags;
@@ -335,8 +333,8 @@ typedef enum CUctx_flags_enum {
  * Stream creation flags
  */
 typedef enum CUstream_flags_enum {
-    CU_STREAM_DEFAULT      = 0x0, /**< Default stream flag */
-    CU_STREAM_NON_BLOCKING = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+    CU_STREAM_DEFAULT             = 0x0, /**< Default stream flag */
+    CU_STREAM_NON_BLOCKING        = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
 } CUstream_flags;
 
 /**
@@ -369,7 +367,26 @@ typedef enum CUevent_flags_enum {
     CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
 } CUevent_flags;
 
-#if __CUDA_API_VERSION >= 8000
+/**
+ * Event record flags
+ */
+typedef enum CUevent_record_flags_enum {
+    CU_EVENT_RECORD_DEFAULT  = 0x0, /**< Default event record flag */
+    CU_EVENT_RECORD_EXTERNAL = 0x1  /**< When using stream capture, create an event record node
+                                      *  instead of the default behavior.  This flag is invalid
+                                      *  when used outside of capture. */
+} CUevent_record_flags;
+
+/**
+ * Event wait flags
+ */
+typedef enum CUevent_wait_flags_enum {
+    CU_EVENT_WAIT_DEFAULT  = 0x0, /**< Default event wait flag */
+    CU_EVENT_WAIT_EXTERNAL = 0x1  /**< When using stream capture, create an event wait node
+                                    *  instead of the default behavior.  This flag is invalid
+                                    *  when used outside of capture.*/
+} CUevent_wait_flags;
+
 /**
  * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64
  */
@@ -448,8 +465,8 @@ typedef union CUstreamBatchMemOpParams_union {
         unsigned int flags;
     } flushRemoteWrites;
     cuuint64_t pad[6];
-} CUstreamBatchMemOpParams;
-#endif /* __CUDA_API_VERSION >= 8000 */
+} CUstreamBatchMemOpParams_v1;
+typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams;
 
 /**
  * Occupancy calculator flag
@@ -459,6 +476,14 @@ typedef enum CUoccupancy_flags_enum {
     CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1  /**< Assume global caching is enabled and cannot be automatically turned off */
 } CUoccupancy_flags;
 
+/**
+ * Flags for ::cuStreamUpdateCaptureDependencies
+ */
+typedef enum CUstreamUpdateCaptureDependencies_flags_enum {
+    CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */
+    CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1  /**< Replace the dependency set with the new nodes */
+} CUstreamUpdateCaptureDependencies_flags;
+
 /**
  * Array formats
  */
@@ -470,7 +495,34 @@ typedef enum CUarray_format_enum {
     CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
     CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
     CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
-    CU_AD_FORMAT_FLOAT          = 0x20  /**< 32-bit floating point */
+    CU_AD_FORMAT_FLOAT          = 0x20, /**< 32-bit floating point */
+    CU_AD_FORMAT_NV12           = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
+    CU_AD_FORMAT_UNORM_INT8X1   = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT8X2   = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT8X4   = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X1  = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X2  = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X4  = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X1   = 0xc6, /**< 1 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X2   = 0xc7, /**< 2 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X4   = 0xc8, /**< 4 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X1  = 0xc9, /**< 1 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X2  = 0xca, /**< 2 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X4  = 0xcb, /**< 4 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_BC1_UNORM      = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
+    CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC2_UNORM      = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
+    CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC3_UNORM      = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
+    CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC4_UNORM      = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
+    CU_AD_FORMAT_BC4_SNORM      = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */
+    CU_AD_FORMAT_BC5_UNORM      = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
+    CU_AD_FORMAT_BC5_SNORM      = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */
+    CU_AD_FORMAT_BC6H_UF16      = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
+    CU_AD_FORMAT_BC6H_SF16      = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */
+    CU_AD_FORMAT_BC7_UNORM      = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
+    CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e  /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
 } CUarray_format;
 
 /**
@@ -495,112 +547,131 @@ typedef enum CUfilter_mode_enum {
  * Device properties
  */
 typedef enum CUdevice_attribute_enum {
-    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,              /**< Maximum number of threads per block */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                    /**< Maximum block dimension X */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                    /**< Maximum block dimension Y */
-    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                    /**< Maximum block dimension Z */
-    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                     /**< Maximum grid dimension X */
-    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                     /**< Maximum grid dimension Y */
-    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                     /**< Maximum grid dimension Z */
-    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,        /**< Maximum shared memory available per block in bytes */
-    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,            /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
-    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,              /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
-    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                         /**< Warp size in threads */
-    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                         /**< Maximum pitch in bytes allowed by memory copies */
-    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,           /**< Maximum number of 32-bit registers available per block */
-    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,               /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
-    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                        /**< Typical clock frequency in kilohertz */
-    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                 /**< Alignment requirement for textures */
-    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                       /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
-    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,              /**< Number of multiprocessors on device */
-    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,               /**< Specifies whether there is a run time limit on kernels */
-    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                        /**< Device is integrated with host memory */
-    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,               /**< Device can map host memory into CUDA address space */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                      /**< Compute mode (See ::CUcomputemode for details) */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,           /**< Maximum 1D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,           /**< Maximum 2D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,          /**< Maximum 2D texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,           /**< Maximum 3D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,          /**< Maximum 3D texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,           /**< Maximum 3D texture depth */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,   /**< Maximum 2D layered texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,  /**< Maximum 2D layered texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,  /**< Maximum layers in a 2D layered texture */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,     /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,    /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
-    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                 /**< Alignment requirement for surfaces */
-    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                /**< Device can possibly execute multiple kernels concurrently */
-    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                       /**< Device has ECC support enabled */
-    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                        /**< PCI bus ID of the device */
-    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                     /**< PCI device ID of the device */
-    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                        /**< Device is using TCC driver model */
-    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                 /**< Peak memory clock frequency in kilohertz */
-    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,           /**< Global memory bus width in bits */
-    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                     /**< Size of L2 cache in bytes */
-    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,    /**< Maximum resident threads per multiprocessor */
-    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                /**< Number of asynchronous engines */
-    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                /**< Device shares a unified address space with the host */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,   /**< Maximum 1D layered texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,  /**< Maximum layers in a 1D layered texture */
-    CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,                  /**< Deprecated, do not use. */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,    /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,   /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */
-    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                     /**< PCI domain ID of the device */
-    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,           /**< Pitch alignment requirement for textures */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,      /**< Maximum cubemap texture width/height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,  /**< Maximum cubemap layered texture width/height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,           /**< Maximum 1D surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,           /**< Maximum 2D surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,          /**< Maximum 2D surface height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,           /**< Maximum 3D surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,          /**< Maximum 3D surface height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,           /**< Maximum 3D surface depth */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,   /**< Maximum 1D layered surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,  /**< Maximum layers in a 1D layered surface */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,   /**< Maximum 2D layered surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,  /**< Maximum 2D layered surface height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,  /**< Maximum layers in a 2D layered surface */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,      /**< Maximum cubemap surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,  /**< Maximum cubemap layered surface width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,    /**< Maximum 1D linear texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,    /**< Maximum 2D linear texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,   /**< Maximum 2D linear texture height */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,    /**< Maximum 2D linear texture pitch in bytes */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,/**< Maximum mipmapped 2D texture height */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,          /**< Major compute capability version number */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,          /**< Minor compute capability version number */
-    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */
-    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,       /**< Device supports stream priorities */
-    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,         /**< Device supports caching globals in L1 */
-    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,          /**< Device supports caching locals in L1 */
-    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,  /**< Maximum shared memory available per multiprocessor in bytes */
-    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,  /**< Maximum number of 32-bit registers available per multiprocessor */
-    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                    /**< Device can allocate managed memory on this system */
-    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                    /**< Device is on a multi-GPU board */
-    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,           /**< Unique id for a group of devices on the same multi-GPU board */
-    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,       /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
-    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,  /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
-    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,            /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
-    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,         /**< Device can coherently access managed memory concurrently with the CPU */
-    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,      /**< Device supports compute preemption. */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,            /**< ::cuStreamBatchMemOp and related APIs are supported. */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,     /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
-    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,     /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */
-    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
-    CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,   /**< Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice */
-    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */
-    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,           /**< Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
-    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,           /**< Device supports host memory registration via ::cudaHostRegister. */
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,                          /**< Maximum number of threads per block */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                                /**< Maximum block dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                                /**< Maximum block dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                                /**< Maximum block dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                                 /**< Maximum grid dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                                 /**< Maximum grid dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                                 /**< Maximum grid dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,                    /**< Maximum shared memory available per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,                        /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,                          /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                                     /**< Warp size in threads */
+    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                                     /**< Maximum pitch in bytes allowed by memory copies */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,                       /**< Maximum number of 32-bit registers available per block */
+    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,                           /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                                    /**< Typical clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                             /**< Alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                                   /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
+    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,                          /**< Number of multiprocessors on device */
+    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,                           /**< Specifies whether there is a run time limit on kernels */
+    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                                    /**< Device is integrated with host memory */
+    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,                           /**< Device can map host memory into CUDA address space */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                                  /**< Compute mode (See ::CUcomputemode for details) */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,                       /**< Maximum 1D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,                       /**< Maximum 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,                      /**< Maximum 2D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,                       /**< Maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,                      /**< Maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,                       /**< Maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,               /**< Maximum 2D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,              /**< Maximum 2D layered texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,              /**< Maximum layers in a 2D layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,                 /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,                /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,             /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
+    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                             /**< Alignment requirement for surfaces */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                            /**< Device can possibly execute multiple kernels concurrently */
+    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                                   /**< Device has ECC support enabled */
+    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                                    /**< PCI bus ID of the device */
+    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                                 /**< PCI device ID of the device */
+    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                                    /**< Device is using TCC driver model */
+    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                             /**< Peak memory clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,                       /**< Global memory bus width in bits */
+    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                                 /**< Size of L2 cache in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,                /**< Maximum resident threads per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                            /**< Number of asynchronous engines */
+    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                            /**< Device shares a unified address space with the host */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,               /**< Maximum 1D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,              /**< Maximum layers in a 1D layered texture */
+    CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,                              /**< Deprecated, do not use. */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,                /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,               /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,             /**< Alternate maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,            /**< Alternate maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,             /**< Alternate maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                                 /**< PCI domain ID of the device */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,                       /**< Pitch alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,                  /**< Maximum cubemap texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,          /**< Maximum cubemap layered texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,         /**< Maximum layers in a cubemap layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,                       /**< Maximum 1D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,                       /**< Maximum 2D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,                      /**< Maximum 2D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,                       /**< Maximum 3D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,                      /**< Maximum 3D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,                       /**< Maximum 3D surface depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,               /**< Maximum 1D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,              /**< Maximum layers in a 1D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,               /**< Maximum 2D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,              /**< Maximum 2D layered surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,              /**< Maximum layers in a 2D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,                  /**< Maximum cubemap surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,          /**< Maximum cubemap layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,         /**< Maximum layers in a cubemap layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,                /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,                /**< Maximum 2D linear texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,               /**< Maximum 2D linear texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,                /**< Maximum 2D linear texture pitch in bytes */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,             /**< Maximum mipmapped 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,            /**< Maximum mipmapped 2D texture height */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,                      /**< Major compute capability version number */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,                      /**< Minor compute capability version number */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,             /**< Maximum mipmapped 1D texture width */
+    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,                   /**< Device supports stream priorities */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,                     /**< Device supports caching globals in L1 */
+    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,                      /**< Device supports caching locals in L1 */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,          /**< Maximum shared memory available per multiprocessor in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,              /**< Maximum number of 32-bit registers available per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                                /**< Device can allocate managed memory on this system */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                               /**< Device is on a multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,                      /**< Unique id for a group of devices on the same multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,                  /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
+    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,         /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,                        /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,                     /**< Device can coherently access managed memory concurrently with the CPU */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,                  /**< Device supports compute preemption. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,       /**< Device can access host registered memory at the same virtual address as the CPU */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,                        /**< ::cuStreamBatchMemOp and related APIs are supported. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,                 /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,                 /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                            /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,               /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,             /**< Maximum optin shared memory per block */
+    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,                       /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
+    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,                       /**< Device supports host memory registration via ::cudaHostRegister. */
     CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
-    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */
+    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,          /**< The host can directly access managed memory on the device without migration. */
+    CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102,         /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/
+    CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102,         /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,  /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,           /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,       /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106,                /**< Maximum number of blocks per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107,                /**< Device supports compression of memory */
+    CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108,                 /**< Maximum L2 persisting lines capacity setting in bytes. */
+    CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109,                /**< Maximum value of CUaccessPolicyWindow::num_bytes. */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110,      /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111,             /**< Shared memory reserved by CUDA driver per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112,                  /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
+    CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113,            /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
+    CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114,         /**< External timeline semaphore interop is supported on the device */
+    CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115,                       /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116,                    /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117,         /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118,              /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */
+    CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119,               /**< Handle types supported with mempool based IPC */
     CU_DEVICE_ATTRIBUTE_MAX
 } CUdevice_attribute;
 
@@ -618,21 +689,30 @@ typedef struct CUdevprop_st {
     int regsPerBlock;           /**< 32-bit registers available per block */
     int clockRate;              /**< Clock frequency in kilohertz */
     int textureAlign;           /**< Alignment requirement for textures */
-} CUdevprop;
+} CUdevprop_v1;
+typedef CUdevprop_v1 CUdevprop;
 
 /**
  * Pointer information
  */
 typedef enum CUpointer_attribute_enum {
-    CU_POINTER_ATTRIBUTE_CONTEXT = 1,        /**< The ::CUcontext on which a pointer was allocated or registered */
-    CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,    /**< The ::CUmemorytype describing the physical location of a pointer */
-    CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */
-    CU_POINTER_ATTRIBUTE_HOST_POINTER = 4,   /**< The address at which a pointer's memory may be accessed on the host */
-    CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5,     /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */
-    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6,    /**< Synchronize every synchronous memory operation initiated on this region */
-    CU_POINTER_ATTRIBUTE_BUFFER_ID = 7,      /**< A process-wide unique ID for an allocated memory region*/
-    CU_POINTER_ATTRIBUTE_IS_MANAGED = 8,     /**< Indicates if the pointer points to managed memory */
-    CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9  /**< A device ordinal of a device on which a pointer was allocated or registered */
+    CU_POINTER_ATTRIBUTE_CONTEXT = 1,                     /**< The ::CUcontext on which a pointer was allocated or registered */
+    CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,                 /**< The ::CUmemorytype describing the physical location of a pointer */
+    CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3,              /**< The address at which a pointer's memory may be accessed on the device */
+    CU_POINTER_ATTRIBUTE_HOST_POINTER = 4,                /**< The address at which a pointer's memory may be accessed on the host */
+    CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5,                  /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */
+    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6,                 /**< Synchronize every synchronous memory operation initiated on this region */
+    CU_POINTER_ATTRIBUTE_BUFFER_ID = 7,                   /**< A process-wide unique ID for an allocated memory region*/
+    CU_POINTER_ATTRIBUTE_IS_MANAGED = 8,                  /**< Indicates if the pointer points to managed memory */
+    CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9,              /**< A device ordinal of a device on which a pointer was allocated or registered */
+    CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/
+    CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11,           /**< Starting address for this requested pointer */
+    CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12,                 /**< Size of the address range for this requested pointer */
+    CU_POINTER_ATTRIBUTE_MAPPED = 13,                     /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/
+    CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14,       /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/
+    CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16,               /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */
+    CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17              /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/
 } CUpointer_attribute;
 
 /**
@@ -697,13 +777,16 @@ typedef enum CUfunction_attribute_enum {
      * The maximum size in bytes of dynamically-allocated shared memory that can be used by
      * this function. If the user-specified dynamic shared memory size is larger than this
      * value, the launch will fail.
+     * See ::cuFuncSetAttribute
      */
     CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
 
     /**
-     * On devices where the L1 cache and shared memory use the same hardware resources,
-     * this sets the shared memory carveout preference, in percent of the total resources.
+     * On devices where the L1 cache and shared memory use the same hardware resources, 
+     * this sets the shared memory carveout preference, in percent of the total shared memory.
+     * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
      * This is only a hint, and the driver can choose a different ratio if required to execute the function.
+     * See ::cuFuncSetAttribute
      */
     CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
 
@@ -730,12 +813,12 @@ typedef enum CUsharedconfig_enum {
 } CUsharedconfig;
 
 /**
- * Shared memory carveout configurations
+ * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute
  */
 typedef enum CUshared_carveout_enum {
-    CU_SHAREDMEM_CARVEOUT_DEFAULT       = -1,  /** < no preference for shared memory or L1 (default) */
-    CU_SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, /** < prefer maximum available shared memory, minimum L1 cache */
-    CU_SHAREDMEM_CARVEOUT_MAX_L1        = 0    /** < prefer maximum available L1 cache, minimum shared memory */
+    CU_SHAREDMEM_CARVEOUT_DEFAULT       = -1,  /**< No preference for shared memory or L1 (default) */
+    CU_SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, /**< Prefer maximum available shared memory, minimum L1 cache */
+    CU_SHAREDMEM_CARVEOUT_MAX_L1        = 0    /**< Prefer maximum available L1 cache, minimum shared memory */
 } CUshared_carveout;
 
 /**
@@ -947,6 +1030,51 @@ typedef enum CUjit_option_enum
      */
     CU_JIT_GLOBAL_SYMBOL_COUNT,
 
+    /**
+     * Enable link-time optimization (-dlto) for device code (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_LTO,
+
+    /**
+     * Control single-precision denormals (-ftz) support (0: false, default).
+     * 1 : flushes denormal values to zero
+     * 0 : preserves denormal values
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     */
+    CU_JIT_FTZ,
+
+    /**
+     * Control single-precision floating-point division and reciprocals
+     * (-prec-div) support (1: true, default).
+     * 1 : Enables the IEEE round-to-nearest mode
+     * 0 : Enables the fast approximation mode
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     */
+    CU_JIT_PREC_DIV,
+
+    /**
+     * Control single-precision floating-point square root
+     * (-prec-sqrt) support (1: true, default).
+     * 1 : Enables the IEEE round-to-nearest mode
+     * 0 : Enables the fast approximation mode
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     */
+    CU_JIT_PREC_SQRT,
+
+    /**
+     * Enable/Disable the contraction of floating-point multiplies
+     * and adds/subtracts into floating-point multiply-add (-fma)
+     * operations (1: Enable, default; 0: Disable).
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     */
+    CU_JIT_FMA,
+
     CU_JIT_NUM_OPTIONS
 
 } CUjit_option;
@@ -969,8 +1097,10 @@ typedef enum CUjit_target_enum
     CU_TARGET_COMPUTE_61 = 61,       /**< Compute device class 6.1.*/
     CU_TARGET_COMPUTE_62 = 62,       /**< Compute device class 6.2.*/
     CU_TARGET_COMPUTE_70 = 70,       /**< Compute device class 7.0.*/
-
-    CU_TARGET_COMPUTE_75 = 75        /**< Compute device class 7.5.*/
+    CU_TARGET_COMPUTE_72 = 72,       /**< Compute device class 7.2.*/
+    CU_TARGET_COMPUTE_75 = 75,       /**< Compute device class 7.5.*/
+    CU_TARGET_COMPUTE_80 = 80,       /**< Compute device class 8.0.*/
+    CU_TARGET_COMPUTE_86 = 86        /**< Compute device class 8.6.*/
 } CUjit_target;
 
 /**
@@ -1029,12 +1159,16 @@ typedef enum CUjitInputType_enum
      */
     CU_JIT_INPUT_LIBRARY,
 
+    /**
+     * High-level intermediate code for link-time optimization\n
+     * Applicable options: NVVM compiler options, PTX compiler options
+     */
+    CU_JIT_INPUT_NVVM,
+
     CU_JIT_NUM_INPUT_TYPES
 } CUjitInputType;
 
-#if __CUDA_API_VERSION >= 5050
 typedef struct CUlinkState_st *CUlinkState;
-#endif /* __CUDA_API_VERSION >= 5050 */
 
 /**
  * Flags to register a graphics resource
@@ -1078,6 +1212,7 @@ typedef enum CUlimit_enum {
     CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH           = 0x03, /**< GPU device runtime launch synchronize depth */
     CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */
     CU_LIMIT_MAX_L2_FETCH_GRANULARITY         = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
+    CU_LIMIT_PERSISTING_L2_CACHE_SIZE         = 0x06, /**< A size in bytes for L2 persisting lines cache size */
     CU_LIMIT_MAX
 } CUlimit;
 
@@ -1097,14 +1232,42 @@ typedef enum CUresourcetype_enum {
 #define CUDA_CB
 #endif
 
-#if __CUDA_API_VERSION >= 10000
-
 /**
  * CUDA host function
  * \param userData Argument value passed to the function
  */
 typedef void (CUDA_CB *CUhostFn)(void *userData);
 
+/**
+ * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members.
+ */
+typedef enum CUaccessProperty_enum {
+    CU_ACCESS_PROPERTY_NORMAL           = 0,    /**< Normal cache persistence. */
+    CU_ACCESS_PROPERTY_STREAMING        = 1,    /**< Streaming access is less likely to persit from cache. */
+    CU_ACCESS_PROPERTY_PERSISTING       = 2     /**< Persisting access is more likely to persist in cache.*/
+} CUaccessProperty;
+
+/**
+ * Specifies an access policy for a window, a contiguous extent of memory
+ * beginning at base_ptr and ending at base_ptr + num_bytes.
+ * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE.
+ * Partition into many segments and assign segments such that:
+ * sum of "hit segments" / window == approx. ratio.
+ * sum of "miss segments" / window == approx 1-ratio.
+ * Segments and ratio specifications are fitted to the capabilities of
+ * the architecture.
+ * Accesses in a hit segment apply the hitProp access policy.
+ * Accesses in a miss segment apply the missProp access policy.
+ */
+typedef struct CUaccessPolicyWindow_st {
+    void *base_ptr;                     /**< Starting address of the access policy window. CUDA driver may align it. */
+    size_t num_bytes;                   /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */
+    float hitRatio;                     /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */
+    CUaccessProperty hitProp;           /**< ::CUaccessProperty set for hit. */
+    CUaccessProperty missProp;          /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */
+} CUaccessPolicyWindow_v1;
+typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow;
+
 /**
  * GPU kernel node parameters
  */
@@ -1119,7 +1282,8 @@ typedef struct CUDA_KERNEL_NODE_PARAMS_st {
     unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
     void **kernelParams;         /**< Array of pointers to kernel parameters */
     void **extra;                /**< Extra options */
-} CUDA_KERNEL_NODE_PARAMS;
+} CUDA_KERNEL_NODE_PARAMS_v1;
+typedef CUDA_KERNEL_NODE_PARAMS_v1 CUDA_KERNEL_NODE_PARAMS;
 
 /**
  * Memset node parameters
@@ -1129,9 +1293,10 @@ typedef struct CUDA_MEMSET_NODE_PARAMS_st {
     size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
     unsigned int value;                     /**< Value to be set */
     unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
-    size_t width;                           /**< Width in bytes, of the row */
+    size_t width;                           /**< Width of the row in elements */
     size_t height;                          /**< Number of rows */
-} CUDA_MEMSET_NODE_PARAMS;
+} CUDA_MEMSET_NODE_PARAMS_v1;
+typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS;
 
 /**
  * Host node parameters
@@ -1139,21 +1304,51 @@ typedef struct CUDA_MEMSET_NODE_PARAMS_st {
 typedef struct CUDA_HOST_NODE_PARAMS_st {
     CUhostFn fn;    /**< The function to call when the node executes */
     void* userData; /**< Argument to pass to the function */
-} CUDA_HOST_NODE_PARAMS;
+} CUDA_HOST_NODE_PARAMS_v1;
+typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS;
 
 /**
  * Graph node types
  */
 typedef enum CUgraphNodeType_enum {
-    CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */
-    CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */
-    CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */
-    CU_GRAPH_NODE_TYPE_HOST   = 3, /**< Host (executable) node */
-    CU_GRAPH_NODE_TYPE_GRAPH  = 4, /**< Node which executes an embedded graph */
-    CU_GRAPH_NODE_TYPE_EMPTY  = 5, /**< Empty (no-op) node */
-    CU_GRAPH_NODE_TYPE_COUNT
+    CU_GRAPH_NODE_TYPE_KERNEL           = 0, /**< GPU kernel node */
+    CU_GRAPH_NODE_TYPE_MEMCPY           = 1, /**< Memcpy node */
+    CU_GRAPH_NODE_TYPE_MEMSET           = 2, /**< Memset node */
+    CU_GRAPH_NODE_TYPE_HOST             = 3, /**< Host (executable) node */
+    CU_GRAPH_NODE_TYPE_GRAPH            = 4, /**< Node which executes an embedded graph */
+    CU_GRAPH_NODE_TYPE_EMPTY            = 5, /**< Empty (no-op) node */
+    CU_GRAPH_NODE_TYPE_WAIT_EVENT       = 6, /**< External event wait node */
+    CU_GRAPH_NODE_TYPE_EVENT_RECORD     = 7, /**< External event record node */
+    CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */
+    CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT   = 9, /**< External semaphore wait node */
+    CU_GRAPH_NODE_TYPE_MEM_ALLOC        = 10,/**< Memory Allocation Node */
+    CU_GRAPH_NODE_TYPE_MEM_FREE         = 11 /**< Memory Free Node */
 } CUgraphNodeType;
 
+typedef enum CUsynchronizationPolicy_enum {
+    CU_SYNC_POLICY_AUTO = 1,
+    CU_SYNC_POLICY_SPIN = 2,
+    CU_SYNC_POLICY_YIELD = 3,
+    CU_SYNC_POLICY_BLOCKING_SYNC = 4
+} CUsynchronizationPolicy;
+
+/**
+ * Graph kernel node Attributes
+ */
+typedef enum CUkernelNodeAttrID_enum {
+    CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW       = 1,    /**< Identifier for ::CUkernelNodeAttrValue::accessPolicyWindow. */
+    CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE                = 2     /**< Allows a kernel node to be cooperative (see ::cuLaunchCooperativeKernel). */
+} CUkernelNodeAttrID;
+
+/**
+ * Graph kernel node attributes union, used with ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute
+ */
+typedef union CUkernelNodeAttrValue_union {
+    CUaccessPolicyWindow accessPolicyWindow;    /**< Attribute ::CUaccessPolicyWindow. */
+    int cooperative;                            /**< Nonzero indicates a cooperative kernel (see ::cuLaunchCooperativeKernel). */
+} CUkernelNodeAttrValue_v1;
+typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue;
+
 /**
  * Possible stream capture statuses returned by ::cuStreamIsCapturing
  */
@@ -1164,7 +1359,68 @@ typedef enum CUstreamCaptureStatus_enum {
                                                    has been invalidated, but not terminated */
 } CUstreamCaptureStatus;
 
-#endif /* __CUDA_API_VERSION >= 10000 */
+/**
+ * Possible modes for stream capture thread interactions. For more details see
+ * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode
+ */
+typedef enum CUstreamCaptureMode_enum {
+    CU_STREAM_CAPTURE_MODE_GLOBAL       = 0,
+    CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
+    CU_STREAM_CAPTURE_MODE_RELAXED      = 2
+} CUstreamCaptureMode;
+
+/**
+ * Stream Attributes 
+ */
+typedef enum CUstreamAttrID_enum {
+    CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW    = 1,   /**< Identifier for ::CUstreamAttrValue::accessPolicyWindow. */
+    CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY  = 3    /**< ::CUsynchronizationPolicy for work queued up in this stream */
+} CUstreamAttrID;
+
+/**
+ * Stream attributes union, used with ::cuStreamSetAttribute/::cuStreamGetAttribute
+ */
+typedef union CUstreamAttrValue_union {
+    CUaccessPolicyWindow accessPolicyWindow;   /**< Attribute ::CUaccessPolicyWindow. */
+    CUsynchronizationPolicy syncPolicy;        /**< Value for ::CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY. */
+} CUstreamAttrValue_v1;
+typedef CUstreamAttrValue_v1 CUstreamAttrValue;
+
+/**
+ * Flags to specify search options. For more details see ::cuGetProcAddress
+ */
+typedef enum CUdriverProcAddress_flags_enum {
+    CU_GET_PROC_ADDRESS_DEFAULT = 0,                        /**< Default search mode for driver symbols. */
+    CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0,             /**< Search for legacy versions of driver symbols. */
+    CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1  /**< Search for per-thread versions of driver symbols. */ 
+} CUdriverProcAddress_flags;
+
+/**
+ * Execution Affinity Types 
+ */
+typedef enum CUexecAffinityType_enum {
+    CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0,  /**< Create a context with limited SMs. */
+    CU_EXEC_AFFINITY_TYPE_MAX
+} CUexecAffinityType;
+
+/**
+ * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT
+ */
+typedef struct CUexecAffinitySmCount_st {
+    unsigned int val;    /**< The number of SMs the context is limited to use. */
+} CUexecAffinitySmCount_v1;
+typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount;
+
+/**
+ * Execution Affinity Parameters 
+ */
+typedef struct CUexecAffinityParam_st {
+    CUexecAffinityType type;
+    union {
+        CUexecAffinitySmCount smCount;    /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */
+    } param;
+} CUexecAffinityParam_v1;
+typedef CUexecAffinityParam_v1 CUexecAffinityParam;
 
 /**
  * Error codes
@@ -1229,6 +1485,13 @@ typedef enum cudaError_enum {
      */
     CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
 
+    /**
+     * This indicates that the CUDA driver that the application has loaded is a
+     * stub library. Applications that run with the stub rather than a real
+     * driver loaded will result in CUDA API returning this error.
+     */
+    CUDA_ERROR_STUB_LIBRARY                   = 34,
+
     /**
      * This indicates that no CUDA-capable devices were detected by the installed
      * CUDA driver.
@@ -1237,10 +1500,15 @@ typedef enum cudaError_enum {
 
     /**
      * This indicates that the device ordinal supplied by the user does not
-     * correspond to a valid CUDA device.
+     * correspond to a valid CUDA device or that the action requested is
+     * invalid for the specified device.
      */
     CUDA_ERROR_INVALID_DEVICE                 = 101,
 
+    /**
+     * This error indicates that the Grid license is not applied.
+     */
+    CUDA_ERROR_DEVICE_NOT_LICENSED            = 102,
 
     /**
      * This indicates that the device kernel image is invalid. This can also
@@ -1365,7 +1633,25 @@ typedef enum cudaError_enum {
     CUDA_ERROR_JIT_COMPILER_NOT_FOUND         = 221,
 
     /**
-     * This indicates that the device kernel source is invalid.
+     * This indicates that the provided PTX was compiled with an unsupported toolchain.
+     */
+
+    CUDA_ERROR_UNSUPPORTED_PTX_VERSION        = 222,
+
+    /**
+     * This indicates that the PTX JIT compilation was disabled.
+     */
+    CUDA_ERROR_JIT_COMPILATION_DISABLED       = 223,
+
+    /**
+     * This indicates that the ::CUexecAffinityType passed to the API call is not
+     * supported by the active device.
+     */ 
+    CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY      = 224,
+
+    /**
+     * This indicates that the device kernel source is invalid. This includes
+     * compilation/linker errors encountered in device code or user error.
      */
     CUDA_ERROR_INVALID_SOURCE                 = 300,
 
@@ -1403,7 +1689,8 @@ typedef enum cudaError_enum {
 
     /**
      * This indicates that a named symbol was not found. Examples of symbols
-     * are global/constant variable names, texture names, and surface names.
+     * are global/constant variable names, driver function names, texture names,
+     * and surface names.
      */
     CUDA_ERROR_NOT_FOUND                      = 500,
 
@@ -1553,7 +1840,8 @@ typedef enum cudaError_enum {
     /**
      * An exception occurred on the device while executing a kernel. Common
      * causes include dereferencing an invalid device pointer and accessing
-     * out of bounds shared memory.
+     * out of bounds shared memory. Less common cases can be system specific - more
+     * information about these cases can be found in the system specific user guide.
      * This leaves the process in an inconsistent state and any further CUDA work
      * will return the same error. To continue using CUDA, the process must be terminated
      * and relaunched.
@@ -1584,9 +1872,53 @@ typedef enum cudaError_enum {
      * This error indicates that the system is not yet ready to start any CUDA
      * work.  To continue using CUDA, verify the system configuration is in a
      * valid state and all required driver daemons are actively running.
+     * More information about this error can be found in the system specific
+     * user guide.
      */
     CUDA_ERROR_SYSTEM_NOT_READY               = 802,
 
+    /**
+     * This error indicates that there is a mismatch between the versions of
+     * the display driver and the CUDA driver. Refer to the compatibility documentation
+     * for supported versions.
+     */
+    CUDA_ERROR_SYSTEM_DRIVER_MISMATCH         = 803,
+
+    /**
+     * This error indicates that the system was upgraded to run with forward compatibility
+     * but the visible hardware detected by CUDA does not support this configuration.
+     * Refer to the compatibility documentation for the supported hardware matrix or ensure
+     * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
+     * environment variable.
+     */
+    CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
+
+    /**
+     * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
+     */
+    CUDA_ERROR_MPS_CONNECTION_FAILED          = 805,
+
+    /**
+     * This error indicates that the remote procedural call between the MPS server and the MPS client failed.
+     */
+    CUDA_ERROR_MPS_RPC_FAILURE                = 806,
+
+    /**
+     * This error indicates that the MPS server is not ready to accept new MPS client requests.
+     * This error can be returned when the MPS server is in the process of recovering from a fatal failure.
+     */
+    CUDA_ERROR_MPS_SERVER_NOT_READY           = 807,
+
+    /**
+     * This error indicates that the hardware resources required to create MPS client have been exhausted.
+     */
+    CUDA_ERROR_MPS_MAX_CLIENTS_REACHED        = 808,
+
+    /**
+     * This error indicates the the hardware resources required to support device connections have been exhausted.
+     */
+    CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED    = 809,
+
     /**
      * This error indicates that the operation is not permitted when
      * the stream is capturing.
@@ -1635,6 +1967,34 @@ typedef enum cudaError_enum {
      */
     CUDA_ERROR_CAPTURED_EVENT                 = 907,
 
+    /**
+     * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
+     * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
+     * different thread.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD    = 908,
+
+    /**
+     * This error indicates that the timeout specified for the wait operation has lapsed.
+     */
+    CUDA_ERROR_TIMEOUT                        = 909,
+
+    /**
+     * This error indicates that the graph update was not performed because it included 
+     * changes which violated constraints specific to instantiated graph update.
+     */
+    CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE      = 910,
+
+    /**
+     * This indicates that an async error has occurred in a device outside of CUDA.
+     * If CUDA was waiting for an external device's signal before consuming shared data,
+     * the external device signaled an error indicating that the data is not valid for
+     * consumption. This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must be
+     * terminated and relaunched.
+     */
+    CUDA_ERROR_EXTERNAL_DEVICE               = 911,
+
     /**
      * This indicates that an unknown internal error has occurred.
      */
@@ -1648,7 +2008,7 @@ typedef enum CUdevice_P2PAttribute_enum {
     CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK                     = 0x01,  /**< A relative value indicating the performance of the link between two devices */
     CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED                     = 0x02,  /**< P2P Access is enable */
     CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED              = 0x03,  /**< Atomic operation over the link supported */
-    CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED        = 0x04,  /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */
+    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED              = 0x04,  /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */
     CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED          = 0x04   /**< Accessing CUDA arrays over the link supported */
 } CUdevice_P2PAttribute;
 
@@ -1708,15 +2068,25 @@ typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize);
  * On Windows the flag is a no-op.
  * On Linux that memory is marked as non cache-coherent for the GPU and
  * is expected to be physically contiguous. It may return
- * CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
- * CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
- * On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED
+ * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
+ * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
+ * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED
  * is returned.
  * Flag for ::cuMemHostRegister()
  */
 #define CU_MEMHOSTREGISTER_IOMEMORY     0x04
 
-#if __CUDA_API_VERSION >= 3020
+/**
+* If set, the passed memory pointer is treated as pointing to memory that is
+* considered read-only by the device.  On platforms without
+* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
+* required in order to register memory mapped to the CPU as read-only.  Support
+* for the use of this flag can be queried from the device attribute
+* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
+* a current context associated with a device that does not have this attribute
+* set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED.
+*/
+#define CU_MEMHOSTREGISTER_READ_ONLY    0x08
 
 /**
  * 2D memory copy parameters
@@ -1742,7 +2112,8 @@ typedef struct CUDA_MEMCPY2D_st {
 
     size_t WidthInBytes;        /**< Width of 2D memory copy in bytes */
     size_t Height;              /**< Height of 2D memory copy */
-} CUDA_MEMCPY2D;
+} CUDA_MEMCPY2D_v2;
+typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D;
 
 /**
  * 3D memory copy parameters
@@ -1775,7 +2146,8 @@ typedef struct CUDA_MEMCPY3D_st {
     size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
     size_t Height;              /**< Height of 3D memory copy */
     size_t Depth;               /**< Depth of 3D memory copy */
-} CUDA_MEMCPY3D;
+} CUDA_MEMCPY3D_v2;
+typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D;
 
 /**
  * 3D memory cross-context copy parameters
@@ -1808,7 +2180,8 @@ typedef struct CUDA_MEMCPY3D_PEER_st {
     size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
     size_t Height;              /**< Height of 3D memory copy */
     size_t Depth;               /**< Depth of 3D memory copy */
-} CUDA_MEMCPY3D_PEER;
+} CUDA_MEMCPY3D_PEER_v1;
+typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER;
 
 /**
  * Array descriptor
@@ -1820,7 +2193,8 @@ typedef struct CUDA_ARRAY_DESCRIPTOR_st
 
     CUarray_format Format;    /**< Array format */
     unsigned int NumChannels; /**< Channels per array element */
-} CUDA_ARRAY_DESCRIPTOR;
+} CUDA_ARRAY_DESCRIPTOR_v2;
+typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR;
 
 /**
  * 3D array descriptor
@@ -1834,11 +2208,39 @@ typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
     CUarray_format Format;    /**< Array format */
     unsigned int NumChannels; /**< Channels per array element */
     unsigned int Flags;       /**< Flags */
-} CUDA_ARRAY3D_DESCRIPTOR;
+} CUDA_ARRAY3D_DESCRIPTOR_v2;
+typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR;
 
-#endif /* __CUDA_API_VERSION >= 3020 */
+/**
+ * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers
+ */
+#define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1
 
-#if __CUDA_API_VERSION >= 5000
+/**
+ * CUDA array sparse properties
+ */
+typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st {
+    struct {
+        unsigned int width;     /**< Width of sparse tile in elements */
+        unsigned int height;    /**< Height of sparse tile in elements */
+        unsigned int depth;     /**< Depth of sparse tile in elements */
+    } tileExtent;
+
+    /**
+     * First mip level at which the mip tail begins.
+     */
+    unsigned int miptailFirstLevel;
+    /**
+     * Total size of the mip tail.
+     */
+    unsigned long long miptailSize;
+    /**
+     * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL
+     */
+    unsigned int flags;
+    unsigned int reserved[4];
+} CUDA_ARRAY_SPARSE_PROPERTIES_v1;
+typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES;
 
 /**
  * CUDA Resource descriptor
@@ -1874,7 +2276,8 @@ typedef struct CUDA_RESOURCE_DESC_st
     } res;
 
     unsigned int flags;                       /**< Flags (must be zero) */
-} CUDA_RESOURCE_DESC;
+} CUDA_RESOURCE_DESC_v1;
+typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC;
 
 /**
  * Texture descriptor
@@ -1890,7 +2293,8 @@ typedef struct CUDA_TEXTURE_DESC_st {
     float maxMipmapLevelClamp;      /**< Mipmap maximum level clamp */
     float borderColor[4];           /**< Border Color */
     int reserved[12];
-} CUDA_TEXTURE_DESC;
+} CUDA_TEXTURE_DESC_v1;
+typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC;
 
 /**
  * Resource view format
@@ -1948,7 +2352,8 @@ typedef struct CUDA_RESOURCE_VIEW_DESC_st
     unsigned int firstLayer;       /**< First layer index */
     unsigned int lastLayer;        /**< Last layer index */
     unsigned int reserved[16];
-} CUDA_RESOURCE_VIEW_DESC;
+} CUDA_RESOURCE_VIEW_DESC_v1;
+typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC;
 
 /**
  * GPU Direct v3 tokens
@@ -1956,11 +2361,18 @@ typedef struct CUDA_RESOURCE_VIEW_DESC_st
 typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
     unsigned long long p2pToken;
     unsigned int vaSpaceToken;
-} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
+} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1;
+typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
 
-#endif /* __CUDA_API_VERSION >= 5000 */
-
-#if __CUDA_API_VERSION >= 9000
+/**
+* Access flags that specify the level of access the current context's device has
+* on the memory referenced.
+*/
+typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum {
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE      = 0x0,   /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ      = 0x1,   /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3    /**< Read-write access, the device has full read-write access to the memory */
+} CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS;
 
 /**
  * Kernel launch parameters
@@ -1976,11 +2388,8 @@ typedef struct CUDA_LAUNCH_PARAMS_st {
     unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
     CUstream hStream;            /**< Stream identifier */
     void **kernelParams;         /**< Array of pointers to kernel parameters */
-} CUDA_LAUNCH_PARAMS;
-
-#endif /* __CUDA_API_VERSION >= 9000 */
-
-#if __CUDA_API_VERSION >= 10000
+} CUDA_LAUNCH_PARAMS_v1;
+typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS;
 
 /**
  * External memory handle types
@@ -1989,23 +2398,35 @@ typedef enum CUexternalMemoryHandleType_enum {
     /**
      * Handle is an opaque file descriptor
      */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD        = 1,
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
     /**
      * Handle is an opaque shared NT handle
      */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32     = 2,
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
     /**
      * Handle is an opaque, globally shared handle
      */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
     /**
      * Handle is a D3D12 heap object
      */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP       = 4,
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
     /**
      * Handle is a D3D12 committed resource
      */
-    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE   = 5
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
+    /**
+     * Handle is a shared NT handle to a D3D11 resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
+    /**
+     * Handle is a globally shared handle to a D3D11 resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
+    /**
+     * Handle is an NvSciBuf object
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8
 } CUexternalMemoryHandleType;
 
 /**
@@ -2013,6 +2434,37 @@ typedef enum CUexternalMemoryHandleType_enum {
  */
 #define CUDA_EXTERNAL_MEMORY_DEDICATED   0x1
 
+/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
+ * contains this flag, it indicates that signaling an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01
+
+/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
+ * contains this flag, it indicates that waiting on an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02
+
+/**
+ * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application needs signaler specific NvSciSyncAttr
+ * to be filled by ::cuDeviceGetNvSciSyncAttributes.
+ */
+#define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1
+
+/**
+ * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application needs waiter specific NvSciSyncAttr
+ * to be filled by ::cuDeviceGetNvSciSyncAttributes.
+ */
+#define CUDA_NVSCISYNC_ATTR_WAIT 0x2
 /**
  * External memory handle descriptor
  */
@@ -2035,9 +2487,12 @@ typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
          * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
          * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP
          * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
          * Exactly one of 'handle' and 'name' must be non-NULL. If
-         * type is
-         * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * type is one of the following:
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
          * then 'name' must be NULL.
          */
         struct {
@@ -2051,6 +2506,11 @@ typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
              */
             const void *name;
         } win32;
+        /**
+         * A handle representing an NvSciBuf Object. Valid when type
+         * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF
+         */
+        const void *nvSciBufObject;
     } handle;
     /**
      * Size of the memory allocation
@@ -2061,7 +2521,8 @@ typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
      */
     unsigned int flags;
     unsigned int reserved[16];
-} CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+} CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1;
+typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
 
 /**
  * External memory buffer descriptor
@@ -2080,7 +2541,8 @@ typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
      */
     unsigned int flags;
     unsigned int reserved[16];
-} CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+} CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1;
+typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
 
 /**
  * External memory mipmap descriptor
@@ -2100,7 +2562,8 @@ typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
      */
     unsigned int numLevels;
     unsigned int reserved[16];
-} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1;
+typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
 
 /**
  * External semaphore handle types
@@ -2109,19 +2572,43 @@ typedef enum CUexternalSemaphoreHandleType_enum {
     /**
      * Handle is an opaque file descriptor
      */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD        = 1,
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD             = 1,
     /**
      * Handle is an opaque shared NT handle
      */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32     = 2,
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32          = 2,
     /**
      * Handle is an opaque, globally shared handle
      */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT      = 3,
     /**
      * Handle is a shared NT handle referencing a D3D12 fence object
      */
-    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE     = 4
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE           = 4,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 fence object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE           = 5,
+    /**
+     * Opaque handle to NvSciSync Object
+	 */
+	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC             = 6,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 keyed mutex object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX     = 7,
+    /**
+     * Handle is a globally shared handle referencing a D3D11 keyed mutex object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8,
+    /**
+     * Handle is an opaque file descriptor referencing a timeline semaphore
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9,
+    /**
+     * Handle is an opaque shared NT handle referencing a timeline semaphore
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
 } CUexternalSemaphoreHandleType;
 
 /**
@@ -2135,8 +2622,9 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
     union {
         /**
          * File descriptor referencing the semaphore object. Valid
-         * when type is
-         * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD
+         * when type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD
          */
         int fd;
         /**
@@ -2145,9 +2633,13 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
          * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32
          * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
          * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
          * Exactly one of 'handle' and 'name' must be non-NULL. If
-         * type is
-         * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
          * then 'name' must be NULL.
          */
         struct {
@@ -2161,13 +2653,18 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
              */
             const void *name;
         } win32;
+        /**
+         * Valid NvSciSyncObj. Must be non NULL
+         */
+        const void* nvSciSyncObj;
     } handle;
     /**
      * Flags reserved for the future. Must be zero.
      */
     unsigned int flags;
     unsigned int reserved[16];
-} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1;
+typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
 
 /**
  * External semaphore signal parameters
@@ -2183,14 +2680,39 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st {
              */
             unsigned long long value;
         } fence;
-        unsigned int reserved[16];
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType
+             * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to release the mutex with
+             */
+            unsigned long long key;
+        } keyedMutex;
+        unsigned int reserved[12];
     } params;
     /**
-     * Flags reserved for the future. Must be zero.
+     * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to
+     * signal a ::CUexternalSemaphore of type
+     * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
+     * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates
+     * that while signaling the ::CUexternalSemaphore, no memory synchronization
+     * operations should be performed for any external memory object imported
+     * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
+     * For all other types of ::CUexternalSemaphore, flags must be zero.
      */
     unsigned int flags;
     unsigned int reserved[16];
-} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS;
+} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1;
+typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS;
 
 /**
  * External semaphore wait parameters
@@ -2206,17 +2728,400 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
              */
             unsigned long long value;
         } fence;
-        unsigned int reserved[16];
+        /**
+         * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType
+         * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC.
+         */
+        union {
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to acquire the mutex with
+             */
+            unsigned long long key;
+            /**
+             * Timeout in milliseconds to wait to acquire the mutex
+             */
+            unsigned int timeoutMs;
+        } keyedMutex;
+        unsigned int reserved[10];
     } params;
     /**
-     * Flags reserved for the future. Must be zero.
+     * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
+     * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+     * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
+     * which indicates that while waiting for the ::CUexternalSemaphore, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
+     * For all other types of ::CUexternalSemaphore, flags must be zero.
      */
     unsigned int flags;
     unsigned int reserved[16];
-} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS;
+} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1;
+typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS;
 
+/**
+ * Semaphore signal node parameters
+ */
+typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st {
+    CUexternalSemaphore* extSemArray;                         /**< Array of external semaphore handles. */
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */
+    unsigned int numExtSems;                                  /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1;
+typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS;
 
-#endif /* __CUDA_API_VERSION >= 10000 */
+/**
+ * Semaphore wait node parameters
+ */
+typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st {
+    CUexternalSemaphore* extSemArray;                       /**< Array of external semaphore handles. */
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */
+    unsigned int numExtSems;                                /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1;
+typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS;
+
+typedef unsigned long long CUmemGenericAllocationHandle_v1;
+typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
+
+/**
+ * Flags for specifying particular handle types
+ */
+typedef enum CUmemAllocationHandleType_enum {
+    CU_MEM_HANDLE_TYPE_NONE                  = 0x0,  /**< Does not allow any export mechanism. > */
+    CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,  /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */
+    CU_MEM_HANDLE_TYPE_WIN32                 = 0x2,  /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */
+    CU_MEM_HANDLE_TYPE_WIN32_KMT             = 0x4,  /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */
+    CU_MEM_HANDLE_TYPE_MAX                   = 0x7FFFFFFF
+} CUmemAllocationHandleType;
+
+/**
+ * Specifies the memory protection flags for mapping.
+ */
+typedef enum CUmemAccess_flags_enum {
+    CU_MEM_ACCESS_FLAGS_PROT_NONE        = 0x0,  /**< Default, make the address range not accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_READ        = 0x1,  /**< Make the address range read accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_READWRITE   = 0x3,  /**< Make the address range read-write accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_MAX         = 0x7FFFFFFF
+} CUmemAccess_flags;
+
+/**
+ * Specifies the type of location
+ */
+typedef enum CUmemLocationType_enum {
+    CU_MEM_LOCATION_TYPE_INVALID = 0x0,
+    CU_MEM_LOCATION_TYPE_DEVICE  = 0x1,  /**< Location is a device location, thus id is a device ordinal */
+    CU_MEM_LOCATION_TYPE_MAX     = 0x7FFFFFFF
+} CUmemLocationType;
+
+/**
+* Defines the allocation types available
+*/
+typedef enum CUmemAllocationType_enum {
+    CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
+
+    /** This allocation type is 'pinned', i.e. cannot migrate from its current
+      * location while the application is actively using it
+      */
+    CU_MEM_ALLOCATION_TYPE_PINNED  = 0x1,
+    CU_MEM_ALLOCATION_TYPE_MAX     = 0x7FFFFFFF
+} CUmemAllocationType;
+
+/**
+* Flag for requesting different optimal and required granularities for an allocation.
+*/
+typedef enum CUmemAllocationGranularity_flags_enum {
+    CU_MEM_ALLOC_GRANULARITY_MINIMUM     = 0x0,     /**< Minimum required granularity for allocation */
+    CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1      /**< Recommended granularity for allocation for best performance */
+} CUmemAllocationGranularity_flags;
+
+/**
+ * Sparse subresource types
+ */
+typedef enum CUarraySparseSubresourceType_enum {
+    CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
+    CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
+} CUarraySparseSubresourceType;
+
+/**
+ * Memory operation types
+ */
+typedef enum CUmemOperationType_enum {
+    CU_MEM_OPERATION_TYPE_MAP = 1,
+    CU_MEM_OPERATION_TYPE_UNMAP = 2
+} CUmemOperationType;
+
+/**
+ * Memory handle types
+ */
+typedef enum CUmemHandleType_enum {
+    CU_MEM_HANDLE_TYPE_GENERIC = 0
+} CUmemHandleType;
+
+/**
+ * Specifies the CUDA array or CUDA mipmapped array memory mapping information
+ */
+typedef struct CUarrayMapInfo_st {    
+    CUresourcetype resourceType;                    /**< Resource type */
+
+    union {
+        CUmipmappedArray mipmap;
+        CUarray array;
+    } resource;
+
+    CUarraySparseSubresourceType subresourceType;   /**< Sparse subresource type */
+
+    union {
+        struct {
+            unsigned int level;                     /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */            
+            unsigned int layer;                     /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */
+            unsigned int offsetX;                   /**< Starting X offset in elements */
+            unsigned int offsetY;                   /**< Starting Y offset in elements */
+            unsigned int offsetZ;                   /**< Starting Z offset in elements */            
+            unsigned int extentWidth;               /**< Width in elements */
+            unsigned int extentHeight;              /**< Height in elements */
+            unsigned int extentDepth;               /**< Depth in elements */
+        } sparseLevel;
+        struct {
+            unsigned int layer;                     /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */
+            unsigned long long offset;              /**< Offset within mip tail */
+            unsigned long long size;                /**< Extent in bytes */
+        } miptail;
+    } subresource;
+    
+    CUmemOperationType memOperationType;            /**< Memory operation type */
+    CUmemHandleType memHandleType;                  /**< Memory handle type */
+
+    union {
+        CUmemGenericAllocationHandle memHandle;
+    } memHandle;
+    
+    unsigned long long offset;                      /**< Offset within the memory */
+    unsigned int deviceBitMask;                     /**< Device ordinal bit mask */
+    unsigned int flags;                             /**< flags for future use, must be zero now. */
+    unsigned int reserved[2];                       /**< Reserved for future use, must be zero now. */
+} CUarrayMapInfo_v1;
+typedef CUarrayMapInfo_v1 CUarrayMapInfo;
+
+/**
+ * Specifies a memory location.
+ */
+typedef struct CUmemLocation_st {
+    CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */
+    int id;                 /**< identifier for a given this location's ::CUmemLocationType. */
+} CUmemLocation_v1;
+typedef CUmemLocation_v1 CUmemLocation;
+
+/**
+ * Specifies compression attribute for an allocation.
+ */
+typedef enum CUmemAllocationCompType_enum {
+    CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */
+    CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating  compressible memory */
+} CUmemAllocationCompType;
+
+/**
+ * This flag if set indicates that the memory will be used as a tile pool.
+ */
+#define CU_MEM_CREATE_USAGE_TILE_POOL    0x1
+
+/**
+* Specifies the allocation properties for a allocation.
+*/
+typedef struct CUmemAllocationProp_st {
+    /** Allocation type */
+    CUmemAllocationType type;
+    /** requested ::CUmemAllocationHandleType */
+    CUmemAllocationHandleType requestedHandleTypes;
+    /** Location of allocation */
+    CUmemLocation location;
+    /**
+     * Windows-specific POBJECT_ATTRIBUTES required when
+     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This object atributes structure
+     * includes security attributes that define
+     * the scope of which exported allocations may be tranferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void *win32HandleMetaData;
+    struct {
+         /**
+         * Allocation hint for requesting compressible memory.
+         * On devices that support Compute Data Compression, compressible
+         * memory can be used to accelerate accesses to data with unstructured
+         * sparsity and other compressible data patterns. Applications are 
+         * expected to query allocation property of the handle obtained with 
+         * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to 
+         * validate if the obtained allocation is compressible or not. Note that 
+         * compressed memory may not be mappable on all devices.
+         */
+         unsigned char compressionType;
+         unsigned char gpuDirectRDMACapable;
+         /** Bitmask indicating intended usage for this allocation */
+         unsigned short usage;
+         unsigned char reserved[4];
+    } allocFlags;
+} CUmemAllocationProp_v1;
+typedef CUmemAllocationProp_v1 CUmemAllocationProp;
+
+/**
+ * Memory access descriptor
+ */
+typedef struct CUmemAccessDesc_st {
+    CUmemLocation location;        /**< Location on which the request is to change it's accessibility */
+    CUmemAccess_flags flags;       /**< ::CUmemProt accessibility flags to set on the request */
+} CUmemAccessDesc_v1;
+typedef CUmemAccessDesc_v1 CUmemAccessDesc;
+
+typedef enum CUgraphExecUpdateResult_enum {
+    CU_GRAPH_EXEC_UPDATE_SUCCESS                     = 0x0, /**< The update succeeded */
+    CU_GRAPH_EXEC_UPDATE_ERROR                       = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */
+    CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED      = 0x2, /**< The update failed because the topology changed */
+    CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED     = 0x3, /**< The update failed because a node type changed */
+    CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED      = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */
+    CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED    = 0x5, /**< The update failed because the parameters changed in a way that is not supported */
+    CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED         = 0x6, /**< The update failed because something about the node is not supported */
+    CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7 /**< The update failed because the function of a kernel node changed in an unsupported way */
+} CUgraphExecUpdateResult;
+
+/**
+ * CUDA memory pool attributes
+ */
+typedef enum CUmemPool_attribute_enum {
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to use memory asynchronously freed
+     * in another streams as long as a stream ordering dependency
+     * of the allocating stream on the free action exists.
+     * Cuda events and null stream interactions can create the required
+     * stream ordered dependencies. (default enabled)
+     */
+    CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1,
+
+    /**
+     * (value type = int)
+     * Allow reuse of already completed frees when there is no dependency
+     * between the free and allocation. (default enabled)
+     */
+    CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
+
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to insert new stream dependencies
+     * in order to establish the stream ordering required to reuse
+     * a piece of memory released by cuFreeAsync (default enabled).
+     */
+    CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of reserved memory in bytes to hold onto before trying
+     * to release memory back to the OS. When more than the release
+     * threshold bytes of memory are held by the memory pool, the
+     * allocator will try to release memory back to the OS on the
+     * next call to stream, event or context synchronize. (default 0)
+     */
+    CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of backing memory currently allocated for the mempool.
+     */
+    CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of backing memory allocated for the mempool since the
+     * last time it was reset. High watermark can only be reset to zero.
+     */
+    CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory from the pool that is currently in use by the application.
+     */
+    CU_MEMPOOL_ATTR_USED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of the amount of memory from the pool that was in use by the application since
+     * the last time it was reset. High watermark can only be reset to zero.
+     */
+    CU_MEMPOOL_ATTR_USED_MEM_HIGH
+} CUmemPool_attribute;
+
+/**
+ * Specifies the properties of allocations made from the pool.
+ */
+typedef struct CUmemPoolProps_st {
+    CUmemAllocationType allocType;         /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */
+    CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */
+    CUmemLocation location;                /**< Location where allocations should reside. */
+    /**
+     * Windows-specific LPSECURITYATTRIBUTES required when
+     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This security attribute defines
+     * the scope of which exported allocations may be tranferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void *win32SecurityAttributes;
+    unsigned char reserved[64]; /**< reserved for future use, must be 0 */
+} CUmemPoolProps_v1;
+typedef CUmemPoolProps_v1 CUmemPoolProps;
+
+/**
+ * Opaque data for exporting a pool allocation
+ */
+typedef struct CUmemPoolPtrExportData_st {
+    unsigned char reserved[64];
+} CUmemPoolPtrExportData_v1;
+typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData;
+
+/**
+ * Memory allocation node parameters
+ */
+typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_st {
+    /**
+    * in: location where the allocation should reside (specified in ::location).
+    * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported.
+    */
+    CUmemPoolProps poolProps;
+    const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */
+    size_t accessDescCount; /**< in: number of memory access descriptors.  Must not exceed the number of GPUs. */
+    size_t bytesize; /**< in: size in bytes of the requested allocation */
+    CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */
+} CUDA_MEM_ALLOC_NODE_PARAMS;
+
+typedef enum CUgraphMem_attribute_enum {
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently associated with graphs
+     */
+    CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, associated with graphs since the
+     * last time it was reset.  High watermark can only be reset to zero.
+     */
+    CU_GRAPH_MEM_ATTR_USED_MEM_HIGH,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH
+} CUgraphMem_attribute;
 
 /**
  * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only
@@ -2276,6 +3181,12 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
  */
 #define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20
 
+/**
+ * This flag if set indicates that the CUDA array or CUDA mipmapped array
+ * is a sparse CUDA array or CUDA mipmapped array respectively
+ */
+#define CUDA_ARRAY3D_SPARSE 0x40
+
 /**
  * Override the texref format with a format inferred from the array.
  * Flag for ::cuTexRefSetArray()
@@ -2285,22 +3196,28 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
 /**
  * Read the texture as integers rather than promoting the values to floats
  * in the range [0,1].
- * Flag for ::cuTexRefSetFlags()
+ * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
  */
 #define CU_TRSF_READ_AS_INTEGER         0x01
 
 /**
  * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
- * Flag for ::cuTexRefSetFlags()
+ * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
  */
 #define CU_TRSF_NORMALIZED_COORDINATES  0x02
 
 /**
  * Perform sRGB->linear conversion during texture read.
- * Flag for ::cuTexRefSetFlags()
+ * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
  */
 #define CU_TRSF_SRGB  0x10
 
+ /**
+  * Disable any trilinear filtering optimizations.
+  * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+  */
+#define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION  0x20
+
 /**
  * End of array terminator for the \p extra parameter to
  * ::cuLaunchKernel
@@ -2344,8 +3261,86 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
  */
 #define CU_DEVICE_INVALID           ((CUdevice)-2)
 
+/**
+ * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS
+ */
+typedef enum CUflushGPUDirectRDMAWritesOptions_enum {
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST   = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1  /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */
+} CUflushGPUDirectRDMAWritesOptions;
+
+/**
+ * Platform native ordering for GPUDirect RDMA writes
+ */
+typedef enum CUGPUDirectRDMAWritesOrdering_enum {
+    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE        = 0,   /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */
+    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER       = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */
+    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200  /**< Any CUDA device in the system can consistently consume remote writes to this device. */
+} CUGPUDirectRDMAWritesOrdering;
+
+/**
+ * The scopes for ::cuFlushGPUDirectRDMAWrites
+ */
+typedef enum CUflushGPUDirectRDMAWritesScope_enum {
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER       = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200  /**< Blocks until remote writes are visible to all CUDA device contexts. */
+} CUflushGPUDirectRDMAWritesScope;
+ 
+/**
+ * The targets for ::cuFlushGPUDirectRDMAWrites
+ */
+typedef enum CUflushGPUDirectRDMAWritesTarget_enum {
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */
+} CUflushGPUDirectRDMAWritesTarget;
+
+/**
+ * The additional write options for ::cuGraphDebugDotPrint
+ */
+typedef enum CUgraphDebugDot_flags_enum {
+    CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE                        = 1<<0,  /** Output all debug data as if every debug flag is enabled */
+    CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES                  = 1<<1,  /** Use CUDA Runtime structures for output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS             = 1<<2,  /** Adds CUDA_KERNEL_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS             = 1<<3,  /** Adds CUDA_MEMCPY3D values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS             = 1<<4,  /** Adds CUDA_MEMSET_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS               = 1<<5,  /** Adds CUDA_HOST_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS              = 1<<6,  /** Adds CUevent handle from record and wait nodes to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS   = 1<<7,  /** Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS     = 1<<8,  /** Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES         = 1<<9,  /** Adds CUkernelNodeAttrValue values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES                        = 1<<10, /** Adds node handles and every kernel function handle to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS          = 1<<11, /** Adds memory alloc node parameters to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS           = 1<<12  /** Adds memory free node parameters to output */
+} CUgraphDebugDot_flags;
+
+/**
+ * Flags for user objects for graphs
+ */
+typedef enum CUuserObject_flags_enum {
+    CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1  /**< Indicates the destructor execution is not synchronized by any CUDA handle. */
+} CUuserObject_flags;
+
+/**
+ * Flags for retaining user object references for graphs
+ */
+typedef enum CUuserObjectRetain_flags_enum {
+    CU_GRAPH_USER_OBJECT_MOVE = 1  /**< Transfer references from the caller rather than creating new references. */
+} CUuserObjectRetain_flags;
+
+/**
+ * Flags for instantiating a graph
+ */
+typedef enum CUgraphInstantiate_flags_enum {
+    CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH  = 1  /**< Automatically free memory allocated in a graph before relaunching. */
+} CUgraphInstantiate_flags;
+
 /** @} */ /* END CUDA_TYPES */
 
+#if defined(__GNUC__)
+  #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT)
+    #pragma GCC visibility push(default)
+  #endif
+#endif
+
 #ifdef _WIN32
 #define CUDAAPI __stdcall
 #else
@@ -2433,7 +3428,9 @@ CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr);
  * \return
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_INVALID_DEVICE
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
+ * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
  * \notefnerr
  */
 CUresult CUDAAPI cuInit(unsigned int Flags);
@@ -2514,7 +3511,8 @@ CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
  * ::cuDeviceGetName,
  * ::cuDeviceGetUuid,
  * ::cuDeviceGetLuid,
- * ::cuDeviceTotalMem
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport
  */
 CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
 
@@ -2542,6 +3540,7 @@ CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
  * ::cuDeviceGetLuid,
  * ::cuDeviceGet,
  * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
  * ::cudaGetDeviceCount
  */
 CUresult CUDAAPI cuDeviceGetCount(int *count);
@@ -2573,14 +3572,17 @@ CUresult CUDAAPI cuDeviceGetCount(int *count);
  * ::cuDeviceGetCount,
  * ::cuDeviceGet,
  * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
  * ::cudaGetDeviceProperties
  */
 CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
 
-#if __CUDA_API_VERSION >= 9020
 /**
  * \brief Return an UUID for the device
  *
+ * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will
+ * supplant this version in 12.0, which is retained for minor version compatibility.
+ *
  * Returns 16-octets identifing the device \p dev in the structure
  * pointed by the \p uuid.
  *
@@ -2596,6 +3598,37 @@ CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
  * \notefnerr
  *
  * \sa
+ * ::cuDeviceGetUuid_v2
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev);
+
+/**
+ * \brief Return an UUID for the device (11.4+)
+ *
+ * Returns 16-octets identifing the device \p dev in the structure
+ * pointed by the \p uuid. If the device is in MIG mode, returns its
+ * MIG UUID which uniquely identifies the subscribed MIG compute instance.
+ *
+ * \param uuid - Returned UUID
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
  * ::cuDeviceGetAttribute,
  * ::cuDeviceGetCount,
  * ::cuDeviceGetName,
@@ -2604,10 +3637,8 @@ CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
  * ::cuDeviceTotalMem,
  * ::cudaGetDeviceProperties
  */
-CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev);
-#endif
+CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev);
 
-#if defined(_WIN32) && __CUDA_API_VERSION >= 10000
 /**
  * \brief Return an LUID and device node mask for the device
  *
@@ -2632,12 +3663,11 @@ CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev);
  * ::cuDeviceGetName,
  * ::cuDeviceGet,
  * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
  * ::cudaGetDeviceProperties
  */
 CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev);
-#endif
 
-#if __CUDA_API_VERSION >= 3020
 /**
  * \brief Returns the total amount of memory on the device
  *
@@ -2662,10 +3692,41 @@ CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdev
  * ::cuDeviceGetName,
  * ::cuDeviceGetUuid,
  * ::cuDeviceGet,
+ * ::cuDeviceGetExecAffinitySupport,
  * ::cudaMemGetInfo
  */
 CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
-#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size.
+ *
+ * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture
+ * for given \p format and \p numChannels.
+ *
+ * \param maxWidthInElements    - Returned maximum number of texture elements allocatable for given \p format and \p numChannels.
+ * \param format                - Texture format.
+ * \param numChannels           - Number of channels per texture element.
+ * \param dev                   - Device handle.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cudaMemGetInfo,
+ * ::cuDeviceTotalMem
+ */
+CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev);
 
 /**
  * \brief Returns information about the device
@@ -2674,117 +3735,117 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
  * \p dev. The supported attributes are:
  * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
  *   block;
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block;
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block;
- * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block;
- * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid;
- * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid;
- * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid
  * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
- *   shared memory available to a thread block in bytes;
+ *   shared memory available to a thread block in bytes
  * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
- *   __constant__ variables in a CUDA C kernel in bytes;
- * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads;
+ *   __constant__ variables in a CUDA C kernel in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads
  * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
  *   memory copy functions that involve memory regions allocated through
- *   ::cuMemAllocPitch();
+ *   ::cuMemAllocPitch()
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
- *  texture width;
+ *  texture width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
- *  for a 1D texture bound to linear memory;
+ *  for a 1D texture bound to linear memory
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum
- *  mipmapped 1D texture width;
+ *  mipmapped 1D texture width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
- *  texture width;
+ *  texture width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
- *  texture height;
+ *  texture height
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
- *  for a 2D texture bound to linear memory;
+ *  for a 2D texture bound to linear memory
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
- *  for a 2D texture bound to linear memory;
+ *  for a 2D texture bound to linear memory
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
- *  in bytes for a 2D texture bound to linear memory;
+ *  in bytes for a 2D texture bound to linear memory
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum
- *  mipmapped 2D texture width;
+ *  mipmapped 2D texture width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum
- *  mipmapped 2D texture height;
+ *  mipmapped 2D texture height
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
- *  texture width;
+ *  texture width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
- *  texture height;
+ *  texture height
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
- *  texture depth;
+ *  texture depth
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
  *  Alternate maximum 3D texture width, 0 if no alternate
- *  maximum 3D texture size is supported;
+ *  maximum 3D texture size is supported
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
  *  Alternate maximum 3D texture height, 0 if no alternate
- *  maximum 3D texture size is supported;
+ *  maximum 3D texture size is supported
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
  *  Alternate maximum 3D texture depth, 0 if no alternate
- *  maximum 3D texture size is supported;
+ *  maximum 3D texture size is supported
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
- *  Maximum cubemap texture width or height;
+ *  Maximum cubemap texture width or height
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
- *  Maximum 1D layered texture width;
+ *  Maximum 1D layered texture width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
- *   Maximum layers in a 1D layered texture;
+ *   Maximum layers in a 1D layered texture
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
- *  Maximum 2D layered texture width;
+ *  Maximum 2D layered texture width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
- *   Maximum 2D layered texture height;
+ *   Maximum 2D layered texture height
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
- *   Maximum layers in a 2D layered texture;
+ *   Maximum layers in a 2D layered texture
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
- *   Maximum cubemap layered texture width or height;
+ *   Maximum cubemap layered texture width or height
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
- *   Maximum layers in a cubemap layered texture;
+ *   Maximum layers in a cubemap layered texture
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
- *   Maximum 1D surface width;
+ *   Maximum 1D surface width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
- *   Maximum 2D surface width;
+ *   Maximum 2D surface width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
- *   Maximum 2D surface height;
+ *   Maximum 2D surface height
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
- *   Maximum 3D surface width;
+ *   Maximum 3D surface width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
- *   Maximum 3D surface height;
+ *   Maximum 3D surface height
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
- *   Maximum 3D surface depth;
+ *   Maximum 3D surface depth
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
- *   Maximum 1D layered surface width;
+ *   Maximum 1D layered surface width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
- *   Maximum layers in a 1D layered surface;
+ *   Maximum layers in a 1D layered surface
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
- *   Maximum 2D layered surface width;
+ *   Maximum 2D layered surface width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
- *   Maximum 2D layered surface height;
+ *   Maximum 2D layered surface height
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
- *   Maximum layers in a 2D layered surface;
+ *   Maximum layers in a 2D layered surface
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
- *   Maximum cubemap surface width;
+ *   Maximum cubemap surface width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
- *   Maximum cubemap layered surface width;
+ *   Maximum cubemap layered surface width
  * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
- *   Maximum layers in a cubemap layered surface;
+ *   Maximum layers in a cubemap layered surface
  * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
- *   registers available to a thread block;
- * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz;
+ *   registers available to a thread block
+ * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz
  * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
  *   base addresses aligned to ::textureAlign bytes do not need an offset
- *   applied to texture fetches;
+ *   applied to texture fetches
  * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
- *   for 2D texture references bound to pitched memory;
+ *   for 2D texture references bound to pitched memory
  * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
- *   memory between host and device while executing a kernel, or 0 if not;
+ *   memory between host and device while executing a kernel, or 0 if not
  * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
- *   the device;
+ *   the device
  * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
- *   for kernels executed on the device, or 0 if not;
+ *   for kernels executed on the device, or 0 if not
  * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
- *   memory subsystem, or 0 if not;
+ *   memory subsystem, or 0 if not
  * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
- *   memory into the CUDA address space, or 0 if not;
+ *   memory into the CUDA address space, or 0 if not
  * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
  *   in. Available modes are as follows:
  *   - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
@@ -2797,33 +3858,33 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
  *   executing multiple kernels within the same context simultaneously, or 0 if
  *   not. It is not guaranteed that multiple kernels will be resident
  *   on the device concurrently so this feature should not be relied upon for
- *   correctness;
+ *   correctness.
  * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
- *    device, 0 if error correction is disabled or not supported by the device;
- * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device;
+ *    device, 0 if error correction is disabled or not supported by the device
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device
  * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
- *   of the device;
+ *   of the device
  * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device
  * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
- *    is only available on Tesla hardware running Windows Vista or later;
- * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz;
- * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits;
- * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache;
- * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor;
+ *    is only available on Tesla hardware running Windows Vista or later
+ * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz
+ * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits
+ * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor
  * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with
- *   the host, or 0 if not;
- * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number;
- * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number;
+ *   the host, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number
  * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals
- *    in L1 cache, 0 if caching globals in L1 cache is not supported by the device;
+ *    in L1 cache, 0 if caching globals in L1 cache is not supported by the device
  * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals
- *    in L1 cache, 0 if caching locals in L1 cache is not supported by the device;
+ *    in L1 cache, 0 if caching locals in L1 cache is not supported by the device
  * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of
  *   shared memory available to a multiprocessor in bytes; this amount is shared
- *   by all thread blocks simultaneously resident on a multiprocessor;
+ *   by all thread blocks simultaneously resident on a multiprocessor
  * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit
  *   registers available to a multiprocessor; this number is shared by all thread
- *   blocks simultaneously resident on a multiprocessor;
+ *   blocks simultaneously resident on a multiprocessor
  * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory
  *   on this system, 0 if allocating managed memory is not supported by the device on this system.
  * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not.
@@ -2846,6 +3907,23 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
  * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's
  *   page tables.
  * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration.
+ * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED:  Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs
+ * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED:  Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes 
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate.
+ * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. 
+ * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU
+ * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information)
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here.
+ * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC
  *
  * \param pi     - Returned device attribute value
  * \param attrib - Device attribute to query
@@ -2866,11 +3944,144 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
  * ::cuDeviceGetUuid,
  * ::cuDeviceGet,
  * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
  * ::cudaDeviceGetAttribute,
  * ::cudaGetDeviceProperties
  */
 CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
 
+/**
+ * \brief Return NvSciSync attributes that this device can support.
+ *
+ * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that
+ * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList
+ * can be used to create an NvSciSync object that matches this device's capabilities.
+ * 
+ * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is
+ * already set this API will return ::CUDA_ERROR_INVALID_VALUE.
+ * 
+ * The applications should set \p nvSciSyncAttrList to a valid 
+ * NvSciSyncAttrList failing which this API will return
+ * ::CUDA_ERROR_INVALID_HANDLE.
+ * 
+ * The \p flags controls how applications intends to use
+ * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are:
+ * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to 
+ * signal an NvSciSync on this CUDA device.
+ * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to 
+ * wait on an NvSciSync on this CUDA device.
+ *
+ * At least one of these flags must be set, failing which the API
+ * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal
+ * to one another: a developer may set both these flags that allows to
+ * set both wait and signal specific attributes in the same \p nvSciSyncAttrList.
+ *
+ * \param nvSciSyncAttrList     - Return NvSciSync attributes supported.
+ * \param dev                   - Valid Cuda Device to get NvSciSync attributes for.
+ * \param flags                 - flags describing NvSciSync usage.
+ *
+ * \return
+ *
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa
+ * ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags);
+
+/**
+ * \brief Sets the current memory pool of a device
+ *
+ * The memory pool must be local to the specified device.
+ * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device.
+ * By default, a device's current memory pool is its default memory pool.
+ *
+ * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different
+ * than the one the stream runs on. 
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync
+ */
+CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool);
+
+/**
+ * \brief Gets the current mempool for a device
+ *
+ * Returns the last pool provided to ::cuDeviceSetMemPool for this device
+ * or the device's default memory pool if ::cuDeviceSetMemPool has never been called.
+ * By default the current mempool is the default mempool for a device.
+ * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool
+ */
+CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev);
+
+/**
+ * \brief Returns the default mempool of a device
+ *
+ * The default mempool of a device contains device memory from that device.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev);
+
+/**
+ * \brief Blocks until remote writes are visible to the specified scope
+ *
+ * Blocks until GPUDirect RDMA writes to the target context via mappings
+ * created through APIs like nvidia_p2p_get_pages (see
+ * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
+ * visible to the specified scope.
+ *
+ * If the scope equals or lies within the scope indicated by
+ * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call
+ * will be a no-op and can be safely omitted for performance. This can be
+ * determined by comparing the numerical values between the two enums, with
+ * smaller scopes having smaller values.
+ *
+ * Users may query support for this API via
+ * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS.
+ *
+ * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget
+ * \param scope  - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ */
+CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
+
 /** @} */ /* END CUDA_DEVICE */
 
 /**
@@ -3000,20 +4211,19 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *mi
  * @{
  */
 
-#if __CUDA_API_VERSION >= 7000
-
 /**
  * \brief Retain the primary context on the GPU
  *
- * Retains the primary context on the device, creating it if necessary,
- * increasing its usage count. The caller must call
- * ::cuDevicePrimaryCtxRelease() when done using the context.
- * Unlike ::cuCtxCreate() the newly created context is not pushed onto the stack.
+ * Retains the primary context on the device.
+ * Once the user successfully retains the primary context, the primary context
+ * will be active and available to the user until the user releases it
+ * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset().
+ * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack.
  *
- * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
- * the device is ::CU_COMPUTEMODE_PROHIBITED.  The function ::cuDeviceGetAttribute()
- * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode
- * of the device.
+ * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN
+ * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function
+ * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to
+ * determine the compute mode  of the device.
  * The <i>nvidia-smi</i> tool can be used to set the compute mode for
  * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a
  * -h option to it.
@@ -3054,9 +4264,15 @@ CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev);
 /**
  * \brief Release the primary context on the GPU
  *
- * Releases the primary context interop on the device by decreasing the usage
- * count by 1. If the usage drops to 0 the primary context of device \p dev
- * will be destroyed regardless of how many threads it is current to.
+ * Releases the primary context interop on the device.
+ * A retained context should always be released once the user is done using
+ * it. The context is automatically reset once the last reference to it is
+ * released. This behavior is different when the primary context was retained
+ * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary
+ * context remains always active.
+ *
+ * Releasing a primary context that has not been previously retained will
+ * fail with ::CUDA_ERROR_INVALID_CONTEXT.
  *
  * Please note that unlike ::cuCtxDestroy() this method does not pop the context
  * from stack in any circumstances.
@@ -3067,7 +4283,8 @@ CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev);
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_DEINITIALIZED,
  * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_DEVICE
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_CONTEXT
  * \notefnerr
  *
  * \sa ::cuDevicePrimaryCtxRetain,
@@ -3089,8 +4306,7 @@ CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
  * \brief Set flags for the primary context
  *
  * Sets the flags for the primary context on the device overwriting perviously
- * set ones. If the primary context is already created
- * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE is returned.
+ * set ones.
  *
  * The three LSBs of the \p flags parameter can be used to control how the OS
  * thread, which owns the CUDA context at the time of an API call, interacts
@@ -3121,13 +4337,16 @@ CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
  * \e C > \e P, then CUDA will yield to other OS threads when waiting for
  * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
  * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
- * However, on low power devices like Tegra, it always defaults to
- * ::CU_CTX_SCHED_BLOCKING_SYNC.
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
  *
  * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
  * after resizing local memory for a kernel. This can prevent thrashing by
  * local memory allocations when launching many kernels with high local
- * memory usage at the cost of potentially increased memory usage.
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
  *
  * \param dev   - Device for which the primary context flags are set
  * \param flags - New flags for the device
@@ -3138,7 +4357,6 @@ CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
  * ::CUDA_ERROR_NOT_INITIALIZED,
  * ::CUDA_ERROR_INVALID_DEVICE,
  * ::CUDA_ERROR_INVALID_VALUE,
- * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
  * \notefnerr
  *
  * \sa ::cuDevicePrimaryCtxRetain,
@@ -3186,6 +4404,8 @@ CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, i
  * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases.
  * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease()
  * even after resetting the device.
+ * Resetting the primary context does not release it, an application that has
+ * retained the primary context should explicitly release its usage.
  *
  * \param dev - Device for which primary context is destroyed
  *
@@ -3213,10 +4433,38 @@ CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, i
  */
 CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
 
-#endif /* __CUDA_API_VERSION >= 7000 */
-
 /** @} */ /* END CUDA_PRIMARY_CTX */
 
+/**
+ * \brief Returns information about the execution affinity support of the device.
+ *
+ * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev.
+ * The supported types are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device,
+ *   or 0 if not;
+ *
+ * \param pi   - 1 if the execution affinity type \p type is supported by the device, or 0 if not
+ * \param type - Execution affinity type to query
+ * \param dev  - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev);
 
 /**
  * \defgroup CUDA_CTX Context Management
@@ -3227,13 +4475,17 @@ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
  * This section describes the context management functions of the low-level
  * CUDA driver application programming interface.
  *
+ * Please note that some functions are described in
+ * \ref CUDA_PRIMARY_CTX "Primary Context Management" section.
+ *
  * @{
  */
 
-#if __CUDA_API_VERSION >= 3020
 /**
  * \brief Create a CUDA context
  *
+ * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain.
+ *
  * Creates a new CUDA context and associates it with the calling thread. The
  * \p flags parameter is described below. The context is created with a usage
  * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or
@@ -3270,8 +4522,9 @@ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
  * \e C > \e P, then CUDA will yield to other OS threads when waiting for
  * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
  * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
- * However, on low power devices like Tegra, it always defaults to
- * ::CU_CTX_SCHED_BLOCKING_SYNC.
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
  *
  * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
  * This flag must be set in order to allocate pinned host memory that is
@@ -3280,7 +4533,10 @@ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
  * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
  * after resizing local memory for a kernel. This can prevent thrashing by
  * local memory allocations when launching many kernels with high local
- * memory usage at the cost of potentially increased memory usage.
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
  *
  * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
  * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
@@ -3318,9 +4574,114 @@ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
  * ::cuCtxSynchronize
  */
 CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
-#endif /* __CUDA_API_VERSION >= 3020 */
 
-#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Create a CUDA context with execution affinity
+ *
+ * Creates a new CUDA context with execution affinity and associates it with
+ * the calling thread. The \p paramsArray and \p flags parameter are described below.
+ * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must
+ * call ::cuCtxDestroy() or when done using the context. If a context is already
+ * current to the thread, it is supplanted by the newly created context and may
+ * be restored by a subsequent call to ::cuCtxPopCurrent().
+ *
+ * The type and the amount of execution resource the context can use is limited by \p paramsArray
+ * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams
+ * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type,
+ * the latter execution affinity parameter overrides the former execution affinity parameter.
+ * The supported execution affinity types are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
+ *   of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
+ *   rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
+ *   affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
+ *   is only supported under Volta+ MPS.
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ * This flag must be set in order to allocate pinned host memory that is
+ * accessible to the GPU.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ * the compute mode for * devices.
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * \param pctx        - Returned context handle of the new context
+ * \param paramsArray - Execution affinity parameters
+ * \param numParams   - Number of execution affinity parameters
+ * \param flags       - Context creation flags
+ * \param dev         - Device to create context on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::CUexecAffinityParam
+ */
+CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev);
+
 /**
  * \brief Destroy a CUDA context
  *
@@ -3329,6 +4690,13 @@ CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
  * It is the responsibility of the calling function to ensure that no API
  * call issues using \p ctx while ::cuCtxDestroy() is executing.
  *
+ * Destroys and cleans up all resources associated with the context.
+ * It is the caller's responsibility to ensure that the context or its resources
+ * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior.
+ * These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
+ * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref,
+ * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore.
+ *
  * If \p ctx is current to the calling thread then \p ctx will also be
  * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
  * were called).  If \p ctx is current to other threads, then \p ctx will
@@ -3358,9 +4726,7 @@ CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
  * ::cuCtxSynchronize
  */
 CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
-#endif /* __CUDA_API_VERSION >= 4000 */
 
-#if __CUDA_API_VERSION >= 4000
 /**
  * \brief Pushes a context on the current CPU thread
  *
@@ -3472,7 +4838,6 @@ CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_DEINITIALIZED,
  * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
  * \notefnerr
  *
  * \sa
@@ -3482,7 +4847,6 @@ CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
  * ::cudaGetDevice
  */
 CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
-#endif /* __CUDA_API_VERSION >= 4000 */
 
 /**
  * \brief Returns the device ID for the current context
@@ -3514,7 +4878,6 @@ CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
  */
 CUresult CUDAAPI cuCtxGetDevice(CUdevice *device);
 
-#if __CUDA_API_VERSION >= 7000
 /**
  * \brief Returns the flags for the current context
  *
@@ -3535,14 +4898,13 @@ CUresult CUDAAPI cuCtxGetDevice(CUdevice *device);
  * ::cuCtxGetApiVersion,
  * ::cuCtxGetCacheConfig,
  * ::cuCtxGetCurrent,
- * ::cuCtxGetDevice
+ * ::cuCtxGetDevice,
  * ::cuCtxGetLimit,
  * ::cuCtxGetSharedMemConfig,
  * ::cuCtxGetStreamPriorityRange,
  * ::cudaGetDeviceFlags
  */
 CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags);
-#endif /* __CUDA_API_VERSION >= 7000 */
 
 /**
  * \brief Block for a context's tasks to complete
@@ -3588,6 +4950,11 @@ CUresult CUDAAPI cuCtxSynchronize(void);
  * discussed here.
  *
  * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread.
+ *   The driver automatically increases the per-thread stack size
+ *   for each kernel launch as needed. This size isn't reset back to the
+ *   original value after each launch. Setting this value will take effect 
+ *   immediately, and if necessary, the device will block until all preceding 
+ *   requested tasks are complete.
  *
  * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used
  *   by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE
@@ -3610,7 +4977,7 @@ CUresult CUDAAPI cuCtxSynchronize(void);
  *   launch depth of 24. When setting this limit, keep in mind that additional
  *   levels of sync depth require the driver to reserve large amounts of device
  *   memory which can no longer be used for user allocations. If these
- *   reservations of device memory fail, ::cuCtxSetLimit will return
+ *   reservations of device memory fail, ::cuCtxSetLimit() will return
  *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
  *   This limit is only applicable to devices of compute capability 3.5 and
  *   higher. Attempting to set this limit on devices of compute capability less
@@ -3627,13 +4994,21 @@ CUresult CUDAAPI cuCtxSynchronize(void);
  *   runtime, this limit can be increased. Keep in mind that being able to
  *   sustain additional pending launches will require the driver to reserve
  *   larger amounts of device memory upfront which can no longer be used for
- *   allocations. If these reservations fail, ::cuCtxSetLimit will return
+ *   allocations. If these reservations fail, ::cuCtxSetLimit() will return
  *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
  *   This limit is only applicable to devices of compute capability 3.5 and
  *   higher. Attempting to set this limit on devices of compute capability less
  *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
  *   returned.
  *
+ * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity.
+ *   Values can range from 0B to 128B. This is purely a performence hint and
+ *   it can be ignored or clamped depending on the platform.
+ *
+ * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes availabe for
+ *   persisting L2 cache. This is purely a performance hint and it can be
+ *   ignored or clamped depending on the platform.
+ *
  * \param limit - Limit to set
  * \param value - Size of limit
  *
@@ -3675,6 +5050,8 @@ CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
  *   child grid launches to complete.
  * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding
  *   device runtime launches that can be made from this context.
+ * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity.
+ * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes
  *
  * \param limit  - Limit to query
  * \param pvalue - Returned size of limit
@@ -3795,7 +5172,6 @@ CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig);
  */
 CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config);
 
-#if __CUDA_API_VERSION >= 4020
 /**
  * \brief Returns the current shared memory configuration for the current context.
  *
@@ -3890,7 +5266,6 @@ CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
  * ::cudaDeviceSetSharedMemConfig
  */
 CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config);
-#endif
 
 /**
  * \brief Gets the context's API version.
@@ -3970,6 +5345,47 @@ CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
  */
 CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
 
+/**
+ * \brief Resets all persisting lines in cache to normal status.
+ *
+ * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal
+ * status. Takes effect on function return.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuCtxResetPersistingL2Cache(void);
+
+/**
+ * \brief Returns the execution affinity setting for the current context.
+ *
+ * Returns in \p *pExecAffinity the current value of \p type. The supported
+ * ::CUexecAffinityType values are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use.
+ *
+ * \param type          - Execution affinity type to query
+ * \param pExecAffinity - Returned execution affinity
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY
+ * \notefnerr
+ *
+ * \sa
+ * ::CUexecAffinityParam
+ */
+CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
+
+
 /** @} */ /* END CUDA_CTX */
 
 /**
@@ -4097,6 +5513,7 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx);
  * ::CUDA_ERROR_INVALID_CONTEXT,
  * ::CUDA_ERROR_INVALID_VALUE,
  * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
  * ::CUDA_ERROR_NOT_FOUND,
  * ::CUDA_ERROR_OUT_OF_MEMORY,
  * ::CUDA_ERROR_FILE_NOT_FOUND,
@@ -4136,6 +5553,7 @@ CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
  * ::CUDA_ERROR_INVALID_CONTEXT,
  * ::CUDA_ERROR_INVALID_VALUE,
  * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
  * ::CUDA_ERROR_OUT_OF_MEMORY,
  * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
  * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
@@ -4179,6 +5597,7 @@ CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image);
  * ::CUDA_ERROR_INVALID_CONTEXT,
  * ::CUDA_ERROR_INVALID_VALUE,
  * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
  * ::CUDA_ERROR_OUT_OF_MEMORY,
  * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
  * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
@@ -4220,6 +5639,7 @@ CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigne
  * ::CUDA_ERROR_INVALID_CONTEXT,
  * ::CUDA_ERROR_INVALID_VALUE,
  * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
  * ::CUDA_ERROR_NOT_FOUND,
  * ::CUDA_ERROR_OUT_OF_MEMORY,
  * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
@@ -4252,6 +5672,7 @@ CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
  * ::CUDA_ERROR_INVALID_CONTEXT,
  * ::CUDA_ERROR_INVALID_VALUE
  * \notefnerr
+ * \note_destroy_ub
  *
  * \sa ::cuModuleGetFunction,
  * ::cuModuleGetGlobal,
@@ -4293,7 +5714,6 @@ CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
  */
 CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
 
-#if __CUDA_API_VERSION >= 3020
 /**
  * \brief Returns a global pointer from a module
  *
@@ -4328,7 +5748,6 @@ CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const cha
  * ::cudaGetSymbolSize
  */
 CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
-#endif /* __CUDA_API_VERSION >= 3020 */
 
 /**
  * \brief Returns a handle to a texture reference
@@ -4396,8 +5815,6 @@ CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char
  */
 CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
 
-#if __CUDA_API_VERSION >= 5050
-
 /**
  * \brief Creates a pending JIT linker invocation.
  *
@@ -4465,6 +5882,7 @@ cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues
  * ::CUDA_ERROR_INVALID_VALUE,
  * ::CUDA_ERROR_INVALID_IMAGE,
  * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
  * ::CUDA_ERROR_OUT_OF_MEMORY,
  * ::CUDA_ERROR_NO_BINARY_FOR_GPU
  *
@@ -4504,6 +5922,7 @@ cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, c
  * ::CUDA_ERROR_INVALID_VALUE,
  * ::CUDA_ERROR_INVALID_IMAGE,
  * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
  * ::CUDA_ERROR_OUT_OF_MEMORY,
  * ::CUDA_ERROR_NO_BINARY_FOR_GPU
  *
@@ -4556,8 +5975,6 @@ cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut);
 CUresult CUDAAPI
 cuLinkDestroy(CUlinkState state);
 
-#endif /* __CUDA_API_VERSION >= 5050 */
-
 /** @} */ /* END CUDA_MODULE */
 
 
@@ -4573,12 +5990,12 @@ cuLinkDestroy(CUlinkState state);
  * @{
  */
 
-#if __CUDA_API_VERSION >= 3020
 /**
  * \brief Gets free and total memory
  *
- * Returns in \p *free and \p *total respectively, the free and total amount of
- * memory available for allocation by the CUDA context, in bytes.
+ * Returns in \p *total the total amount of memory available to the the current context.
+ * Returns in \p *free the amount of memory on the device that is free according to the OS.
+ * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
  *
  * \param free  - Returned free memory in bytes
  * \param total - Returned total memory in bytes
@@ -4811,7 +6228,6 @@ CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdevic
  * ::cudaMallocHost
  */
 CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize);
-#endif /* __CUDA_API_VERSION >= 3020 */
 
 /**
  * \brief Frees page-locked host memory
@@ -4878,9 +6294,6 @@ CUresult CUDAAPI cuMemFreeHost(void *p);
  * All of these flags are orthogonal to one another: a developer may allocate
  * memory that is portable, mapped and/or write-combined with no restrictions.
  *
- * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in
- * order for the ::CU_MEMHOSTALLOC_DEVICEMAP flag to have any effect.
- *
  * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for
  * devices that do not support mapped pinned memory. The failure is deferred
  * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
@@ -4925,7 +6338,6 @@ CUresult CUDAAPI cuMemFreeHost(void *p);
  */
 CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
 
-#if __CUDA_API_VERSION >= 3020
 /**
  * \brief Passes back device pointer of mapped pinned memory
  *
@@ -4978,7 +6390,6 @@ CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
  * ::cudaHostGetDevicePointer
  */
 CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
-#endif /* __CUDA_API_VERSION >= 3020 */
 
 /**
  * \brief Passes back flags that were used for a pinned allocation
@@ -5007,8 +6418,6 @@ CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned
  */
 CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p);
 
-#if __CUDA_API_VERSION >= 6000
-
 /**
  * \brief Allocates memory that will be automatically managed by the Unified Memory system
  *
@@ -5119,10 +6528,6 @@ CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p);
  */
 CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags);
 
-#endif /* __CUDA_API_VERSION >= 6000 */
-
-#if __CUDA_API_VERSION >= 4010
-
 /**
  * \brief Returns a handle to a compute device
  *
@@ -5201,8 +6606,7 @@ CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
  *
  * IPC functionality is restricted to devices with support for unified
  * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode.
- * IPC functionality is not supported on Tegra platforms.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
  *
  * \param pHandle - Pointer to a user allocated CUipcEventHandle
  *                    in which to return the opaque event handle
@@ -5243,8 +6647,7 @@ CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
  *
  * IPC functionality is restricted to devices with support for unified
  * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode.
- * IPC functionality is not supported on Tegra platforms.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
  *
  * \param phEvent - Returns the imported event
  * \param handle  - Interprocess handle to open
@@ -5287,8 +6690,7 @@ CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle)
  *
  * IPC functionality is restricted to devices with support for unified
  * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode.
- * IPC functionality is not supported on Tegra platforms.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
  *
  * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return
  *                    the handle in.
@@ -5297,6 +6699,7 @@ CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle)
  * \returns
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
  * ::CUDA_ERROR_OUT_OF_MEMORY,
  * ::CUDA_ERROR_MAP_FAILED,
  * ::CUDA_ERROR_INVALID_VALUE
@@ -5327,6 +6730,10 @@ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
  * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened
  * by one ::CUcontext per ::CUdevice per other process.
  *
+ * If the memory handle has already been opened by the current context, the
+ * reference count on the handle is incremented by 1 and the existing device pointer
+ * is returned.
+ *
  * Memory returned from ::cuIpcOpenMemHandle must be freed with
  * ::cuIpcCloseMemHandle.
  *
@@ -5336,8 +6743,7 @@ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
  *
  * IPC functionality is restricted to devices with support for unified
  * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode.
- * IPC functionality is not supported on Tegra platforms.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
  *
  * \param pdptr  - Returned device pointer
  * \param handle - ::CUipcMemHandle to open
@@ -5368,9 +6774,10 @@ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
 CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
 
 /**
- * \brief Close memory mapped with ::cuIpcOpenMemHandle
+ * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle
  *
- * Unmaps memory returnd by ::cuIpcOpenMemHandle. The original allocation
+ * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1.
+ * When the reference count reaches 0, this API unmaps the memory. The original allocation
  * in the exporting process as well as imported mappings in other processes
  * will be unaffected.
  *
@@ -5379,8 +6786,7 @@ CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, u
  *
  * IPC functionality is restricted to devices with support for unified
  * addressing on Linux and Windows operating systems.
- * IPC functionality on Windows is restricted to GPUs in TCC mode.
- * IPC functionality is not supported on Tegra platforms.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
  *
  * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle
  *
@@ -5401,9 +6807,6 @@ CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, u
  */
 CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
 
-#endif /* __CUDA_API_VERSION >= 4010 */
-
-#if __CUDA_API_VERSION >= 4000
 /**
  * \brief Registers an existing host memory range for use by CUDA
  *
@@ -5420,9 +6823,6 @@ CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
  *
  * This function has limited support on Mac OS X. OS 10.7 or higher is required.
  *
- * This function is supported only on I/O coherent devices that have a non-zero value
- * for the device attribute ::CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED.
- *
  * The \p Flags parameter enables different options to be specified that
  * affect the allocation, as follows.
  *
@@ -5437,12 +6837,18 @@ CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
  * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some
  *   I/O memory space, e.g. the PCI Express resource of a 3rd party device.
  *
+ * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory
+ *   that is considered read-only by the device.  On platforms without
+ *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
+ *   required in order to register memory mapped to the CPU as read-only.  Support
+ *   for the use of this flag can be queried from the device attribute
+ *   ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
+ *   a current context associated with a device that does not have this attribute
+ *   set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED.
+ *
  * All of these flags are orthogonal to one another: a developer may page-lock
  * memory that is portable or mapped with no restrictions.
  *
- * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in
- * order for the ::CU_MEMHOSTREGISTER_DEVICEMAP flag to have any effect.
- *
  * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for
  * devices that do not support mapped pinned memory. The failure is deferred
  * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
@@ -5538,6 +6944,7 @@ CUresult CUDAAPI cuMemHostUnregister(void *p);
  * ::CUDA_ERROR_INVALID_VALUE
  * \notefnerr
  * \note_sync
+ * \note_memcpy
  *
  * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
@@ -5585,9 +6992,6 @@ CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
  */
 CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
 
-#endif /* __CUDA_API_VERSION >= 4000 */
-
-#if __CUDA_API_VERSION >= 3020
 /**
  * \brief Copies memory from Host to Device
  *
@@ -5607,6 +7011,7 @@ CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdev
  * ::CUDA_ERROR_INVALID_VALUE
  * \notefnerr
  * \note_sync
+ * \note_memcpy
  *
  * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
@@ -5642,6 +7047,7 @@ CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t
  * ::CUDA_ERROR_INVALID_VALUE
  * \notefnerr
  * \note_sync
+ * \note_memcpy
  *
  * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
@@ -5789,6 +7195,7 @@ CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t sr
  * ::CUDA_ERROR_INVALID_VALUE
  * \notefnerr
  * \note_sync
+ * \note_memcpy
  *
  * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
@@ -5825,6 +7232,7 @@ CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *sr
  * ::CUDA_ERROR_INVALID_VALUE
  * \notefnerr
  * \note_sync
+ * \note_memcpy
  *
  * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
@@ -6374,9 +7782,7 @@ CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
  * ::cudaMemcpy3D
  */
 CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
-#endif /* __CUDA_API_VERSION >= 3020 */
 
-#if __CUDA_API_VERSION >= 4000
 /**
  * \brief Copies memory between contexts
  *
@@ -6426,6 +7832,7 @@ CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
  * \notefnerr
  * \note_async
  * \note_null_stream
+ * \note_memcpy
  *
  * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
@@ -6477,9 +7884,7 @@ CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCoun
  * ::cudaMemcpyPeerAsync
  */
 CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
-#endif /* __CUDA_API_VERSION >= 4000 */
 
-#if __CUDA_API_VERSION >= 3020
 /**
  * \brief Copies memory from Host to Device
  *
@@ -6502,6 +7907,7 @@ CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext,
  * \notefnerr
  * \note_async
  * \note_null_stream
+ * \note_memcpy
  *
  * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
@@ -6542,6 +7948,7 @@ CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, s
  * \notefnerr
  * \note_async
  * \note_null_stream
+ * \note_memcpy
  *
  * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
@@ -6625,6 +8032,7 @@ CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
  * \notefnerr
  * \note_async
  * \note_null_stream
+ * \note_memcpy
  *
  * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
@@ -6666,6 +8074,7 @@ CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const voi
  * \notefnerr
  * \note_async
  * \note_null_stream
+ * \note_memcpy
  *
  * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
  * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
@@ -7025,9 +8434,7 @@ CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
  * ::cudaMemcpy3DAsync
  */
 CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
-#endif /* __CUDA_API_VERSION >= 3020 */
 
-#if __CUDA_API_VERSION >= 4000
 /**
  * \brief Copies memory between contexts asynchronously.
  *
@@ -7053,9 +8460,7 @@ CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
  * ::cudaMemcpy3DPeerAsync
  */
 CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
-#endif /* __CUDA_API_VERSION >= 4000 */
 
-#if __CUDA_API_VERSION >= 3020
 /**
  * \brief Initializes device memory
  *
@@ -7171,7 +8576,7 @@ CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
  * ::cuMemAllocPitch().
  *
  * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
  * \param uc        - Value to set
  * \param Width     - Width of row
  * \param Height    - Number of rows
@@ -7212,7 +8617,7 @@ CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned c
  * ::cuMemAllocPitch().
  *
  * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
  * \param us        - Value to set
  * \param Width     - Width of row
  * \param Height    - Number of rows
@@ -7253,7 +8658,7 @@ CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned
  * ::cuMemAllocPitch().
  *
  * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
  * \param ui        - Value to set
  * \param Width     - Width of row
  * \param Height    - Number of rows
@@ -7403,7 +8808,7 @@ CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t
  * ::cuMemAllocPitch().
  *
  * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
  * \param uc        - Value to set
  * \param Width     - Width of row
  * \param Height    - Number of rows
@@ -7446,7 +8851,7 @@ CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsig
  * ::cuMemAllocPitch().
  *
  * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
  * \param us        - Value to set
  * \param Width     - Width of row
  * \param Height    - Number of rows
@@ -7489,7 +8894,7 @@ CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsi
  * ::cuMemAllocPitch().
  *
  * \param dstDevice - Destination device pointer
- * \param dstPitch  - Pitch of destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
  * \param ui        - Value to set
  * \param Width     - Width of row
  * \param Height    - Number of rows
@@ -7582,7 +8987,7 @@ CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsi
  * float16's:
  * \code
     CUDA_ARRAY_DESCRIPTOR desc;
-    desc.FormatFlags = CU_AD_FORMAT_HALF;
+    desc.Format = CU_AD_FORMAT_HALF;
     desc.NumChannels = 4;
     desc.Width = width;
     desc.Height = height;
@@ -7592,7 +8997,7 @@ CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsi
  * of which is two 8-bit unsigned chars:
  * \code
     CUDA_ARRAY_DESCRIPTOR arrayDesc;
-    desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8;
+    desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
     desc.NumChannels = 2;
     desc.Width = width;
     desc.Height = height;
@@ -7658,8 +9063,88 @@ CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pA
  * ::cudaArrayGetInfo
  */
 CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
-#endif /* __CUDA_API_VERSION >= 3020 */
 
+/**
+ * \brief Returns the layout properties of a sparse CUDA array
+ *
+ * Returns the layout properties of a sparse CUDA array in \p sparseProperties
+ * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE 
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
+ * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero.
+ * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero.
+ * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained
+ * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties 
+ * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
+ * \param[in] array - CUDA array to get the sparse properties of
+ * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array);
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA mipmapped array
+ *
+ * Returns the sparse array layout properties in \p sparseProperties
+ * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE 
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the
+ * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth
+ * is less than that of the tile.
+ * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
+ * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. 
+ * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer.
+ * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
+ * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of
+ * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap);
+
+/**
+ * \brief Gets a CUDA array plane from a CUDA array
+ *
+ * Returns in \p pPlaneArray a CUDA array that represents a single format plane
+ * of the CUDA array \p hArray.
+ *
+ * If \p planeIdx is greater than the maximum number of planes in this array or if the array does
+ * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns
+ * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
+ * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width
+ * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
+ *
+ * \param pPlaneArray   - Returned CUDA array referenced by the \p planeIdx
+ * \param hArray        - Multiplanar CUDA array
+ * \param planeIdx      - Plane index
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuArrayCreate,
+ * ::cudaGetArrayPlane
+ */
+CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
 
 /**
  * \brief Destroys a CUDA array
@@ -7692,7 +9177,6 @@ CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, C
  */
 CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
 
-#if __CUDA_API_VERSION >= 3020
 /**
  * \brief Creates a 3D CUDA array
  *
@@ -7839,7 +9323,7 @@ CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
  * 4x16-bit float16's:
  * \code
     CUDA_ARRAY3D_DESCRIPTOR desc;
-    desc.FormatFlags = CU_AD_FORMAT_HALF;
+    desc.Format = CU_AD_FORMAT_HALF;
     desc.NumChannels = 4;
     desc.Width = width;
     desc.Height = height;
@@ -7910,9 +9394,6 @@ CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR
  * ::cudaArrayGetInfo
  */
 CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
-#endif /* __CUDA_API_VERSION >= 3020 */
-
-#if __CUDA_API_VERSION >= 5000
 
 /**
  * \brief Creates a CUDA mipmapped array
@@ -8111,10 +9592,932 @@ CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray
  */
 CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
 
-#endif /* __CUDA_API_VERSION >= 5000 */
-
 /** @} */ /* END CUDA_MEM */
 
+/**
+ * \defgroup CUDA_VA Virtual Memory Management
+ *
+ * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the virtual memory management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+* \brief Allocate an address range reservation. 
+* 
+* Reserves a virtual address range based on the given parameters, giving
+* the starting address of the range in \p ptr.  This API requires a system that
+* supports UVA.  The size and address parameters must be a multiple of the
+* host page size and the alignment must be a power of two or zero for default
+* alignment.
+*
+* \param[out] ptr       - Resulting pointer to start of virtual address range allocated
+* \param[in]  size      - Size of the reserved virtual address range requested
+* \param[in]  alignment - Alignment of the reserved virtual address range requested
+* \param[in]  addr      - Fixed starting address range requested
+* \param[in]  flags     - Currently unused, must be zero
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_OUT_OF_MEMORY,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemAddressFree
+*/
+CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags);
+
+/**
+* \brief Free an address range reservation.
+* 
+* Frees a virtual address range reserved by cuMemAddressReserve.  The size
+* must match what was given to memAddressReserve and the ptr given must
+* match what was returned from memAddressReserve.
+*
+* \param[in] ptr  - Starting address of the virtual address range to free
+* \param[in] size - Size of the virtual address region to free
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemAddressReserve
+*/
+CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size);
+
+/**
+* \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties
+*
+* This creates a memory allocation on the target device specified through the
+* \p prop strcuture. The created allocation will not have any device or host
+* mappings. The generic memory \p handle for the allocation can be
+* mapped to the address space of calling process via ::cuMemMap. This handle
+* cannot be transmitted directly to other processes (see
+* ::cuMemExportToShareableHandle).  On Windows, the caller must also pass
+* an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which
+* limits or allows access to this handle for a recepient process (see
+* ::CUmemAllocationProp::win32HandleMetaData for more).  The \p size of this
+* allocation must be a multiple of the the value given via
+* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM
+* flag.
+* If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then
+* the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays
+* and sparse CUDA mipmapped arrays.
+* (see ::cuMemMapArrayAsync).
+*
+* \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle.
+* \param[in]  size   - Size of the allocation requested
+* \param[in]  prop   - Properties of the allocation to create.
+* \param[in]  flags  - flags for future use, must be zero now.
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_OUT_OF_MEMORY,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+*
+* \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags);
+
+/**
+* \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate.
+* 
+* Frees the memory that was allocated on a device through cuMemCreate.
+*
+* The memory allocation will be freed when all outstanding mappings to the memory
+* are unmapped and when all outstanding references to the handle (including it's
+* shareable counterparts) are also released. The generic memory handle can be
+* freed when there are still outstanding mappings made with this handle. Each
+* time a recepient process imports a shareable handle, it needs to pair it with
+* ::cuMemRelease for the handle to be freed.  If \p handle is not a valid handle
+* the behavior is undefined. 
+*
+* \param[in] handle Value of handle which was returned previously by cuMemCreate.
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+*
+* \sa ::cuMemCreate
+*/
+CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle);
+
+/**
+* \brief Maps an allocation handle to a reserved virtual address range.
+*
+* Maps bytes of memory represented by \p handle starting from byte \p offset to
+* \p size to address range [\p addr, \p addr + \p size]. This range must be an
+* address reservation previously reserved with ::cuMemAddressReserve, and
+* \p offset + \p size must be less than the size of the memory allocation.
+* Both \p ptr, \p size, and \p offset must be a multiple of the value given via
+* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag.
+* 
+* Please note calling ::cuMemMap does not make the address accessible,
+* the caller needs to update accessibility of a contiguous mapped VA
+* range by calling ::cuMemSetAccess.
+* 
+* Once a recipient process obtains a shareable memory handle
+* from ::cuMemImportFromShareableHandle, the process must
+* use ::cuMemMap to map the memory into its address ranges before
+* setting accessibility with ::cuMemSetAccess.
+*  
+* ::cuMemMap can only create mappings on VA range reservations 
+* that are not currently mapped.
+* 
+* \param[in] ptr    - Address where memory will be mapped. 
+* \param[in] size   - Size of the memory mapping. 
+* \param[in] offset - Offset into the memory represented by 
+*                   - \p handle from which to start mapping
+*                   - Note: currently must be zero.
+* \param[in] handle - Handle to a shareable memory 
+* \param[in] flags  - flags for future use, must be zero now. 
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_OUT_OF_MEMORY,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+*
+* \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags);
+
+/**
+ * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays
+ *
+ * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays.
+ * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count.
+ * The structure ::CUarrayMapInfo is defined as follow:
+ \code
+     typedef struct CUarrayMapInfo_st {
+        CUresourcetype resourceType;                   
+        union {
+            CUmipmappedArray mipmap;
+            CUarray array;
+        } resource;
+
+        CUarraySparseSubresourceType subresourceType;   
+        union {
+            struct {
+                unsigned int level;                     
+                unsigned int layer;                     
+                unsigned int offsetX;                   
+                unsigned int offsetY;                   
+                unsigned int offsetZ;                   
+                unsigned int extentWidth;               
+                unsigned int extentHeight;              
+                unsigned int extentDepth;               
+            } sparseLevel;
+            struct {
+                unsigned int layer;
+                unsigned long long offset;              
+                unsigned long long size;                
+            } miptail;
+        } subresource;
+
+        CUmemOperationType memOperationType;
+        
+        CUmemHandleType memHandleType;                  
+        union {
+            CUmemGenericAllocationHandle memHandle;
+        } memHandle;
+
+        unsigned long long offset;                      
+        unsigned int deviceBitMask;                     
+        unsigned int flags;                             
+        unsigned int reserved[2];                       
+    } CUarrayMapInfo;
+ \endcode
+ *
+ * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on.
+ * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then 
+ * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle.
+ * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using
+ * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE. 
+ * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned.
+ * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY 
+ * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle.
+ * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been
+ * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE.
+ *
+ * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. 
+ * ::CUarraySparseSubresourceType_enum is defined as:
+ \code
+    typedef enum CUarraySparseSubresourceType_enum {
+        CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
+        CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
+    } CUarraySparseSubresourceType;
+ \endcode
+ *
+ * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a
+ * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which
+ * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by 
+ * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type.
+ *
+ * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL
+ * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents.
+ * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY
+ * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively.
+ * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight
+ * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively.
+ * These offsets and extents must be aligned to the corresponding tile dimension.
+ * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise,
+ * must be zero.
+ * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise,
+ * must be zero.
+ * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth
+ * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays.
+ * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties
+ *
+ * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL
+ * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in 
+ * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size.
+ * Both, mip tail offset and mip tail size must be aligned to the tile size. 
+ * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags
+ * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index.
+ * Otherwise, must be zero.
+ * 
+ * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as:
+ \code
+    typedef enum CUmemOperationType_enum {
+        CU_MEM_OPERATION_TYPE_MAP = 1,
+        CU_MEM_OPERATION_TYPE_UNMAP = 2
+    } CUmemOperationType;
+ \endcode
+ * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource 
+ * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. 
+ * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, 
+ * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC.
+ * 
+ * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation
+ * is performed. ::CUarrayMapInfo::memHandle must be NULL.
+ *
+ * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. 
+ * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. 
+ * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match 
+ * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle.
+ *
+ * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ *
+ * \param[in] mapInfoList - List of ::CUarrayMapInfo
+ * \param[in] count       - Count of ::CUarrayMapInfo  in \p mapInfoList
+ * \param[in] hStream     - Stream identifier for the stream to use for map or unmap operations
+ *
+ * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties
+ */
+CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo  *mapInfoList, unsigned int count, CUstream hStream);
+
+/**
+* \brief Unmap the backing memory of a given address range.
+*
+* The range must be the entire contiguous address range that was mapped to.  In
+* other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped
+* by ::cuMemCreate / ::cuMemMap.  Any backing memory allocations will be freed
+* if there are no existing mappings and there are no unreleased memory handles.
+*
+* When ::cuMemUnmap returns successfully the address range is converted to an
+* address reservation and can be used for a future calls to ::cuMemMap.  Any new
+* mapping to this virtual address will need to have access granted through
+* ::cuMemSetAccess, as all mappings start with no accessibility setup.
+*
+* \param[in] ptr  - Starting address for the virtual address range to unmap
+* \param[in] size - Size of the virtual address range to unmap
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+* \note_sync
+*
+* \sa ::cuMemCreate, ::cuMemAddressReserve
+*/
+CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size);
+
+/**
+* \brief Set the access flags for each location specified in \p desc for the given virtual address range
+* 
+* Given the virtual address range via \p ptr and \p size, and the locations
+* in the array given by \p desc and \p count, set the access flags for the
+* target locations.  The range must be a fully mapped address range
+* containing all allocations created by ::cuMemMap / ::cuMemCreate.
+*
+* \param[in] ptr   - Starting address for the virtual address range
+* \param[in] size  - Length of the virtual address range
+* \param[in] desc  - Array of ::CUmemAccessDesc that describe how to change the
+*                  - mapping for each location specified
+* \param[in] count - Number of ::CUmemAccessDesc in \p desc
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+* \note_sync
+*
+* \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap
+*/
+CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count);
+
+/**
+* \brief Get the access \p flags set for the given \p location and \p ptr
+*
+* \param[out] flags   - Flags set for this location
+* \param[in] location - Location in which to check the flags for
+* \param[in] ptr      - Address in which to check the access flags for
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemSetAccess
+*/
+CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr);
+
+/**
+* \brief Exports an allocation to a requested shareable handle type
+*
+* Given a CUDA memory handle, create a shareable memory
+* allocation handle that can be used to share the memory with other
+* processes. The recipient process can convert the shareable handle back into a
+* CUDA memory handle using ::cuMemImportFromShareableHandle and map
+* it with ::cuMemMap. The implementation of what this handle is and how it
+* can be transferred is defined by the requested handle type in \p handleType
+*
+* Once all shareable handles are closed and the allocation is released, the allocated
+* memory referenced will be released back to the OS and uses of the CUDA handle afterward
+* will lead to undefined behavior.
+*
+* This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL)
+* that support importing memory from the shareable type
+*
+* \param[out] shareableHandle - Pointer to the location in which to store the requested handle type
+* \param[in] handle           - CUDA handle for the memory allocation
+* \param[in] handleType       - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter)
+* \param[in] flags            - Reserved, must be zero
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+
+/**
+* \brief Imports an allocation from a requested shareable handle type.
+*
+* If the current process cannot support the memory described by this shareable
+* handle, this API will error as CUDA_ERROR_NOT_SUPPORTED.
+*
+* \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc)
+* created on devices under an SLI group may not be supported, and thus this API will
+* return CUDA_ERROR_NOT_SUPPORTED.
+* There is no guarantee that the contents of \p handle will be the same CUDA memory handle
+* for the same given OS shareable handle, or the same underlying allocation.
+*
+* \param[out] handle       - CUDA Memory handle for the memory allocation.
+* \param[in]  osHandle     - Shareable Handle representing the memory allocation that is to be imported. 
+* \param[in]  shHandleType - handle type of the exported handle ::CUmemAllocationHandleType.
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease
+*/
+CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
+
+/**
+* \brief Calculates either the minimal or recommended granularity 
+*
+* Calculates either the minimal or recommended granularity
+* for a given allocation specification and returns it in granularity.  This
+* granularity can be used as a multiple for alignment, size, or address mapping.
+*
+* \param[out] granularity Returned granularity.
+* \param[in]  prop Property for which to determine the granularity for
+* \param[in]  option Determines which granularity to return
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemCreate, ::cuMemMap
+*/
+CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option);
+
+/**
+* \brief Retrieve the contents of the property structure defining properties for this handle
+*
+* \param[out] prop  - Pointer to a properties structure which will hold the information about this handle
+* \param[in] handle - Handle which to perform the query on
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemCreate, ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle);
+
+/**
+* \brief Given an address \p addr, returns the allocation handle of the backing memory allocation.
+*
+* The handle is guaranteed to be the same handle value used to map the memory. If the address
+* requested is not mapped, the function will fail. The returned handle must be released with
+* corresponding number of calls to ::cuMemRelease.
+*
+* \note The address \p addr, can be any address in a range previously mapped
+* by ::cuMemMap, and not necessarily the start address.
+*
+* \param[out] handle CUDA Memory handle for the backing memory allocation.
+* \param[in] addr Memory address to query, that has been mapped previously.
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap
+*/
+CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr);
+
+/** @} */ /* END CUDA_VA */
+
+/**
+ * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator
+ *
+ * ___MANBRIEF___ Functions for performing allocation and free operations in stream order.
+ *                Functions for controlling the behavior of the underlying allocator.
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream ordered memory allocator exposed by the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ *
+ * \section CUDA_MALLOC_ASYNC_overview overview
+ *
+ * The asynchronous allocator allows the user to allocate and free in stream order.
+ * All asynchronous accesses of the allocation must happen between
+ * the stream executions of the allocation and the free. If the memory is accessed
+ * outside of the promised stream order, a use before allocation / use after free error
+ * will cause undefined behavior.
+ *
+ * The allocator is free to reallocate the memory as long as it can guarantee
+ * that compliant memory accesses will not overlap temporally.
+ * The allocator may refer to internal stream ordering as well as inter-stream dependencies
+ * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee.
+ * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. 
+ *
+ * \section CUDA_MALLOC_ASYNC_support Supported Platforms
+ *
+ * Whether or not a device supports the integrated stream ordered memory allocator
+ * may be queried by calling ::cuDeviceGetAttribute() with the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED
+ */
+
+/**
+ * \brief Frees memory with stream ordered semantics
+ *
+ * Inserts a free operation into \p hStream.
+ * The allocation must not be accessed after stream execution reaches the free.
+ * After this API returns, accessing the memory from any subsequent work launched on the GPU
+ * or querying its pointer attributes results in undefined behavior.
+ *
+ * \note During stream capture, this function results in the creation of a free node and
+ *       must therefore be passed the address of a graph allocation.
+ * 
+ * \param dptr - memory to free
+ * \param hStream - The stream establishing the stream ordering contract. 
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);
+
+/**
+ * \brief Allocates memory with stream ordered semantics
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the memory pool current to the stream's device.
+ *
+ * \note The default memory pool of a device contains device memory from that device.
+ * \note Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs. 
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] dptr    - Returned device pointer
+ * \param[in] bytesize - Number of bytes to allocate
+ * \param[in] hStream  - The stream establishing the stream ordering contract and the memory pool to allocate from
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool,
+ *     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
+ *     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute
+ */
+CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream);
+
+/**
+ * \brief Tries to release memory back to the OS
+ *
+ * Releases memory back to the OS until the pool contains fewer than minBytesToKeep
+ * reserved bytes, or there is no more memory that the allocator can safely release.
+ * The allocator cannot release OS allocations that back outstanding asynchronous allocations.
+ * The OS allocations may happen at different granularity from the user allocations.
+ *
+ * \note: Allocations that have not been freed count as outstanding. 
+ * \note: Allocations that have been asynchronously freed but whose completion has
+ *        not been observed on the host (eg. by a synchronize) can count as outstanding.
+ *
+ * \param[in] pool           - The memory pool to trim
+ * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved,
+ * the TrimTo operation is a no-op.  Otherwise the pool will be guaranteed to have
+ * at least minBytesToKeep bytes reserved after the operation.
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep);
+
+/**
+ * \brief Sets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cuMemFreeAsync (default enabled).
+ * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of backing memory that was
+ *                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
+ * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of used memory that was
+ *                    allocated for the memory pool.
+ *
+ * \param[in] pool  - The memory pool to modify
+ * \param[in] attr  - The attribute to modify
+ * \param[in] value - Pointer to the value to assign
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+
+/**
+ * \brief Gets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cuMemFreeAsync (default enabled).
+ * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t)
+ *                    Amount of backing memory currently allocated for the mempool
+ * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
+ *                    High watermark of backing memory allocated for the mempool since the
+ *                    last time it was reset.
+ * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t)
+ *                    Amount of memory from the pool that is currently in use by the application.
+ * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
+ *                    High watermark of the amount of memory from the pool that was in use by the application.
+ *
+ * \param[in] pool   - The memory pool to get attributes of
+ * \param[in] attr   - The attribute to get 
+ * \param[out] value - Retrieved value
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+
+/**
+ * \brief Controls visibility of pools between devices
+ *
+ * \param[in] pool  - The pool being modified
+ * \param[in] map   - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu.
+ * \param[in] count - Number of descriptors in the map array.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count);
+
+/**
+ * \brief Returns the accessibility of a pool from a device
+ *
+ * Returns the accessibility of the pool's memory from the specified location. 
+ *
+ * \param[out] flags   - the accessibility of the pool from the specified location
+ * \param[in] memPool  - the pool being queried
+ * \param[in] location - the location accessing the pool
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location);
+
+/**
+ * \brief Creates a memory pool
+ *
+ * Creates a CUDA memory pool and returns the handle in \p pool.  The \p poolProps determines
+ * the properties of the pool such as the backing device and IPC capabilities. 
+ *
+ * By default, the pool's memory will be accessible from the device it is allocated on.
+ *
+ * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool,
+ *     ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle
+ */
+CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps);
+
+/**
+ * \brief Destroys the specified memory pool
+ *
+ * If any pointers obtained from this pool haven't been freed or
+ * the pool has free operations that haven't completed
+ * when ::cuMemPoolDestroy is invoked, the function will return immediately and the
+ * resources associated with the pool will be released automatically
+ * once there are no more outstanding allocations. 
+ *
+ * Destroying the current mempool of a device sets the default mempool of
+ * that device as the current mempool for that device.
+ *
+ * \note A device's default memory pool cannot be destroyed.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool,
+ *     ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool);
+
+/**
+ * \brief Allocates memory from a specified pool with stream ordered semantics.
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the specified memory pool.
+ *
+ * \note
+ *    -  The specified memory pool may be from a device different than that of the specified \p hStream. 
+ * 
+ *    -  Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs. 
+ *
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] dptr    - Returned device pointer
+ * \param[in] bytesize - Number of bytes to allocate
+ * \param[in] pool     - The pool to allocate from 
+ * \param[in] hStream  - The stream establishing the stream ordering semantic
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess,
+ *     ::cuMemPoolSetAttribute
+ */
+CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+
+/**
+ * \brief Exports a memory pool to the requested handle type.
+ *
+ * Given an IPC capable mempool, create an OS handle to share the pool with another process.
+ * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle.
+ * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs.
+ * The implementation of what the shareable handle is and how it can be transferred is defined by the requested
+ * handle type.
+ *
+ * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE.
+ *
+ * \param[out] handle_out  - Returned OS handle 
+ * \param[in] pool         - pool to export 
+ * \param[in] handleType   - the type of handle to create 
+ * \param[in] flags        - must be 0 
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer,
+ *     ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync,
+ *     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
+ *     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute
+ */
+CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
+
+/**
+ * \brief imports a memory pool from a shared handle.
+ *
+ * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer.
+ *
+ * \note Imported memory pools do not support creating new allocations.
+ *       As such imported memory pools may not be used in cuDeviceSetMemPool
+ *       or ::cuMemAllocFromPoolAsync calls.
+ *
+ * \param[out] pool_out    - Returned memory pool
+ * \param[in] handle       - OS handle of the pool to open 
+ * \param[in] handleType   - The type of handle being imported 
+ * \param[in] flags        - must be 0 
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer
+ */
+CUresult CUDAAPI cuMemPoolImportFromShareableHandle(
+        CUmemoryPool *pool_out,
+        void *handle,
+        CUmemAllocationHandleType handleType,
+        unsigned long long flags);
+
+/**
+ * \brief Export data to share a memory pool allocation between processes.
+ *
+ * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool.
+ * The recipient process can import the allocation with the ::cuMemPoolImportPointer api.
+ * The data is not a handle and may be shared through any IPC mechanism.
+ *
+ * \param[out] shareData_out - Returned export data  
+ * \param[in] ptr            - pointer to memory being exported
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer
+ */
+CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr);
+
+/**
+ * \brief Import a memory pool allocation from another process.
+ *
+ * Returns in \p ptr_out a pointer to the imported memory.
+ * The imported memory must not be accessed before the allocation operation completes
+ * in the exporting process. The imported memory must be freed from all importing processes before
+ * being freed in the exporting process. The pointer may be freed with cuMemFree
+ * or cuMemFreeAsync.  If cuMemFreeAsync is used, the free must be completed
+ * on the importing process before the free operation on the exporting process.
+ *
+ * \note The cuMemFreeAsync api may be used in the exporting process before
+ *       the cuMemFreeAsync operation completes in its stream as long as the
+ *       cuMemFreeAsync in the exporting process specifies a stream with
+ *       a stream dependency on the importing process's cuMemFreeAsync.
+ *
+ * \param[out] ptr_out  - pointer to imported memory
+ * \param[in] pool      - pool from which to import
+ * \param[in] shareData - data specifying the memory to import
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer
+ */
+CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData);
+
+/** @} */ /* END CUDA_MALLOC_ASYNC */
+
 /**
  * \defgroup CUDA_UNIFIED Unified Addressing
  *
@@ -8208,7 +10611,6 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
  *
  */
 
-#if __CUDA_API_VERSION >= 4000
 /**
  * \brief Returns information about a pointer
  *
@@ -8314,11 +10716,47 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
  *      Returns in \p *data a boolean that indicates whether the pointer points to
  *      managed memory or not.
  *
+ *      If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
  * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL:
  *
  *      Returns in \p *data an integer representing a device ordinal of a device against
  *      which the memory was allocated or registered.
  *
+ * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE:
+ *
+ *      Returns in \p *data a boolean that indicates if this pointer maps to
+ *      an allocation that is suitable for ::cudaIpcGetMemHandle.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR:
+ *
+ *      Returns in \p *data the starting address for the allocation referenced
+ *      by the device pointer \p ptr.  Note that this is not necessarily the
+ *      address of the mapped region, but the address of the mappable address
+ *      range \p ptr references (e.g. from ::cuMemAddressReserve).
+ *
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE:
+ *
+ *      Returns in \p *data the size for the allocation referenced by the device
+ *      pointer \p ptr.  Note that this is not necessarily the size of the mapped
+ *      region, but the size of the mappable address range \p ptr references
+ *      (e.g. from ::cuMemAddressReserve).  To retrieve the size of the mapped
+ *      region, see ::cuMemGetAddressRange
+ *
+ * - ::CU_POINTER_ATTRIBUTE_MAPPED:
+ *
+ *      Returns in \p *data a boolean that indicates if this pointer is in a
+ *      valid address range that is mapped to a backing allocation.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES:
+ *
+ *      Returns a bitmask of the allowed handle types for an allocation that may
+ *      be passed to ::cuMemExportToShareableHandle.
+ * 
+ * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE:
+ * 
+ *      Returns in \p *data the handle to the mempool that the allocation was obtained from.
+ *
  * \par
  *
  * Note that for most allocations in the unified virtual address space
@@ -8362,9 +10800,7 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
  * ::cudaPointerGetAttributes
  */
 CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
-#endif /* __CUDA_API_VERSION >= 4000 */
 
-#if __CUDA_API_VERSION >= 8000
 /**
  * \brief Prefetches memory to the specified destination device
  *
@@ -8639,14 +11075,12 @@ CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range
  * ::CUDA_ERROR_INVALID_DEVICE
  * \notefnerr
  *
- * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise
+ * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise,
  * ::cuMemPrefetchAsync,
  * ::cudaMemRangeGetAttributes
  */
 CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count);
-#endif /* __CUDA_API_VERSION >= 8000 */
 
-#if __CUDA_API_VERSION >= 6000
 /**
  * \brief Set attributes on a previously allocated memory region
  *
@@ -8688,9 +11122,7 @@ CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_r
  * ::cuMemHostUnregister
  */
 CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr);
-#endif /* __CUDA_API_VERSION >= 6000 */
 
-#if __CUDA_API_VERSION >= 7000
 /**
  * \brief Returns information about a pointer.
  *
@@ -8704,6 +11136,12 @@ CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute at
  * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID
  * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED
  * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE
+ * - ::CU_POINTER_ATTRIBUTE_MAPPED
+ * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
+ * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
+ * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
  *
  * \param numAttributes - Number of attributes to query
  * \param attributes    - An array of attributes to query
@@ -8733,7 +11171,6 @@ CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute at
  * ::cudaPointerGetAttributes
  */
 CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr);
-#endif /* __CUDA_API_VERSION >= 7000 */
 
 /** @} */ /* END CUDA_UNIFIED */
 
@@ -8753,7 +11190,9 @@ CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_at
  * \brief Create a stream
  *
  * Creates a stream and returns a handle in \p phStream.  The \p Flags argument
- * determines behaviors of the stream.  Valid values for \p Flags are:
+ * determines behaviors of the stream.
+ *
+ * Valid values for \p Flags are:
  * - ::CU_STREAM_DEFAULT: Default stream creation flag.
  * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created
  *   stream may run concurrently with work in stream 0 (the NULL stream), and that
@@ -8892,8 +11331,6 @@ CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
  */
 CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
 
-#if __CUDA_API_VERSION >= 9020
-
 /**
  * \brief Query the context associated with a stream
  *
@@ -8938,8 +11375,6 @@ CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
  */
 CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
 
-#endif /* __CUDA_API_VERSION >= 9020 */
-
 /**
  * \brief Make a compute stream wait on an event
  *
@@ -8948,9 +11383,15 @@ CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
  * The synchronization will be performed efficiently on the device when applicable.
  * \p hEvent may be from a different context or device than \p hStream.
  *
+ * flags include:
+ * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external
+ *   event node when performing stream capture. This flag is invalid outside
+ *   of stream capture.
+ *
  * \param hStream - Stream to wait
  * \param hEvent  - Event to wait on (may not be NULL)
- * \param Flags   - Parameters for the operation (must be 0)
+ * \param Flags   - See ::CUevent_capture_flags
  *
  * \return
  * ::CUDA_SUCCESS,
@@ -9046,8 +11487,6 @@ CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned in
  */
 CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
 
-#if __CUDA_API_VERSION >= 10000
-
 /**
  * \brief Begins graph capture on a stream
  *
@@ -9056,9 +11495,16 @@ CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback
  * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated
  * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which
  * it was initiated, and it may only be initiated if the stream is not already in capture
- * mode. The capture mode may be queried via ::cuStreamIsCapturing.
+ * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id
+ * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo.
+ *
+ * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be
+ * called on this stream from the same thread.
  *
  * \param hStream - Stream in which to initiate capture
+ * \param mode    - Controls the interaction of this capture sequence with other API
+ *                  calls that are potentially unsafe. For more details see
+ *                  ::cuThreadExchangeStreamCaptureMode.
  *
  * \note Kernels captured using this API must not use texture and surface references.
  *       Reading or writing through any texture or surface reference is undefined
@@ -9074,9 +11520,63 @@ CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback
  * \sa
  * ::cuStreamCreate,
  * ::cuStreamIsCapturing,
- * ::cuStreamEndCapture
+ * ::cuStreamEndCapture,
+ * ::cuThreadExchangeStreamCaptureMode
  */
-CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream);
+CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode);
+
+/**
+ * \brief Swaps the stream capture interaction mode for a thread
+ *
+ * Sets the calling thread's stream capture interaction mode to the value contained
+ * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To
+ * facilitate deterministic behavior across function or module boundaries, callers
+ * are encouraged to use this API in a push-pop fashion: \code
+     CUstreamCaptureMode mode = desiredMode;
+     cuThreadExchangeStreamCaptureMode(&mode);
+     ...
+     cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode
+ * \endcode
+ *
+ * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call
+ * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is
+ * not enqueued asynchronously to a stream, and is not observed by stream capture.
+ * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture
+ * depended on the allocation being replayed whenever the graph is launched, the
+ * captured graph would be invalid.
+ *
+ * Therefore, stream capture places restrictions on API calls that can be made within
+ * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This
+ * behavior can be controlled via this API and flags to ::cuStreamBeginCapture.
+ *
+ * A thread's mode is one of the following:
+ * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has
+ *   an ongoing capture sequence that was not initiated with
+ *   \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread
+ *   has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL,
+ *   this thread is prohibited from potentially unsafe API calls.
+ * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture
+ *   sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited
+ *   from potentially unsafe API calls. Concurrent capture sequences in other threads
+ *   are ignored.
+ * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially
+ *   unsafe API calls. Note that the thread is still prohibited from API calls which
+ *   necessarily conflict with stream capture, for example, attempting ::cuEventQuery
+ *   on an event that was last recorded inside a capture sequence.
+ *
+ * \param mode - Pointer to mode value to swap with the current mode
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBeginCapture
+ */
+CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode);
 
 /**
  * \brief Ends capture on a stream, returning the captured graph
@@ -9086,6 +11586,10 @@ CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream);
  * If capture was invalidated, due to a violation of the rules of stream capture, then
  * a NULL graph will be returned.
  *
+ * If the \p mode argument to ::cuStreamBeginCapture was not
+ * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as
+ * ::cuStreamBeginCapture.
+ *
  * \param hStream - Stream to query
  * \param phGraph - The captured graph
  *
@@ -9093,7 +11597,8 @@ CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream);
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_DEINITIALIZED,
  * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD
  * \notefnerr
  *
  * \sa
@@ -9143,9 +11648,120 @@ CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
  */
 CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
 
-#endif /* __CUDA_API_VERSION >= 10000 */
+/**
+ * \brief Query capture status of a stream
+ *
+ * Note there is a later version of this API, ::cuStreamGetCaptureInfo_v2. It will
+ * supplant this version in 12.0, which is retained for minor version compatibility.
+ *
+ * Query the capture status of a stream and and get an id for 
+ * the capture sequence, which is unique over the lifetime of the process.
+ *
+ * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created 
+ * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
+ *
+ * A valid id is returned only if both of the following are true:
+ * - the call returns CUDA_SUCCESS
+ * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamGetCaptureInfo_v2,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing
+ */
+CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
 
-#if __CUDA_API_VERSION >= 6000
+/**
+ * \brief Query a stream's capture state (11.3+)
+ *
+ * Query stream state related to stream capture.
+ *
+ * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created 
+ * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
+ *
+ * Valid data (other than capture status) is returned only if both of the following are true:
+ * - the call returns CUDA_SUCCESS
+ * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE
+ *
+ * This version of cuStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the
+ * previous version in 12.0. Developers requiring compatibility across minor versions to
+ * CUDA 11.0 (driver version 445) should use ::cuStreamGetCaptureInfo or include a fallback
+ * path.
+ *
+ * \param hStream - The stream to query
+ * \param captureStatus_out - Location to return the capture status of the stream; required
+ * \param id_out - Optional location to return an id for the capture sequence, which is
+ *           unique over the lifetime of the process
+ * \param graph_out - Optional location to return the graph being captured into. All
+ *           operations other than destroy and node removal are permitted on the graph
+ *           while the capture sequence is in progress. This API does not transfer
+ *           ownership of the graph, which is transferred or destroyed at
+ *           ::cuStreamEndCapture. Note that the graph handle may be invalidated before
+ *           end of capture for certain errors. Nodes that are or become
+ *           unreachable from the original stream at ::cuStreamEndCapture due to direct
+ *           actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED.
+ * \param dependencies_out - Optional location to store a pointer to an array of nodes.
+ *           The next node to be captured in the stream will depend on this set of nodes,
+ *           absent operations such as event wait which modify this set. The array pointer
+ *           is valid until the next API call which operates on the stream or until end of
+ *           capture. The node handles may be copied out and are valid until they or the
+ *           graph is destroyed. The driver-owned array may also be passed directly to
+ *           APIs that operate on the graph (not the stream) without copying.
+ * \param numDependencies_out - Optional location to store the size of the array
+ *           returned in dependencies_out.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamGetCaptureInfo,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamUpdateCaptureDependencies
+ */
+CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
+        cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+
+/**
+ * \brief Update the set of dependencies in a capturing stream (11.3+)
+ *
+ * Modifies the dependency set of a capturing stream. The dependency set is the set
+ * of nodes that the next captured node in the stream will depend on.
+ *
+ * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and
+ * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to
+ * the API is added to the existing set or replaces it. A flags value of 0 defaults
+ * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES.
+ *
+ * Nodes that are removed from the dependency set via this API do not result in
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at
+ * ::cuStreamEndCapture.
+ *
+ * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing.
+ *
+ * This API is new in CUDA 11.3. Developers requiring compatibility across minor
+ * versions to CUDA 11.0 should not use this API or provide a fallback.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ *
+ * \sa
+ * ::cuStreamBeginCapture,
+ * ::cuStreamGetCaptureInfo,
+ * ::cuStreamGetCaptureInfo_v2
+ */
+CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
 
 /**
  * \brief Attach memory to a stream asynchronously
@@ -9235,8 +11851,6 @@ CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *ca
  */
 CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
 
-#endif /* __CUDA_API_VERSION >= 6000 */
-
 /**
  * \brief Determine status of a compute stream
  *
@@ -9296,7 +11910,6 @@ CUresult CUDAAPI cuStreamQuery(CUstream hStream);
  */
 CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
 
-#if __CUDA_API_VERSION >= 4000
 /**
  * \brief Destroys a stream
  *
@@ -9326,7 +11939,71 @@ CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
  * ::cudaStreamDestroy
  */
 CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
-#endif /* __CUDA_API_VERSION >= 4000 */
+
+/**
+ * \brief Copies attributes from source stream to destination stream.
+ *
+ * Copies attributes from source stream \p src to destination stream \p dst.
+ * Both streams must have the same context.
+ *
+ * \param[out] dst Destination stream
+ * \param[in] src Source stream
+ * For list of attributes see ::CUstreamAttrID
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src);
+
+/**
+ * \brief Queries stream attribute.
+ *
+ * Queries attribute \p attr from \p hStream and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hStream
+ * \param[in] attr
+ * \param[out] value_out
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      CUstreamAttrValue *value_out);
+
+/**
+ * \brief Sets stream attribute.
+ *
+ * Sets attribute \p attr on \p hStream from corresponding attribute of
+ * \p value. The updated attribute will be applied to subsequent work
+ * submitted to the stream. It will not affect previously submitted work.
+ *
+ * \param[out] hStream
+ * \param[in] attr
+ * \param[in] value
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      const CUstreamAttrValue *value);
 
 /** @} */ /* END CUDA_STREAM */
 
@@ -9421,10 +12098,60 @@ CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
  * ::cuStreamWaitEvent,
  * ::cuEventDestroy,
  * ::cuEventElapsedTime,
- * ::cudaEventRecord
+ * ::cudaEventRecord,
+ * ::cuEventRecordWithFlags
  */
 CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
 
+/**
+ * \brief Records an event
+ *
+ * Captures in \p hEvent the contents of \p hStream at the time of this call.
+ * \p hEvent and \p hStream must be from the same context.
+ * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p hStream after this call do not modify \p hEvent. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cuEventRecordWithFlags() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cuStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an
+ * event represents an empty set of work, so for example ::cuEventQuery()
+ * would return ::CUDA_SUCCESS.
+ *
+ * flags include:
+ * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external
+ *   event node when performing stream capture. This flag is invalid outside
+ *   of stream capture.
+ *
+ * \param hEvent  - Event to record
+ * \param hStream - Stream to record event for
+ * \param flags   - See ::CUevent_capture_flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cuEventRecord,
+ * ::cudaEventRecord
+ */
+CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags);
+
 /**
  * \brief Queries an event's status
  *
@@ -9488,7 +12215,6 @@ CUresult CUDAAPI cuEventQuery(CUevent hEvent);
  */
 CUresult CUDAAPI cuEventSynchronize(CUevent hEvent);
 
-#if __CUDA_API_VERSION >= 4000
 /**
  * \brief Destroys an event
  *
@@ -9517,7 +12243,6 @@ CUresult CUDAAPI cuEventSynchronize(CUevent hEvent);
  * ::cudaEventDestroy
  */
 CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
-#endif /* __CUDA_API_VERSION >= 4000 */
 
 /**
  * \brief Computes the elapsed time between two events
@@ -9577,8 +12302,6 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven
  * @{
  */
 
-#if __CUDA_API_VERSION >= 10000
-
  /**
  * \brief Imports an external memory object
  *
@@ -9598,7 +12321,9 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven
                     void *handle;
                     const void *name;
                 } win32;
+                const void *nvSciBufObject;
             } handle;
+            unsigned long long size;
             unsigned int flags;
         } CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
  * \endcode
@@ -9609,11 +12334,14 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven
  *
  * \code
         typedef enum CUexternalMemoryHandleType_enum {
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD        = 1,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32     = 2,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP       = 4,
-            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE   = 5
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF           = 8
         } CUexternalMemoryHandleType;
  * \endcode
  *
@@ -9654,7 +12382,7 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven
  * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
  * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
  * is not NULL, then it must represent a valid shared NT handle that
- * is returned by ID3DDevice::CreateSharedHandle when referring to a
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
  * ID3D12Heap object. This handle holds a reference to the underlying
  * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
  * is not NULL, then it must point to a NULL-terminated array of
@@ -9666,17 +12394,55 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven
  * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
  * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
  * is not NULL, then it must represent a valid shared NT handle that
- * is returned by ID3DDevice::CreateSharedHandle when referring to a
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
  * ID3D12Resource object. This handle holds a reference to the
  * underlying object. If
  * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
  * is not NULL, then it must point to a NULL-terminated array of
  * UTF-16 characters that refers to a ID3D12Resource object.
  *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * represent a valid shared NT handle that is returned by
+ * IDXGIResource1::CreateSharedHandle when referring to a
+ * ID3D11Resource object. If
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D11Resource object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * represent a valid shared KMT handle that is returned by
+ * IDXGIResource::GetSharedHandle when referring to a
+ * ID3D11Resource object and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * must be NULL.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL
+ * and reference a valid NvSciBuf object.
+ * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the
+ * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync
+ * as appropriate barriers to maintain coherence between CUDA and the other drivers.
+ * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
+ * for memory synchronization.
+ *
+ *
+ * The size of the memory object must be specified in
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size.
+ *
  * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in
  * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the
  * resource is a dedicated resource. The definition of what a
  * dedicated resource is outside the scope of this extension.
+ * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type
+ * is one of the following:
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
  *
  * \param extMem_out    - Returned handle to an external memory object
  * \param memHandleDesc - Memory import handle descriptor
@@ -9684,6 +12450,7 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven
  * \return
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
  * ::CUDA_ERROR_INVALID_HANDLE
  * \notefnerr
  *
@@ -9734,6 +12501,8 @@ CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA
  * appropriate offsets to the returned pointer to derive the
  * individual buffers.
  *
+ * The returned pointer \p devPtr must be freed using ::cuMemFree.
+ *
  * \param devPtr     - Returned device pointer to buffer
  * \param extMem     - Handle to external memory object
  * \param bufferDesc - Buffer descriptor
@@ -9741,10 +12510,11 @@ CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA
  * \return
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
  * ::CUDA_ERROR_INVALID_HANDLE
  * \notefnerr
  *
- * \sa ::cuImportExternalMemory
+ * \sa ::cuImportExternalMemory,
  * ::cuDestroyExternalMemory,
  * ::cuExternalMemoryGetMappedMipmappedArray
  */
@@ -9781,6 +12551,11 @@ CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternal
  * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies
  * the total number of levels in the mipmap chain.
  *
+ * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1.
+ *
+ * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy.
+ *
  * \param mipmap     - Returned CUDA mipmapped array
  * \param extMem     - Handle to external memory object
  * \param mipmapDesc - CUDA array descriptor
@@ -9788,22 +12563,23 @@ CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternal
  * \return
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
  * ::CUDA_ERROR_INVALID_HANDLE
  * \notefnerr
  *
- * \sa ::cuImportExternalMemory
+ * \sa ::cuImportExternalMemory,
  * ::cuDestroyExternalMemory,
  * ::cuExternalMemoryGetMappedBuffer
  */
 CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc);
 
 /**
- * \brief Releases all resources associated with an external memory
- * object.
+ * \brief Destroys an external memory object.
  *
- * Frees all buffers and CUDA mipmapped arrays that were
- * mapped onto this external memory object and releases any reference
- * on the underlying memory itself.
+ * Destroys the specified external memory object. Any existing buffers
+ * and CUDA mipmapped arrays mapped onto this object must no longer be
+ * used and must be explicitly freed using ::cuMemFree and
+ * ::cuMipmappedArrayDestroy respectively.
  *
  * \param extMem - External memory object to be destroyed
  *
@@ -9813,7 +12589,7 @@ CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipma
  * ::CUDA_ERROR_INVALID_HANDLE
  * \notefnerr
  *
- * \sa ::cuImportExternalMemory
+ * \sa ::cuImportExternalMemory,
  * ::cuExternalMemoryGetMappedBuffer,
  * ::cuExternalMemoryGetMappedMipmappedArray
  */
@@ -9836,8 +12612,9 @@ CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem);
                 int fd;
                 struct {
                     void *handle;
-                const void *name;
+                    const void *name;
                 } win32;
+                const void* NvSciSyncObj;
             } handle;
             unsigned int flags;
         } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
@@ -9849,10 +12626,16 @@ CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem);
  *
  * \code
         typedef enum CUexternalSemaphoreHandleType_enum {
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD        = 1,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32     = 2,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
-            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE      = 4
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD                = 1,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32             = 2,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT         = 3,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE              = 4,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE              = 5,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC                = 6,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX        = 7,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT    = 8,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD    = 9,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
         } CUexternalSemaphoreHandleType;
  * \endcode
  *
@@ -9867,7 +12650,7 @@ CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem);
  * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
  * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
  * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
- * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
  * NULL. If
  * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
  * is not NULL, then it must represent a valid shared NT handle that
@@ -9890,23 +12673,76 @@ CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem);
  * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
  * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one
  * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
- * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
  * NULL. If
  * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
  * is not NULL, then it must represent a valid shared NT handle that
- * is returned by ID3DDevice::CreateSharedHandle when referring to a
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
  * ID3D12Fence object. This handle holds a reference to the underlying
  * object. If
  * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
  * is not NULL, then it must name a valid synchronization object that
  * refers to a valid ID3D12Fence object.
  *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * represents a valid shared NT handle that is returned by
+ * ID3D11Fence::CreateSharedHandle. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D11Fence object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj
+ * represents a valid NvSciSyncObj.
+ *
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * represents a valid shared NT handle that
+ * is returned by IDXGIResource1::CreateSharedHandle when referring to
+ * a IDXGIKeyedMutex object. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid IDXGIKeyedMutex object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * represents a valid shared KMT handle that
+ * is returned by IDXGIResource::GetSharedHandle when referring to
+ * a IDXGIKeyedMutex object and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL.
+ * 
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a synchronization object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ * 
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object.
+ *
  * \param extSem_out    - Returned handle to an external semaphore
  * \param semHandleDesc - Semaphore import handle descriptor
  *
  * \return
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
  * ::CUDA_ERROR_INVALID_HANDLE
  * \notefnerr
  *
@@ -9932,20 +12768,49 @@ CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, cons
  * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
  * then signaling the semaphore will set it to the signaled state.
  *
- * If the semaphore object is of the type
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then the
- * semaphore will be set to the value specified in
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+ * then the semaphore will be set to the value specified in
  * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value.
  *
+ * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
+ * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence
+ * to a value that can be used by subsequent waiters of the same NvSciSync object
+ * to order operations with those currently submitted in \p stream. Such an update
+ * will overwrite previous contents of
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default,
+ * signaling such an external semaphore object causes appropriate memory synchronization
+ * operations to be performed over all external memory objects that are imported as
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses
+ * made by other importers of the same set of NvSciBuf memory object(s) are coherent.
+ * These operations can be skipped by specifying the flag
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return
+ * CUDA_ERROR_NOT_SUPPORTED.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+ * then the keyed mutex will be released with the key specified in
+ * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key.
+ *
  * \param extSemArray - Set of external semaphores to be signaled
  * \param paramsArray - Array of semaphore parameters
  * \param numExtSems  - Number of semaphores to signal
- * \param stream     - Stream to enqueue the signal operations in
+ * \param stream      - Stream to enqueue the signal operations in
  *
  * \return
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
  * \notefnerr
  *
  * \sa ::cuImportExternalSemaphore,
@@ -9973,12 +12838,44 @@ CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extS
  * unsignaled state. Therefore for every signal operation, there can
  * only be one wait operation.
  *
- * If the semaphore object is of the type
- * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then waiting on
- * the semaphore will wait until the value of the semaphore is
- * greater than or equal to
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+ * then waiting on the semaphore will wait until the value of the
+ * semaphore is greater than or equal to
  * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value.
  *
+ * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
+ * then, waiting on the semaphore will wait until the
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the
+ * signaler of the NvSciSyncObj that was associated with this semaphore object.
+ * By default, waiting on such an external semaphore object causes appropriate
+ * memory synchronization operations to be performed over all external memory objects
+ * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that
+ * any subsequent accesses made by other importers of the same set of NvSciBuf memory
+ * object(s) are coherent. These operations can be skipped by specifying the flag
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return
+ * CUDA_ERROR_NOT_SUPPORTED.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+ * then the keyed mutex will be acquired when it is released with the key 
+ * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key 
+ * or until the timeout specified by
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs
+ * has lapsed. The timeout interval can either be a finite value
+ * specified in milliseconds or an infinite value. In case an infinite
+ * value is specified the timeout never elapses. The windows INFINITE
+ * macro must be used to specify infinite timeout.
+ *
  * \param extSemArray - External semaphores to be waited on
  * \param paramsArray - Array of semaphore parameters
  * \param numExtSems  - Number of semaphores to wait on
@@ -9987,7 +12884,9 @@ CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extS
  * \return
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_NOT_INITIALIZED,
- * ::CUDA_ERROR_INVALID_HANDLE
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_TIMEOUT
  * \notefnerr
  *
  * \sa ::cuImportExternalSemaphore,
@@ -10017,8 +12916,6 @@ CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSem
  */
 CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
 
-#endif /* __CUDA_API_VERSION >= 10000 */
-
 /** @} */ /* END CUDA_EXTRES_INTEROP */
 
 /**
@@ -10064,7 +12961,6 @@ CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
  * @{
  */
 
-#if __CUDA_API_VERSION >= 8000
 /**
  * \brief Wait on a memory location
  *
@@ -10097,7 +12993,7 @@ CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
  *
  * \sa ::cuStreamWaitValue64,
  * ::cuStreamWriteValue32,
- * ::cuStreamWriteValue64
+ * ::cuStreamWriteValue64,
  * ::cuStreamBatchMemOp,
  * ::cuMemHostRegister,
  * ::cuStreamWaitEvent
@@ -10242,7 +13138,6 @@ CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint6
  * ::cuMemHostRegister
  */
 CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
-#endif /* __CUDA_API_VERSION >= 8000 */
 
 /** @} */ /* END CUDA_MEMOP */
 
@@ -10293,7 +13188,7 @@ CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstrea
  * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of
  *   dynamically-allocated shared memory.
  * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1
- *   cache split ratio in percent of shared memory.
+ *   cache split ratio in percent of total shared memory.
  *
  * \param pi     - Returned attribute value
  * \param attrib - Attribute requested
@@ -10312,13 +13207,11 @@ CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstrea
  * ::cuCtxSetCacheConfig,
  * ::cuFuncSetCacheConfig,
  * ::cuLaunchKernel,
- * ::cudaFuncGetAttributes
+ * ::cudaFuncGetAttributes,
  * ::cudaFuncSetAttribute
  */
 CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
 
-#if __CUDA_API_VERSION >= 9000
-
 /**
  * \brief Sets information about a function
  *
@@ -10339,8 +13232,9 @@ CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunc
  *   architecture.
  * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1
  *   cache and shared memory use the same hardware resources, this sets the shared memory
- *   carveout preference, in percent of the total resources. This is only a hint, and the
- *   driver can choose a different ratio if required to execute the function.
+ *   carveout preference, in percent of the total shared memory. 
+ *   See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
  *
  * \param hfunc  - Function to query attribute of
  * \param attrib - Attribute requested
@@ -10359,11 +13253,10 @@ CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunc
  * ::cuCtxSetCacheConfig,
  * ::cuFuncSetCacheConfig,
  * ::cuLaunchKernel,
- * ::cudaFuncGetAttributes
+ * ::cudaFuncGetAttributes,
  * ::cudaFuncSetAttribute
  */
 CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value);
-#endif // __CUDA_API_VERSION >= 9000
 
 /**
  * \brief Sets the preferred cache configuration for a device function
@@ -10409,7 +13302,6 @@ CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attri
  */
 CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
 
-#if __CUDA_API_VERSION >= 4020
 /**
  * \brief Sets the shared memory configuration for a device function.
  *
@@ -10461,9 +13353,33 @@ CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
  * ::cudaFuncSetSharedMemConfig
  */
 CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
-#endif
 
-#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Returns a module handle
+ *
+ * Returns in \p *hmod the handle of the module that function \p hfunc
+ * is located in. The lifetime of the module corresponds to the lifetime of
+ * the context it was loaded in or until the module is explicitly unloaded.
+ *
+ * The CUDA runtime manages its own modules loaded into the primary context.
+ * If the handle returned by this API refers to a module loaded by the CUDA runtime,
+ * calling ::cuModuleUnload() on that module will result in undefined behavior.
+ *
+ * \param hmod - Returned module handle
+ * \param hfunc   - Function to retrieve module for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ */
+CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc);
+
 /**
  * \brief Launches a CUDA function
  *
@@ -10522,8 +13438,8 @@ CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig confi
  * parameters are specified with both \p kernelParams and \p extra
  * (i.e. both \p kernelParams and \p extra are non-NULL).
  *
- * Calling ::cuLaunchKernel() sets persistent function state that is
- * the same as function state set through the following deprecated APIs:
+ * Calling ::cuLaunchKernel() invalidates the persistent function state
+ * set through the following deprecated APIs:
  *  ::cuFuncSetBlockShape(),
  *  ::cuFuncSetSharedSize(),
  *  ::cuParamSetSize(),
@@ -10531,10 +13447,6 @@ CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig confi
  *  ::cuParamSetf(),
  *  ::cuParamSetv().
  *
- * When the kernel \p f is launched via ::cuLaunchKernel(), the previous
- * block shape, shared size and parameter info associated with \p f
- * is overwritten.
- *
  * Note that to use ::cuLaunchKernel(), the kernel \p f must either have
  * been compiled with toolchain version 3.2 or later so that it will
  * contain kernel parameter information, or have no kernel parameters.
@@ -10586,8 +13498,7 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f,
                                 CUstream hStream,
                                 void **kernelParams,
                                 void **extra);
-#endif /* __CUDA_API_VERSION >= 4000 */
-#if __CUDA_API_VERSION >= 9000
+
 /**
  * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute
  *
@@ -10678,6 +13589,8 @@ CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
 /**
  * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute
  *
+ * \deprecated This function is deprecated as of CUDA 11.3.
+ *
  * Invokes kernels as specified in the \p launchParamsList array where each element
  * of the array specifies all the parameters required to perform a single kernel launch.
  * These kernels can cooperate and synchronize as they execute. The size of the array is
@@ -10808,11 +13721,7 @@ CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
  * ::cuLaunchCooperativeKernel,
  * ::cudaLaunchCooperativeKernelMultiDevice
  */
-CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags);
-
-#endif /* __CUDA_API_VERSION >= 9000 */
-
-#if __CUDA_API_VERSION >= 10000
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags);
 
 /**
  * \brief Enqueues a host function call in a stream
@@ -10879,8 +13788,6 @@ CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launch
  */
 CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
 
-#endif /* __CUDA_API_VERSION >= 10000 */
-
 /** @} */ /* END CUDA_EXEC */
 
 /**
@@ -11107,6 +14014,21 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, voi
  * contains the number of threads specified by a previous call to
  * ::cuFuncSetBlockShape().
  *
+ * The block shape, dynamic shared memory size, and parameter information
+ * must be set using
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(), and
+ *  ::cuParamSetv()
+ * prior to calling this function.
+ *
+ * Launching a function via ::cuLaunchKernel() invalidates the function's
+ * block shape, dynamic shared memory size, and parameter information. After
+ * launching via cuLaunchKernel, this state must be re-initialized prior to
+ * calling this function. Failure to do so results in undefined behavior.
+ *
  * \param f - Kernel to launch
  *
  * \return
@@ -11144,6 +14066,21 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f);
  * blocks. Each block contains the number of threads specified by a previous
  * call to ::cuFuncSetBlockShape().
  *
+ * The block shape, dynamic shared memory size, and parameter information
+ * must be set using
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(), and
+ *  ::cuParamSetv()
+ * prior to calling this function.
+ *
+ * Launching a function via ::cuLaunchKernel() invalidates the function's
+ * block shape, dynamic shared memory size, and parameter information. After
+ * launching via cuLaunchKernel, this state must be re-initialized prior to
+ * calling this function. Failure to do so results in undefined behavior.
+ *
  * \param f           - Kernel to launch
  * \param grid_width  - Width of grid in blocks
  * \param grid_height - Height of grid in blocks
@@ -11183,6 +14120,21 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, in
  * blocks. Each block contains the number of threads specified by a previous
  * call to ::cuFuncSetBlockShape().
  *
+ * The block shape, dynamic shared memory size, and parameter information
+ * must be set using
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(), and
+ *  ::cuParamSetv()
+ * prior to calling this function.
+ *
+ * Launching a function via ::cuLaunchKernel() invalidates the function's
+ * block shape, dynamic shared memory size, and parameter information. After
+ * launching via cuLaunchKernel, this state must be re-initialized prior to
+ * calling this function. Failure to do so results in undefined behavior.
+ *
  * \param f           - Kernel to launch
  * \param grid_width  - Width of grid in blocks
  * \param grid_height - Height of grid in blocks
@@ -11202,8 +14154,8 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, in
  * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
  *
  * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no),
- *       this function may serialize kernel launches. In order to force the CUDA driver to retain
- *       asynchronous behavior, set the ::CU_CTX_LMEM_RESIZE_TO_MAX flag during context creation (see ::cuCtxCreate).
+ *       this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by
+ *       growing the per-thread stack as needed per launch and not shrinking it afterwards.
  *
  * \note_null_stream
  * \notefnerr
@@ -11247,7 +14199,6 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_widt
 __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
 /** @} */ /* END CUDA_EXEC_DEPRECATED */
 
-#if __CUDA_API_VERSION >= 10000
 /**
  * \defgroup CUDA_GRAPH Graph Management
  *
@@ -11334,9 +14285,9 @@ CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags);
  * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need
  * to be specified as that information is retrieved directly from the kernel's image.
  *
- * 2) Kernel parameters can also be packaged by the application into a single buffer that is passed in
- * via \p extra. This places the burden on the application of knowing each kernel
- * parameter's size and alignment/padding within the buffer. The \p extra parameter exists
+ * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single
+ * buffer that is passed in via \p extra. This places the burden on the application of knowing each
+ * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists
  * to allow this function to take additional less commonly used arguments. \p extra specifies
  * a list of names of extra settings and their corresponding values. Each extra setting name is
  * immediately followed by the corresponding value. The list must be terminated with either NULL or
@@ -11354,8 +14305,8 @@ CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags);
  *   ::CU_LAUNCH_PARAM_BUFFER_POINTER;
  *
  * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both
- * \p kernelParams and \p extra (i.e. both \p kernelParams and
- * \p extra are non-NULL).
+ * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL).
+ * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel.
  *
  * The \p kernelParams or \p extra array, as well as the argument values it points to,
  * are copied during this call.
@@ -11380,6 +14331,7 @@ CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags);
  *
  * \sa
  * ::cuLaunchKernel,
+ * ::cuLaunchCooperativeKernel,
  * ::cuGraphKernelNodeGetParams,
  * ::cuGraphKernelNodeSetParams,
  * ::cuGraphCreate,
@@ -11390,7 +14342,7 @@ CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags);
  * ::cuGraphAddMemcpyNode,
  * ::cuGraphAddMemsetNode
  */
-CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
 
 /**
  * \brief Returns a kernel node's parameters
@@ -11493,7 +14445,7 @@ CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL
  * ::cuGraphAddHostNode,
  * ::cuGraphAddMemsetNode
  */
-CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
+CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
 
 /**
  * \brief Returns a memcpy node's parameters
@@ -11581,7 +14533,7 @@ CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY
  * ::cuGraphAddHostNode,
  * ::cuGraphAddMemcpyNode
  */
-CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
+CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
 
 /**
  * \brief Returns a memset node's parameters
@@ -11639,6 +14591,7 @@ CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET
  * A handle to the new node will be returned in \p phGraphNode.
  *
  * When the graph is launched, the node will invoke the specified CPU function.
+ * Host nodes are not supported under MPS with pre-Volta GPUs.
  *
  * \param phGraphNode     - Returns newly created node
  * \param hGraph          - Graph to which to add the node
@@ -11650,6 +14603,7 @@ CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET
  * ::CUDA_SUCCESS,
  * ::CUDA_ERROR_DEINITIALIZED,
  * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
  * ::CUDA_ERROR_INVALID_VALUE
  * \note_graph_thread_safety
  * \notefnerr
@@ -11666,7 +14620,7 @@ CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET
  * ::cuGraphAddMemcpyNode,
  * ::cuGraphAddMemsetNode
  */
-CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams);
+CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams);
 
 /**
  * \brief Returns a host node's parameters
@@ -11723,6 +14677,8 @@ CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NOD
  * at the root of the graph. \p dependencies may not have any duplicate entries.
  * A handle to the new node will be returned in \p phGraphNode.
  *
+ * If \p hGraph contains allocation or free nodes, this call will return an error.
+ *
  * The node executes an embedded child graph. The child graph is cloned in this call.
  *
  * \param phGraphNode     - Returns newly created node
@@ -11750,7 +14706,7 @@ CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NOD
  * ::cuGraphAddMemsetNode,
  * ::cuGraphClone
  */
-CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
+CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
 
 /**
  * \brief Gets a handle to the embedded graph of a child graph node
@@ -11759,6 +14715,9 @@ CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGra
  * does not clone the graph. Changes to the graph will be reflected in
  * the node, and the node retains ownership of the graph.
  *
+ * Allocation and free nodes cannot be added to the returned graph.
+ * Attempting to do so will return an error.
+ *
  * \param hNode   - Node to get the embedded graph for
  * \param phGraph - Location to store a handle to the graph
  *
@@ -11812,12 +14771,654 @@ CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGra
  * ::cuGraphAddMemcpyNode,
  * ::cuGraphAddMemsetNode
  */
-CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies);
+CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
+
+/**
+ * \brief Creates an event record node and adds it to a graph
+ *
+ * Creates a new event record node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * Each launch of the graph will record \p event to capture execution of the
+ * node's dependencies.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventWaitNode,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ */
+CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+ 
+/**
+ * \brief Returns the event associated with an event record node
+ *
+ * Returns the event of event record node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphEventRecordNodeSetEvent,
+ * ::cuGraphEventWaitNodeGetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out);
+
+/**
+ * \brief Sets an event record node's event
+ *
+ * Sets the event of event record node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphEventRecordNodeGetEvent,
+ * ::cuGraphEventWaitNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Creates an event wait node and adds it to a graph
+ *
+ * Creates a new event wait node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The graph node will wait for all work captured in \p event.  See ::cuEventRecord()
+ * for details on what is captured by an event. \p event may be from a different context
+ * or device than the launch stream.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventRecordNode,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ */
+CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+
+/**
+ * \brief Returns the event associated with an event wait node
+ *
+ * Returns the event of event wait node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphEventWaitNodeSetEvent,
+ * ::cuGraphEventRecordNodeGetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out);
+
+/**
+ * \brief Sets an event wait node's event
+ *
+ * Sets the event of event wait node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphEventWaitNodeGetEvent,
+ * ::cuGraphEventRecordNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Creates an external semaphore signal node and adds it to a graph
+ *
+ * Creates a new external semaphore signal node and adds it to \p hGraph with \p
+ * numDependencies dependencies specified via \p dependencies and arguments specified
+ * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the
+ * node will be placed at the root of the graph. \p dependencies may not have any
+ * duplicate entries. A handle to the new node will be returned in \p phGraphNode.
+ *
+ * Performs a signal operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The operation(s) will occur after all of the node's
+ * dependencies have completed.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExternalSemaphoresSignalNodeGetParams,
+ * ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ */
+CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns an external semaphore signal node's parameters
+ *
+ * Returns the parameters of an external semaphore signal node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out);
+
+/**
+ * \brief Sets an external semaphore signal node's parameters
+ *
+ * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates an external semaphore wait node and adds it to a graph
+ *
+ * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p phGraphNode.
+ *
+ * Performs a wait operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The node's dependencies will not be launched until
+ * the wait operation has completed.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExternalSemaphoresWaitNodeGetParams,
+ * ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ */
+CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns an external semaphore wait node's parameters
+ *
+ * Returns the parameters of an external semaphore wait node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out);
+
+/**
+ * \brief Sets an external semaphore wait node's parameters
+ *
+ * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates an allocation node and adds it to a graph
+ *
+ * Creates a new allocation node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p phGraphNode.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in
+ * \p nodeParams.dptr.  The allocation's address remains fixed across instantiations and launches.
+ *
+ * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode,
+ * the allocation can be accessed by nodes ordered after the allocation node but before the free node.
+ * These allocations cannot be freed outside the owning graph, and they can only be freed once in the
+ * owning graph.
+ *
+ * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the
+ * graph which are ordered after the allocation node, but also by stream operations ordered after the
+ * graph's execution but before the allocation is freed.
+ *
+ * Allocations which are not freed in the same graph can be freed by:
+ * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree;
+ * - launching a graph with a free node for that allocation; or
+ * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes
+ * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation.
+ * 
+ * It is not possible to free an allocation in both the owning graph and another graph.  If the allocation
+ * is freed in the same graph, a free node cannot be added to another graph.  If the allocation is freed
+ * in another graph, a free node can no longer be added to the owning graph.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemFreeNode,
+ * ::cuGraphMemAllocNodeGetParams,
+ * ::cuDeviceGraphMemTrim,
+ * ::cuDeviceGetGraphMemAttribute,
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuMemAllocAsync,
+ * ::cuMemFreeAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a memory alloc node's parameters
+ *
+ * Returns the parameters of a memory alloc node \p hNode in \p params_out.
+ * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the
+ * node.  This memory remains valid until the node is destroyed.  The returned
+ * parameters must not be modified.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphMemFreeNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
+
+/**
+ * \brief Creates a memory free node and adds it to a graph
+ *
+ * Creates a new memory free node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p phGraphNode.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dptr            - Address of memory to free
+ *
+ * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free:
+ * - an allocation twice in the same graph.
+ * - an address that was not returned by an allocation node.
+ * - an invalid address.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphMemFreeNodeGetParams,
+ * ::cuDeviceGraphMemTrim,
+ * ::cuDeviceGetGraphMemAttribute,
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuMemAllocAsync,
+ * ::cuMemFreeAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
+
+/**
+ * \brief Returns a memory free node's parameters
+ *
+ * Returns the address of a memory free node \p hNode in \p dptr_out.
+ *
+ * \param hNode    - Node to get the parameters for
+ * \param dptr_out - Pointer to return the device address
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemFreeNode,
+ * ::cuGraphMemAllocNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out);
+
+/**
+ * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS.
+ *
+ * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are
+ * freed back to the operating system.
+ *
+ * \param device - The device for which cached memory should be freed.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphAddMemFreeNode,
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuDeviceGetGraphMemAttribute
+ */
+CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device);
+
+/**
+ * \brief Query asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs
+ * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - retrieved value
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphAddMemFreeNode
+ */
+CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value);
+
+/**
+ * \brief Set asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - pointer to value to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuDeviceGetGraphMemAttribute,
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphAddMemFreeNode
+ */
+CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value);
 
 /**
  * \brief Clones a graph
  *
- * This function creates a copy of \p originalGraph and returns it in \p * phGraphClone.
+ * This function creates a copy of \p originalGraph and returns it in \p phGraphClone.
  * All parameters are copied into the cloned graph. The original graph may be modified
  * after this call without affecting the clone.
  *
@@ -12082,7 +15683,7 @@ CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *de
  * ::cuGraphNodeGetDependencies,
  * ::cuGraphNodeGetDependentNodes
  */
-CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t numDependencies);
+CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
 
 /**
  * \brief Removes dependency edges from a graph
@@ -12094,6 +15695,9 @@ CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, CUgraphNode *from, CUgra
  * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
  * Specifying a non-existing dependency will return an error.
  *
+ * Dependencies cannot be removed from graphs which contain allocation or free nodes.
+ * Any attempt to do so will return an error.
+ *
  * \param hGraph - Graph from which to remove dependencies
  * \param from - Array of nodes that provide the dependencies
  * \param to - Array of dependent nodes
@@ -12111,7 +15715,7 @@ CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, CUgraphNode *from, CUgra
  * ::cuGraphNodeGetDependencies,
  * ::cuGraphNodeGetDependentNodes
  */
-CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t numDependencies);
+CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
 
 /**
  * \brief Remove a node from the graph
@@ -12119,6 +15723,9 @@ CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, CUgraphNode *from, CU
  * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes
  * on \p hNode and vice versa.
  *
+ * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed.
+ * Any attempt to do so will return an error.
+ *
  * \param hNode  - Node to remove
  *
  * \return
@@ -12143,7 +15750,7 @@ CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode);
  * Instantiates \p hGraph as an executable graph. The graph is validated for any
  * structural constraints or intra-node constraints which were not previously
  * validated. If instantiation is successful, a handle to the instantiated graph
- * is returned in \p graphExec.
+ * is returned in \p phGraphExec.
  *
  * If there are any errors, diagnostic information may be returned in \p errorNode and
  * \p logBuffer. This is the primary way to inspect instantiation errors. The output
@@ -12167,12 +15774,461 @@ CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode);
  * \notefnerr
  *
  * \sa
+ * ::cuGraphInstantiateWithFlags,
  * ::cuGraphCreate,
+ * ::cuGraphUpload,
  * ::cuGraphLaunch,
  * ::cuGraphExecDestroy
  */
 CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
 
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p hGraph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p phGraphExec.
+ *
+ * The \p flags parameter controls the behavior of instantiation and subsequent
+ * graph launches.  Valid flags are:
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * If \p hGraph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time.
+ *
+ * An attempt to instantiate a second executable graph before destroying the first
+ * with ::cuGraphExecDestroy will result in an error.
+ *
+ * \param phGraphExec - Returns instantiated graph
+ * \param hGraph      - Graph to instantiate
+ * \param flags       - Flags to control instantiation.  See ::CUgraphInstantiate_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphCreate,
+ * ::cuGraphUpload,
+ * ::cuGraphLaunch,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphInstantiateWithFlags(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
+
+/**
+ * \brief Sets the parameters for a kernel node in the given graphExec
+ *
+ * Sets the parameters of a kernel node in an executable graph \p hGraphExec. 
+ * The node is identified by the corresponding node \p hNode in the 
+ * non-executable graph, from which the executable graph was instantiated. 
+ *
+ * \p hNode must not have been removed from the original graph. The \p func field 
+ * of \p nodeParams cannot be modified and must match the original value.
+ * All other values can be modified. 
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already 
+ * enqueued or running launches of \p hGraphExec are not affected by this call. 
+ * \p hNode is also not modified by this call.
+ * 
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param hNode       - kernel node from the graph from which graphExec was instantiated
+ * \param nodeParams  - Updated Parameters to set
+ * 
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
+ * contained \p copyParams at instantiation.  hNode must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+ *
+ * The source and destination memory in \p copyParams must be allocated from the same 
+ * contexts as the original source and destination memory.  Both the instantiation-time 
+ * memory operands and the memory operands in \p copyParams must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
+ * not modified by this call.
+ *
+ * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or
+ * either the original or new memory operands are multidimensional.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Memcpy node from the graph which was used to instantiate graphExec
+ * \param copyParams - The updated parameters to set
+ * \param ctx        - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
+
+/**
+ * \brief Sets the parameters for a memset node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
+ * contained \p memsetParams at instantiation.  hNode must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+ *
+ * The destination memory in \p memsetParams must be allocated from the same 
+ * contexts as the original destination memory.  Both the instantiation-time 
+ * memory operand and the memory operand in \p memsetParams must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
+ * not modified by this call.
+ *
+ * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or
+ * either the original or new memory operand are multidimensional.
+ *
+ * \param hGraphExec   - The executable graph in which to set the specified node
+ * \param hNode        - Memset node from the graph which was used to instantiate graphExec
+ * \param memsetParams - The updated parameters to set
+ * \param ctx          - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeSetParams,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
+
+/**
+ * \brief Sets the parameters for a host node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
+ * contained \p nodeParams at instantiation.  hNode must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
+ * not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Host node from the graph which was used to instantiate graphExec
+ * \param nodeParams - The updated parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Updates node parameters in the child graph node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained
+ * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation.
+ * \p hNode must remain in the graph which was used to instantiate \p hGraphExec.
+ * Changed edges to and from \p hNode are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p hNode is also 
+ * not modified by this call.
+ *
+ * The topology of \p childGraph, as well as the node insertion order,  must match that
+ * of the graph contained in \p hNode.  See ::cuGraphExecUpdate() for a list of restrictions
+ * on what can be updated in an instantiated graph.  The update is recursive, so child graph
+ * nodes contained within the top level child graph will also be updated.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Host node from the graph which was used to instantiate graphExec
+ * \param childGraph - The graph supplying the updated parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
+
+/**
+ * \brief Sets the event for an event record node in the given graphExec
+ *
+ * Sets the event of an event record node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - event record node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphEventRecordNodeGetEvent,
+ * ::cuGraphEventWaitNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Sets the event for an event wait node in the given graphExec
+ *
+ * Sets the event of an event wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - event wait node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphEventWaitNodeGetEvent,
+ * ::cuGraphEventRecordNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Sets the parameters for an external semaphore signal node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore signal node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets the parameters for an external semaphore wait node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore wait node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Uploads an executable graph in a stream
+ *
+ * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of
+ * the same \p hGraphExec will be serialized. Each upload is ordered behind both any
+ * previous work in \p hStream and any previous launches of \p hGraphExec.
+ * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec.
+ *
+ * \param hGraphExec - Executable graph to upload
+ * \param hStream    - Stream in which to upload the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphLaunch,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream);
+
 /**
  * \brief Launches an executable graph in a stream
  *
@@ -12181,6 +16237,10 @@ CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CU
  * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be
  * instantiated multiple times into multiple executable graphs.
  *
+ * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and
+ * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH,
+ * the launch will fail with ::CUDA_ERROR_INVALID_VALUE.
+ *
  * \param hGraphExec - Executable graph to launch
  * \param hStream    - Stream in which to launch the graph
  *
@@ -12194,6 +16254,7 @@ CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CU
  *
  * \sa
  * ::cuGraphInstantiate,
+ * ::cuGraphUpload,
  * ::cuGraphExecDestroy
  */
 CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream);
@@ -12218,6 +16279,7 @@ CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream);
  *
  * \sa
  * ::cuGraphInstantiate,
+ * ::cuGraphUpload,
  * ::cuGraphLaunch
  */
 CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec);
@@ -12241,10 +16303,307 @@ CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec);
  * ::cuGraphCreate
  */
 CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
-/** @} */ /* END CUDA_GRAPH */
-#endif /* __CUDA_API_VERSION >= 10000 */
 
-#if __CUDA_API_VERSION >= 6050
+/**
+ * \brief Check whether an executable graph can be updated with a graph and perform the update if possible
+ *
+ * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the
+ * node parameters in a topologically identical graph specified by \p hGraph.
+ *
+ * Limitations:
+ *
+ * - Kernel nodes:
+ *   - The owning context of the function cannot change.
+ *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+ *     to a function which uses CDP
+ * - Memset and memcpy nodes:
+ *   - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
+ *   - The source/destination memory must be allocated from the same contexts as the original
+ *     source/destination memory.
+ *   - Only 1D memsets can be changed.
+ * - Additional memcpy node restrictions:
+ *   - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
+ *     CU_MEMORYTYPE_ARRAY, etc.) is not supported.
+ * - External semaphore wait nodes and record nodes:
+ *   - Changing the number of semaphores is not supported.
+ *
+ * Note:  The API may add further restrictions in future releases.  The return code should always be checked.
+ *
+ * cuGraphExecUpdate sets \p updateResult_out to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under
+ * the following conditions:
+ *
+ * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out
+ *   is NULL.
+ * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out
+ *   is NULL.
+ * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is
+ *   the pairless node from \p hGraph.
+ * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph.
+ *
+ * cuGraphExecUpdate sets \p updateResult_out to:
+ * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case
+ *   \p hErrorNode_out is set to the node from \p hGraph.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported
+ *   way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way 
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like 
+ *   the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph
+ *
+ * If \p updateResult_out isn't set in one of the situations described above, the update check passes
+ * and cuGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph.  If an error happens
+ * during the update, \p updateResult_out will be set to CU_GRAPH_EXEC_UPDATE_ERROR; otherwise,
+ * \p updateResult_out is set to CU_GRAPH_EXEC_UPDATE_SUCCESS.
+ *
+ * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully.  It returns
+ * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included 
+ * changes which violated constraints specific to instantiated graph update.
+ *
+ * \param hGraphExec The instantiated graph to be updated
+ * \param hGraph The graph containing the updated parameters
+ * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any
+ * \param updateResult_out Whether the graph update was permitted.  If was forbidden, the reason why
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ */
+CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
+
+/**
+ * \brief Copies attributes from source node to destination node.
+ *
+ * Copies attributes from source node \p src to destination node \p dst.
+ * Both node must have the same context.
+ *
+ * \param[out] dst Destination node
+ * \param[in] src Source node
+ * For list of attributes see ::CUkernelNodeAttrID
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src);
+
+/**
+ * \brief Queries node attribute.
+ * 
+ * Queries attribute \p attr from node \p hNode and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hNode
+ * \param[in] attr
+ * \param[out] value_out 
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *  
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                                      CUkernelNodeAttrValue *value_out);
+ 
+/**
+ * \brief Sets node attribute.
+ * 
+ * Sets attribute \p attr on node \p hNode from corresponding attribute of
+ * \p value.
+ *
+ * \param[out] hNode
+ * \param[in] attr
+ * \param[out] value
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                                      const CUkernelNodeAttrValue *value);
+
+/**
+ * \brief Write a DOT file describing graph structure
+ *
+ * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph.
+ * By default this includes the graph topology, node types, node id, kernel names and memcpy direction.
+ * \p flags can be specified to write more detailed information about each node type such as
+ * parameter values, kernel attributes, node and function handles.
+ *
+ * \param hGraph - The graph to create a DOT file from
+ * \param path   - The path to write the DOT file to
+ * \param flags  - Flags from CUgraphDebugDot_flags for specifying which additional node information to write
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ */
+CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags);
+
+/**
+ * \brief Create a user object
+ *
+ * Create a user object with the specified destructor callback and initial reference count. The
+ * initial references are owned by the caller.
+ *
+ * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they
+ * are executed by a shared internal thread. Another thread may be signaled to perform such
+ * actions, if it does not block forward progress of tasks scheduled through CUDA.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object_out      - Location to return the user object handle
+ * \param ptr             - The pointer to pass to the destroy function
+ * \param destroy         - Callback to free the user object when it is no longer in use
+ * \param initialRefcount - The initial refcount to create the object with, typically 1. The
+ *                          initial references are owned by the calling thread.
+ * \param flags           - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC,
+ *                          which is the only defined flag. This indicates that the destroy
+ *                          callback cannot be waited on by any CUDA API. Users requiring
+ *                          synchronization of the callback should signal its completion
+ *                          manually.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectRetain,
+ * ::cuUserObjectRelease,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy,
+                                    unsigned int initialRefcount, unsigned int flags);
+
+/**
+ * \brief Retain a reference to a user object
+ *
+ * Retains new references to a user object. The new references are owned by the caller.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to retain
+ * \param count  - The number of references to retain, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRelease,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count);
+
+/**
+ * \brief Release a reference to a user object
+ *
+ * Releases user object references owned by the caller. The object's destructor is invoked if
+ * the reference count reaches zero.
+ *
+ * It is undefined behavior to release references not owned by the caller, or to use a user
+ * object handle after all references are released.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to release
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRetain,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count);
+
+/**
+ * \brief Retain a reference to a user object from a graph
+ *
+ * Creates or moves user object references that will be owned by a CUDA graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph to associate the reference with
+ * \param object - The user object to retain a reference for
+ * \param count  - The number of references to add to the graph, typically 1. Must be
+ *                 nonzero and not larger than INT_MAX.
+ * \param flags  - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references
+ *                 from the calling thread, rather than create new references. Pass 0
+ *                 to create new references.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRetain,
+ * ::cuUserObjectRelease,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
+
+/**
+ * \brief Release a user object reference from a graph
+ *
+ * Releases user object references owned by a graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph that will release the reference
+ * \param object - The user object to release a reference for
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRetain,
+ * ::cuUserObjectRelease,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count);
+
+/** @} */ /* END CUDA_GRAPH */
+
 /**
  * \defgroup CUDA_OCCUPANCY Occupancy
  *
@@ -12422,17 +16781,39 @@ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSi
  */
 CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
 
+/**
+ * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM 
+ *
+ * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. 
+ *
+ * \param dynamicSmemSize - Returned maximum dynamic shared memory 
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param numBlocks       - Number of blocks to fit on SM 
+ * \param blockSize       - Size of the blocks
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ */
+CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
+
 /** @} */ /* END CUDA_OCCUPANCY */
-#endif /* __CUDA_API_VERSION >= 6050 */
 
 /**
- * \defgroup CUDA_TEXREF Texture Reference Management
+ * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED]
  *
- * ___MANBRIEF___ texture reference management functions of the low-level CUDA
- * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ * ___MANBRIEF___ deprecated texture reference management functions of the
+ * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
  *
- * This section describes the texture reference management functions of the
- * low-level CUDA driver application programming interface.
+ * This section describes the deprecated texture reference management
+ * functions of the low-level CUDA driver application programming interface.
  *
  * @{
  */
@@ -12440,6 +16821,8 @@ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int
 /**
  * \brief Binds an array as a texture reference
  *
+ * \deprecated
+ *
  * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any
  * previous address or CUDA array state associated with the texture reference
  * is superseded by this function. \p Flags must be set to
@@ -12464,11 +16847,13 @@ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
  * ::cudaBindTextureToArray
  */
-CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
 
 /**
  * \brief Binds a mipmapped array to a texture reference
  *
+ * \deprecated
+ *
  * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef.
  * Any previous address or CUDA array state associated with the texture reference
  * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT.
@@ -12492,12 +16877,13 @@ CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
  * ::cudaBindTextureToMipmappedArray
  */
-CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
 
-#if __CUDA_API_VERSION >= 3020
 /**
  * \brief Binds an address as a texture reference
  *
+ * \deprecated
+ *
  * Binds a linear address range to the texture reference \p hTexRef. Any
  * previous address or CUDA array state associated with the texture reference
  * is superseded by this function. Any memory previously bound to \p hTexRef
@@ -12537,11 +16923,13 @@ CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hM
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
  * ::cudaBindTexture
  */
-CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
 
 /**
  * \brief Binds an address as a 2D texture reference
  *
+ * \deprecated
+ *
  * Binds a linear address range to the texture reference \p hTexRef. Any
  * previous address or CUDA array state associated with the texture reference
  * is superseded by this function. Any memory previously bound to \p hTexRef
@@ -12590,12 +16978,13 @@ CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdevi
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
  * ::cudaBindTexture2D
  */
-CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
-#endif /* __CUDA_API_VERSION >= 3020 */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
 
 /**
  * \brief Sets the format for a texture reference
  *
+ * \deprecated
+ *
  * Specifies the format of the data to be read by the texture reference
  * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the
  * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure:
@@ -12624,11 +17013,13 @@ CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIP
  * ::cudaBindTextureToArray,
  * ::cudaBindTextureToMipmappedArray
  */
-CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
 
 /**
  * \brief Sets the addressing mode for a texture reference
  *
+ * \deprecated
+ *
  * Specifies the addressing mode \p am for the given dimension \p dim of the
  * texture reference \p hTexRef. If \p dim is zero, the addressing mode is
  * applied to the first parameter of the functions used to fetch from the
@@ -12668,11 +17059,13 @@ CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int Num
  * ::cudaBindTextureToArray,
  * ::cudaBindTextureToMipmappedArray
  */
-CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
 
 /**
  * \brief Sets the filtering mode for a texture reference
  *
+ * \deprecated
+ *
  * Specifies the filtering mode \p fm to be used when reading memory through
  * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
  *
@@ -12702,11 +17095,13 @@ CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mod
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
  * ::cudaBindTextureToArray
  */
-CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
 
 /**
  * \brief Sets the mipmap filtering mode for a texture reference
  *
+ * \deprecated
+ *
  * Specifies the mipmap filtering mode \p fm to be used when reading memory through
  * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
  *
@@ -12736,11 +17131,13 @@ CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
  * ::cudaBindTextureToMipmappedArray
  */
-CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
 
 /**
  * \brief Sets the mipmap level bias for a texture reference
  *
+ * \deprecated
+ *
  * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when
  * reading memory through the texture reference \p hTexRef.
  *
@@ -12763,11 +17160,13 @@ CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm)
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
  * ::cudaBindTextureToMipmappedArray
  */
-CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
 
 /**
  * \brief Sets the mipmap min/max mipmap level clamps for a texture reference
  *
+ * \deprecated
+ *
  * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp
  * respectively, to be used when reading memory through the texture reference
  * \p hTexRef.
@@ -12792,11 +17191,13 @@ CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
  * ::cudaBindTextureToMipmappedArray
  */
-CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
 
 /**
  * \brief Sets the maximum anisotropy for a texture reference
  *
+ * \deprecated
+ *
  * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through
  * the texture reference \p hTexRef.
  *
@@ -12820,11 +17221,13 @@ CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLe
  * ::cudaBindTextureToArray,
  * ::cudaBindTextureToMipmappedArray
  */
-CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
 
 /**
  * \brief Sets the border color for a texture reference
  *
+ * \deprecated
+ *
  * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference
  * \p hTexRef. The color value supports only float type and holds color components in
  * the following sequence:
@@ -12854,11 +17257,13 @@ CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAnis
  * ::cudaBindTextureToArray,
  * ::cudaBindTextureToMipmappedArray
  */
-CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor);
 
 /**
  * \brief Sets the flags for a texture reference
  *
+ * \deprecated
+ *
  * Specifies optional flags via \p Flags to specify the behavior of data
  * returned through the texture reference \p hTexRef. The valid flags are:
  *
@@ -12872,6 +17277,10 @@ CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor);
  *   from [0, Dim) where Dim is the width or height of the CUDA
  *   array. Instead, the texture coordinates [0, 1.0) reference
  *   the entire breadth of the array dimension;
+ * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
+ *   filtering optimizations. Trilinear optimizations improve texture filtering
+ *   performance by allowing bilinear filtering on textures in scenarios where
+ *   it can closely approximate the expected results.
  *
  * \param hTexRef - Texture reference
  * \param Flags   - Optional flags to set
@@ -12893,12 +17302,13 @@ CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor);
  * ::cudaBindTextureToArray,
  * ::cudaBindTextureToMipmappedArray
  */
-CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
 
-#if __CUDA_API_VERSION >= 3020
 /**
  * \brief Gets the address associated with a texture reference
  *
+ * \deprecated
+ *
  * Returns in \p *pdptr the base address bound to the texture reference
  * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
  * is not bound to any device memory range.
@@ -12919,12 +17329,13 @@ CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
  * ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  */
-CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
-#endif /* __CUDA_API_VERSION >= 3020 */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
 
 /**
  * \brief Gets the array bound to a texture reference
  *
+ * \deprecated
+ *
  * Returns in \p *phArray the CUDA array bound to the texture reference
  * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
  * is not bound to any CUDA array.
@@ -12945,11 +17356,13 @@ CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
  * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  */
-CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
 
 /**
  * \brief Gets the mipmapped array bound to a texture reference
  *
+ * \deprecated
+ *
  * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture
  * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
  * is not bound to any CUDA mipmapped array.
@@ -12970,11 +17383,13 @@ CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
  * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  */
-CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
 
 /**
  * \brief Gets the addressing mode used by a texture reference
  *
+ * \deprecated
+ *
  * Returns in \p *pam the addressing mode corresponding to the
  * dimension \p dim of the texture reference \p hTexRef. Currently, the only
  * valid value for \p dim are 0 and 1.
@@ -12996,11 +17411,13 @@ CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, C
  * ::cuTexRefGetAddress, ::cuTexRefGetArray,
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  */
-CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
 
 /**
  * \brief Gets the filter-mode used by a texture reference
  *
+ * \deprecated
+ *
  * Returns in \p *pfm the filtering mode of the texture reference
  * \p hTexRef.
  *
@@ -13020,11 +17437,13 @@ CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, i
  * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  * ::cuTexRefGetFlags, ::cuTexRefGetFormat
  */
-CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
 
 /**
  * \brief Gets the format used by a texture reference
  *
+ * \deprecated
+ *
  * Returns in \p *pFormat and \p *pNumChannels the format and number
  * of components of the CUDA array bound to the texture reference \p hTexRef.
  * If \p pFormat or \p pNumChannels is NULL, it will be ignored.
@@ -13046,11 +17465,13 @@ CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
  * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags
  */
-CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
 
 /**
  * \brief Gets the mipmap filtering mode for a texture reference
  *
+ * \deprecated
+ *
  * Returns the mipmap filtering mode in \p pfm that's used when reading memory through
  * the texture reference \p hTexRef.
  *
@@ -13070,11 +17491,13 @@ CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, C
  * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  */
-CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
 
 /**
  * \brief Gets the mipmap level bias for a texture reference
  *
+ * \deprecated
+ *
  * Returns the mipmap level bias in \p pBias that's added to the specified mipmap
  * level when reading memory through the texture reference \p hTexRef.
  *
@@ -13094,11 +17517,13 @@ CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRe
  * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  */
-CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef);
 
 /**
  * \brief Gets the min/max mipmap level clamps for a texture reference
  *
+ * \deprecated
+ *
  * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp
  * that's used when reading memory through the texture reference \p hTexRef.
  *
@@ -13119,11 +17544,13 @@ CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef);
  * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  */
-CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
 
 /**
  * \brief Gets the maximum anisotropy for a texture reference
  *
+ * \deprecated
+ *
  * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through
  * the texture reference \p hTexRef.
  *
@@ -13143,11 +17570,13 @@ CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float
  * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
  */
-CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef);
 
 /**
  * \brief Gets the border color used by a texture reference
  *
+ * \deprecated
+ *
  * Returns in \p pBorderColor, values of the RGBA color used by
  * the texture reference \p hTexRef.
  * The color value is of type float and holds color components in
@@ -13170,11 +17599,13 @@ CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef);
  * \sa ::cuTexRefSetAddressMode,
  * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor
  */
-CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef);
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef);
 
 /**
  * \brief Gets the flags used by a texture reference
  *
+ * \deprecated
+ *
  * Returns in \p *pFlags the flags of the texture reference \p hTexRef.
  *
  * \param pFlags  - Returned flags
@@ -13193,21 +17624,7 @@ CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef);
  * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
  * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat
  */
-CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
-
-/** @} */ /* END CUDA_TEXREF */
-
-/**
- * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED]
- *
- * ___MANBRIEF___ deprecated texture reference management functions of the
- * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
- *
- * This section describes the deprecated texture reference management
- * functions of the low-level CUDA driver application programming interface.
- *
- * @{
- */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
 
 /**
  * \brief Creates a texture reference
@@ -13258,7 +17675,7 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef);
 
 
 /**
- * \defgroup CUDA_SURFREF Surface Reference Management
+ * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED]
  *
  * ___MANBRIEF___ surface reference management functions of the low-level CUDA
  * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
@@ -13272,6 +17689,8 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef);
 /**
  * \brief Sets the CUDA array for a surface reference.
  *
+ * \deprecated
+ *
  * Sets the CUDA array \p hArray to be read and written by the surface reference
  * \p hSurfRef.  Any previous CUDA array state associated with the surface
  * reference is superseded by this function.  \p Flags must be set to 0.
@@ -13294,11 +17713,13 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef);
  * ::cuSurfRefGetArray,
  * ::cudaBindSurfaceToArray
  */
-CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
 
 /**
  * \brief Passes back the CUDA array bound to a surface reference.
  *
+ * \deprecated
+ *
  * Returns in \p *phArray the CUDA array bound to the surface reference
  * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference
  * is not bound to any CUDA array.
@@ -13315,11 +17736,10 @@ CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned
  *
  * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray
  */
-CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
 
-/** @} */ /* END CUDA_SURFREF */
+/** @} */ /* END CUDA_SURFREF_DEPRECATED */
 
-#if __CUDA_API_VERSION >= 5000
 /**
  * \defgroup CUDA_TEXOBJECT Texture Object Management
  *
@@ -13457,11 +17877,19 @@ CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
  *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR.
  *
  * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following:
- *   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of having the texture promote integer data to floating point data in the
- *     range [0, 1]. Note that texture with 32-bit integer format would not be promoted, regardless of whether or not this flag is specified.
- *   - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior of having the texture coordinates range from [0, Dim) where Dim is
- *     the width or height of the CUDA array. Instead, the texture coordinates [0, 1.0) reference the entire breadth of the array dimension; Note
- *     that for CUDA mipmapped arrays, this flag has to be set.
+ *   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
+ *   having the texture promote integer data to floating point data in the
+ *   range [0, 1]. Note that texture with 32-bit integer format would not be 
+ *   promoted, regardless of whether or not this flag is specified.
+ *   - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior
+ *   of having the texture coordinates range from [0, Dim) where Dim is the 
+ *   width or height of the CUDA array. Instead, the texture coordinates 
+ *   [0, 1.0) reference the entire breadth of the array dimension; Note that
+ *   for CUDA mipmapped arrays, this flag has to be set.
+ *   - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
+ *   filtering optimizations. Trilinear optimizations improve texture filtering
+ *   performance by allowing bilinear filtering on textures in scenarios where
+ *   it can closely approximate the expected results.
  *
  * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be
  *   clamped to the range [1,16].
@@ -13710,7 +18138,6 @@ CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject);
 CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject);
 
 /** @} */ /* END CUDA_SURFOBJECT */
-#endif /* __CUDA_API_VERSION >= 5000 */
 
 /**
  * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access
@@ -13724,8 +18151,6 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur
  * @{
  */
 
-#if __CUDA_API_VERSION >= 4000
-
 /**
  * \brief Queries if a device may directly access a peer device's memory.
  *
@@ -13767,7 +18192,9 @@ CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevic
  * memory from the current context in \p peerContext, a separate symmetric call
  * to ::cuCtxEnablePeerAccess() is required.
  *
- * There is a system-wide maximum of eight peer connections per device.
+ * Note that there are both device-wide and system-wide limitations per system
+ * configuration, as noted in the CUDA Programming Guide under the section
+ * "Peer-to-Peer Memory Access".
  *
  * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates
  * that the ::CUdevice of the current context cannot directly access memory
@@ -13832,10 +18259,6 @@ CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags
  */
 CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext);
 
-#endif /* __CUDA_API_VERSION >= 4000 */
-
-#if __CUDA_API_VERSION >= 8000
-
 /**
  * \brief Queries attributes of the link between two devices.
  *
@@ -13876,8 +18299,6 @@ CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext);
  */
 CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
 
-#endif /* __CUDA_API_VERSION >= 8000 */
-
 /** @} */ /* END CUDA_PEER_ACCESS */
 
 /**
@@ -13962,8 +18383,6 @@ CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource);
  */
 CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
 
-#if __CUDA_API_VERSION >= 5000
-
 /**
  * \brief Get a mipmapped array through which to access a mapped graphics resource.
  *
@@ -13995,9 +18414,6 @@ CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphics
  */
 CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
 
-#endif /* __CUDA_API_VERSION >= 5000 */
-
-#if __CUDA_API_VERSION >= 3020
 /**
  * \brief Get a device pointer through which to access a mapped graphics resource.
  *
@@ -14031,7 +18447,6 @@ CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMi
  * ::cudaGraphicsResourceGetMappedPointer
  */
 CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
-#endif /* __CUDA_API_VERSION >= 3020 */
 
 /**
  * \brief Set usage flags for mapping a graphics resource
@@ -14153,8 +18568,73 @@ CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource
 
 /** @} */ /* END CUDA_GRAPHICS */
 
-CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
+/**
+ * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access 
+ *
+ * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the driver entry point access functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
 
+/**
+ * \brief Returns the requested driver API function pointer
+ *
+ * Returns in \p **pfn the address of the CUDA driver function for the requested
+ * CUDA version and flags.
+ *
+ * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2
+ * should be specified as 11020. For a requested driver symbol, if the specified
+ * CUDA version is greater than or equal to the CUDA version in which the driver symbol
+ * was introduced, this API will return the function pointer to the corresponding
+ * versioned function.
+ *
+ * The pointer returned by the API should be cast to a function pointer matching the
+ * requested driver function's definition in the API header file. The function pointer
+ * typedef can be picked up from the corresponding typedefs header file. For example,
+ * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h.
+ *
+ * The API will return ::CUDA_ERROR_NOT_FOUND if the requested driver function is not
+ * supported on the platform, no ABI compatible driver function exists for the specified
+ * \p cudaVersion or if the driver symbol is invalid.
+ *
+ * The requested flags can be:
+ * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to
+ *   ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with
+ *   --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM
+ *   is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise.
+ * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols
+ *   that match the requested driver symbol name except the corresponding per-thread versions.
+ * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all
+ *   driver symbols that match the requested driver symbol name including the per-thread
+ *   versions. If a per-thread version is not found, the API will return the legacy version
+ *   of the driver function.
+ *
+ * \param symbol - The base name of the driver API function to look for. As an example,
+ *                 for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and
+ *                 \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. 
+ * \param pfn - Location to return the function pointer to the requested driver function
+ * \param cudaVersion - The CUDA version to look for the requested driver symbol 
+ * \param flags -  Flags to specify search options.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \note_version_mixing
+ *
+ * \sa
+ * ::cudaGetDriverEntryPoint
+ */
+CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags);
+
+/** @} */ /* END CUDA_DRIVER_ENTRY_POINT */
+
+CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
 
 /**
  * CUDA API versioning support
@@ -14233,6 +18713,7 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExp
     #undef cuStreamQuery
     #undef cuStreamSynchronize
     #undef cuEventRecord
+    #undef cuEventRecordWithFlags
     #undef cuLaunchKernel
     #undef cuLaunchHostFunc
     #undef cuGraphicsMapResources
@@ -14249,184 +18730,160 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExp
     #undef cuStreamBeginCapture
     #undef cuStreamEndCapture
     #undef cuStreamIsCapturing
+    #undef cuStreamGetCaptureInfo
+    #undef cuStreamGetCaptureInfo_v2
+    #undef cuGraphUpload
     #undef cuGraphLaunch
-#endif /* __CUDA_API_VERSION_INTERNAL */
+    #undef cuDevicePrimaryCtxRelease
+    #undef cuDevicePrimaryCtxReset
+    #undef cuDevicePrimaryCtxSetFlags
+    #undef cuIpcOpenMemHandle
+    #undef cuStreamCopyAttributes
+    #undef cuStreamSetAttribute
+    #undef cuStreamGetAttribute
+    #undef cuGraphInstantiate
+    #undef cuMemMapArrayAsync
+    #undef cuMemFreeAsync 
+    #undef cuMemAllocAsync 
+    #undef cuMemAllocFromPoolAsync 
+    #undef cuStreamUpdateCaptureDependencies
 
-#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050)
-CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
-#endif /* defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) */
+    CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+    CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+    CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+    CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+        unsigned int numOptions, CUjit_option *options, void **optionValues);
+    CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+        unsigned int numOptions, CUjit_option *options, void **optionValues);
+    CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
 
-#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050
-CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
-#endif /* defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050 */
+    typedef unsigned int CUdeviceptr_v1;
 
-#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050)
-CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
-CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
-    unsigned int numOptions, CUjit_option *options, void **optionValues);
-CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
-    unsigned int numOptions, CUjit_option *options, void **optionValues);
-#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050) */
+    typedef struct CUDA_MEMCPY2D_v1_st
+    {
+        unsigned int srcXInBytes;   /**< Source X in bytes */
+        unsigned int srcY;          /**< Source Y */
+        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+        const void *srcHost;        /**< Source host pointer */
+        CUdeviceptr_v1 srcDevice;   /**< Source device pointer */
+        CUarray srcArray;           /**< Source array reference */
+        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
 
-#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010)
-CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
-#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) */
+        unsigned int dstXInBytes;   /**< Destination X in bytes */
+        unsigned int dstY;          /**< Destination Y */
+        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+        void *dstHost;              /**< Destination host pointer */
+        CUdeviceptr_v1 dstDevice;   /**< Destination device pointer */
+        CUarray dstArray;           /**< Destination array reference */
+        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
 
-/**
- * CUDA API made obselete at API version 3020
- */
-#if defined(__CUDA_API_VERSION_INTERNAL)
-    #define CUdeviceptr                  CUdeviceptr_v1
-    #define CUDA_MEMCPY2D_st             CUDA_MEMCPY2D_v1_st
-    #define CUDA_MEMCPY2D                CUDA_MEMCPY2D_v1
-    #define CUDA_MEMCPY3D_st             CUDA_MEMCPY3D_v1_st
-    #define CUDA_MEMCPY3D                CUDA_MEMCPY3D_v1
-    #define CUDA_ARRAY_DESCRIPTOR_st     CUDA_ARRAY_DESCRIPTOR_v1_st
-    #define CUDA_ARRAY_DESCRIPTOR        CUDA_ARRAY_DESCRIPTOR_v1
-    #define CUDA_ARRAY3D_DESCRIPTOR_st   CUDA_ARRAY3D_DESCRIPTOR_v1_st
-    #define CUDA_ARRAY3D_DESCRIPTOR      CUDA_ARRAY3D_DESCRIPTOR_v1
-#endif /* CUDA_FORCE_LEGACY32_INTERNAL */
+        unsigned int WidthInBytes;  /**< Width of 2D memory copy in bytes */
+        unsigned int Height;        /**< Height of 2D memory copy */
+    } CUDA_MEMCPY2D_v1;
 
-#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020
+    typedef struct CUDA_MEMCPY3D_v1_st
+    {
+        unsigned int srcXInBytes;   /**< Source X in bytes */
+        unsigned int srcY;          /**< Source Y */
+        unsigned int srcZ;          /**< Source Z */
+        unsigned int srcLOD;        /**< Source LOD */
+        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+        const void *srcHost;        /**< Source host pointer */
+        CUdeviceptr_v1 srcDevice;   /**< Source device pointer */
+        CUarray srcArray;           /**< Source array reference */
+        void *reserved0;            /**< Must be NULL */
+        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+        unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */
 
-typedef unsigned int CUdeviceptr;
+        unsigned int dstXInBytes;   /**< Destination X in bytes */
+        unsigned int dstY;          /**< Destination Y */
+        unsigned int dstZ;          /**< Destination Z */
+        unsigned int dstLOD;        /**< Destination LOD */
+        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+        void *dstHost;              /**< Destination host pointer */
+        CUdeviceptr_v1 dstDevice;   /**< Destination device pointer */
+        CUarray dstArray;           /**< Destination array reference */
+        void *reserved1;            /**< Must be NULL */
+        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+        unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
 
-typedef struct CUDA_MEMCPY2D_st
-{
-    unsigned int srcXInBytes;   /**< Source X in bytes */
-    unsigned int srcY;          /**< Source Y */
-    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
-    const void *srcHost;        /**< Source host pointer */
-    CUdeviceptr srcDevice;      /**< Source device pointer */
-    CUarray srcArray;           /**< Source array reference */
-    unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+        unsigned int WidthInBytes;  /**< Width of 3D memory copy in bytes */
+        unsigned int Height;        /**< Height of 3D memory copy */
+        unsigned int Depth;         /**< Depth of 3D memory copy */
+    } CUDA_MEMCPY3D_v1;
 
-    unsigned int dstXInBytes;   /**< Destination X in bytes */
-    unsigned int dstY;          /**< Destination Y */
-    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
-    void *dstHost;              /**< Destination host pointer */
-    CUdeviceptr dstDevice;      /**< Destination device pointer */
-    CUarray dstArray;           /**< Destination array reference */
-    unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+    typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st
+    {
+        unsigned int Width;         /**< Width of array */
+        unsigned int Height;        /**< Height of array */
 
-    unsigned int WidthInBytes;  /**< Width of 2D memory copy in bytes */
-    unsigned int Height;        /**< Height of 2D memory copy */
-} CUDA_MEMCPY2D;
+        CUarray_format Format;      /**< Array format */
+        unsigned int NumChannels;   /**< Channels per array element */
+    } CUDA_ARRAY_DESCRIPTOR_v1;
 
-typedef struct CUDA_MEMCPY3D_st
-{
-    unsigned int srcXInBytes;   /**< Source X in bytes */
-    unsigned int srcY;          /**< Source Y */
-    unsigned int srcZ;          /**< Source Z */
-    unsigned int srcLOD;        /**< Source LOD */
-    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
-    const void *srcHost;        /**< Source host pointer */
-    CUdeviceptr srcDevice;      /**< Source device pointer */
-    CUarray srcArray;           /**< Source array reference */
-    void *reserved0;            /**< Must be NULL */
-    unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
-    unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+    typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st
+    {
+        unsigned int Width;         /**< Width of 3D array */
+        unsigned int Height;        /**< Height of 3D array */
+        unsigned int Depth;         /**< Depth of 3D array */
 
-    unsigned int dstXInBytes;   /**< Destination X in bytes */
-    unsigned int dstY;          /**< Destination Y */
-    unsigned int dstZ;          /**< Destination Z */
-    unsigned int dstLOD;        /**< Destination LOD */
-    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
-    void *dstHost;              /**< Destination host pointer */
-    CUdeviceptr dstDevice;      /**< Destination device pointer */
-    CUarray dstArray;           /**< Destination array reference */
-    void *reserved1;            /**< Must be NULL */
-    unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
-    unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+        CUarray_format Format;      /**< Array format */
+        unsigned int NumChannels;   /**< Channels per array element */
+        unsigned int Flags;         /**< Flags */
+    } CUDA_ARRAY3D_DESCRIPTOR_v1;
 
-    unsigned int WidthInBytes;  /**< Width of 3D memory copy in bytes */
-    unsigned int Height;        /**< Height of 3D memory copy */
-    unsigned int Depth;         /**< Depth of 3D memory copy */
-} CUDA_MEMCPY3D;
+    CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
+    CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+    CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+    CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
+    CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize);
+    CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
+    CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr);
+    CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
+    CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
+    CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
+    CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy);
+    CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy);
+    CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy);
+    CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
+    CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
+    CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
+    CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+    CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+    CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+    CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
+    CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
+    CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
+    CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
+    CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
+    CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
 
-typedef struct CUDA_ARRAY_DESCRIPTOR_st
-{
-    unsigned int Width;         /**< Width of array */
-    unsigned int Height;        /**< Height of array */
+    CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
+    CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
+    CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
+    CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
+    CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
+    CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
+    CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
+    CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
 
-    CUarray_format Format;      /**< Array format */
-    unsigned int NumChannels;   /**< Channels per array element */
-} CUDA_ARRAY_DESCRIPTOR;
-
-typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
-{
-    unsigned int Width;         /**< Width of 3D array */
-    unsigned int Height;        /**< Height of 3D array */
-    unsigned int Depth;         /**< Depth of 3D array */
-
-    CUarray_format Format;      /**< Array format */
-    unsigned int NumChannels;   /**< Channels per array element */
-    unsigned int Flags;         /**< Flags */
-} CUDA_ARRAY3D_DESCRIPTOR;
-
-CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
-CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
-CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
-CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
-CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
-CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
-CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
-CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr);
-CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
-CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
-CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
-CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount);
-CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
-CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
-CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
-CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
-CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
-CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
-CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
-CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
-CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
-CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
-CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
-CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
-CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
-CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
-CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
-CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
-CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N);
-CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N);
-CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
-CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
-CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
-CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
-CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
-CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
-CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
-CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
-CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
-CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
-CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
-CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
-#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 3020 */
-#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 4000
-CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
-CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
-CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
-CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
-CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
-#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 4000 */
-#if defined(__CUDA_API_VERSION_INTERNAL)
-    #undef CUdeviceptr
-    #undef CUDA_MEMCPY2D_st
-    #undef CUDA_MEMCPY2D
-    #undef CUDA_MEMCPY3D_st
-    #undef CUDA_MEMCPY3D
-    #undef CUDA_ARRAY_DESCRIPTOR_st
-    #undef CUDA_ARRAY_DESCRIPTOR
-    #undef CUDA_ARRAY3D_DESCRIPTOR_st
-    #undef CUDA_ARRAY3D_DESCRIPTOR
-#endif /* __CUDA_API_VERSION_INTERNAL */
-
-#if defined(__CUDA_API_VERSION_INTERNAL)
     CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
     CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
     CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
@@ -14474,6 +18931,7 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
     CUresult CUDAAPI cuStreamQuery(CUstream hStream);
     CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
     CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
+    CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags);
     CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
     CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
     CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
@@ -14488,16 +18946,49 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
     CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
     CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
     CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream);
+    CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream);
+    CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode);
     CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
     CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+    CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+    CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+    CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream);
     CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream);
+    CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream);
+    CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value);
+    CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param);
+
+    CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
+    CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+    CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream);
+
+    CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);
+    CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream);
+    CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+
+    CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
+static inline CUresult cuGetProcAddress_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags) {
+    const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM|
+                                 CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM);
+    if ((flags & procAddressMask) == 0) {
+        flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM;
+    }
+    return cuGetProcAddress(symbol, funcPtr, driverVersion, flags); 
+}
+#define cuGetProcAddress cuGetProcAddress_ptsz
 #endif
 
 #ifdef __cplusplus
 }
 #endif
 
-#undef __CUDA_API_VERSION
+#if defined(__GNUC__)
+  #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT)
+    #pragma GCC visibility pop
+  #endif
+#endif
+
 #undef __CUDA_DEPRECATED
 
 #endif /* __cuda_cuda_h__ */
diff --git a/python/src/triton.cc b/python/src/triton.cc
index 4d7df76ff..cb6923afe 100644
--- a/python/src/triton.cc
+++ b/python/src/triton.cc
@@ -101,12 +101,12 @@ void hip_enqueue(uint64_t stream, uint64_t kernel,
 
 }
 
-std::string pow2_divisor(long N){
-    if(N % 16 == 0) return "16";
-    if(N % 8 == 0) return "8";
-    if(N % 4 == 0) return "4";
-    if(N % 2 == 0) return "2";
-    return "1";
+long pow2_divisor(long N){
+    if(N % 16 == 0) return 16;
+    if(N % 8 == 0) return 8;
+    if(N % 4 == 0) return 4;
+    if(N % 2 == 0) return 2;
+    return 1;
 }
 
 // Returns something like "int16", whether dtype is a torch.dtype or
@@ -127,6 +127,14 @@ std::string dtype_cache_key_part(const py::object& dtype) {
   }
 }
 
+size_t get_pointer_range_size(uint64_t addr){
+  if(addr == 0)
+    return 0;
+  size_t size;
+  drv::dispatch::cuPointerGetAttribute(&size, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)addr);
+  return size;
+}
+
 // Launch
 void parse_args(py::list& args, py::list do_not_specialize, const std::string& func_key, py::list& arg_names,
                 std::string& cache_key, std::string& params, size_t& params_size, py::dict constants,
@@ -187,7 +195,7 @@ void parse_args(py::list& args, py::list do_not_specialize, const std::string& f
           continue;
         // values divisible by small powers of 2 are specialized
         cache_key += "[multipleof(";
-        cache_key += pow2_divisor(value);
+        cache_key += std::to_string(pow2_divisor(value));
         cache_key += ")]";
         continue;
       }
@@ -213,12 +221,15 @@ void parse_args(py::list& args, py::list do_not_specialize, const std::string& f
         py::object data_ptr = arg.attr("data_ptr")();
         long value = data_ptr.cast<long>();
         params_ptr = (char*)(((uintptr_t)params_ptr + 7) & (-8));
+        // copy param
         std::memcpy(params_ptr, &value, 8);
         params_ptr += 8;
+        // udpate cache key
         cache_key += dtype_cache_key_part(arg.attr("dtype"));
         cache_key += "*";
         cache_key += "[multipleof(";
-        cache_key += pow2_divisor(value);
+        size_t range_size = get_pointer_range_size(value);
+        cache_key += std::to_string(std::min(pow2_divisor(value), pow2_divisor(range_size)));
         cache_key += ")]";
         continue;
       }
@@ -268,6 +279,10 @@ void init_triton_runtime(py::module &&m) {
     }
   );
 
+  // get range size for the given pointer
+  m.def("get_pointer_range_size", &get_pointer_range_size);
+
+
   // cache key
   m.def("launch", [](py::list args, py::list do_not_specialize, const std::string& func_key, py::list& arg_names, 
                      py::object device, py::int_ stream, py::dict bin_cache, py::int_ num_warps, py::int_ num_stages, 
diff --git a/python/triton/code_gen.py b/python/triton/code_gen.py
index b7da2047e..63e3d0aa0 100644
--- a/python/triton/code_gen.py
+++ b/python/triton/code_gen.py
@@ -674,9 +674,17 @@ class Kernel:
     def add_to_cache(self, key, wargs, device_idx, num_warps, num_stages):
         tensor_idxs = [i for i, arg in enumerate(wargs) if hasattr(arg, 'data_ptr')]
         # attributes
-        args = [arg.data_ptr() if i in tensor_idxs else arg for i, arg in enumerate(wargs)]
-        attributes = {i: Kernel.pow2_divisor(a) for i, a in enumerate(args)
-                      if isinstance(a, int) and i not in self.fn.do_not_specialize}
+        attributes = dict()
+        for i, arg in enumerate(wargs):
+            if i in self.fn.do_not_specialize:
+                continue
+            if isinstance(arg, int):
+                attributes[i] = Kernel.pow2_divisor(arg)
+            elif i in tensor_idxs:
+                addr = arg.data_ptr()
+                range_size = _triton.runtime.get_pointer_range_size(addr)
+                attributes[i] = min(Kernel.pow2_divisor(addr),
+                                    Kernel.pow2_divisor(range_size))
 
         # transforms ints whose value is one into constants for just-in-time compilation
         constants = {i: arg for i, arg in enumerate(wargs) if isinstance(arg, int) and arg == 1 and i not in self.fn.do_not_specialize}
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
index 4ef6408d0..6f1bf3ba7 100644
--- a/python/triton/language/core.py
+++ b/python/triton/language/core.py
@@ -165,6 +165,7 @@ class block:
         self.numel = 1
         for s in self.shape:
             self.numel *= s
+        self.numel = constexpr(self.numel)
         # Data-type wrapper
         self.dtype = block._init_dtype(self.handle.type.scalar)
 
@@ -873,7 +874,7 @@ def ravel(x):
     :param x: the input block
     :type x: Block
     """
-    return triton.language.reshape(x, [x.type.numel])
+    return triton.language.reshape(x, [x.numel])
 
 
 @triton.jit