diff --git a/include/triton/external/CUDA/cuda.h b/include/triton/external/CUDA/cuda.h index 24d96bd6c..f7bf9fc12 100755 --- a/include/triton/external/CUDA/cuda.h +++ b/include/triton/external/CUDA/cuda.h @@ -60,9 +60,6 @@ typedef uint32_t cuuint32_t; typedef uint64_t cuuint64_t; #endif -/** - * CUDA API versioning support - */ #if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) #define __CUDA_DEPRECATED #elif defined(_MSC_VER) @@ -74,14 +71,8 @@ typedef uint64_t cuuint64_t; #endif #if defined(CUDA_FORCE_API_VERSION) - #if (CUDA_FORCE_API_VERSION == 3010) - #define __CUDA_API_VERSION 3010 - #else - #error "Unsupported value of CUDA_FORCE_API_VERSION" - #endif -#else - #define __CUDA_API_VERSION 10000 -#endif /* CUDA_FORCE_API_VERSION */ +#error "CUDA_FORCE_API_VERSION is no longer supported." +#endif #if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) #define __CUDA_API_PER_THREAD_DEFAULT_STREAM @@ -92,74 +83,66 @@ typedef uint64_t cuuint64_t; #define __CUDA_API_PTSZ(api) api #endif -#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020 - #define cuDeviceTotalMem cuDeviceTotalMem_v2 - #define cuCtxCreate cuCtxCreate_v2 - #define cuModuleGetGlobal cuModuleGetGlobal_v2 - #define cuMemGetInfo cuMemGetInfo_v2 - #define cuMemAlloc cuMemAlloc_v2 - #define cuMemAllocPitch cuMemAllocPitch_v2 - #define cuMemFree cuMemFree_v2 - #define cuMemGetAddressRange cuMemGetAddressRange_v2 - #define cuMemAllocHost cuMemAllocHost_v2 - #define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 - #define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) - #define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) - #define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2) - #define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2) - #define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2) - #define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2) - #define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2) - #define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2) - #define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2) - #define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2) - #define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2) - #define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2) - #define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2) - #define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2) - #define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2) - #define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2) - #define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2) - #define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2) - #define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2) - #define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2) - #define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2) - #define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2) - #define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2) - #define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2) - #define cuArrayCreate cuArrayCreate_v2 - #define cuArrayGetDescriptor cuArrayGetDescriptor_v2 - #define cuArray3DCreate cuArray3DCreate_v2 - #define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2 - #define cuTexRefSetAddress cuTexRefSetAddress_v2 - #define cuTexRefGetAddress cuTexRefGetAddress_v2 - #define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2 -#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 3020 */ -#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4000 - #define cuCtxDestroy cuCtxDestroy_v2 - #define cuCtxPopCurrent cuCtxPopCurrent_v2 - #define cuCtxPushCurrent cuCtxPushCurrent_v2 - #define cuStreamDestroy cuStreamDestroy_v2 - #define cuEventDestroy cuEventDestroy_v2 -#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4000 */ -#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4010 - #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3 -#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4010 */ -#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050 - #define cuLinkCreate cuLinkCreate_v2 - #define cuLinkAddData cuLinkAddData_v2 - #define cuLinkAddFile cuLinkAddFile_v2 -#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */ -#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050 - #define cuMemHostRegister cuMemHostRegister_v2 - #define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2 -#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */ - -#if !defined(__CUDA_API_VERSION_INTERNAL) -#if defined(__CUDA_API_VERSION) && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 - #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v2 -#endif /* __CUDA_API_VERSION && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 */ -#endif /* __CUDA_API_VERSION_INTERNAL */ +#define cuDeviceTotalMem cuDeviceTotalMem_v2 +#define cuCtxCreate cuCtxCreate_v2 +#define cuCtxCreate_v3 cuCtxCreate_v3 +#define cuModuleGetGlobal cuModuleGetGlobal_v2 +#define cuMemGetInfo cuMemGetInfo_v2 +#define cuMemAlloc cuMemAlloc_v2 +#define cuMemAllocPitch cuMemAllocPitch_v2 +#define cuMemFree cuMemFree_v2 +#define cuMemGetAddressRange cuMemGetAddressRange_v2 +#define cuMemAllocHost cuMemAllocHost_v2 +#define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 +#define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) +#define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) +#define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2) +#define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2) +#define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2) +#define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2) +#define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2) +#define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2) +#define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2) +#define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2) +#define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2) +#define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2) +#define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2) +#define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2) +#define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2) +#define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2) +#define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2) +#define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2) +#define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2) +#define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2) +#define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2) +#define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2) +#define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2) +#define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2) +#define cuArrayCreate cuArrayCreate_v2 +#define cuArrayGetDescriptor cuArrayGetDescriptor_v2 +#define cuArray3DCreate cuArray3DCreate_v2 +#define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2 +#define cuTexRefSetAddress cuTexRefSetAddress_v2 +#define cuTexRefGetAddress cuTexRefGetAddress_v2 +#define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2 +#define cuCtxDestroy cuCtxDestroy_v2 +#define cuCtxPopCurrent cuCtxPopCurrent_v2 +#define cuCtxPushCurrent cuCtxPushCurrent_v2 +#define cuStreamDestroy cuStreamDestroy_v2 +#define cuEventDestroy cuEventDestroy_v2 +#define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3 +#define cuLinkCreate cuLinkCreate_v2 +#define cuLinkAddData cuLinkAddData_v2 +#define cuLinkAddFile cuLinkAddFile_v2 +#define cuMemHostRegister cuMemHostRegister_v2 +#define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2 +#define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture_v2) +#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 +#define cuDevicePrimaryCtxReset cuDevicePrimaryCtxReset_v2 +#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 +#define cuDeviceGetUuid_v2 cuDeviceGetUuid_v2 +#define cuIpcOpenMemHandle cuIpcOpenMemHandle_v2 +#define cuGraphInstantiate cuGraphInstantiate_v2 #if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) #define cuMemcpy __CUDA_API_PTDS(cuMemcpy) @@ -181,14 +164,17 @@ typedef uint64_t cuuint64_t; #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags) #define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx) #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent) - #define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture) #define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture) #define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing) + #define cuStreamGetCaptureInfo __CUDA_API_PTSZ(cuStreamGetCaptureInfo) + #define cuStreamGetCaptureInfo_v2 __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2) + #define cuStreamUpdateCaptureDependencies __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies) #define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback) #define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync) #define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery) #define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize) #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord) + #define cuEventRecordWithFlags __CUDA_API_PTSZ(cuEventRecordWithFlags) #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel) #define cuLaunchHostFunc __CUDA_API_PTSZ(cuLaunchHostFunc) #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources) @@ -205,7 +191,16 @@ typedef uint64_t cuuint64_t; #define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync) #define cuWaitExternalSemaphoresAsync __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync) + #define cuGraphUpload __CUDA_API_PTSZ(cuGraphUpload) #define cuGraphLaunch __CUDA_API_PTSZ(cuGraphLaunch) + #define cuStreamCopyAttributes __CUDA_API_PTSZ(cuStreamCopyAttributes) + #define cuStreamGetAttribute __CUDA_API_PTSZ(cuStreamGetAttribute) + #define cuStreamSetAttribute __CUDA_API_PTSZ(cuStreamSetAttribute) + #define cuMemMapArrayAsync __CUDA_API_PTSZ(cuMemMapArrayAsync) + + #define cuMemFreeAsync __CUDA_API_PTSZ(cuMemFreeAsync) + #define cuMemAllocAsync __CUDA_API_PTSZ(cuMemAllocAsync) + #define cuMemAllocFromPoolAsync __CUDA_API_PTSZ(cuMemAllocFromPoolAsync) #endif /** @@ -229,7 +224,7 @@ typedef uint64_t cuuint64_t; /** * CUDA API version number */ -#define CUDA_VERSION 10000 +#define CUDA_VERSION 11050 #ifdef __cplusplus extern "C" { @@ -239,34 +234,36 @@ extern "C" { * CUDA device pointer * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. */ -#if __CUDA_API_VERSION >= 3020 - #if defined(_WIN64) || defined(__LP64__) -typedef unsigned long long CUdeviceptr; +typedef unsigned long long CUdeviceptr_v2; #else -typedef unsigned int CUdeviceptr; +typedef unsigned int CUdeviceptr_v2; #endif +typedef CUdeviceptr_v2 CUdeviceptr; /**< CUDA device pointer */ -#endif /* __CUDA_API_VERSION >= 3020 */ - -typedef int CUdevice; /**< CUDA device */ -typedef struct CUctx_st *CUcontext; /**< CUDA context */ -typedef struct CUmod_st *CUmodule; /**< CUDA module */ -typedef struct CUfunc_st *CUfunction; /**< CUDA function */ -typedef struct CUarray_st *CUarray; /**< CUDA array */ -typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ -typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ -typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ -typedef struct CUevent_st *CUevent; /**< CUDA event */ -typedef struct CUstream_st *CUstream; /**< CUDA stream */ -typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ -typedef unsigned long long CUtexObject; /**< An opaque value that represents a CUDA texture object */ -typedef unsigned long long CUsurfObject; /**< An opaque value that represents a CUDA surface object */ -typedef struct CUextMemory_st *CUexternalMemory; /**< CUDA external memory */ -typedef struct CUextSemaphore_st *CUexternalSemaphore; /**< CUDA external semaphore */ -typedef struct CUgraph_st *CUgraph; /**< CUDA graph */ -typedef struct CUgraphNode_st *CUgraphNode; /**< CUDA graph node */ -typedef struct CUgraphExec_st *CUgraphExec; /**< CUDA executable graph */ +typedef int CUdevice_v1; /**< CUDA device */ +typedef CUdevice_v1 CUdevice; /**< CUDA device */ +typedef struct CUctx_st *CUcontext; /**< CUDA context */ +typedef struct CUmod_st *CUmodule; /**< CUDA module */ +typedef struct CUfunc_st *CUfunction; /**< CUDA function */ +typedef struct CUarray_st *CUarray; /**< CUDA array */ +typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ +typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ +typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ +typedef struct CUevent_st *CUevent; /**< CUDA event */ +typedef struct CUstream_st *CUstream; /**< CUDA stream */ +typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ +typedef unsigned long long CUtexObject_v1; /**< An opaque value that represents a CUDA texture object */ +typedef CUtexObject_v1 CUtexObject; /**< An opaque value that represents a CUDA texture object */ +typedef unsigned long long CUsurfObject_v1; /**< An opaque value that represents a CUDA surface object */ +typedef CUsurfObject_v1 CUsurfObject; /**< An opaque value that represents a CUDA surface object */ +typedef struct CUextMemory_st *CUexternalMemory; /**< CUDA external memory */ +typedef struct CUextSemaphore_st *CUexternalSemaphore; /**< CUDA external semaphore */ +typedef struct CUgraph_st *CUgraph; /**< CUDA graph */ +typedef struct CUgraphNode_st *CUgraphNode; /**< CUDA graph node */ +typedef struct CUgraphExec_st *CUgraphExec; /**< CUDA executable graph */ +typedef struct CUmemPoolHandle_st *CUmemoryPool; /**< CUDA memory pool */ +typedef struct CUuserObject_st *CUuserObject; /**< CUDA user object for graphs */ #ifndef CU_UUID_HAS_BEEN_DEFINED #define CU_UUID_HAS_BEEN_DEFINED @@ -275,8 +272,6 @@ typedef struct CUuuid_st { /**< CUDA definition o } CUuuid; #endif -#if __CUDA_API_VERSION >= 4010 - /** * CUDA IPC handle size */ @@ -287,14 +282,16 @@ typedef struct CUuuid_st { /**< CUDA definition o */ typedef struct CUipcEventHandle_st { char reserved[CU_IPC_HANDLE_SIZE]; -} CUipcEventHandle; +} CUipcEventHandle_v1; +typedef CUipcEventHandle_v1 CUipcEventHandle; /** * CUDA IPC mem handle */ typedef struct CUipcMemHandle_st { char reserved[CU_IPC_HANDLE_SIZE]; -} CUipcMemHandle; +} CUipcMemHandle_v1; +typedef CUipcMemHandle_v1 CUipcMemHandle; /** * CUDA Ipc Mem Flags @@ -303,7 +300,6 @@ typedef enum CUipcMem_flags_enum { CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */ } CUipcMem_flags; -#endif /** * CUDA Mem Attach Flags @@ -326,7 +322,9 @@ typedef enum CUctx_flags_enum { * \deprecated This flag was deprecated as of CUDA 4.0 * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */ CU_CTX_SCHED_MASK = 0x07, - CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */ + CU_CTX_MAP_HOST = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 + * and it no longer has any effect. All contexts + * as of CUDA 3.2 behave as though the flag is enabled. */ CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ CU_CTX_FLAGS_MASK = 0x1f } CUctx_flags; @@ -335,8 +333,8 @@ typedef enum CUctx_flags_enum { * Stream creation flags */ typedef enum CUstream_flags_enum { - CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */ - CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */ + CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */ + CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */ } CUstream_flags; /** @@ -369,7 +367,26 @@ typedef enum CUevent_flags_enum { CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */ } CUevent_flags; -#if __CUDA_API_VERSION >= 8000 +/** + * Event record flags + */ +typedef enum CUevent_record_flags_enum { + CU_EVENT_RECORD_DEFAULT = 0x0, /**< Default event record flag */ + CU_EVENT_RECORD_EXTERNAL = 0x1 /**< When using stream capture, create an event record node + * instead of the default behavior. This flag is invalid + * when used outside of capture. */ +} CUevent_record_flags; + +/** + * Event wait flags + */ +typedef enum CUevent_wait_flags_enum { + CU_EVENT_WAIT_DEFAULT = 0x0, /**< Default event wait flag */ + CU_EVENT_WAIT_EXTERNAL = 0x1 /**< When using stream capture, create an event wait node + * instead of the default behavior. This flag is invalid + * when used outside of capture.*/ +} CUevent_wait_flags; + /** * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64 */ @@ -448,8 +465,8 @@ typedef union CUstreamBatchMemOpParams_union { unsigned int flags; } flushRemoteWrites; cuuint64_t pad[6]; -} CUstreamBatchMemOpParams; -#endif /* __CUDA_API_VERSION >= 8000 */ +} CUstreamBatchMemOpParams_v1; +typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams; /** * Occupancy calculator flag @@ -459,6 +476,14 @@ typedef enum CUoccupancy_flags_enum { CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1 /**< Assume global caching is enabled and cannot be automatically turned off */ } CUoccupancy_flags; +/** + * Flags for ::cuStreamUpdateCaptureDependencies + */ +typedef enum CUstreamUpdateCaptureDependencies_flags_enum { + CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */ + CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1 /**< Replace the dependency set with the new nodes */ +} CUstreamUpdateCaptureDependencies_flags; + /** * Array formats */ @@ -470,7 +495,34 @@ typedef enum CUarray_format_enum { CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ - CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ + CU_AD_FORMAT_FLOAT = 0x20, /**< 32-bit floating point */ + CU_AD_FORMAT_NV12 = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */ + CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, /**< 1 channel signed 8-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, /**< 2 channel signed 8-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, /**< 4 channel signed 8-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT16X1 = 0xc9, /**< 1 channel signed 16-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT16X2 = 0xca, /**< 2 channel signed 16-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT16X4 = 0xcb, /**< 4 channel signed 16-bit normalized integer */ + CU_AD_FORMAT_BC1_UNORM = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */ + CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/ + CU_AD_FORMAT_BC2_UNORM = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */ + CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/ + CU_AD_FORMAT_BC3_UNORM = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */ + CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/ + CU_AD_FORMAT_BC4_UNORM = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */ + CU_AD_FORMAT_BC4_SNORM = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */ + CU_AD_FORMAT_BC5_UNORM = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */ + CU_AD_FORMAT_BC5_SNORM = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */ + CU_AD_FORMAT_BC6H_UF16 = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */ + CU_AD_FORMAT_BC6H_SF16 = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */ + CU_AD_FORMAT_BC7_UNORM = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */ + CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */ } CUarray_format; /** @@ -495,112 +547,131 @@ typedef enum CUfilter_mode_enum { * Device properties */ typedef enum CUdevice_attribute_enum { - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ - CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ - CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ - CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ - CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ - CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */ - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ - CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ - CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ - CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ - CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ - CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ - CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ - CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ - CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ - CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ - CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ - CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ - CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ - CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ - CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ - CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ - CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ - CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ - CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Maximum 1D linear texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,/**< Maximum mipmapped 2D texture height */ - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ - CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ - CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */ - CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */ - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ - CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ - CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ - CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ - CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ - CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ - CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ - CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, /**< Device can coherently access managed memory concurrently with the CPU */ - CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, /**< Device supports compute preemption. */ - CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ - CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, /**< ::cuStreamBatchMemOp and related APIs are supported. */ - CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */ - CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */ - CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ - CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice */ - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */ - CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, /**< Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */ - CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ + CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ + CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ + CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ + CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ + CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ + CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ + CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ + CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, /**< Alternate maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ + CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, /**< Maximum mipmapped 2D texture height */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ + CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ + CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */ + CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ + CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ + CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ + CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, /**< Device can coherently access managed memory concurrently with the CPU */ + CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, /**< Device supports compute preemption. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, /**< ::cuStreamBatchMemOp and related APIs are supported. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */ + CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ + CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */ + CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */ + CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */ - CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */ + CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */ + CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102, /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/ + CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */ + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, /**< Maximum number of blocks per multiprocessor */ + CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, /**< Device supports compression of memory */ + CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */ + CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, /**< Maximum value of CUaccessPolicyWindow::num_bytes. */ + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */ + CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, /**< Shared memory reserved by CUDA driver per block in bytes */ + CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */ + CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */ + CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, /**< External timeline semaphore interop is supported on the device */ + CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */ + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */ + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */ + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */ + CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, /**< Handle types supported with mempool based IPC */ CU_DEVICE_ATTRIBUTE_MAX } CUdevice_attribute; @@ -618,21 +689,30 @@ typedef struct CUdevprop_st { int regsPerBlock; /**< 32-bit registers available per block */ int clockRate; /**< Clock frequency in kilohertz */ int textureAlign; /**< Alignment requirement for textures */ -} CUdevprop; +} CUdevprop_v1; +typedef CUdevprop_v1 CUdevprop; /** * Pointer information */ typedef enum CUpointer_attribute_enum { - CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ - CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ - CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ - CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ - CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ - CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ - CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ - CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, /**< Indicates if the pointer points to managed memory */ - CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9 /**< A device ordinal of a device on which a pointer was allocated or registered */ + CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ + CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ + CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ + CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ + CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ + CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ + CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, /**< Indicates if the pointer points to managed memory */ + CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9, /**< A device ordinal of a device on which a pointer was allocated or registered */ + CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/ + CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11, /**< Starting address for this requested pointer */ + CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12, /**< Size of the address range for this requested pointer */ + CU_POINTER_ATTRIBUTE_MAPPED = 13, /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/ + CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14, /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/ + CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/ + CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16, /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */ + CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17 /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/ } CUpointer_attribute; /** @@ -697,13 +777,16 @@ typedef enum CUfunction_attribute_enum { * The maximum size in bytes of dynamically-allocated shared memory that can be used by * this function. If the user-specified dynamic shared memory size is larger than this * value, the launch will fail. + * See ::cuFuncSetAttribute */ CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, /** - * On devices where the L1 cache and shared memory use the same hardware resources, - * this sets the shared memory carveout preference, in percent of the total resources. + * On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the total shared memory. + * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * See ::cuFuncSetAttribute */ CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, @@ -730,12 +813,12 @@ typedef enum CUsharedconfig_enum { } CUsharedconfig; /** - * Shared memory carveout configurations + * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute */ typedef enum CUshared_carveout_enum { - CU_SHAREDMEM_CARVEOUT_DEFAULT = -1, /** < no preference for shared memory or L1 (default) */ - CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, /** < prefer maximum available shared memory, minimum L1 cache */ - CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0 /** < prefer maximum available L1 cache, minimum shared memory */ + CU_SHAREDMEM_CARVEOUT_DEFAULT = -1, /**< No preference for shared memory or L1 (default) */ + CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, /**< Prefer maximum available shared memory, minimum L1 cache */ + CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0 /**< Prefer maximum available L1 cache, minimum shared memory */ } CUshared_carveout; /** @@ -947,6 +1030,51 @@ typedef enum CUjit_option_enum */ CU_JIT_GLOBAL_SYMBOL_COUNT, + /** + * Enable link-time optimization (-dlto) for device code (0: false, default)\n + * Option type: int\n + * Applies to: compiler and linker + */ + CU_JIT_LTO, + + /** + * Control single-precision denormals (-ftz) support (0: false, default). + * 1 : flushes denormal values to zero + * 0 : preserves denormal values + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + */ + CU_JIT_FTZ, + + /** + * Control single-precision floating-point division and reciprocals + * (-prec-div) support (1: true, default). + * 1 : Enables the IEEE round-to-nearest mode + * 0 : Enables the fast approximation mode + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + */ + CU_JIT_PREC_DIV, + + /** + * Control single-precision floating-point square root + * (-prec-sqrt) support (1: true, default). + * 1 : Enables the IEEE round-to-nearest mode + * 0 : Enables the fast approximation mode + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + */ + CU_JIT_PREC_SQRT, + + /** + * Enable/Disable the contraction of floating-point multiplies + * and adds/subtracts into floating-point multiply-add (-fma) + * operations (1: Enable, default; 0: Disable). + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + */ + CU_JIT_FMA, + CU_JIT_NUM_OPTIONS } CUjit_option; @@ -969,8 +1097,10 @@ typedef enum CUjit_target_enum CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/ - - CU_TARGET_COMPUTE_75 = 75 /**< Compute device class 7.5.*/ + CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/ + CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/ + CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/ + CU_TARGET_COMPUTE_86 = 86 /**< Compute device class 8.6.*/ } CUjit_target; /** @@ -1029,12 +1159,16 @@ typedef enum CUjitInputType_enum */ CU_JIT_INPUT_LIBRARY, + /** + * High-level intermediate code for link-time optimization\n + * Applicable options: NVVM compiler options, PTX compiler options + */ + CU_JIT_INPUT_NVVM, + CU_JIT_NUM_INPUT_TYPES } CUjitInputType; -#if __CUDA_API_VERSION >= 5050 typedef struct CUlinkState_st *CUlinkState; -#endif /* __CUDA_API_VERSION >= 5050 */ /** * Flags to register a graphics resource @@ -1078,6 +1212,7 @@ typedef enum CUlimit_enum { CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, /**< GPU device runtime launch synchronize depth */ CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */ CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */ + CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x06, /**< A size in bytes for L2 persisting lines cache size */ CU_LIMIT_MAX } CUlimit; @@ -1097,14 +1232,42 @@ typedef enum CUresourcetype_enum { #define CUDA_CB #endif -#if __CUDA_API_VERSION >= 10000 - /** * CUDA host function * \param userData Argument value passed to the function */ typedef void (CUDA_CB *CUhostFn)(void *userData); +/** + * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members. + */ +typedef enum CUaccessProperty_enum { + CU_ACCESS_PROPERTY_NORMAL = 0, /**< Normal cache persistence. */ + CU_ACCESS_PROPERTY_STREAMING = 1, /**< Streaming access is less likely to persit from cache. */ + CU_ACCESS_PROPERTY_PERSISTING = 2 /**< Persisting access is more likely to persist in cache.*/ +} CUaccessProperty; + +/** + * Specifies an access policy for a window, a contiguous extent of memory + * beginning at base_ptr and ending at base_ptr + num_bytes. + * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. + * Partition into many segments and assign segments such that: + * sum of "hit segments" / window == approx. ratio. + * sum of "miss segments" / window == approx 1-ratio. + * Segments and ratio specifications are fitted to the capabilities of + * the architecture. + * Accesses in a hit segment apply the hitProp access policy. + * Accesses in a miss segment apply the missProp access policy. + */ +typedef struct CUaccessPolicyWindow_st { + void *base_ptr; /**< Starting address of the access policy window. CUDA driver may align it. */ + size_t num_bytes; /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */ + float hitRatio; /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */ + CUaccessProperty hitProp; /**< ::CUaccessProperty set for hit. */ + CUaccessProperty missProp; /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */ +} CUaccessPolicyWindow_v1; +typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow; + /** * GPU kernel node parameters */ @@ -1119,7 +1282,8 @@ typedef struct CUDA_KERNEL_NODE_PARAMS_st { unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ void **kernelParams; /**< Array of pointers to kernel parameters */ void **extra; /**< Extra options */ -} CUDA_KERNEL_NODE_PARAMS; +} CUDA_KERNEL_NODE_PARAMS_v1; +typedef CUDA_KERNEL_NODE_PARAMS_v1 CUDA_KERNEL_NODE_PARAMS; /** * Memset node parameters @@ -1129,9 +1293,10 @@ typedef struct CUDA_MEMSET_NODE_PARAMS_st { size_t pitch; /**< Pitch of destination device pointer. Unused if height is 1 */ unsigned int value; /**< Value to be set */ unsigned int elementSize; /**< Size of each element in bytes. Must be 1, 2, or 4. */ - size_t width; /**< Width in bytes, of the row */ + size_t width; /**< Width of the row in elements */ size_t height; /**< Number of rows */ -} CUDA_MEMSET_NODE_PARAMS; +} CUDA_MEMSET_NODE_PARAMS_v1; +typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS; /** * Host node parameters @@ -1139,21 +1304,51 @@ typedef struct CUDA_MEMSET_NODE_PARAMS_st { typedef struct CUDA_HOST_NODE_PARAMS_st { CUhostFn fn; /**< The function to call when the node executes */ void* userData; /**< Argument to pass to the function */ -} CUDA_HOST_NODE_PARAMS; +} CUDA_HOST_NODE_PARAMS_v1; +typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS; /** * Graph node types */ typedef enum CUgraphNodeType_enum { - CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */ - CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */ - CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */ - CU_GRAPH_NODE_TYPE_HOST = 3, /**< Host (executable) node */ - CU_GRAPH_NODE_TYPE_GRAPH = 4, /**< Node which executes an embedded graph */ - CU_GRAPH_NODE_TYPE_EMPTY = 5, /**< Empty (no-op) node */ - CU_GRAPH_NODE_TYPE_COUNT + CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */ + CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */ + CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */ + CU_GRAPH_NODE_TYPE_HOST = 3, /**< Host (executable) node */ + CU_GRAPH_NODE_TYPE_GRAPH = 4, /**< Node which executes an embedded graph */ + CU_GRAPH_NODE_TYPE_EMPTY = 5, /**< Empty (no-op) node */ + CU_GRAPH_NODE_TYPE_WAIT_EVENT = 6, /**< External event wait node */ + CU_GRAPH_NODE_TYPE_EVENT_RECORD = 7, /**< External event record node */ + CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */ + CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT = 9, /**< External semaphore wait node */ + CU_GRAPH_NODE_TYPE_MEM_ALLOC = 10,/**< Memory Allocation Node */ + CU_GRAPH_NODE_TYPE_MEM_FREE = 11 /**< Memory Free Node */ } CUgraphNodeType; +typedef enum CUsynchronizationPolicy_enum { + CU_SYNC_POLICY_AUTO = 1, + CU_SYNC_POLICY_SPIN = 2, + CU_SYNC_POLICY_YIELD = 3, + CU_SYNC_POLICY_BLOCKING_SYNC = 4 +} CUsynchronizationPolicy; + +/** + * Graph kernel node Attributes + */ +typedef enum CUkernelNodeAttrID_enum { + CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, /**< Identifier for ::CUkernelNodeAttrValue::accessPolicyWindow. */ + CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE = 2 /**< Allows a kernel node to be cooperative (see ::cuLaunchCooperativeKernel). */ +} CUkernelNodeAttrID; + +/** + * Graph kernel node attributes union, used with ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute + */ +typedef union CUkernelNodeAttrValue_union { + CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ + int cooperative; /**< Nonzero indicates a cooperative kernel (see ::cuLaunchCooperativeKernel). */ +} CUkernelNodeAttrValue_v1; +typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue; + /** * Possible stream capture statuses returned by ::cuStreamIsCapturing */ @@ -1164,7 +1359,68 @@ typedef enum CUstreamCaptureStatus_enum { has been invalidated, but not terminated */ } CUstreamCaptureStatus; -#endif /* __CUDA_API_VERSION >= 10000 */ +/** + * Possible modes for stream capture thread interactions. For more details see + * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode + */ +typedef enum CUstreamCaptureMode_enum { + CU_STREAM_CAPTURE_MODE_GLOBAL = 0, + CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1, + CU_STREAM_CAPTURE_MODE_RELAXED = 2 +} CUstreamCaptureMode; + +/** + * Stream Attributes + */ +typedef enum CUstreamAttrID_enum { + CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, /**< Identifier for ::CUstreamAttrValue::accessPolicyWindow. */ + CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 /**< ::CUsynchronizationPolicy for work queued up in this stream */ +} CUstreamAttrID; + +/** + * Stream attributes union, used with ::cuStreamSetAttribute/::cuStreamGetAttribute + */ +typedef union CUstreamAttrValue_union { + CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ + CUsynchronizationPolicy syncPolicy; /**< Value for ::CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY. */ +} CUstreamAttrValue_v1; +typedef CUstreamAttrValue_v1 CUstreamAttrValue; + +/** + * Flags to specify search options. For more details see ::cuGetProcAddress + */ +typedef enum CUdriverProcAddress_flags_enum { + CU_GET_PROC_ADDRESS_DEFAULT = 0, /**< Default search mode for driver symbols. */ + CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0, /**< Search for legacy versions of driver symbols. */ + CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1 /**< Search for per-thread versions of driver symbols. */ +} CUdriverProcAddress_flags; + +/** + * Execution Affinity Types + */ +typedef enum CUexecAffinityType_enum { + CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0, /**< Create a context with limited SMs. */ + CU_EXEC_AFFINITY_TYPE_MAX +} CUexecAffinityType; + +/** + * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT + */ +typedef struct CUexecAffinitySmCount_st { + unsigned int val; /**< The number of SMs the context is limited to use. */ +} CUexecAffinitySmCount_v1; +typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount; + +/** + * Execution Affinity Parameters + */ +typedef struct CUexecAffinityParam_st { + CUexecAffinityType type; + union { + CUexecAffinitySmCount smCount; /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */ + } param; +} CUexecAffinityParam_v1; +typedef CUexecAffinityParam_v1 CUexecAffinityParam; /** * Error codes @@ -1229,6 +1485,13 @@ typedef enum cudaError_enum { */ CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, + /** + * This indicates that the CUDA driver that the application has loaded is a + * stub library. Applications that run with the stub rather than a real + * driver loaded will result in CUDA API returning this error. + */ + CUDA_ERROR_STUB_LIBRARY = 34, + /** * This indicates that no CUDA-capable devices were detected by the installed * CUDA driver. @@ -1237,10 +1500,15 @@ typedef enum cudaError_enum { /** * This indicates that the device ordinal supplied by the user does not - * correspond to a valid CUDA device. + * correspond to a valid CUDA device or that the action requested is + * invalid for the specified device. */ CUDA_ERROR_INVALID_DEVICE = 101, + /** + * This error indicates that the Grid license is not applied. + */ + CUDA_ERROR_DEVICE_NOT_LICENSED = 102, /** * This indicates that the device kernel image is invalid. This can also @@ -1365,7 +1633,25 @@ typedef enum cudaError_enum { CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221, /** - * This indicates that the device kernel source is invalid. + * This indicates that the provided PTX was compiled with an unsupported toolchain. + */ + + CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222, + + /** + * This indicates that the PTX JIT compilation was disabled. + */ + CUDA_ERROR_JIT_COMPILATION_DISABLED = 223, + + /** + * This indicates that the ::CUexecAffinityType passed to the API call is not + * supported by the active device. + */ + CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224, + + /** + * This indicates that the device kernel source is invalid. This includes + * compilation/linker errors encountered in device code or user error. */ CUDA_ERROR_INVALID_SOURCE = 300, @@ -1403,7 +1689,8 @@ typedef enum cudaError_enum { /** * This indicates that a named symbol was not found. Examples of symbols - * are global/constant variable names, texture names, and surface names. + * are global/constant variable names, driver function names, texture names, + * and surface names. */ CUDA_ERROR_NOT_FOUND = 500, @@ -1553,7 +1840,8 @@ typedef enum cudaError_enum { /** * An exception occurred on the device while executing a kernel. Common * causes include dereferencing an invalid device pointer and accessing - * out of bounds shared memory. + * out of bounds shared memory. Less common cases can be system specific - more + * information about these cases can be found in the system specific user guide. * This leaves the process in an inconsistent state and any further CUDA work * will return the same error. To continue using CUDA, the process must be terminated * and relaunched. @@ -1584,9 +1872,53 @@ typedef enum cudaError_enum { * This error indicates that the system is not yet ready to start any CUDA * work. To continue using CUDA, verify the system configuration is in a * valid state and all required driver daemons are actively running. + * More information about this error can be found in the system specific + * user guide. */ CUDA_ERROR_SYSTEM_NOT_READY = 802, + /** + * This error indicates that there is a mismatch between the versions of + * the display driver and the CUDA driver. Refer to the compatibility documentation + * for supported versions. + */ + CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803, + + /** + * This error indicates that the system was upgraded to run with forward compatibility + * but the visible hardware detected by CUDA does not support this configuration. + * Refer to the compatibility documentation for the supported hardware matrix or ensure + * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES + * environment variable. + */ + CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804, + + /** + * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server. + */ + CUDA_ERROR_MPS_CONNECTION_FAILED = 805, + + /** + * This error indicates that the remote procedural call between the MPS server and the MPS client failed. + */ + CUDA_ERROR_MPS_RPC_FAILURE = 806, + + /** + * This error indicates that the MPS server is not ready to accept new MPS client requests. + * This error can be returned when the MPS server is in the process of recovering from a fatal failure. + */ + CUDA_ERROR_MPS_SERVER_NOT_READY = 807, + + /** + * This error indicates that the hardware resources required to create MPS client have been exhausted. + */ + CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808, + + /** + * This error indicates the the hardware resources required to support device connections have been exhausted. + */ + CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809, + /** * This error indicates that the operation is not permitted when * the stream is capturing. @@ -1635,6 +1967,34 @@ typedef enum cudaError_enum { */ CUDA_ERROR_CAPTURED_EVENT = 907, + /** + * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED + * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a + * different thread. + */ + CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908, + + /** + * This error indicates that the timeout specified for the wait operation has lapsed. + */ + CUDA_ERROR_TIMEOUT = 909, + + /** + * This error indicates that the graph update was not performed because it included + * changes which violated constraints specific to instantiated graph update. + */ + CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910, + + /** + * This indicates that an async error has occurred in a device outside of CUDA. + * If CUDA was waiting for an external device's signal before consuming shared data, + * the external device signaled an error indicating that the data is not valid for + * consumption. This leaves the process in an inconsistent state and any further CUDA + * work will return the same error. To continue using CUDA, the process must be + * terminated and relaunched. + */ + CUDA_ERROR_EXTERNAL_DEVICE = 911, + /** * This indicates that an unknown internal error has occurred. */ @@ -1648,7 +2008,7 @@ typedef enum CUdevice_P2PAttribute_enum { CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, /**< A relative value indicating the performance of the link between two devices */ CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, /**< P2P Access is enable */ CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03, /**< Atomic operation over the link supported */ - CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED = 0x04, /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */ + CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 0x04, /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */ CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04 /**< Accessing CUDA arrays over the link supported */ } CUdevice_P2PAttribute; @@ -1708,15 +2068,25 @@ typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize); * On Windows the flag is a no-op. * On Linux that memory is marked as non cache-coherent for the GPU and * is expected to be physically contiguous. It may return - * CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, - * CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. - * On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED + * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, + * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. + * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED * is returned. * Flag for ::cuMemHostRegister() */ #define CU_MEMHOSTREGISTER_IOMEMORY 0x04 -#if __CUDA_API_VERSION >= 3020 +/** +* If set, the passed memory pointer is treated as pointing to memory that is +* considered read-only by the device. On platforms without +* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is +* required in order to register memory mapped to the CPU as read-only. Support +* for the use of this flag can be queried from the device attribute +* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with +* a current context associated with a device that does not have this attribute +* set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED. +*/ +#define CU_MEMHOSTREGISTER_READ_ONLY 0x08 /** * 2D memory copy parameters @@ -1742,7 +2112,8 @@ typedef struct CUDA_MEMCPY2D_st { size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ size_t Height; /**< Height of 2D memory copy */ -} CUDA_MEMCPY2D; +} CUDA_MEMCPY2D_v2; +typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D; /** * 3D memory copy parameters @@ -1775,7 +2146,8 @@ typedef struct CUDA_MEMCPY3D_st { size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ size_t Height; /**< Height of 3D memory copy */ size_t Depth; /**< Depth of 3D memory copy */ -} CUDA_MEMCPY3D; +} CUDA_MEMCPY3D_v2; +typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D; /** * 3D memory cross-context copy parameters @@ -1808,7 +2180,8 @@ typedef struct CUDA_MEMCPY3D_PEER_st { size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ size_t Height; /**< Height of 3D memory copy */ size_t Depth; /**< Depth of 3D memory copy */ -} CUDA_MEMCPY3D_PEER; +} CUDA_MEMCPY3D_PEER_v1; +typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER; /** * Array descriptor @@ -1820,7 +2193,8 @@ typedef struct CUDA_ARRAY_DESCRIPTOR_st CUarray_format Format; /**< Array format */ unsigned int NumChannels; /**< Channels per array element */ -} CUDA_ARRAY_DESCRIPTOR; +} CUDA_ARRAY_DESCRIPTOR_v2; +typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR; /** * 3D array descriptor @@ -1834,11 +2208,39 @@ typedef struct CUDA_ARRAY3D_DESCRIPTOR_st CUarray_format Format; /**< Array format */ unsigned int NumChannels; /**< Channels per array element */ unsigned int Flags; /**< Flags */ -} CUDA_ARRAY3D_DESCRIPTOR; +} CUDA_ARRAY3D_DESCRIPTOR_v2; +typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR; -#endif /* __CUDA_API_VERSION >= 3020 */ +/** + * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers + */ +#define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1 -#if __CUDA_API_VERSION >= 5000 +/** + * CUDA array sparse properties + */ +typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st { + struct { + unsigned int width; /**< Width of sparse tile in elements */ + unsigned int height; /**< Height of sparse tile in elements */ + unsigned int depth; /**< Depth of sparse tile in elements */ + } tileExtent; + + /** + * First mip level at which the mip tail begins. + */ + unsigned int miptailFirstLevel; + /** + * Total size of the mip tail. + */ + unsigned long long miptailSize; + /** + * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL + */ + unsigned int flags; + unsigned int reserved[4]; +} CUDA_ARRAY_SPARSE_PROPERTIES_v1; +typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES; /** * CUDA Resource descriptor @@ -1874,7 +2276,8 @@ typedef struct CUDA_RESOURCE_DESC_st } res; unsigned int flags; /**< Flags (must be zero) */ -} CUDA_RESOURCE_DESC; +} CUDA_RESOURCE_DESC_v1; +typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC; /** * Texture descriptor @@ -1890,7 +2293,8 @@ typedef struct CUDA_TEXTURE_DESC_st { float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ float borderColor[4]; /**< Border Color */ int reserved[12]; -} CUDA_TEXTURE_DESC; +} CUDA_TEXTURE_DESC_v1; +typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC; /** * Resource view format @@ -1948,7 +2352,8 @@ typedef struct CUDA_RESOURCE_VIEW_DESC_st unsigned int firstLayer; /**< First layer index */ unsigned int lastLayer; /**< Last layer index */ unsigned int reserved[16]; -} CUDA_RESOURCE_VIEW_DESC; +} CUDA_RESOURCE_VIEW_DESC_v1; +typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC; /** * GPU Direct v3 tokens @@ -1956,11 +2361,18 @@ typedef struct CUDA_RESOURCE_VIEW_DESC_st typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st { unsigned long long p2pToken; unsigned int vaSpaceToken; -} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; +} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1; +typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; -#endif /* __CUDA_API_VERSION >= 5000 */ - -#if __CUDA_API_VERSION >= 9000 +/** +* Access flags that specify the level of access the current context's device has +* on the memory referenced. +*/ +typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum { + CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE = 0x0, /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */ + CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ = 0x1, /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */ + CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3 /**< Read-write access, the device has full read-write access to the memory */ +} CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS; /** * Kernel launch parameters @@ -1976,11 +2388,8 @@ typedef struct CUDA_LAUNCH_PARAMS_st { unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ CUstream hStream; /**< Stream identifier */ void **kernelParams; /**< Array of pointers to kernel parameters */ -} CUDA_LAUNCH_PARAMS; - -#endif /* __CUDA_API_VERSION >= 9000 */ - -#if __CUDA_API_VERSION >= 10000 +} CUDA_LAUNCH_PARAMS_v1; +typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS; /** * External memory handle types @@ -1989,23 +2398,35 @@ typedef enum CUexternalMemoryHandleType_enum { /** * Handle is an opaque file descriptor */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, /** * Handle is an opaque shared NT handle */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, /** * Handle is an opaque, globally shared handle */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, /** * Handle is a D3D12 heap object */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, /** * Handle is a D3D12 committed resource */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5 + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, + /** + * Handle is a shared NT handle to a D3D11 resource + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, + /** + * Handle is a globally shared handle to a D3D11 resource + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, + /** + * Handle is an NvSciBuf object + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 } CUexternalMemoryHandleType; /** @@ -2013,6 +2434,37 @@ typedef enum CUexternalMemoryHandleType_enum { */ #define CUDA_EXTERNAL_MEMORY_DEDICATED 0x1 +/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS + * contains this flag, it indicates that signaling an external semaphore object + * should skip performing appropriate memory synchronization operations over all + * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, + * which otherwise are performed by default to ensure data coherency with other + * importers of the same NvSciBuf memory objects. + */ +#define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01 + +/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS + * contains this flag, it indicates that waiting on an external semaphore object + * should skip performing appropriate memory synchronization operations over all + * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, + * which otherwise are performed by default to ensure data coherency with other + * importers of the same NvSciBuf memory objects. + */ +#define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02 + +/** + * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, + * it indicates that application needs signaler specific NvSciSyncAttr + * to be filled by ::cuDeviceGetNvSciSyncAttributes. + */ +#define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1 + +/** + * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, + * it indicates that application needs waiter specific NvSciSyncAttr + * to be filled by ::cuDeviceGetNvSciSyncAttributes. + */ +#define CUDA_NVSCISYNC_ATTR_WAIT 0x2 /** * External memory handle descriptor */ @@ -2035,9 +2487,12 @@ typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT * Exactly one of 'handle' and 'name' must be non-NULL. If - * type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * type is one of the following: + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT * then 'name' must be NULL. */ struct { @@ -2051,6 +2506,11 @@ typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { */ const void *name; } win32; + /** + * A handle representing an NvSciBuf Object. Valid when type + * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF + */ + const void *nvSciBufObject; } handle; /** * Size of the memory allocation @@ -2061,7 +2521,8 @@ typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { */ unsigned int flags; unsigned int reserved[16]; -} CUDA_EXTERNAL_MEMORY_HANDLE_DESC; +} CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1; +typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC; /** * External memory buffer descriptor @@ -2080,7 +2541,8 @@ typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { */ unsigned int flags; unsigned int reserved[16]; -} CUDA_EXTERNAL_MEMORY_BUFFER_DESC; +} CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1; +typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC; /** * External memory mipmap descriptor @@ -2100,7 +2562,8 @@ typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { */ unsigned int numLevels; unsigned int reserved[16]; -} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; +} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1; +typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; /** * External semaphore handle types @@ -2109,19 +2572,43 @@ typedef enum CUexternalSemaphoreHandleType_enum { /** * Handle is an opaque file descriptor */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, /** * Handle is an opaque shared NT handle */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, /** * Handle is an opaque, globally shared handle */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, /** * Handle is a shared NT handle referencing a D3D12 fence object */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4 + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, + /** + * Handle is a shared NT handle referencing a D3D11 fence object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, + /** + * Opaque handle to NvSciSync Object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, + /** + * Handle is a shared NT handle referencing a D3D11 keyed mutex object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, + /** + * Handle is a globally shared handle referencing a D3D11 keyed mutex object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, + /** + * Handle is an opaque file descriptor referencing a timeline semaphore + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, + /** + * Handle is an opaque shared NT handle referencing a timeline semaphore + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 } CUexternalSemaphoreHandleType; /** @@ -2135,8 +2622,9 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { union { /** * File descriptor referencing the semaphore object. Valid - * when type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD + * when type is one of the following: + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD */ int fd; /** @@ -2145,9 +2633,13 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 * Exactly one of 'handle' and 'name' must be non-NULL. If - * type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * type is one of the following: + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT * then 'name' must be NULL. */ struct { @@ -2161,13 +2653,18 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { */ const void *name; } win32; + /** + * Valid NvSciSyncObj. Must be non NULL + */ + const void* nvSciSyncObj; } handle; /** * Flags reserved for the future. Must be zero. */ unsigned int flags; unsigned int reserved[16]; -} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; +} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1; +typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; /** * External semaphore signal parameters @@ -2183,14 +2680,39 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st { */ unsigned long long value; } fence; - unsigned int reserved[16]; + union { + /** + * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType + * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. + */ + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /** + * Value of key to release the mutex with + */ + unsigned long long key; + } keyedMutex; + unsigned int reserved[12]; } params; /** - * Flags reserved for the future. Must be zero. + * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to + * signal a ::CUexternalSemaphore of type + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates + * that while signaling the ::CUexternalSemaphore, no memory synchronization + * operations should be performed for any external memory object imported + * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. + * For all other types of ::CUexternalSemaphore, flags must be zero. */ unsigned int flags; unsigned int reserved[16]; -} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS; +} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1; +typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS; /** * External semaphore wait parameters @@ -2206,17 +2728,400 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { */ unsigned long long value; } fence; - unsigned int reserved[16]; + /** + * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType + * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. + */ + union { + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /** + * Value of key to acquire the mutex with + */ + unsigned long long key; + /** + * Timeout in milliseconds to wait to acquire the mutex + */ + unsigned int timeoutMs; + } keyedMutex; + unsigned int reserved[10]; } params; /** - * Flags reserved for the future. Must be zero. + * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on + * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, + * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC + * which indicates that while waiting for the ::CUexternalSemaphore, no memory + * synchronization operations should be performed for any external memory + * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. + * For all other types of ::CUexternalSemaphore, flags must be zero. */ unsigned int flags; unsigned int reserved[16]; -} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS; +} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1; +typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS; +/** + * Semaphore signal node parameters + */ +typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st { + CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ + const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */ + unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ +} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1; +typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS; -#endif /* __CUDA_API_VERSION >= 10000 */ +/** + * Semaphore wait node parameters + */ +typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st { + CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ + const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */ + unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ +} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1; +typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS; + +typedef unsigned long long CUmemGenericAllocationHandle_v1; +typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle; + +/** + * Flags for specifying particular handle types + */ +typedef enum CUmemAllocationHandleType_enum { + CU_MEM_HANDLE_TYPE_NONE = 0x0, /**< Does not allow any export mechanism. > */ + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1, /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */ + CU_MEM_HANDLE_TYPE_WIN32 = 0x2, /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */ + CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4, /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */ + CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF +} CUmemAllocationHandleType; + +/** + * Specifies the memory protection flags for mapping. + */ +typedef enum CUmemAccess_flags_enum { + CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0, /**< Default, make the address range not accessible */ + CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1, /**< Make the address range read accessible */ + CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3, /**< Make the address range read-write accessible */ + CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF +} CUmemAccess_flags; + +/** + * Specifies the type of location + */ +typedef enum CUmemLocationType_enum { + CU_MEM_LOCATION_TYPE_INVALID = 0x0, + CU_MEM_LOCATION_TYPE_DEVICE = 0x1, /**< Location is a device location, thus id is a device ordinal */ + CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF +} CUmemLocationType; + +/** +* Defines the allocation types available +*/ +typedef enum CUmemAllocationType_enum { + CU_MEM_ALLOCATION_TYPE_INVALID = 0x0, + + /** This allocation type is 'pinned', i.e. cannot migrate from its current + * location while the application is actively using it + */ + CU_MEM_ALLOCATION_TYPE_PINNED = 0x1, + CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF +} CUmemAllocationType; + +/** +* Flag for requesting different optimal and required granularities for an allocation. +*/ +typedef enum CUmemAllocationGranularity_flags_enum { + CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0, /**< Minimum required granularity for allocation */ + CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1 /**< Recommended granularity for allocation for best performance */ +} CUmemAllocationGranularity_flags; + +/** + * Sparse subresource types + */ +typedef enum CUarraySparseSubresourceType_enum { + CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, + CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 +} CUarraySparseSubresourceType; + +/** + * Memory operation types + */ +typedef enum CUmemOperationType_enum { + CU_MEM_OPERATION_TYPE_MAP = 1, + CU_MEM_OPERATION_TYPE_UNMAP = 2 +} CUmemOperationType; + +/** + * Memory handle types + */ +typedef enum CUmemHandleType_enum { + CU_MEM_HANDLE_TYPE_GENERIC = 0 +} CUmemHandleType; + +/** + * Specifies the CUDA array or CUDA mipmapped array memory mapping information + */ +typedef struct CUarrayMapInfo_st { + CUresourcetype resourceType; /**< Resource type */ + + union { + CUmipmappedArray mipmap; + CUarray array; + } resource; + + CUarraySparseSubresourceType subresourceType; /**< Sparse subresource type */ + + union { + struct { + unsigned int level; /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */ + unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ + unsigned int offsetX; /**< Starting X offset in elements */ + unsigned int offsetY; /**< Starting Y offset in elements */ + unsigned int offsetZ; /**< Starting Z offset in elements */ + unsigned int extentWidth; /**< Width in elements */ + unsigned int extentHeight; /**< Height in elements */ + unsigned int extentDepth; /**< Depth in elements */ + } sparseLevel; + struct { + unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ + unsigned long long offset; /**< Offset within mip tail */ + unsigned long long size; /**< Extent in bytes */ + } miptail; + } subresource; + + CUmemOperationType memOperationType; /**< Memory operation type */ + CUmemHandleType memHandleType; /**< Memory handle type */ + + union { + CUmemGenericAllocationHandle memHandle; + } memHandle; + + unsigned long long offset; /**< Offset within the memory */ + unsigned int deviceBitMask; /**< Device ordinal bit mask */ + unsigned int flags; /**< flags for future use, must be zero now. */ + unsigned int reserved[2]; /**< Reserved for future use, must be zero now. */ +} CUarrayMapInfo_v1; +typedef CUarrayMapInfo_v1 CUarrayMapInfo; + +/** + * Specifies a memory location. + */ +typedef struct CUmemLocation_st { + CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */ + int id; /**< identifier for a given this location's ::CUmemLocationType. */ +} CUmemLocation_v1; +typedef CUmemLocation_v1 CUmemLocation; + +/** + * Specifies compression attribute for an allocation. + */ +typedef enum CUmemAllocationCompType_enum { + CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */ + CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating compressible memory */ +} CUmemAllocationCompType; + +/** + * This flag if set indicates that the memory will be used as a tile pool. + */ +#define CU_MEM_CREATE_USAGE_TILE_POOL 0x1 + +/** +* Specifies the allocation properties for a allocation. +*/ +typedef struct CUmemAllocationProp_st { + /** Allocation type */ + CUmemAllocationType type; + /** requested ::CUmemAllocationHandleType */ + CUmemAllocationHandleType requestedHandleTypes; + /** Location of allocation */ + CUmemLocation location; + /** + * Windows-specific POBJECT_ATTRIBUTES required when + * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This object atributes structure + * includes security attributes that define + * the scope of which exported allocations may be tranferred to other + * processes. In all other cases, this field is required to be zero. + */ + void *win32HandleMetaData; + struct { + /** + * Allocation hint for requesting compressible memory. + * On devices that support Compute Data Compression, compressible + * memory can be used to accelerate accesses to data with unstructured + * sparsity and other compressible data patterns. Applications are + * expected to query allocation property of the handle obtained with + * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to + * validate if the obtained allocation is compressible or not. Note that + * compressed memory may not be mappable on all devices. + */ + unsigned char compressionType; + unsigned char gpuDirectRDMACapable; + /** Bitmask indicating intended usage for this allocation */ + unsigned short usage; + unsigned char reserved[4]; + } allocFlags; +} CUmemAllocationProp_v1; +typedef CUmemAllocationProp_v1 CUmemAllocationProp; + +/** + * Memory access descriptor + */ +typedef struct CUmemAccessDesc_st { + CUmemLocation location; /**< Location on which the request is to change it's accessibility */ + CUmemAccess_flags flags; /**< ::CUmemProt accessibility flags to set on the request */ +} CUmemAccessDesc_v1; +typedef CUmemAccessDesc_v1 CUmemAccessDesc; + +typedef enum CUgraphExecUpdateResult_enum { + CU_GRAPH_EXEC_UPDATE_SUCCESS = 0x0, /**< The update succeeded */ + CU_GRAPH_EXEC_UPDATE_ERROR = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */ + CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED = 0x2, /**< The update failed because the topology changed */ + CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED = 0x3, /**< The update failed because a node type changed */ + CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */ + CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED = 0x5, /**< The update failed because the parameters changed in a way that is not supported */ + CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED = 0x6, /**< The update failed because something about the node is not supported */ + CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7 /**< The update failed because the function of a kernel node changed in an unsupported way */ +} CUgraphExecUpdateResult; + +/** + * CUDA memory pool attributes + */ +typedef enum CUmemPool_attribute_enum { + /** + * (value type = int) + * Allow cuMemAllocAsync to use memory asynchronously freed + * in another streams as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + */ + CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1, + + /** + * (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + */ + CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, + + /** + * (value type = int) + * Allow cuMemAllocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by cuFreeAsync (default enabled). + */ + CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, + + /** + * (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + */ + CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + + /** + * (value type = cuuint64_t) + * Amount of backing memory currently allocated for the mempool. + */ + CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, + + /** + * (value type = cuuint64_t) + * High watermark of backing memory allocated for the mempool since the + * last time it was reset. High watermark can only be reset to zero. + */ + CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, + + /** + * (value type = cuuint64_t) + * Amount of memory from the pool that is currently in use by the application. + */ + CU_MEMPOOL_ATTR_USED_MEM_CURRENT, + + /** + * (value type = cuuint64_t) + * High watermark of the amount of memory from the pool that was in use by the application since + * the last time it was reset. High watermark can only be reset to zero. + */ + CU_MEMPOOL_ATTR_USED_MEM_HIGH +} CUmemPool_attribute; + +/** + * Specifies the properties of allocations made from the pool. + */ +typedef struct CUmemPoolProps_st { + CUmemAllocationType allocType; /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */ + CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */ + CUmemLocation location; /**< Location where allocations should reside. */ + /** + * Windows-specific LPSECURITYATTRIBUTES required when + * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute defines + * the scope of which exported allocations may be tranferred to other + * processes. In all other cases, this field is required to be zero. + */ + void *win32SecurityAttributes; + unsigned char reserved[64]; /**< reserved for future use, must be 0 */ +} CUmemPoolProps_v1; +typedef CUmemPoolProps_v1 CUmemPoolProps; + +/** + * Opaque data for exporting a pool allocation + */ +typedef struct CUmemPoolPtrExportData_st { + unsigned char reserved[64]; +} CUmemPoolPtrExportData_v1; +typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData; + +/** + * Memory allocation node parameters + */ +typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_st { + /** + * in: location where the allocation should reside (specified in ::location). + * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported. + */ + CUmemPoolProps poolProps; + const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */ + size_t accessDescCount; /**< in: number of memory access descriptors. Must not exceed the number of GPUs. */ + size_t bytesize; /**< in: size in bytes of the requested allocation */ + CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */ +} CUDA_MEM_ALLOC_NODE_PARAMS; + +typedef enum CUgraphMem_attribute_enum { + /** + * (value type = cuuint64_t) + * Amount of memory, in bytes, currently associated with graphs + */ + CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT, + + /** + * (value type = cuuint64_t) + * High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + */ + CU_GRAPH_MEM_ATTR_USED_MEM_HIGH, + + /** + * (value type = cuuint64_t) + * Amount of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + */ + CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT, + + /** + * (value type = cuuint64_t) + * High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + */ + CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH +} CUgraphMem_attribute; /** * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only @@ -2276,6 +3181,12 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { */ #define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20 +/** + * This flag if set indicates that the CUDA array or CUDA mipmapped array + * is a sparse CUDA array or CUDA mipmapped array respectively + */ +#define CUDA_ARRAY3D_SPARSE 0x40 + /** * Override the texref format with a format inferred from the array. * Flag for ::cuTexRefSetArray() @@ -2285,22 +3196,28 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { /** * Read the texture as integers rather than promoting the values to floats * in the range [0,1]. - * Flag for ::cuTexRefSetFlags() + * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() */ #define CU_TRSF_READ_AS_INTEGER 0x01 /** * Use normalized texture coordinates in the range [0,1) instead of [0,dim). - * Flag for ::cuTexRefSetFlags() + * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() */ #define CU_TRSF_NORMALIZED_COORDINATES 0x02 /** * Perform sRGB->linear conversion during texture read. - * Flag for ::cuTexRefSetFlags() + * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() */ #define CU_TRSF_SRGB 0x10 + /** + * Disable any trilinear filtering optimizations. + * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() + */ +#define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION 0x20 + /** * End of array terminator for the \p extra parameter to * ::cuLaunchKernel @@ -2344,8 +3261,86 @@ typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { */ #define CU_DEVICE_INVALID ((CUdevice)-2) +/** + * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS + */ +typedef enum CUflushGPUDirectRDMAWritesOptions_enum { + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */ + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1 /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */ +} CUflushGPUDirectRDMAWritesOptions; + +/** + * Platform native ordering for GPUDirect RDMA writes + */ +typedef enum CUGPUDirectRDMAWritesOrdering_enum { + CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE = 0, /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */ + CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */ + CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200 /**< Any CUDA device in the system can consistently consume remote writes to this device. */ +} CUGPUDirectRDMAWritesOrdering; + +/** + * The scopes for ::cuFlushGPUDirectRDMAWrites + */ +typedef enum CUflushGPUDirectRDMAWritesScope_enum { + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */ + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200 /**< Blocks until remote writes are visible to all CUDA device contexts. */ +} CUflushGPUDirectRDMAWritesScope; + +/** + * The targets for ::cuFlushGPUDirectRDMAWrites + */ +typedef enum CUflushGPUDirectRDMAWritesTarget_enum { + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */ +} CUflushGPUDirectRDMAWritesTarget; + +/** + * The additional write options for ::cuGraphDebugDotPrint + */ +typedef enum CUgraphDebugDot_flags_enum { + CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE = 1<<0, /** Output all debug data as if every debug flag is enabled */ + CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES = 1<<1, /** Use CUDA Runtime structures for output */ + CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS = 1<<2, /** Adds CUDA_KERNEL_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS = 1<<3, /** Adds CUDA_MEMCPY3D values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS = 1<<4, /** Adds CUDA_MEMSET_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS = 1<<5, /** Adds CUDA_HOST_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS = 1<<6, /** Adds CUevent handle from record and wait nodes to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS = 1<<7, /** Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS = 1<<8, /** Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES = 1<<9, /** Adds CUkernelNodeAttrValue values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES = 1<<10, /** Adds node handles and every kernel function handle to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS = 1<<11, /** Adds memory alloc node parameters to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS = 1<<12 /** Adds memory free node parameters to output */ +} CUgraphDebugDot_flags; + +/** + * Flags for user objects for graphs + */ +typedef enum CUuserObject_flags_enum { + CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1 /**< Indicates the destructor execution is not synchronized by any CUDA handle. */ +} CUuserObject_flags; + +/** + * Flags for retaining user object references for graphs + */ +typedef enum CUuserObjectRetain_flags_enum { + CU_GRAPH_USER_OBJECT_MOVE = 1 /**< Transfer references from the caller rather than creating new references. */ +} CUuserObjectRetain_flags; + +/** + * Flags for instantiating a graph + */ +typedef enum CUgraphInstantiate_flags_enum { + CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH = 1 /**< Automatically free memory allocated in a graph before relaunching. */ +} CUgraphInstantiate_flags; + /** @} */ /* END CUDA_TYPES */ +#if defined(__GNUC__) + #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) + #pragma GCC visibility push(default) + #endif +#endif + #ifdef _WIN32 #define CUDAAPI __stdcall #else @@ -2433,7 +3428,9 @@ CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr); * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH, + * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE * \notefnerr */ CUresult CUDAAPI cuInit(unsigned int Flags); @@ -2514,7 +3511,8 @@ CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); * ::cuDeviceGetName, * ::cuDeviceGetUuid, * ::cuDeviceGetLuid, - * ::cuDeviceTotalMem + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport */ CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); @@ -2542,6 +3540,7 @@ CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); * ::cuDeviceGetLuid, * ::cuDeviceGet, * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, * ::cudaGetDeviceCount */ CUresult CUDAAPI cuDeviceGetCount(int *count); @@ -2573,14 +3572,17 @@ CUresult CUDAAPI cuDeviceGetCount(int *count); * ::cuDeviceGetCount, * ::cuDeviceGet, * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, * ::cudaGetDeviceProperties */ CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); -#if __CUDA_API_VERSION >= 9020 /** * \brief Return an UUID for the device * + * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will + * supplant this version in 12.0, which is retained for minor version compatibility. + * * Returns 16-octets identifing the device \p dev in the structure * pointed by the \p uuid. * @@ -2596,6 +3598,37 @@ CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); * \notefnerr * * \sa + * ::cuDeviceGetUuid_v2 + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetLuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev); + +/** + * \brief Return an UUID for the device (11.4+) + * + * Returns 16-octets identifing the device \p dev in the structure + * pointed by the \p uuid. If the device is in MIG mode, returns its + * MIG UUID which uniquely identifies the subscribed MIG compute instance. + * + * \param uuid - Returned UUID + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, @@ -2604,10 +3637,8 @@ CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); * ::cuDeviceTotalMem, * ::cudaGetDeviceProperties */ -CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev); -#endif +CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev); -#if defined(_WIN32) && __CUDA_API_VERSION >= 10000 /** * \brief Return an LUID and device node mask for the device * @@ -2632,12 +3663,11 @@ CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev); * ::cuDeviceGetName, * ::cuDeviceGet, * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, * ::cudaGetDeviceProperties */ CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev); -#endif -#if __CUDA_API_VERSION >= 3020 /** * \brief Returns the total amount of memory on the device * @@ -2662,10 +3692,41 @@ CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdev * ::cuDeviceGetName, * ::cuDeviceGetUuid, * ::cuDeviceGet, + * ::cuDeviceGetExecAffinitySupport, * ::cudaMemGetInfo */ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); -#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size. + * + * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture + * for given \p format and \p numChannels. + * + * \param maxWidthInElements - Returned maximum number of texture elements allocatable for given \p format and \p numChannels. + * \param format - Texture format. + * \param numChannels - Number of channels per texture element. + * \param dev - Device handle. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cudaMemGetInfo, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev); /** * \brief Returns information about the device @@ -2674,117 +3735,117 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); * \p dev. The supported attributes are: * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per * block; - * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block; - * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block; - * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block; - * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid; - * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid; - * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid; + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of - * shared memory available to a thread block in bytes; + * shared memory available to a thread block in bytes * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for - * __constant__ variables in a CUDA C kernel in bytes; - * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads; + * __constant__ variables in a CUDA C kernel in bytes + * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the * memory copy functions that involve memory regions allocated through - * ::cuMemAllocPitch(); + * ::cuMemAllocPitch() * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D - * texture width; + * texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width - * for a 1D texture bound to linear memory; + * for a 1D texture bound to linear memory * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum - * mipmapped 1D texture width; + * mipmapped 1D texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D - * texture width; + * texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D - * texture height; + * texture height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width - * for a 2D texture bound to linear memory; + * for a 2D texture bound to linear memory * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height - * for a 2D texture bound to linear memory; + * for a 2D texture bound to linear memory * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch - * in bytes for a 2D texture bound to linear memory; + * in bytes for a 2D texture bound to linear memory * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum - * mipmapped 2D texture width; + * mipmapped 2D texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum - * mipmapped 2D texture height; + * mipmapped 2D texture height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D - * texture width; + * texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D - * texture height; + * texture height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D - * texture depth; + * texture depth * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: * Alternate maximum 3D texture width, 0 if no alternate - * maximum 3D texture size is supported; + * maximum 3D texture size is supported * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: * Alternate maximum 3D texture height, 0 if no alternate - * maximum 3D texture size is supported; + * maximum 3D texture size is supported * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: * Alternate maximum 3D texture depth, 0 if no alternate - * maximum 3D texture size is supported; + * maximum 3D texture size is supported * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: - * Maximum cubemap texture width or height; + * Maximum cubemap texture width or height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: - * Maximum 1D layered texture width; + * Maximum 1D layered texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: - * Maximum layers in a 1D layered texture; + * Maximum layers in a 1D layered texture * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: - * Maximum 2D layered texture width; + * Maximum 2D layered texture width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: - * Maximum 2D layered texture height; + * Maximum 2D layered texture height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: - * Maximum layers in a 2D layered texture; + * Maximum layers in a 2D layered texture * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: - * Maximum cubemap layered texture width or height; + * Maximum cubemap layered texture width or height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: - * Maximum layers in a cubemap layered texture; + * Maximum layers in a cubemap layered texture * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: - * Maximum 1D surface width; + * Maximum 1D surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: - * Maximum 2D surface width; + * Maximum 2D surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: - * Maximum 2D surface height; + * Maximum 2D surface height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: - * Maximum 3D surface width; + * Maximum 3D surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: - * Maximum 3D surface height; + * Maximum 3D surface height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: - * Maximum 3D surface depth; + * Maximum 3D surface depth * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: - * Maximum 1D layered surface width; + * Maximum 1D layered surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: - * Maximum layers in a 1D layered surface; + * Maximum layers in a 1D layered surface * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: - * Maximum 2D layered surface width; + * Maximum 2D layered surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: - * Maximum 2D layered surface height; + * Maximum 2D layered surface height * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: - * Maximum layers in a 2D layered surface; + * Maximum layers in a 2D layered surface * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: - * Maximum cubemap surface width; + * Maximum cubemap surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: - * Maximum cubemap layered surface width; + * Maximum cubemap layered surface width * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: - * Maximum layers in a cubemap layered surface; + * Maximum layers in a cubemap layered surface * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit - * registers available to a thread block; - * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz; + * registers available to a thread block + * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture * base addresses aligned to ::textureAlign bytes do not need an offset - * applied to texture fetches; + * applied to texture fetches * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement - * for 2D texture references bound to pitched memory; + * for 2D texture references bound to pitched memory * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy - * memory between host and device while executing a kernel, or 0 if not; + * memory between host and device while executing a kernel, or 0 if not * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on - * the device; + * the device * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit - * for kernels executed on the device, or 0 if not; + * for kernels executed on the device, or 0 if not * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the - * memory subsystem, or 0 if not; + * memory subsystem, or 0 if not * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host - * memory into the CUDA address space, or 0 if not; + * memory into the CUDA address space, or 0 if not * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently * in. Available modes are as follows: * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and @@ -2797,33 +3858,33 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); * executing multiple kernels within the same context simultaneously, or 0 if * not. It is not guaranteed that multiple kernels will be resident * on the device concurrently so this feature should not be relied upon for - * correctness; + * correctness. * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the - * device, 0 if error correction is disabled or not supported by the device; - * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device; + * device, 0 if error correction is disabled or not supported by the device + * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier - * of the device; + * of the device * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC - * is only available on Tesla hardware running Windows Vista or later; - * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz; - * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits; - * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache; - * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor; + * is only available on Tesla hardware running Windows Vista or later + * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits + * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with - * the host, or 0 if not; - * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number; - * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number; + * the host, or 0 if not + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals - * in L1 cache, 0 if caching globals in L1 cache is not supported by the device; + * in L1 cache, 0 if caching globals in L1 cache is not supported by the device * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals - * in L1 cache, 0 if caching locals in L1 cache is not supported by the device; + * in L1 cache, 0 if caching locals in L1 cache is not supported by the device * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of * shared memory available to a multiprocessor in bytes; this amount is shared - * by all thread blocks simultaneously resident on a multiprocessor; + * by all thread blocks simultaneously resident on a multiprocessor * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit * registers available to a multiprocessor; this number is shared by all thread - * blocks simultaneously resident on a multiprocessor; + * blocks simultaneously resident on a multiprocessor * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory * on this system, 0 if allocating managed memory is not supported by the device on this system. * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not. @@ -2846,6 +3907,23 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's * page tables. * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration. + * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED: Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs + * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate + * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate + * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor + * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate + * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes + * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes + * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate. + * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes + * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. + * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU + * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs + * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) + * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum + * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. + * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC * * \param pi - Returned device attribute value * \param attrib - Device attribute to query @@ -2866,11 +3944,144 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); * ::cuDeviceGetUuid, * ::cuDeviceGet, * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, * ::cudaDeviceGetAttribute, * ::cudaGetDeviceProperties */ CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); +/** + * \brief Return NvSciSync attributes that this device can support. + * + * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that + * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList + * can be used to create an NvSciSync object that matches this device's capabilities. + * + * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is + * already set this API will return ::CUDA_ERROR_INVALID_VALUE. + * + * The applications should set \p nvSciSyncAttrList to a valid + * NvSciSyncAttrList failing which this API will return + * ::CUDA_ERROR_INVALID_HANDLE. + * + * The \p flags controls how applications intends to use + * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are: + * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to + * signal an NvSciSync on this CUDA device. + * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to + * wait on an NvSciSync on this CUDA device. + * + * At least one of these flags must be set, failing which the API + * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal + * to one another: a developer may set both these flags that allows to + * set both wait and signal specific attributes in the same \p nvSciSyncAttrList. + * + * \param nvSciSyncAttrList - Return NvSciSync attributes supported. + * \param dev - Valid Cuda Device to get NvSciSync attributes for. + * \param flags - flags describing NvSciSync usage. + * + * \return + * + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa + * ::cuImportExternalSemaphore, + * ::cuDestroyExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags); + +/** + * \brief Sets the current memory pool of a device + * + * The memory pool must be local to the specified device. + * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device. + * By default, a device's current memory pool is its default memory pool. + * + * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different + * than the one the stream runs on. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync + */ +CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool); + +/** + * \brief Gets the current mempool for a device + * + * Returns the last pool provided to ::cuDeviceSetMemPool for this device + * or the device's default memory pool if ::cuDeviceSetMemPool has never been called. + * By default the current mempool is the default mempool for a device. + * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool + */ +CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev); + +/** + * \brief Returns the default mempool of a device + * + * The default mempool of a device contains device memory from that device. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev); + +/** + * \brief Blocks until remote writes are visible to the specified scope + * + * Blocks until GPUDirect RDMA writes to the target context via mappings + * created through APIs like nvidia_p2p_get_pages (see + * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are + * visible to the specified scope. + * + * If the scope equals or lies within the scope indicated by + * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call + * will be a no-op and can be safely omitted for performance. This can be + * determined by comparing the numerical values between the two enums, with + * smaller scopes having smaller values. + * + * Users may query support for this API via + * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS. + * + * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget + * \param scope - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + */ +CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope); + /** @} */ /* END CUDA_DEVICE */ /** @@ -3000,20 +4211,19 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *mi * @{ */ -#if __CUDA_API_VERSION >= 7000 - /** * \brief Retain the primary context on the GPU * - * Retains the primary context on the device, creating it if necessary, - * increasing its usage count. The caller must call - * ::cuDevicePrimaryCtxRelease() when done using the context. - * Unlike ::cuCtxCreate() the newly created context is not pushed onto the stack. + * Retains the primary context on the device. + * Once the user successfully retains the primary context, the primary context + * will be active and available to the user until the user releases it + * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset(). + * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack. * - * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of - * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() - * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode - * of the device. + * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN + * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function + * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to + * determine the compute mode of the device. * The nvidia-smi tool can be used to set the compute mode for * devices. Documentation for nvidia-smi can be obtained by passing a * -h option to it. @@ -3054,9 +4264,15 @@ CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev); /** * \brief Release the primary context on the GPU * - * Releases the primary context interop on the device by decreasing the usage - * count by 1. If the usage drops to 0 the primary context of device \p dev - * will be destroyed regardless of how many threads it is current to. + * Releases the primary context interop on the device. + * A retained context should always be released once the user is done using + * it. The context is automatically reset once the last reference to it is + * released. This behavior is different when the primary context was retained + * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary + * context remains always active. + * + * Releasing a primary context that has not been previously retained will + * fail with ::CUDA_ERROR_INVALID_CONTEXT. * * Please note that unlike ::cuCtxDestroy() this method does not pop the context * from stack in any circumstances. @@ -3067,7 +4283,8 @@ CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev); * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_CONTEXT * \notefnerr * * \sa ::cuDevicePrimaryCtxRetain, @@ -3089,8 +4306,7 @@ CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); * \brief Set flags for the primary context * * Sets the flags for the primary context on the device overwriting perviously - * set ones. If the primary context is already created - * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE is returned. + * set ones. * * The three LSBs of the \p flags parameter can be used to control how the OS * thread, which owns the CUDA context at the time of an API call, interacts @@ -3121,13 +4337,16 @@ CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); * \e C > \e P, then CUDA will yield to other OS threads when waiting for * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). - * However, on low power devices like Tegra, it always defaults to - * ::CU_CTX_SCHED_BLOCKING_SYNC. + * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on + * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC + * for low-powered devices. * * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory * after resizing local memory for a kernel. This can prevent thrashing by * local memory allocations when launching many kernels with high local - * memory usage at the cost of potentially increased memory usage. + * memory usage at the cost of potentially increased memory usage.
+ * Deprecated: This flag is deprecated and the behavior enabled + * by this flag is now the default and cannot be disabled. * * \param dev - Device for which the primary context flags are set * \param flags - New flags for the device @@ -3138,7 +4357,6 @@ CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_DEVICE, * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE * \notefnerr * * \sa ::cuDevicePrimaryCtxRetain, @@ -3186,6 +4404,8 @@ CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, i * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() * even after resetting the device. + * Resetting the primary context does not release it, an application that has + * retained the primary context should explicitly release its usage. * * \param dev - Device for which primary context is destroyed * @@ -3213,10 +4433,38 @@ CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, i */ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); -#endif /* __CUDA_API_VERSION >= 7000 */ - /** @} */ /* END CUDA_PRIMARY_CTX */ +/** + * \brief Returns information about the execution affinity support of the device. + * + * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev. + * The supported types are: + * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device, + * or 0 if not; + * + * \param pi - 1 if the execution affinity type \p type is supported by the device, or 0 if not + * \param type - Execution affinity type to query + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev); /** * \defgroup CUDA_CTX Context Management @@ -3227,13 +4475,17 @@ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); * This section describes the context management functions of the low-level * CUDA driver application programming interface. * + * Please note that some functions are described in + * \ref CUDA_PRIMARY_CTX "Primary Context Management" section. + * * @{ */ -#if __CUDA_API_VERSION >= 3020 /** * \brief Create a CUDA context * + * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain. + * * Creates a new CUDA context and associates it with the calling thread. The * \p flags parameter is described below. The context is created with a usage * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or @@ -3270,8 +4522,9 @@ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); * \e C > \e P, then CUDA will yield to other OS threads when waiting for * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). - * However, on low power devices like Tegra, it always defaults to - * ::CU_CTX_SCHED_BLOCKING_SYNC. + * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on + * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC + * for low-powered devices. * * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. * This flag must be set in order to allocate pinned host memory that is @@ -3280,7 +4533,10 @@ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory * after resizing local memory for a kernel. This can prevent thrashing by * local memory allocations when launching many kernels with high local - * memory usage at the cost of potentially increased memory usage. + * memory usage at the cost of potentially increased memory usage.
+ * Deprecated: This flag is deprecated and the behavior enabled + * by this flag is now the default and cannot be disabled. + * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). * * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() @@ -3318,9 +4574,114 @@ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); * ::cuCtxSynchronize */ CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); -#endif /* __CUDA_API_VERSION >= 3020 */ -#if __CUDA_API_VERSION >= 4000 +/** + * \brief Create a CUDA context with execution affinity + * + * Creates a new CUDA context with execution affinity and associates it with + * the calling thread. The \p paramsArray and \p flags parameter are described below. + * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must + * call ::cuCtxDestroy() or when done using the context. If a context is already + * current to the thread, it is supplanted by the newly created context and may + * be restored by a subsequent call to ::cuCtxPopCurrent(). + * + * The type and the amount of execution resource the context can use is limited by \p paramsArray + * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams + * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type, + * the latter execution affinity parameter overrides the former execution affinity parameter. + * The supported execution affinity types are: + * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion + * of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally + * rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution + * affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute + * is only supported under Volta+ MPS. + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work.
+ * Deprecated: This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on + * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC + * for low-powered devices. + * + * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. + * This flag must be set in order to allocate pinned host memory that is + * accessible to the GPU. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage.
+ * Deprecated: This flag is deprecated and the behavior enabled + * by this flag is now the default and cannot be disabled. + * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). + * + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() + * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the + * compute mode of the device. The nvidia-smi tool can be used to set + * the compute mode for * devices. + * Documentation for nvidia-smi can be obtained by passing a + * -h option to it. + * + * \param pctx - Returned context handle of the new context + * \param paramsArray - Execution affinity parameters + * \param numParams - Number of execution affinity parameters + * \param flags - Context creation flags + * \param dev - Device to create context on + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::CUexecAffinityParam + */ +CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev); + /** * \brief Destroy a CUDA context * @@ -3329,6 +4690,13 @@ CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); * It is the responsibility of the calling function to ensure that no API * call issues using \p ctx while ::cuCtxDestroy() is executing. * + * Destroys and cleans up all resources associated with the context. + * It is the caller's responsibility to ensure that the context or its resources + * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior. + * These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent, + * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref, + * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore. + * * If \p ctx is current to the calling thread then \p ctx will also be * popped from the current thread's context stack (as though ::cuCtxPopCurrent() * were called). If \p ctx is current to other threads, then \p ctx will @@ -3358,9 +4726,7 @@ CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); * ::cuCtxSynchronize */ CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); -#endif /* __CUDA_API_VERSION >= 4000 */ -#if __CUDA_API_VERSION >= 4000 /** * \brief Pushes a context on the current CPU thread * @@ -3472,7 +4838,6 @@ CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa @@ -3482,7 +4847,6 @@ CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); * ::cudaGetDevice */ CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx); -#endif /* __CUDA_API_VERSION >= 4000 */ /** * \brief Returns the device ID for the current context @@ -3514,7 +4878,6 @@ CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx); */ CUresult CUDAAPI cuCtxGetDevice(CUdevice *device); -#if __CUDA_API_VERSION >= 7000 /** * \brief Returns the flags for the current context * @@ -3535,14 +4898,13 @@ CUresult CUDAAPI cuCtxGetDevice(CUdevice *device); * ::cuCtxGetApiVersion, * ::cuCtxGetCacheConfig, * ::cuCtxGetCurrent, - * ::cuCtxGetDevice + * ::cuCtxGetDevice, * ::cuCtxGetLimit, * ::cuCtxGetSharedMemConfig, * ::cuCtxGetStreamPriorityRange, * ::cudaGetDeviceFlags */ CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags); -#endif /* __CUDA_API_VERSION >= 7000 */ /** * \brief Block for a context's tasks to complete @@ -3588,6 +4950,11 @@ CUresult CUDAAPI cuCtxSynchronize(void); * discussed here. * * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread. + * The driver automatically increases the per-thread stack size + * for each kernel launch as needed. This size isn't reset back to the + * original value after each launch. Setting this value will take effect + * immediately, and if necessary, the device will block until all preceding + * requested tasks are complete. * * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used * by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE @@ -3610,7 +4977,7 @@ CUresult CUDAAPI cuCtxSynchronize(void); * launch depth of 24. When setting this limit, keep in mind that additional * levels of sync depth require the driver to reserve large amounts of device * memory which can no longer be used for user allocations. If these - * reservations of device memory fail, ::cuCtxSetLimit will return + * reservations of device memory fail, ::cuCtxSetLimit() will return * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. * This limit is only applicable to devices of compute capability 3.5 and * higher. Attempting to set this limit on devices of compute capability less @@ -3627,13 +4994,21 @@ CUresult CUDAAPI cuCtxSynchronize(void); * runtime, this limit can be increased. Keep in mind that being able to * sustain additional pending launches will require the driver to reserve * larger amounts of device memory upfront which can no longer be used for - * allocations. If these reservations fail, ::cuCtxSetLimit will return + * allocations. If these reservations fail, ::cuCtxSetLimit() will return * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. * This limit is only applicable to devices of compute capability 3.5 and * higher. Attempting to set this limit on devices of compute capability less * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being * returned. * + * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity. + * Values can range from 0B to 128B. This is purely a performence hint and + * it can be ignored or clamped depending on the platform. + * + * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes availabe for + * persisting L2 cache. This is purely a performance hint and it can be + * ignored or clamped depending on the platform. + * * \param limit - Limit to set * \param value - Size of limit * @@ -3675,6 +5050,8 @@ CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value); * child grid launches to complete. * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding * device runtime launches that can be made from this context. + * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity. + * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes * * \param limit - Limit to query * \param pvalue - Returned size of limit @@ -3795,7 +5172,6 @@ CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig); */ CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config); -#if __CUDA_API_VERSION >= 4020 /** * \brief Returns the current shared memory configuration for the current context. * @@ -3890,7 +5266,6 @@ CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig); * ::cudaDeviceSetSharedMemConfig */ CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config); -#endif /** * \brief Gets the context's API version. @@ -3970,6 +5345,47 @@ CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version); */ CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority); +/** + * \brief Resets all persisting lines in cache to normal status. + * + * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal + * status. Takes effect on function return. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuCtxResetPersistingL2Cache(void); + +/** + * \brief Returns the execution affinity setting for the current context. + * + * Returns in \p *pExecAffinity the current value of \p type. The supported + * ::CUexecAffinityType values are: + * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use. + * + * \param type - Execution affinity type to query + * \param pExecAffinity - Returned execution affinity + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY + * \notefnerr + * + * \sa + * ::CUexecAffinityParam + */ +CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type); + + /** @} */ /* END CUDA_CTX */ /** @@ -4097,6 +5513,7 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx); * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, * ::CUDA_ERROR_NOT_FOUND, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_FILE_NOT_FOUND, @@ -4136,6 +5553,7 @@ CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NO_BINARY_FOR_GPU, * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, @@ -4179,6 +5597,7 @@ CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image); * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NO_BINARY_FOR_GPU, * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, @@ -4220,6 +5639,7 @@ CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigne * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, * ::CUDA_ERROR_NOT_FOUND, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NO_BINARY_FOR_GPU, @@ -4252,6 +5672,7 @@ CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE * \notefnerr + * \note_destroy_ub * * \sa ::cuModuleGetFunction, * ::cuModuleGetGlobal, @@ -4293,7 +5714,6 @@ CUresult CUDAAPI cuModuleUnload(CUmodule hmod); */ CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); -#if __CUDA_API_VERSION >= 3020 /** * \brief Returns a global pointer from a module * @@ -4328,7 +5748,6 @@ CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const cha * ::cudaGetSymbolSize */ CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); -#endif /* __CUDA_API_VERSION >= 3020 */ /** * \brief Returns a handle to a texture reference @@ -4396,8 +5815,6 @@ CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char */ CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); -#if __CUDA_API_VERSION >= 5050 - /** * \brief Creates a pending JIT linker invocation. * @@ -4465,6 +5882,7 @@ cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_IMAGE, * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NO_BINARY_FOR_GPU * @@ -4504,6 +5922,7 @@ cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, c * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_IMAGE, * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_NO_BINARY_FOR_GPU * @@ -4556,8 +5975,6 @@ cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut); CUresult CUDAAPI cuLinkDestroy(CUlinkState state); -#endif /* __CUDA_API_VERSION >= 5050 */ - /** @} */ /* END CUDA_MODULE */ @@ -4573,12 +5990,12 @@ cuLinkDestroy(CUlinkState state); * @{ */ -#if __CUDA_API_VERSION >= 3020 /** * \brief Gets free and total memory * - * Returns in \p *free and \p *total respectively, the free and total amount of - * memory available for allocation by the CUDA context, in bytes. + * Returns in \p *total the total amount of memory available to the the current context. + * Returns in \p *free the amount of memory on the device that is free according to the OS. + * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free. * * \param free - Returned free memory in bytes * \param total - Returned total memory in bytes @@ -4811,7 +6228,6 @@ CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdevic * ::cudaMallocHost */ CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize); -#endif /* __CUDA_API_VERSION >= 3020 */ /** * \brief Frees page-locked host memory @@ -4878,9 +6294,6 @@ CUresult CUDAAPI cuMemFreeHost(void *p); * All of these flags are orthogonal to one another: a developer may allocate * memory that is portable, mapped and/or write-combined with no restrictions. * - * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in - * order for the ::CU_MEMHOSTALLOC_DEVICEMAP flag to have any effect. - * * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for * devices that do not support mapped pinned memory. The failure is deferred * to ::cuMemHostGetDevicePointer() because the memory may be mapped into @@ -4925,7 +6338,6 @@ CUresult CUDAAPI cuMemFreeHost(void *p); */ CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); -#if __CUDA_API_VERSION >= 3020 /** * \brief Passes back device pointer of mapped pinned memory * @@ -4978,7 +6390,6 @@ CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); * ::cudaHostGetDevicePointer */ CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); -#endif /* __CUDA_API_VERSION >= 3020 */ /** * \brief Passes back flags that were used for a pinned allocation @@ -5007,8 +6418,6 @@ CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned */ CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p); -#if __CUDA_API_VERSION >= 6000 - /** * \brief Allocates memory that will be automatically managed by the Unified Memory system * @@ -5119,10 +6528,6 @@ CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p); */ CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); -#endif /* __CUDA_API_VERSION >= 6000 */ - -#if __CUDA_API_VERSION >= 4010 - /** * \brief Returns a handle to a compute device * @@ -5201,8 +6606,7 @@ CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); * * IPC functionality is restricted to devices with support for unified * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode. - * IPC functionality is not supported on Tegra platforms. + * IPC functionality on Windows is restricted to GPUs in TCC mode * * \param pHandle - Pointer to a user allocated CUipcEventHandle * in which to return the opaque event handle @@ -5243,8 +6647,7 @@ CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); * * IPC functionality is restricted to devices with support for unified * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode. - * IPC functionality is not supported on Tegra platforms. + * IPC functionality on Windows is restricted to GPUs in TCC mode * * \param phEvent - Returns the imported event * \param handle - Interprocess handle to open @@ -5287,8 +6690,7 @@ CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle) * * IPC functionality is restricted to devices with support for unified * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode. - * IPC functionality is not supported on Tegra platforms. + * IPC functionality on Windows is restricted to GPUs in TCC mode * * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return * the handle in. @@ -5297,6 +6699,7 @@ CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle) * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_MAP_FAILED, * ::CUDA_ERROR_INVALID_VALUE @@ -5327,6 +6730,10 @@ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened * by one ::CUcontext per ::CUdevice per other process. * + * If the memory handle has already been opened by the current context, the + * reference count on the handle is incremented by 1 and the existing device pointer + * is returned. + * * Memory returned from ::cuIpcOpenMemHandle must be freed with * ::cuIpcCloseMemHandle. * @@ -5336,8 +6743,7 @@ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); * * IPC functionality is restricted to devices with support for unified * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode. - * IPC functionality is not supported on Tegra platforms. + * IPC functionality on Windows is restricted to GPUs in TCC mode * * \param pdptr - Returned device pointer * \param handle - ::CUipcMemHandle to open @@ -5368,9 +6774,10 @@ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); /** - * \brief Close memory mapped with ::cuIpcOpenMemHandle + * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle * - * Unmaps memory returnd by ::cuIpcOpenMemHandle. The original allocation + * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1. + * When the reference count reaches 0, this API unmaps the memory. The original allocation * in the exporting process as well as imported mappings in other processes * will be unaffected. * @@ -5379,8 +6786,7 @@ CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, u * * IPC functionality is restricted to devices with support for unified * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode. - * IPC functionality is not supported on Tegra platforms. + * IPC functionality on Windows is restricted to GPUs in TCC mode * * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle * @@ -5401,9 +6807,6 @@ CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, u */ CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); -#endif /* __CUDA_API_VERSION >= 4010 */ - -#if __CUDA_API_VERSION >= 4000 /** * \brief Registers an existing host memory range for use by CUDA * @@ -5420,9 +6823,6 @@ CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); * * This function has limited support on Mac OS X. OS 10.7 or higher is required. * - * This function is supported only on I/O coherent devices that have a non-zero value - * for the device attribute ::CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED. - * * The \p Flags parameter enables different options to be specified that * affect the allocation, as follows. * @@ -5437,12 +6837,18 @@ CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some * I/O memory space, e.g. the PCI Express resource of a 3rd party device. * + * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory + * that is considered read-only by the device. On platforms without + * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is + * required in order to register memory mapped to the CPU as read-only. Support + * for the use of this flag can be queried from the device attribute + * ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with + * a current context associated with a device that does not have this attribute + * set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED. + * * All of these flags are orthogonal to one another: a developer may page-lock * memory that is portable or mapped with no restrictions. * - * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in - * order for the ::CU_MEMHOSTREGISTER_DEVICEMAP flag to have any effect. - * * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for * devices that do not support mapped pinned memory. The failure is deferred * to ::cuMemHostGetDevicePointer() because the memory may be mapped into @@ -5538,6 +6944,7 @@ CUresult CUDAAPI cuMemHostUnregister(void *p); * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync + * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, @@ -5585,9 +6992,6 @@ CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); */ CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); -#endif /* __CUDA_API_VERSION >= 4000 */ - -#if __CUDA_API_VERSION >= 3020 /** * \brief Copies memory from Host to Device * @@ -5607,6 +7011,7 @@ CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdev * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync + * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, @@ -5642,6 +7047,7 @@ CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync + * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, @@ -5789,6 +7195,7 @@ CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t sr * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync + * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, @@ -5825,6 +7232,7 @@ CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *sr * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * \note_sync + * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, @@ -6374,9 +7782,7 @@ CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); * ::cudaMemcpy3D */ CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); -#endif /* __CUDA_API_VERSION >= 3020 */ -#if __CUDA_API_VERSION >= 4000 /** * \brief Copies memory between contexts * @@ -6426,6 +7832,7 @@ CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); * \notefnerr * \note_async * \note_null_stream + * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, @@ -6477,9 +7884,7 @@ CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCoun * ::cudaMemcpyPeerAsync */ CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); -#endif /* __CUDA_API_VERSION >= 4000 */ -#if __CUDA_API_VERSION >= 3020 /** * \brief Copies memory from Host to Device * @@ -6502,6 +7907,7 @@ CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, * \notefnerr * \note_async * \note_null_stream + * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, @@ -6542,6 +7948,7 @@ CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, s * \notefnerr * \note_async * \note_null_stream + * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, @@ -6625,6 +8032,7 @@ CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, * \notefnerr * \note_async * \note_null_stream + * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, @@ -6666,6 +8074,7 @@ CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const voi * \notefnerr * \note_async * \note_null_stream + * \note_memcpy * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, @@ -7025,9 +8434,7 @@ CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); * ::cudaMemcpy3DAsync */ CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); -#endif /* __CUDA_API_VERSION >= 3020 */ -#if __CUDA_API_VERSION >= 4000 /** * \brief Copies memory between contexts asynchronously. * @@ -7053,9 +8460,7 @@ CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); * ::cudaMemcpy3DPeerAsync */ CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); -#endif /* __CUDA_API_VERSION >= 4000 */ -#if __CUDA_API_VERSION >= 3020 /** * \brief Initializes device memory * @@ -7171,7 +8576,7 @@ CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N); * ::cuMemAllocPitch(). * * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) * \param uc - Value to set * \param Width - Width of row * \param Height - Number of rows @@ -7212,7 +8617,7 @@ CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned c * ::cuMemAllocPitch(). * * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) * \param us - Value to set * \param Width - Width of row * \param Height - Number of rows @@ -7253,7 +8658,7 @@ CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned * ::cuMemAllocPitch(). * * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) * \param ui - Value to set * \param Width - Width of row * \param Height - Number of rows @@ -7403,7 +8808,7 @@ CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t * ::cuMemAllocPitch(). * * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) * \param uc - Value to set * \param Width - Width of row * \param Height - Number of rows @@ -7446,7 +8851,7 @@ CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsig * ::cuMemAllocPitch(). * * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) * \param us - Value to set * \param Width - Width of row * \param Height - Number of rows @@ -7489,7 +8894,7 @@ CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsi * ::cuMemAllocPitch(). * * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) * \param ui - Value to set * \param Width - Width of row * \param Height - Number of rows @@ -7582,7 +8987,7 @@ CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsi * float16's: * \code CUDA_ARRAY_DESCRIPTOR desc; - desc.FormatFlags = CU_AD_FORMAT_HALF; + desc.Format = CU_AD_FORMAT_HALF; desc.NumChannels = 4; desc.Width = width; desc.Height = height; @@ -7592,7 +8997,7 @@ CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsi * of which is two 8-bit unsigned chars: * \code CUDA_ARRAY_DESCRIPTOR arrayDesc; - desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8; + desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; desc.NumChannels = 2; desc.Width = width; desc.Height = height; @@ -7658,8 +9063,88 @@ CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pA * ::cudaArrayGetInfo */ CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); -#endif /* __CUDA_API_VERSION >= 3020 */ +/** + * \brief Returns the layout properties of a sparse CUDA array + * + * Returns the layout properties of a sparse CUDA array in \p sparseProperties + * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, + * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero. + * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero. + * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained + * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties + * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to. + * + * \return + * ::CUDA_SUCCESS + * ::CUDA_ERROR_INVALID_VALUE + * + * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES + * \param[in] array - CUDA array to get the sparse properties of + * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync + */ +CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array); + +/** + * \brief Returns the layout properties of a sparse CUDA mipmapped array + * + * Returns the sparse array layout properties in \p sparseProperties + * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the + * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth + * is less than that of the tile. + * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, + * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. + * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer. + * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero. + * + * \return + * ::CUDA_SUCCESS + * ::CUDA_ERROR_INVALID_VALUE + * + * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES + * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of + * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync + */ +CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap); + +/** + * \brief Gets a CUDA array plane from a CUDA array + * + * Returns in \p pPlaneArray a CUDA array that represents a single format plane + * of the CUDA array \p hArray. + * + * If \p planeIdx is greater than the maximum number of planes in this array or if the array does + * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns + * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. + * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width + * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. + * + * \param pPlaneArray - Returned CUDA array referenced by the \p planeIdx + * \param hArray - Multiplanar CUDA array + * \param planeIdx - Plane index + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::cuArrayCreate, + * ::cudaGetArrayPlane + */ +CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx); /** * \brief Destroys a CUDA array @@ -7692,7 +9177,6 @@ CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, C */ CUresult CUDAAPI cuArrayDestroy(CUarray hArray); -#if __CUDA_API_VERSION >= 3020 /** * \brief Creates a 3D CUDA array * @@ -7839,7 +9323,7 @@ CUresult CUDAAPI cuArrayDestroy(CUarray hArray); * 4x16-bit float16's: * \code CUDA_ARRAY3D_DESCRIPTOR desc; - desc.FormatFlags = CU_AD_FORMAT_HALF; + desc.Format = CU_AD_FORMAT_HALF; desc.NumChannels = 4; desc.Width = width; desc.Height = height; @@ -7910,9 +9394,6 @@ CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR * ::cudaArrayGetInfo */ CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); -#endif /* __CUDA_API_VERSION >= 3020 */ - -#if __CUDA_API_VERSION >= 5000 /** * \brief Creates a CUDA mipmapped array @@ -8111,10 +9592,932 @@ CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray */ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); -#endif /* __CUDA_API_VERSION >= 5000 */ - /** @} */ /* END CUDA_MEM */ +/** + * \defgroup CUDA_VA Virtual Memory Management + * + * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the virtual memory management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** +* \brief Allocate an address range reservation. +* +* Reserves a virtual address range based on the given parameters, giving +* the starting address of the range in \p ptr. This API requires a system that +* supports UVA. The size and address parameters must be a multiple of the +* host page size and the alignment must be a power of two or zero for default +* alignment. +* +* \param[out] ptr - Resulting pointer to start of virtual address range allocated +* \param[in] size - Size of the reserved virtual address range requested +* \param[in] alignment - Alignment of the reserved virtual address range requested +* \param[in] addr - Fixed starting address range requested +* \param[in] flags - Currently unused, must be zero +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_OUT_OF_MEMORY, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemAddressFree +*/ +CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags); + +/** +* \brief Free an address range reservation. +* +* Frees a virtual address range reserved by cuMemAddressReserve. The size +* must match what was given to memAddressReserve and the ptr given must +* match what was returned from memAddressReserve. +* +* \param[in] ptr - Starting address of the virtual address range to free +* \param[in] size - Size of the virtual address region to free +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemAddressReserve +*/ +CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size); + +/** +* \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties +* +* This creates a memory allocation on the target device specified through the +* \p prop strcuture. The created allocation will not have any device or host +* mappings. The generic memory \p handle for the allocation can be +* mapped to the address space of calling process via ::cuMemMap. This handle +* cannot be transmitted directly to other processes (see +* ::cuMemExportToShareableHandle). On Windows, the caller must also pass +* an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which +* limits or allows access to this handle for a recepient process (see +* ::CUmemAllocationProp::win32HandleMetaData for more). The \p size of this +* allocation must be a multiple of the the value given via +* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM +* flag. +* If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then +* the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays +* and sparse CUDA mipmapped arrays. +* (see ::cuMemMapArrayAsync). +* +* \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle. +* \param[in] size - Size of the allocation requested +* \param[in] prop - Properties of the allocation to create. +* \param[in] flags - flags for future use, must be zero now. +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_OUT_OF_MEMORY, +* ::CUDA_ERROR_INVALID_DEVICE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* +* \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle +*/ +CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags); + +/** +* \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate. +* +* Frees the memory that was allocated on a device through cuMemCreate. +* +* The memory allocation will be freed when all outstanding mappings to the memory +* are unmapped and when all outstanding references to the handle (including it's +* shareable counterparts) are also released. The generic memory handle can be +* freed when there are still outstanding mappings made with this handle. Each +* time a recepient process imports a shareable handle, it needs to pair it with +* ::cuMemRelease for the handle to be freed. If \p handle is not a valid handle +* the behavior is undefined. +* +* \param[in] handle Value of handle which was returned previously by cuMemCreate. +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* +* \sa ::cuMemCreate +*/ +CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle); + +/** +* \brief Maps an allocation handle to a reserved virtual address range. +* +* Maps bytes of memory represented by \p handle starting from byte \p offset to +* \p size to address range [\p addr, \p addr + \p size]. This range must be an +* address reservation previously reserved with ::cuMemAddressReserve, and +* \p offset + \p size must be less than the size of the memory allocation. +* Both \p ptr, \p size, and \p offset must be a multiple of the value given via +* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag. +* +* Please note calling ::cuMemMap does not make the address accessible, +* the caller needs to update accessibility of a contiguous mapped VA +* range by calling ::cuMemSetAccess. +* +* Once a recipient process obtains a shareable memory handle +* from ::cuMemImportFromShareableHandle, the process must +* use ::cuMemMap to map the memory into its address ranges before +* setting accessibility with ::cuMemSetAccess. +* +* ::cuMemMap can only create mappings on VA range reservations +* that are not currently mapped. +* +* \param[in] ptr - Address where memory will be mapped. +* \param[in] size - Size of the memory mapping. +* \param[in] offset - Offset into the memory represented by +* - \p handle from which to start mapping +* - Note: currently must be zero. +* \param[in] handle - Handle to a shareable memory +* \param[in] flags - flags for future use, must be zero now. +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_INVALID_DEVICE, +* ::CUDA_ERROR_OUT_OF_MEMORY, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* +* \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle +*/ +CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags); + +/** + * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays + * + * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays. + * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count. + * The structure ::CUarrayMapInfo is defined as follow: + \code + typedef struct CUarrayMapInfo_st { + CUresourcetype resourceType; + union { + CUmipmappedArray mipmap; + CUarray array; + } resource; + + CUarraySparseSubresourceType subresourceType; + union { + struct { + unsigned int level; + unsigned int layer; + unsigned int offsetX; + unsigned int offsetY; + unsigned int offsetZ; + unsigned int extentWidth; + unsigned int extentHeight; + unsigned int extentDepth; + } sparseLevel; + struct { + unsigned int layer; + unsigned long long offset; + unsigned long long size; + } miptail; + } subresource; + + CUmemOperationType memOperationType; + + CUmemHandleType memHandleType; + union { + CUmemGenericAllocationHandle memHandle; + } memHandle; + + unsigned long long offset; + unsigned int deviceBitMask; + unsigned int flags; + unsigned int reserved[2]; + } CUarrayMapInfo; + \endcode + * + * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on. + * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then + * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle. + * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using + * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE. + * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. + * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY + * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle. + * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been + * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE. + * + * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. + * ::CUarraySparseSubresourceType_enum is defined as: + \code + typedef enum CUarraySparseSubresourceType_enum { + CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, + CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 + } CUarraySparseSubresourceType; + \endcode + * + * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a + * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which + * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by + * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type. + * + * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL + * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents. + * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY + * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively. + * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight + * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively. + * These offsets and extents must be aligned to the corresponding tile dimension. + * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise, + * must be zero. + * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise, + * must be zero. + * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth + * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays. + * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties + * + * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL + * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in + * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size. + * Both, mip tail offset and mip tail size must be aligned to the tile size. + * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags + * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index. + * Otherwise, must be zero. + * + * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as: + \code + typedef enum CUmemOperationType_enum { + CU_MEM_OPERATION_TYPE_MAP = 1, + CU_MEM_OPERATION_TYPE_UNMAP = 2 + } CUmemOperationType; + \endcode + * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource + * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. + * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, + * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC. + * + * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation + * is performed. ::CUarrayMapInfo::memHandle must be NULL. + * + * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. + * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. + * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match + * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle. + * + * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * + * \param[in] mapInfoList - List of ::CUarrayMapInfo + * \param[in] count - Count of ::CUarrayMapInfo in \p mapInfoList + * \param[in] hStream - Stream identifier for the stream to use for map or unmap operations + * + * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties + */ +CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); + +/** +* \brief Unmap the backing memory of a given address range. +* +* The range must be the entire contiguous address range that was mapped to. In +* other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped +* by ::cuMemCreate / ::cuMemMap. Any backing memory allocations will be freed +* if there are no existing mappings and there are no unreleased memory handles. +* +* When ::cuMemUnmap returns successfully the address range is converted to an +* address reservation and can be used for a future calls to ::cuMemMap. Any new +* mapping to this virtual address will need to have access granted through +* ::cuMemSetAccess, as all mappings start with no accessibility setup. +* +* \param[in] ptr - Starting address for the virtual address range to unmap +* \param[in] size - Size of the virtual address range to unmap +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* \note_sync +* +* \sa ::cuMemCreate, ::cuMemAddressReserve +*/ +CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size); + +/** +* \brief Set the access flags for each location specified in \p desc for the given virtual address range +* +* Given the virtual address range via \p ptr and \p size, and the locations +* in the array given by \p desc and \p count, set the access flags for the +* target locations. The range must be a fully mapped address range +* containing all allocations created by ::cuMemMap / ::cuMemCreate. +* +* \param[in] ptr - Starting address for the virtual address range +* \param[in] size - Length of the virtual address range +* \param[in] desc - Array of ::CUmemAccessDesc that describe how to change the +* - mapping for each location specified +* \param[in] count - Number of ::CUmemAccessDesc in \p desc +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_INVALID_DEVICE, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* \note_sync +* +* \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap +*/ +CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count); + +/** +* \brief Get the access \p flags set for the given \p location and \p ptr +* +* \param[out] flags - Flags set for this location +* \param[in] location - Location in which to check the flags for +* \param[in] ptr - Address in which to check the access flags for +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_INVALID_DEVICE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemSetAccess +*/ +CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr); + +/** +* \brief Exports an allocation to a requested shareable handle type +* +* Given a CUDA memory handle, create a shareable memory +* allocation handle that can be used to share the memory with other +* processes. The recipient process can convert the shareable handle back into a +* CUDA memory handle using ::cuMemImportFromShareableHandle and map +* it with ::cuMemMap. The implementation of what this handle is and how it +* can be transferred is defined by the requested handle type in \p handleType +* +* Once all shareable handles are closed and the allocation is released, the allocated +* memory referenced will be released back to the OS and uses of the CUDA handle afterward +* will lead to undefined behavior. +* +* This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL) +* that support importing memory from the shareable type +* +* \param[out] shareableHandle - Pointer to the location in which to store the requested handle type +* \param[in] handle - CUDA handle for the memory allocation +* \param[in] handleType - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter) +* \param[in] flags - Reserved, must be zero +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemImportFromShareableHandle +*/ +CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags); + +/** +* \brief Imports an allocation from a requested shareable handle type. +* +* If the current process cannot support the memory described by this shareable +* handle, this API will error as CUDA_ERROR_NOT_SUPPORTED. +* +* \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc) +* created on devices under an SLI group may not be supported, and thus this API will +* return CUDA_ERROR_NOT_SUPPORTED. +* There is no guarantee that the contents of \p handle will be the same CUDA memory handle +* for the same given OS shareable handle, or the same underlying allocation. +* +* \param[out] handle - CUDA Memory handle for the memory allocation. +* \param[in] osHandle - Shareable Handle representing the memory allocation that is to be imported. +* \param[in] shHandleType - handle type of the exported handle ::CUmemAllocationHandleType. +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease +*/ +CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType); + +/** +* \brief Calculates either the minimal or recommended granularity +* +* Calculates either the minimal or recommended granularity +* for a given allocation specification and returns it in granularity. This +* granularity can be used as a multiple for alignment, size, or address mapping. +* +* \param[out] granularity Returned granularity. +* \param[in] prop Property for which to determine the granularity for +* \param[in] option Determines which granularity to return +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemCreate, ::cuMemMap +*/ +CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); + +/** +* \brief Retrieve the contents of the property structure defining properties for this handle +* +* \param[out] prop - Pointer to a properties structure which will hold the information about this handle +* \param[in] handle - Handle which to perform the query on +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemCreate, ::cuMemImportFromShareableHandle +*/ +CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle); + +/** +* \brief Given an address \p addr, returns the allocation handle of the backing memory allocation. +* +* The handle is guaranteed to be the same handle value used to map the memory. If the address +* requested is not mapped, the function will fail. The returned handle must be released with +* corresponding number of calls to ::cuMemRelease. +* +* \note The address \p addr, can be any address in a range previously mapped +* by ::cuMemMap, and not necessarily the start address. +* +* \param[out] handle CUDA Memory handle for the backing memory allocation. +* \param[in] addr Memory address to query, that has been mapped previously. +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap +*/ +CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr); + +/** @} */ /* END CUDA_VA */ + +/** + * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator + * + * ___MANBRIEF___ Functions for performing allocation and free operations in stream order. + * Functions for controlling the behavior of the underlying allocator. + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream ordered memory allocator exposed by the + * low-level CUDA driver application programming interface. + * + * @{ + * + * \section CUDA_MALLOC_ASYNC_overview overview + * + * The asynchronous allocator allows the user to allocate and free in stream order. + * All asynchronous accesses of the allocation must happen between + * the stream executions of the allocation and the free. If the memory is accessed + * outside of the promised stream order, a use before allocation / use after free error + * will cause undefined behavior. + * + * The allocator is free to reallocate the memory as long as it can guarantee + * that compliant memory accesses will not overlap temporally. + * The allocator may refer to internal stream ordering as well as inter-stream dependencies + * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee. + * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. + * + * \section CUDA_MALLOC_ASYNC_support Supported Platforms + * + * Whether or not a device supports the integrated stream ordered memory allocator + * may be queried by calling ::cuDeviceGetAttribute() with the device attribute + * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED + */ + +/** + * \brief Frees memory with stream ordered semantics + * + * Inserts a free operation into \p hStream. + * The allocation must not be accessed after stream execution reaches the free. + * After this API returns, accessing the memory from any subsequent work launched on the GPU + * or querying its pointer attributes results in undefined behavior. + * + * \note During stream capture, this function results in the creation of a free node and + * must therefore be passed the address of a graph allocation. + * + * \param dptr - memory to free + * \param hStream - The stream establishing the stream ordering contract. + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), + * ::CUDA_ERROR_NOT_SUPPORTED + */ +CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); + +/** + * \brief Allocates memory with stream ordered semantics + * + * Inserts an allocation operation into \p hStream. + * A pointer to the allocated memory is returned immediately in *dptr. + * The allocation must not be accessed until the the allocation operation completes. + * The allocation comes from the memory pool current to the stream's device. + * + * \note The default memory pool of a device contains device memory from that device. + * \note Basic stream ordering allows future work submitted into the same stream to use the allocation. + * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + * operation completes before work submitted in a separate stream runs. + * \note During stream capture, this function results in the creation of an allocation node. In this case, + * the allocation is owned by the graph instead of the memory pool. The memory pool's properties + * are used to set the node's creation parameters. + * + * \param[out] dptr - Returned device pointer + * \param[in] bytesize - Number of bytes to allocate + * \param[in] hStream - The stream establishing the stream ordering contract and the memory pool to allocate from + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool, + * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, + * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute + */ +CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); + +/** + * \brief Tries to release memory back to the OS + * + * Releases memory back to the OS until the pool contains fewer than minBytesToKeep + * reserved bytes, or there is no more memory that the allocator can safely release. + * The allocator cannot release OS allocations that back outstanding asynchronous allocations. + * The OS allocations may happen at different granularity from the user allocations. + * + * \note: Allocations that have not been freed count as outstanding. + * \note: Allocations that have been asynchronously freed but whose completion has + * not been observed on the host (eg. by a synchronize) can count as outstanding. + * + * \param[in] pool - The memory pool to trim + * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved, + * the TrimTo operation is a no-op. Otherwise the pool will be guaranteed to have + * at least minBytesToKeep bytes reserved after the operation. + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep); + +/** + * \brief Sets attributes of a memory pool + * + * Supported attributes are: + * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + * Allow ::cuMemAllocAsync to use memory asynchronously freed + * in another stream as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + * Allow ::cuMemAllocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by ::cuMemFreeAsync (default enabled). + * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) + * Reset the high watermark that tracks the amount of backing memory that was + * allocated for the memory pool. It is illegal to set this attribute to a non-zero value. + * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) + * Reset the high watermark that tracks the amount of used memory that was + * allocated for the memory pool. + * + * \param[in] pool - The memory pool to modify + * \param[in] attr - The attribute to modify + * \param[in] value - Pointer to the value to assign + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); + +/** + * \brief Gets attributes of a memory pool + * + * Supported attributes are: + * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + * Allow ::cuMemAllocAsync to use memory asynchronously freed + * in another stream as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + * Allow ::cuMemAllocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by ::cuMemFreeAsync (default enabled). + * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t) + * Amount of backing memory currently allocated for the mempool + * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) + * High watermark of backing memory allocated for the mempool since the + * last time it was reset. + * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t) + * Amount of memory from the pool that is currently in use by the application. + * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) + * High watermark of the amount of memory from the pool that was in use by the application. + * + * \param[in] pool - The memory pool to get attributes of + * \param[in] attr - The attribute to get + * \param[out] value - Retrieved value + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); + +/** + * \brief Controls visibility of pools between devices + * + * \param[in] pool - The pool being modified + * \param[in] map - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu. + * \param[in] count - Number of descriptors in the map array. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count); + +/** + * \brief Returns the accessibility of a pool from a device + * + * Returns the accessibility of the pool's memory from the specified location. + * + * \param[out] flags - the accessibility of the pool from the specified location + * \param[in] memPool - the pool being queried + * \param[in] location - the location accessing the pool + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location); + +/** + * \brief Creates a memory pool + * + * Creates a CUDA memory pool and returns the handle in \p pool. The \p poolProps determines + * the properties of the pool such as the backing device and IPC capabilities. + * + * By default, the pool's memory will be accessible from the device it is allocated on. + * + * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NOT_SUPPORTED + * + * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool, + * ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle + */ +CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps); + +/** + * \brief Destroys the specified memory pool + * + * If any pointers obtained from this pool haven't been freed or + * the pool has free operations that haven't completed + * when ::cuMemPoolDestroy is invoked, the function will return immediately and the + * resources associated with the pool will be released automatically + * once there are no more outstanding allocations. + * + * Destroying the current mempool of a device sets the default mempool of + * that device as the current mempool for that device. + * + * \note A device's default memory pool cannot be destroyed. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, + * ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool); + +/** + * \brief Allocates memory from a specified pool with stream ordered semantics. + * + * Inserts an allocation operation into \p hStream. + * A pointer to the allocated memory is returned immediately in *dptr. + * The allocation must not be accessed until the the allocation operation completes. + * The allocation comes from the specified memory pool. + * + * \note + * - The specified memory pool may be from a device different than that of the specified \p hStream. + * + * - Basic stream ordering allows future work submitted into the same stream to use the allocation. + * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + * operation completes before work submitted in a separate stream runs. + * + * \note During stream capture, this function results in the creation of an allocation node. In this case, + * the allocation is owned by the graph instead of the memory pool. The memory pool's properties + * are used to set the node's creation parameters. + * + * \param[out] dptr - Returned device pointer + * \param[in] bytesize - Number of bytes to allocate + * \param[in] pool - The pool to allocate from + * \param[in] hStream - The stream establishing the stream ordering semantic + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess, + * ::cuMemPoolSetAttribute + */ +CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); + +/** + * \brief Exports a memory pool to the requested handle type. + * + * Given an IPC capable mempool, create an OS handle to share the pool with another process. + * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle. + * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs. + * The implementation of what the shareable handle is and how it can be transferred is defined by the requested + * handle type. + * + * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE. + * + * \param[out] handle_out - Returned OS handle + * \param[in] pool - pool to export + * \param[in] handleType - the type of handle to create + * \param[in] flags - must be 0 + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer, + * ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync, + * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, + * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute + */ +CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags); + +/** + * \brief imports a memory pool from a shared handle. + * + * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer. + * + * \note Imported memory pools do not support creating new allocations. + * As such imported memory pools may not be used in cuDeviceSetMemPool + * or ::cuMemAllocFromPoolAsync calls. + * + * \param[out] pool_out - Returned memory pool + * \param[in] handle - OS handle of the pool to open + * \param[in] handleType - The type of handle being imported + * \param[in] flags - must be 0 + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer + */ +CUresult CUDAAPI cuMemPoolImportFromShareableHandle( + CUmemoryPool *pool_out, + void *handle, + CUmemAllocationHandleType handleType, + unsigned long long flags); + +/** + * \brief Export data to share a memory pool allocation between processes. + * + * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool. + * The recipient process can import the allocation with the ::cuMemPoolImportPointer api. + * The data is not a handle and may be shared through any IPC mechanism. + * + * \param[out] shareData_out - Returned export data + * \param[in] ptr - pointer to memory being exported + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer + */ +CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr); + +/** + * \brief Import a memory pool allocation from another process. + * + * Returns in \p ptr_out a pointer to the imported memory. + * The imported memory must not be accessed before the allocation operation completes + * in the exporting process. The imported memory must be freed from all importing processes before + * being freed in the exporting process. The pointer may be freed with cuMemFree + * or cuMemFreeAsync. If cuMemFreeAsync is used, the free must be completed + * on the importing process before the free operation on the exporting process. + * + * \note The cuMemFreeAsync api may be used in the exporting process before + * the cuMemFreeAsync operation completes in its stream as long as the + * cuMemFreeAsync in the exporting process specifies a stream with + * a stream dependency on the importing process's cuMemFreeAsync. + * + * \param[out] ptr_out - pointer to imported memory + * \param[in] pool - pool from which to import + * \param[in] shareData - data specifying the memory to import + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer + */ +CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData); + +/** @} */ /* END CUDA_MALLOC_ASYNC */ + /** * \defgroup CUDA_UNIFIED Unified Addressing * @@ -8208,7 +10611,6 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); * */ -#if __CUDA_API_VERSION >= 4000 /** * \brief Returns information about a pointer * @@ -8314,11 +10716,47 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); * Returns in \p *data a boolean that indicates whether the pointer points to * managed memory or not. * + * If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned. + * * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: * * Returns in \p *data an integer representing a device ordinal of a device against * which the memory was allocated or registered. * + * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE: + * + * Returns in \p *data a boolean that indicates if this pointer maps to + * an allocation that is suitable for ::cudaIpcGetMemHandle. + * + * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR: + * + * Returns in \p *data the starting address for the allocation referenced + * by the device pointer \p ptr. Note that this is not necessarily the + * address of the mapped region, but the address of the mappable address + * range \p ptr references (e.g. from ::cuMemAddressReserve). + * + * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE: + * + * Returns in \p *data the size for the allocation referenced by the device + * pointer \p ptr. Note that this is not necessarily the size of the mapped + * region, but the size of the mappable address range \p ptr references + * (e.g. from ::cuMemAddressReserve). To retrieve the size of the mapped + * region, see ::cuMemGetAddressRange + * + * - ::CU_POINTER_ATTRIBUTE_MAPPED: + * + * Returns in \p *data a boolean that indicates if this pointer is in a + * valid address range that is mapped to a backing allocation. + * + * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: + * + * Returns a bitmask of the allowed handle types for an allocation that may + * be passed to ::cuMemExportToShareableHandle. + * + * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE: + * + * Returns in \p *data the handle to the mempool that the allocation was obtained from. + * * \par * * Note that for most allocations in the unified virtual address space @@ -8362,9 +10800,7 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); * ::cudaPointerGetAttributes */ CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); -#endif /* __CUDA_API_VERSION >= 4000 */ -#if __CUDA_API_VERSION >= 8000 /** * \brief Prefetches memory to the specified destination device * @@ -8639,14 +11075,12 @@ CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range * ::CUDA_ERROR_INVALID_DEVICE * \notefnerr * - * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise + * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise, * ::cuMemPrefetchAsync, * ::cudaMemRangeGetAttributes */ CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count); -#endif /* __CUDA_API_VERSION >= 8000 */ -#if __CUDA_API_VERSION >= 6000 /** * \brief Set attributes on a previously allocated memory region * @@ -8688,9 +11122,7 @@ CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_r * ::cuMemHostUnregister */ CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr); -#endif /* __CUDA_API_VERSION >= 6000 */ -#if __CUDA_API_VERSION >= 7000 /** * \brief Returns information about a pointer. * @@ -8704,6 +11136,12 @@ CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute at * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL + * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR + * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE + * - ::CU_POINTER_ATTRIBUTE_MAPPED + * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE + * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES + * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE * * \param numAttributes - Number of attributes to query * \param attributes - An array of attributes to query @@ -8733,7 +11171,6 @@ CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute at * ::cudaPointerGetAttributes */ CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); -#endif /* __CUDA_API_VERSION >= 7000 */ /** @} */ /* END CUDA_UNIFIED */ @@ -8753,7 +11190,9 @@ CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_at * \brief Create a stream * * Creates a stream and returns a handle in \p phStream. The \p Flags argument - * determines behaviors of the stream. Valid values for \p Flags are: + * determines behaviors of the stream. + * + * Valid values for \p Flags are: * - ::CU_STREAM_DEFAULT: Default stream creation flag. * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created * stream may run concurrently with work in stream 0 (the NULL stream), and that @@ -8892,8 +11331,6 @@ CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); */ CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); -#if __CUDA_API_VERSION >= 9020 - /** * \brief Query the context associated with a stream * @@ -8938,8 +11375,6 @@ CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); */ CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); -#endif /* __CUDA_API_VERSION >= 9020 */ - /** * \brief Make a compute stream wait on an event * @@ -8948,9 +11383,15 @@ CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); * The synchronization will be performed efficiently on the device when applicable. * \p hEvent may be from a different context or device than \p hStream. * + * flags include: + * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag. + * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external + * event node when performing stream capture. This flag is invalid outside + * of stream capture. + * * \param hStream - Stream to wait * \param hEvent - Event to wait on (may not be NULL) - * \param Flags - Parameters for the operation (must be 0) + * \param Flags - See ::CUevent_capture_flags * * \return * ::CUDA_SUCCESS, @@ -9046,8 +11487,6 @@ CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned in */ CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); -#if __CUDA_API_VERSION >= 10000 - /** * \brief Begins graph capture on a stream * @@ -9056,9 +11495,16 @@ CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which * it was initiated, and it may only be initiated if the stream is not already in capture - * mode. The capture mode may be queried via ::cuStreamIsCapturing. + * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id + * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo. + * + * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be + * called on this stream from the same thread. * * \param hStream - Stream in which to initiate capture + * \param mode - Controls the interaction of this capture sequence with other API + * calls that are potentially unsafe. For more details see + * ::cuThreadExchangeStreamCaptureMode. * * \note Kernels captured using this API must not use texture and surface references. * Reading or writing through any texture or surface reference is undefined @@ -9074,9 +11520,63 @@ CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback * \sa * ::cuStreamCreate, * ::cuStreamIsCapturing, - * ::cuStreamEndCapture + * ::cuStreamEndCapture, + * ::cuThreadExchangeStreamCaptureMode */ -CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); +CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode); + +/** + * \brief Swaps the stream capture interaction mode for a thread + * + * Sets the calling thread's stream capture interaction mode to the value contained + * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To + * facilitate deterministic behavior across function or module boundaries, callers + * are encouraged to use this API in a push-pop fashion: \code + CUstreamCaptureMode mode = desiredMode; + cuThreadExchangeStreamCaptureMode(&mode); + ... + cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode + * \endcode + * + * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call + * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is + * not enqueued asynchronously to a stream, and is not observed by stream capture. + * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture + * depended on the allocation being replayed whenever the graph is launched, the + * captured graph would be invalid. + * + * Therefore, stream capture places restrictions on API calls that can be made within + * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This + * behavior can be controlled via this API and flags to ::cuStreamBeginCapture. + * + * A thread's mode is one of the following: + * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has + * an ongoing capture sequence that was not initiated with + * \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread + * has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL, + * this thread is prohibited from potentially unsafe API calls. + * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture + * sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited + * from potentially unsafe API calls. Concurrent capture sequences in other threads + * are ignored. + * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially + * unsafe API calls. Note that the thread is still prohibited from API calls which + * necessarily conflict with stream capture, for example, attempting ::cuEventQuery + * on an event that was last recorded inside a capture sequence. + * + * \param mode - Pointer to mode value to swap with the current mode + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuStreamBeginCapture + */ +CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode); /** * \brief Ends capture on a stream, returning the captured graph @@ -9086,6 +11586,10 @@ CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); * If capture was invalidated, due to a violation of the rules of stream capture, then * a NULL graph will be returned. * + * If the \p mode argument to ::cuStreamBeginCapture was not + * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as + * ::cuStreamBeginCapture. + * * \param hStream - Stream to query * \param phGraph - The captured graph * @@ -9093,7 +11597,8 @@ CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD * \notefnerr * * \sa @@ -9143,9 +11648,120 @@ CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); */ CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); -#endif /* __CUDA_API_VERSION >= 10000 */ +/** + * \brief Query capture status of a stream + * + * Note there is a later version of this API, ::cuStreamGetCaptureInfo_v2. It will + * supplant this version in 12.0, which is retained for minor version compatibility. + * + * Query the capture status of a stream and and get an id for + * the capture sequence, which is unique over the lifetime of the process. + * + * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created + * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. + * + * A valid id is returned only if both of the following are true: + * - the call returns CUDA_SUCCESS + * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT + * \notefnerr + * + * \sa + * ::cuStreamGetCaptureInfo_v2, + * ::cuStreamBeginCapture, + * ::cuStreamIsCapturing + */ +CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); -#if __CUDA_API_VERSION >= 6000 +/** + * \brief Query a stream's capture state (11.3+) + * + * Query stream state related to stream capture. + * + * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created + * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. + * + * Valid data (other than capture status) is returned only if both of the following are true: + * - the call returns CUDA_SUCCESS + * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE + * + * This version of cuStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the + * previous version in 12.0. Developers requiring compatibility across minor versions to + * CUDA 11.0 (driver version 445) should use ::cuStreamGetCaptureInfo or include a fallback + * path. + * + * \param hStream - The stream to query + * \param captureStatus_out - Location to return the capture status of the stream; required + * \param id_out - Optional location to return an id for the capture sequence, which is + * unique over the lifetime of the process + * \param graph_out - Optional location to return the graph being captured into. All + * operations other than destroy and node removal are permitted on the graph + * while the capture sequence is in progress. This API does not transfer + * ownership of the graph, which is transferred or destroyed at + * ::cuStreamEndCapture. Note that the graph handle may be invalidated before + * end of capture for certain errors. Nodes that are or become + * unreachable from the original stream at ::cuStreamEndCapture due to direct + * actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED. + * \param dependencies_out - Optional location to store a pointer to an array of nodes. + * The next node to be captured in the stream will depend on this set of nodes, + * absent operations such as event wait which modify this set. The array pointer + * is valid until the next API call which operates on the stream or until end of + * capture. The node handles may be copied out and are valid until they or the + * graph is destroyed. The driver-owned array may also be passed directly to + * APIs that operate on the graph (not the stream) without copying. + * \param numDependencies_out - Optional location to store the size of the array + * returned in dependencies_out. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuStreamGetCaptureInfo, + * ::cuStreamBeginCapture, + * ::cuStreamIsCapturing, + * ::cuStreamUpdateCaptureDependencies + */ +CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, + cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); + +/** + * \brief Update the set of dependencies in a capturing stream (11.3+) + * + * Modifies the dependency set of a capturing stream. The dependency set is the set + * of nodes that the next captured node in the stream will depend on. + * + * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and + * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to + * the API is added to the existing set or replaces it. A flags value of 0 defaults + * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES. + * + * Nodes that are removed from the dependency set via this API do not result in + * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at + * ::cuStreamEndCapture. + * + * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing. + * + * This API is new in CUDA 11.3. Developers requiring compatibility across minor + * versions to CUDA 11.0 should not use this API or provide a fallback. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_ILLEGAL_STATE + * + * \sa + * ::cuStreamBeginCapture, + * ::cuStreamGetCaptureInfo, + * ::cuStreamGetCaptureInfo_v2 + */ +CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); /** * \brief Attach memory to a stream asynchronously @@ -9235,8 +11851,6 @@ CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *ca */ CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); -#endif /* __CUDA_API_VERSION >= 6000 */ - /** * \brief Determine status of a compute stream * @@ -9296,7 +11910,6 @@ CUresult CUDAAPI cuStreamQuery(CUstream hStream); */ CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); -#if __CUDA_API_VERSION >= 4000 /** * \brief Destroys a stream * @@ -9326,7 +11939,71 @@ CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); * ::cudaStreamDestroy */ CUresult CUDAAPI cuStreamDestroy(CUstream hStream); -#endif /* __CUDA_API_VERSION >= 4000 */ + +/** + * \brief Copies attributes from source stream to destination stream. + * + * Copies attributes from source stream \p src to destination stream \p dst. + * Both streams must have the same context. + * + * \param[out] dst Destination stream + * \param[in] src Source stream + * For list of attributes see ::CUstreamAttrID + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src); + +/** + * \brief Queries stream attribute. + * + * Queries attribute \p attr from \p hStream and stores it in corresponding + * member of \p value_out. + * + * \param[in] hStream + * \param[in] attr + * \param[out] value_out + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, + CUstreamAttrValue *value_out); + +/** + * \brief Sets stream attribute. + * + * Sets attribute \p attr on \p hStream from corresponding attribute of + * \p value. The updated attribute will be applied to subsequent work + * submitted to the stream. It will not affect previously submitted work. + * + * \param[out] hStream + * \param[in] attr + * \param[in] value + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, + const CUstreamAttrValue *value); /** @} */ /* END CUDA_STREAM */ @@ -9421,10 +12098,60 @@ CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags); * ::cuStreamWaitEvent, * ::cuEventDestroy, * ::cuEventElapsedTime, - * ::cudaEventRecord + * ::cudaEventRecord, + * ::cuEventRecordWithFlags */ CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); +/** + * \brief Records an event + * + * Captures in \p hEvent the contents of \p hStream at the time of this call. + * \p hEvent and \p hStream must be from the same context. + * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then + * examine or wait for completion of the work that was captured. Uses of + * \p hStream after this call do not modify \p hEvent. See note on default + * stream behavior for what is captured in the default case. + * + * ::cuEventRecordWithFlags() can be called multiple times on the same event and + * will overwrite the previously captured state. Other APIs such as + * ::cuStreamWaitEvent() use the most recently captured state at the time + * of the API call, and are not affected by later calls to + * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an + * event represents an empty set of work, so for example ::cuEventQuery() + * would return ::CUDA_SUCCESS. + * + * flags include: + * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag. + * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external + * event node when performing stream capture. This flag is invalid outside + * of stream capture. + * + * \param hEvent - Event to record + * \param hStream - Stream to record event for + * \param flags - See ::CUevent_capture_flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \note_null_stream + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuStreamWaitEvent, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cuEventRecord, + * ::cudaEventRecord + */ +CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); + /** * \brief Queries an event's status * @@ -9488,7 +12215,6 @@ CUresult CUDAAPI cuEventQuery(CUevent hEvent); */ CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); -#if __CUDA_API_VERSION >= 4000 /** * \brief Destroys an event * @@ -9517,7 +12243,6 @@ CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); * ::cudaEventDestroy */ CUresult CUDAAPI cuEventDestroy(CUevent hEvent); -#endif /* __CUDA_API_VERSION >= 4000 */ /** * \brief Computes the elapsed time between two events @@ -9577,8 +12302,6 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven * @{ */ -#if __CUDA_API_VERSION >= 10000 - /** * \brief Imports an external memory object * @@ -9598,7 +12321,9 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven void *handle; const void *name; } win32; + const void *nvSciBufObject; } handle; + unsigned long long size; unsigned int flags; } CUDA_EXTERNAL_MEMORY_HANDLE_DESC; * \endcode @@ -9609,11 +12334,14 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven * * \code typedef enum CUexternalMemoryHandleType_enum { - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5 + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 } CUexternalMemoryHandleType; * \endcode * @@ -9654,7 +12382,7 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle * is not NULL, then it must represent a valid shared NT handle that - * is returned by ID3DDevice::CreateSharedHandle when referring to a + * is returned by ID3D12Device::CreateSharedHandle when referring to a * ID3D12Heap object. This handle holds a reference to the underlying * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name * is not NULL, then it must point to a NULL-terminated array of @@ -9666,17 +12394,55 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle * is not NULL, then it must represent a valid shared NT handle that - * is returned by ID3DDevice::CreateSharedHandle when referring to a + * is returned by ID3D12Device::CreateSharedHandle when referring to a * ID3D12Resource object. This handle holds a reference to the * underlying object. If * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name * is not NULL, then it must point to a NULL-terminated array of * UTF-16 characters that refers to a ID3D12Resource object. * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must + * represent a valid shared NT handle that is returned by + * IDXGIResource1::CreateSharedHandle when referring to a + * ID3D11Resource object. If + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D11Resource object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must + * represent a valid shared KMT handle that is returned by + * IDXGIResource::GetSharedHandle when referring to a + * ID3D11Resource object and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * must be NULL. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL + * and reference a valid NvSciBuf object. + * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the + * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync + * as appropriate barriers to maintain coherence between CUDA and the other drivers. + * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC + * for memory synchronization. + * + * + * The size of the memory object must be specified in + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size. + * * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the * resource is a dedicated resource. The definition of what a * dedicated resource is outside the scope of this extension. + * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type + * is one of the following: + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT * * \param extMem_out - Returned handle to an external memory object * \param memHandleDesc - Memory import handle descriptor @@ -9684,6 +12450,7 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * @@ -9734,6 +12501,8 @@ CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA * appropriate offsets to the returned pointer to derive the * individual buffers. * + * The returned pointer \p devPtr must be freed using ::cuMemFree. + * * \param devPtr - Returned device pointer to buffer * \param extMem - Handle to external memory object * \param bufferDesc - Buffer descriptor @@ -9741,10 +12510,11 @@ CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * - * \sa ::cuImportExternalMemory + * \sa ::cuImportExternalMemory, * ::cuDestroyExternalMemory, * ::cuExternalMemoryGetMappedMipmappedArray */ @@ -9781,6 +12551,11 @@ CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternal * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies * the total number of levels in the mipmap chain. * + * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1. + * + * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy. + * * \param mipmap - Returned CUDA mipmapped array * \param extMem - Handle to external memory object * \param mipmapDesc - CUDA array descriptor @@ -9788,22 +12563,23 @@ CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternal * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * - * \sa ::cuImportExternalMemory + * \sa ::cuImportExternalMemory, * ::cuDestroyExternalMemory, * ::cuExternalMemoryGetMappedBuffer */ CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc); /** - * \brief Releases all resources associated with an external memory - * object. + * \brief Destroys an external memory object. * - * Frees all buffers and CUDA mipmapped arrays that were - * mapped onto this external memory object and releases any reference - * on the underlying memory itself. + * Destroys the specified external memory object. Any existing buffers + * and CUDA mipmapped arrays mapped onto this object must no longer be + * used and must be explicitly freed using ::cuMemFree and + * ::cuMipmappedArrayDestroy respectively. * * \param extMem - External memory object to be destroyed * @@ -9813,7 +12589,7 @@ CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipma * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * - * \sa ::cuImportExternalMemory + * \sa ::cuImportExternalMemory, * ::cuExternalMemoryGetMappedBuffer, * ::cuExternalMemoryGetMappedMipmappedArray */ @@ -9836,8 +12612,9 @@ CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); int fd; struct { void *handle; - const void *name; + const void *name; } win32; + const void* NvSciSyncObj; } handle; unsigned int flags; } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; @@ -9849,10 +12626,16 @@ CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); * * \code typedef enum CUexternalSemaphoreHandleType_enum { - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4 + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 } CUexternalSemaphoreHandleType; * \endcode * @@ -9867,7 +12650,7 @@ CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and - * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be * NULL. If * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle * is not NULL, then it must represent a valid shared NT handle that @@ -9890,23 +12673,76 @@ CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and - * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be * NULL. If * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle * is not NULL, then it must represent a valid shared NT handle that - * is returned by ID3DDevice::CreateSharedHandle when referring to a + * is returned by ID3D12Device::CreateSharedHandle when referring to a * ID3D12Fence object. This handle holds a reference to the underlying * object. If * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name * is not NULL, then it must name a valid synchronization object that * refers to a valid ID3D12Fence object. * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * represents a valid shared NT handle that is returned by + * ID3D11Fence::CreateSharedHandle. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object that + * refers to a valid ID3D11Fence object. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj + * represents a valid NvSciSyncObj. + * + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * represents a valid shared NT handle that + * is returned by IDXGIResource1::CreateSharedHandle when referring to + * a IDXGIKeyedMutex object. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object that + * refers to a valid IDXGIKeyedMutex object. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * represents a valid shared KMT handle that + * is returned by IDXGIResource::GetSharedHandle when referring to + * a IDXGIKeyedMutex object and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid + * file descriptor referencing a synchronization object. Ownership of + * the file descriptor is transferred to the CUDA driver when the + * handle is imported successfully. Performing any operations on the + * file descriptor after it is imported results in undefined behavior. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one + * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be + * NULL. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a synchronization object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object. + * * \param extSem_out - Returned handle to an external semaphore * \param semHandleDesc - Semaphore import handle descriptor * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * @@ -9932,20 +12768,49 @@ CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, cons * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT * then signaling the semaphore will set it to the signaled state. * - * If the semaphore object is of the type - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then the - * semaphore will be set to the value specified in + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 + * then the semaphore will be set to the value specified in * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value. * + * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC + * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence + * to a value that can be used by subsequent waiters of the same NvSciSync object + * to order operations with those currently submitted in \p stream. Such an update + * will overwrite previous contents of + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default, + * signaling such an external semaphore object causes appropriate memory synchronization + * operations to be performed over all external memory objects that are imported as + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses + * made by other importers of the same set of NvSciBuf memory object(s) are coherent. + * These operations can be skipped by specifying the flag + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a + * performance optimization when data coherency is not required. But specifying this + * flag in scenarios where data coherency is required results in undefined behavior. + * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, + * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in + * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return + * CUDA_ERROR_NOT_SUPPORTED. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT + * then the keyed mutex will be released with the key specified in + * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key. + * * \param extSemArray - Set of external semaphores to be signaled * \param paramsArray - Array of semaphore parameters * \param numExtSems - Number of semaphores to signal - * \param stream - Stream to enqueue the signal operations in + * \param stream - Stream to enqueue the signal operations in * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_HANDLE + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED * \notefnerr * * \sa ::cuImportExternalSemaphore, @@ -9973,12 +12838,44 @@ CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extS * unsignaled state. Therefore for every signal operation, there can * only be one wait operation. * - * If the semaphore object is of the type - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then waiting on - * the semaphore will wait until the value of the semaphore is - * greater than or equal to + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 + * then waiting on the semaphore will wait until the value of the + * semaphore is greater than or equal to * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value. * + * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC + * then, waiting on the semaphore will wait until the + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the + * signaler of the NvSciSyncObj that was associated with this semaphore object. + * By default, waiting on such an external semaphore object causes appropriate + * memory synchronization operations to be performed over all external memory objects + * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that + * any subsequent accesses made by other importers of the same set of NvSciBuf memory + * object(s) are coherent. These operations can be skipped by specifying the flag + * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a + * performance optimization when data coherency is not required. But specifying this + * flag in scenarios where data coherency is required results in undefined behavior. + * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, + * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in + * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return + * CUDA_ERROR_NOT_SUPPORTED. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT + * then the keyed mutex will be acquired when it is released with the key + * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key + * or until the timeout specified by + * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs + * has lapsed. The timeout interval can either be a finite value + * specified in milliseconds or an infinite value. In case an infinite + * value is specified the timeout never elapses. The windows INFINITE + * macro must be used to specify infinite timeout. + * * \param extSemArray - External semaphores to be waited on * \param paramsArray - Array of semaphore parameters * \param numExtSems - Number of semaphores to wait on @@ -9987,7 +12884,9 @@ CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extS * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_HANDLE + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_TIMEOUT * \notefnerr * * \sa ::cuImportExternalSemaphore, @@ -10017,8 +12916,6 @@ CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSem */ CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem); -#endif /* __CUDA_API_VERSION >= 10000 */ - /** @} */ /* END CUDA_EXTRES_INTEROP */ /** @@ -10064,7 +12961,6 @@ CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem); * @{ */ -#if __CUDA_API_VERSION >= 8000 /** * \brief Wait on a memory location * @@ -10097,7 +12993,7 @@ CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem); * * \sa ::cuStreamWaitValue64, * ::cuStreamWriteValue32, - * ::cuStreamWriteValue64 + * ::cuStreamWriteValue64, * ::cuStreamBatchMemOp, * ::cuMemHostRegister, * ::cuStreamWaitEvent @@ -10242,7 +13138,6 @@ CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint6 * ::cuMemHostRegister */ CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); -#endif /* __CUDA_API_VERSION >= 8000 */ /** @} */ /* END CUDA_MEMOP */ @@ -10293,7 +13188,7 @@ CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstrea * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of * dynamically-allocated shared memory. * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 - * cache split ratio in percent of shared memory. + * cache split ratio in percent of total shared memory. * * \param pi - Returned attribute value * \param attrib - Attribute requested @@ -10312,13 +13207,11 @@ CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstrea * ::cuCtxSetCacheConfig, * ::cuFuncSetCacheConfig, * ::cuLaunchKernel, - * ::cudaFuncGetAttributes + * ::cudaFuncGetAttributes, * ::cudaFuncSetAttribute */ CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); -#if __CUDA_API_VERSION >= 9000 - /** * \brief Sets information about a function * @@ -10339,8 +13232,9 @@ CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunc * architecture. * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 * cache and shared memory use the same hardware resources, this sets the shared memory - * carveout preference, in percent of the total resources. This is only a hint, and the - * driver can choose a different ratio if required to execute the function. + * carveout preference, in percent of the total shared memory. + * See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR + * This is only a hint, and the driver can choose a different ratio if required to execute the function. * * \param hfunc - Function to query attribute of * \param attrib - Attribute requested @@ -10359,11 +13253,10 @@ CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunc * ::cuCtxSetCacheConfig, * ::cuFuncSetCacheConfig, * ::cuLaunchKernel, - * ::cudaFuncGetAttributes + * ::cudaFuncGetAttributes, * ::cudaFuncSetAttribute */ CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value); -#endif // __CUDA_API_VERSION >= 9000 /** * \brief Sets the preferred cache configuration for a device function @@ -10409,7 +13302,6 @@ CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attri */ CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); -#if __CUDA_API_VERSION >= 4020 /** * \brief Sets the shared memory configuration for a device function. * @@ -10461,9 +13353,33 @@ CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); * ::cudaFuncSetSharedMemConfig */ CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); -#endif -#if __CUDA_API_VERSION >= 4000 +/** + * \brief Returns a module handle + * + * Returns in \p *hmod the handle of the module that function \p hfunc + * is located in. The lifetime of the module corresponds to the lifetime of + * the context it was loaded in or until the module is explicitly unloaded. + * + * The CUDA runtime manages its own modules loaded into the primary context. + * If the handle returned by this API refers to a module loaded by the CUDA runtime, + * calling ::cuModuleUnload() on that module will result in undefined behavior. + * + * \param hmod - Returned module handle + * \param hfunc - Function to retrieve module for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + */ +CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc); + /** * \brief Launches a CUDA function * @@ -10522,8 +13438,8 @@ CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig confi * parameters are specified with both \p kernelParams and \p extra * (i.e. both \p kernelParams and \p extra are non-NULL). * - * Calling ::cuLaunchKernel() sets persistent function state that is - * the same as function state set through the following deprecated APIs: + * Calling ::cuLaunchKernel() invalidates the persistent function state + * set through the following deprecated APIs: * ::cuFuncSetBlockShape(), * ::cuFuncSetSharedSize(), * ::cuParamSetSize(), @@ -10531,10 +13447,6 @@ CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig confi * ::cuParamSetf(), * ::cuParamSetv(). * - * When the kernel \p f is launched via ::cuLaunchKernel(), the previous - * block shape, shared size and parameter info associated with \p f - * is overwritten. - * * Note that to use ::cuLaunchKernel(), the kernel \p f must either have * been compiled with toolchain version 3.2 or later so that it will * contain kernel parameter information, or have no kernel parameters. @@ -10586,8 +13498,7 @@ CUresult CUDAAPI cuLaunchKernel(CUfunction f, CUstream hStream, void **kernelParams, void **extra); -#endif /* __CUDA_API_VERSION >= 4000 */ -#if __CUDA_API_VERSION >= 9000 + /** * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute * @@ -10678,6 +13589,8 @@ CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, /** * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute * + * \deprecated This function is deprecated as of CUDA 11.3. + * * Invokes kernels as specified in the \p launchParamsList array where each element * of the array specifies all the parameters required to perform a single kernel launch. * These kernels can cooperate and synchronize as they execute. The size of the array is @@ -10808,11 +13721,7 @@ CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, * ::cuLaunchCooperativeKernel, * ::cudaLaunchCooperativeKernelMultiDevice */ -CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags); - -#endif /* __CUDA_API_VERSION >= 9000 */ - -#if __CUDA_API_VERSION >= 10000 +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags); /** * \brief Enqueues a host function call in a stream @@ -10879,8 +13788,6 @@ CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launch */ CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); -#endif /* __CUDA_API_VERSION >= 10000 */ - /** @} */ /* END CUDA_EXEC */ /** @@ -11107,6 +14014,21 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, voi * contains the number of threads specified by a previous call to * ::cuFuncSetBlockShape(). * + * The block shape, dynamic shared memory size, and parameter information + * must be set using + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), and + * ::cuParamSetv() + * prior to calling this function. + * + * Launching a function via ::cuLaunchKernel() invalidates the function's + * block shape, dynamic shared memory size, and parameter information. After + * launching via cuLaunchKernel, this state must be re-initialized prior to + * calling this function. Failure to do so results in undefined behavior. + * * \param f - Kernel to launch * * \return @@ -11144,6 +14066,21 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f); * blocks. Each block contains the number of threads specified by a previous * call to ::cuFuncSetBlockShape(). * + * The block shape, dynamic shared memory size, and parameter information + * must be set using + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), and + * ::cuParamSetv() + * prior to calling this function. + * + * Launching a function via ::cuLaunchKernel() invalidates the function's + * block shape, dynamic shared memory size, and parameter information. After + * launching via cuLaunchKernel, this state must be re-initialized prior to + * calling this function. Failure to do so results in undefined behavior. + * * \param f - Kernel to launch * \param grid_width - Width of grid in blocks * \param grid_height - Height of grid in blocks @@ -11183,6 +14120,21 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, in * blocks. Each block contains the number of threads specified by a previous * call to ::cuFuncSetBlockShape(). * + * The block shape, dynamic shared memory size, and parameter information + * must be set using + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), and + * ::cuParamSetv() + * prior to calling this function. + * + * Launching a function via ::cuLaunchKernel() invalidates the function's + * block shape, dynamic shared memory size, and parameter information. After + * launching via cuLaunchKernel, this state must be re-initialized prior to + * calling this function. Failure to do so results in undefined behavior. + * * \param f - Kernel to launch * \param grid_width - Width of grid in blocks * \param grid_height - Height of grid in blocks @@ -11202,8 +14154,8 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, in * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED * * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), - * this function may serialize kernel launches. In order to force the CUDA driver to retain - * asynchronous behavior, set the ::CU_CTX_LMEM_RESIZE_TO_MAX flag during context creation (see ::cuCtxCreate). + * this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by + * growing the per-thread stack as needed per launch and not shrinking it afterwards. * * \note_null_stream * \notefnerr @@ -11247,7 +14199,6 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_widt __CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); /** @} */ /* END CUDA_EXEC_DEPRECATED */ -#if __CUDA_API_VERSION >= 10000 /** * \defgroup CUDA_GRAPH Graph Management * @@ -11334,9 +14285,9 @@ CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need * to be specified as that information is retrieved directly from the kernel's image. * - * 2) Kernel parameters can also be packaged by the application into a single buffer that is passed in - * via \p extra. This places the burden on the application of knowing each kernel - * parameter's size and alignment/padding within the buffer. The \p extra parameter exists + * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single + * buffer that is passed in via \p extra. This places the burden on the application of knowing each + * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists * to allow this function to take additional less commonly used arguments. \p extra specifies * a list of names of extra settings and their corresponding values. Each extra setting name is * immediately followed by the corresponding value. The list must be terminated with either NULL or @@ -11354,8 +14305,8 @@ CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); * ::CU_LAUNCH_PARAM_BUFFER_POINTER; * * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both - * \p kernelParams and \p extra (i.e. both \p kernelParams and - * \p extra are non-NULL). + * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL). + * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel. * * The \p kernelParams or \p extra array, as well as the argument values it points to, * are copied during this call. @@ -11380,6 +14331,7 @@ CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); * * \sa * ::cuLaunchKernel, + * ::cuLaunchCooperativeKernel, * ::cuGraphKernelNodeGetParams, * ::cuGraphKernelNodeSetParams, * ::cuGraphCreate, @@ -11390,7 +14342,7 @@ CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode */ -CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams); +CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams); /** * \brief Returns a kernel node's parameters @@ -11493,7 +14445,7 @@ CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL * ::cuGraphAddHostNode, * ::cuGraphAddMemsetNode */ -CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); +CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); /** * \brief Returns a memcpy node's parameters @@ -11581,7 +14533,7 @@ CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY * ::cuGraphAddHostNode, * ::cuGraphAddMemcpyNode */ -CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); +CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); /** * \brief Returns a memset node's parameters @@ -11639,6 +14591,7 @@ CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET * A handle to the new node will be returned in \p phGraphNode. * * When the graph is launched, the node will invoke the specified CPU function. + * Host nodes are not supported under MPS with pre-Volta GPUs. * * \param phGraphNode - Returns newly created node * \param hGraph - Graph to which to add the node @@ -11650,6 +14603,7 @@ CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, * ::CUDA_ERROR_INVALID_VALUE * \note_graph_thread_safety * \notefnerr @@ -11666,7 +14620,7 @@ CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode */ -CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams); +CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams); /** * \brief Returns a host node's parameters @@ -11723,6 +14677,8 @@ CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NOD * at the root of the graph. \p dependencies may not have any duplicate entries. * A handle to the new node will be returned in \p phGraphNode. * + * If \p hGraph contains allocation or free nodes, this call will return an error. + * * The node executes an embedded child graph. The child graph is cloned in this call. * * \param phGraphNode - Returns newly created node @@ -11750,7 +14706,7 @@ CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NOD * ::cuGraphAddMemsetNode, * ::cuGraphClone */ -CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph); +CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph); /** * \brief Gets a handle to the embedded graph of a child graph node @@ -11759,6 +14715,9 @@ CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGra * does not clone the graph. Changes to the graph will be reflected in * the node, and the node retains ownership of the graph. * + * Allocation and free nodes cannot be added to the returned graph. + * Attempting to do so will return an error. + * * \param hNode - Node to get the embedded graph for * \param phGraph - Location to store a handle to the graph * @@ -11812,12 +14771,654 @@ CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGra * ::cuGraphAddMemcpyNode, * ::cuGraphAddMemsetNode */ -CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies); +CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies); + +/** + * \brief Creates an event record node and adds it to a graph + * + * Creates a new event record node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and event specified in \p event. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * Each launch of the graph will record \p event to capture execution of the + * node's dependencies. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param event - Event for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventWaitNode, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + */ +CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); + +/** + * \brief Returns the event associated with an event record node + * + * Returns the event of event record node \p hNode in \p event_out. + * + * \param hNode - Node to get the event for + * \param event_out - Pointer to return the event + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventRecordNode, + * ::cuGraphEventRecordNodeSetEvent, + * ::cuGraphEventWaitNodeGetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out); + +/** + * \brief Sets an event record node's event + * + * Sets the event of event record node \p hNode to \p event. + * + * \param hNode - Node to set the event for + * \param event - Event to use + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventRecordNode, + * ::cuGraphEventRecordNodeGetEvent, + * ::cuGraphEventWaitNodeSetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event); + +/** + * \brief Creates an event wait node and adds it to a graph + * + * Creates a new event wait node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and event specified in \p event. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The graph node will wait for all work captured in \p event. See ::cuEventRecord() + * for details on what is captured by an event. \p event may be from a different context + * or device than the launch stream. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param event - Event for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventRecordNode, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + */ +CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); + +/** + * \brief Returns the event associated with an event wait node + * + * Returns the event of event wait node \p hNode in \p event_out. + * + * \param hNode - Node to get the event for + * \param event_out - Pointer to return the event + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventWaitNode, + * ::cuGraphEventWaitNodeSetEvent, + * ::cuGraphEventRecordNodeGetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out); + +/** + * \brief Sets an event wait node's event + * + * Sets the event of event wait node \p hNode to \p event. + * + * \param hNode - Node to set the event for + * \param event - Event to use + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventWaitNode, + * ::cuGraphEventWaitNodeGetEvent, + * ::cuGraphEventRecordNodeSetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event); + +/** + * \brief Creates an external semaphore signal node and adds it to a graph + * + * Creates a new external semaphore signal node and adds it to \p hGraph with \p + * numDependencies dependencies specified via \p dependencies and arguments specified + * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the + * node will be placed at the root of the graph. \p dependencies may not have any + * duplicate entries. A handle to the new node will be returned in \p phGraphNode. + * + * Performs a signal operation on a set of externally allocated semaphore objects + * when the node is launched. The operation(s) will occur after all of the node's + * dependencies have completed. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphExternalSemaphoresSignalNodeGetParams, + * ::cuGraphExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddEventRecordNode, + * ::cuGraphAddEventWaitNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + */ +CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); + +/** + * \brief Returns an external semaphore signal node's parameters + * + * Returns the parameters of an external semaphore signal node \p hNode in \p params_out. + * The \p extSemArray and \p paramsArray returned in \p params_out, + * are owned by the node. This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the + * parameters of this node. + * + * \param hNode - Node to get the parameters for + * \param params_out - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuGraphExternalSemaphoresSignalNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out); + +/** + * \brief Sets an external semaphore signal node's parameters + * + * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuGraphExternalSemaphoresSignalNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); + +/** + * \brief Creates an external semaphore wait node and adds it to a graph + * + * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. A handle + * to the new node will be returned in \p phGraphNode. + * + * Performs a wait operation on a set of externally allocated semaphore objects + * when the node is launched. The node's dependencies will not be launched until + * the wait operation has completed. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphExternalSemaphoresWaitNodeGetParams, + * ::cuGraphExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddEventRecordNode, + * ::cuGraphAddEventWaitNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + */ +CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); + +/** + * \brief Returns an external semaphore wait node's parameters + * + * Returns the parameters of an external semaphore wait node \p hNode in \p params_out. + * The \p extSemArray and \p paramsArray returned in \p params_out, + * are owned by the node. This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the + * parameters of this node. + * + * \param hNode - Node to get the parameters for + * \param params_out - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuGraphExternalSemaphoresWaitNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out); + +/** + * \brief Sets an external semaphore wait node's parameters + * + * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuGraphExternalSemaphoresWaitNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); + +/** + * \brief Creates an allocation node and adds it to a graph + * + * Creates a new allocation node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. A handle + * to the new node will be returned in \p phGraphNode. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in + * \p nodeParams.dptr. The allocation's address remains fixed across instantiations and launches. + * + * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode, + * the allocation can be accessed by nodes ordered after the allocation node but before the free node. + * These allocations cannot be freed outside the owning graph, and they can only be freed once in the + * owning graph. + * + * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the + * graph which are ordered after the allocation node, but also by stream operations ordered after the + * graph's execution but before the allocation is freed. + * + * Allocations which are not freed in the same graph can be freed by: + * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree; + * - launching a graph with a free node for that allocation; or + * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes + * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation. + * + * It is not possible to free an allocation in both the owning graph and another graph. If the allocation + * is freed in the same graph, a free node cannot be added to another graph. If the allocation is freed + * in another graph, a free node can no longer be added to the owning graph. + * + * The following restrictions apply to graphs which contain allocation and/or memory free nodes: + * - Nodes and edges of the graph cannot be deleted. + * - The graph cannot be used in a child node. + * - Only one instantiation of the graph may exist at any point in time. + * - The graph cannot be cloned. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemFreeNode, + * ::cuGraphMemAllocNodeGetParams, + * ::cuDeviceGraphMemTrim, + * ::cuDeviceGetGraphMemAttribute, + * ::cuDeviceSetGraphMemAttribute, + * ::cuMemAllocAsync, + * ::cuMemFreeAsync, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddEventRecordNode, + * ::cuGraphAddEventWaitNode, + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a memory alloc node's parameters + * + * Returns the parameters of a memory alloc node \p hNode in \p params_out. + * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the + * node. This memory remains valid until the node is destroyed. The returned + * parameters must not be modified. + * + * \param hNode - Node to get the parameters for + * \param params_out - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemAllocNode, + * ::cuGraphMemFreeNodeGetParams + */ +CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out); + +/** + * \brief Creates a memory free node and adds it to a graph + * + * Creates a new memory free node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. A handle + * to the new node will be returned in \p phGraphNode. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param dptr - Address of memory to free + * + * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free: + * - an allocation twice in the same graph. + * - an address that was not returned by an allocation node. + * - an invalid address. + * + * The following restrictions apply to graphs which contain allocation and/or memory free nodes: + * - Nodes and edges of the graph cannot be deleted. + * - The graph cannot be used in a child node. + * - Only one instantiation of the graph may exist at any point in time. + * - The graph cannot be cloned. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemAllocNode, + * ::cuGraphMemFreeNodeGetParams, + * ::cuDeviceGraphMemTrim, + * ::cuDeviceGetGraphMemAttribute, + * ::cuDeviceSetGraphMemAttribute, + * ::cuMemAllocAsync, + * ::cuMemFreeAsync, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddEventRecordNode, + * ::cuGraphAddEventWaitNode, + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr); + +/** + * \brief Returns a memory free node's parameters + * + * Returns the address of a memory free node \p hNode in \p dptr_out. + * + * \param hNode - Node to get the parameters for + * \param dptr_out - Pointer to return the device address + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemFreeNode, + * ::cuGraphMemAllocNodeGetParams + */ +CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out); + +/** + * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS. + * + * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are + * freed back to the operating system. + * + * \param device - The device for which cached memory should be freed. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_DEVICE + * + * \sa + * ::cuGraphAddMemAllocNode, + * ::cuGraphAddMemFreeNode, + * ::cuDeviceSetGraphMemAttribute, + * ::cuDeviceGetGraphMemAttribute + */ +CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device); + +/** + * \brief Query asynchronous allocation attributes related to graphs + * + * Valid attributes are: + * + * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs + * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + * + * \param device - Specifies the scope of the query + * \param attr - attribute to get + * \param value - retrieved value + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_DEVICE + * + * \sa + * ::cuDeviceSetGraphMemAttribute, + * ::cuGraphAddMemAllocNode, + * ::cuGraphAddMemFreeNode + */ +CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); + +/** + * \brief Set asynchronous allocation attributes related to graphs + * + * Valid attributes are: + * + * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + * + * \param device - Specifies the scope of the query + * \param attr - attribute to get + * \param value - pointer to value to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_DEVICE + * + * \sa + * ::cuDeviceGetGraphMemAttribute, + * ::cuGraphAddMemAllocNode, + * ::cuGraphAddMemFreeNode + */ +CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); /** * \brief Clones a graph * - * This function creates a copy of \p originalGraph and returns it in \p * phGraphClone. + * This function creates a copy of \p originalGraph and returns it in \p phGraphClone. * All parameters are copied into the cloned graph. The original graph may be modified * after this call without affecting the clone. * @@ -12082,7 +15683,7 @@ CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *de * ::cuGraphNodeGetDependencies, * ::cuGraphNodeGetDependentNodes */ -CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t numDependencies); +CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); /** * \brief Removes dependency edges from a graph @@ -12094,6 +15695,9 @@ CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, CUgraphNode *from, CUgra * If \p numDependencies is 0, elements in \p from and \p to will be ignored. * Specifying a non-existing dependency will return an error. * + * Dependencies cannot be removed from graphs which contain allocation or free nodes. + * Any attempt to do so will return an error. + * * \param hGraph - Graph from which to remove dependencies * \param from - Array of nodes that provide the dependencies * \param to - Array of dependent nodes @@ -12111,7 +15715,7 @@ CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, CUgraphNode *from, CUgra * ::cuGraphNodeGetDependencies, * ::cuGraphNodeGetDependentNodes */ -CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t numDependencies); +CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); /** * \brief Remove a node from the graph @@ -12119,6 +15723,9 @@ CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, CUgraphNode *from, CU * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes * on \p hNode and vice versa. * + * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed. + * Any attempt to do so will return an error. + * * \param hNode - Node to remove * * \return @@ -12143,7 +15750,7 @@ CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode); * Instantiates \p hGraph as an executable graph. The graph is validated for any * structural constraints or intra-node constraints which were not previously * validated. If instantiation is successful, a handle to the instantiated graph - * is returned in \p graphExec. + * is returned in \p phGraphExec. * * If there are any errors, diagnostic information may be returned in \p errorNode and * \p logBuffer. This is the primary way to inspect instantiation errors. The output @@ -12167,12 +15774,461 @@ CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode); * \notefnerr * * \sa + * ::cuGraphInstantiateWithFlags, * ::cuGraphCreate, + * ::cuGraphUpload, * ::cuGraphLaunch, * ::cuGraphExecDestroy */ CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); +/** + * \brief Creates an executable graph from a graph + * + * Instantiates \p hGraph as an executable graph. The graph is validated for any + * structural constraints or intra-node constraints which were not previously + * validated. If instantiation is successful, a handle to the instantiated graph + * is returned in \p phGraphExec. + * + * The \p flags parameter controls the behavior of instantiation and subsequent + * graph launches. Valid flags are: + * + * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a + * graph containing memory allocation nodes to automatically free any + * unfreed memory allocations before the graph is relaunched. + * + * If \p hGraph contains any allocation or free nodes, there can be at most one + * executable graph in existence for that graph at a time. + * + * An attempt to instantiate a second executable graph before destroying the first + * with ::cuGraphExecDestroy will result in an error. + * + * \param phGraphExec - Returns instantiated graph + * \param hGraph - Graph to instantiate + * \param flags - Flags to control instantiation. See ::CUgraphInstantiate_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphCreate, + * ::cuGraphUpload, + * ::cuGraphLaunch, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphInstantiateWithFlags(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags); + +/** + * \brief Sets the parameters for a kernel node in the given graphExec + * + * Sets the parameters of a kernel node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. The \p func field + * of \p nodeParams cannot be modified and must match the original value. + * All other values can be modified. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - kernel node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddKernelNode, + * ::cuGraphKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Sets the parameters for a memcpy node in the given graphExec. + * + * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + * contained \p copyParams at instantiation. hNode must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + * + * The source and destination memory in \p copyParams must be allocated from the same + * contexts as the original source and destination memory. Both the instantiation-time + * memory operands and the memory operands in \p copyParams must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. hNode is also + * not modified by this call. + * + * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or + * either the original or new memory operands are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Memcpy node from the graph which was used to instantiate graphExec + * \param copyParams - The updated parameters to set + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemcpyNode, + * ::cuGraphMemcpyNodeSetParams, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); + +/** + * \brief Sets the parameters for a memset node in the given graphExec. + * + * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + * contained \p memsetParams at instantiation. hNode must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + * + * The destination memory in \p memsetParams must be allocated from the same + * contexts as the original destination memory. Both the instantiation-time + * memory operand and the memory operand in \p memsetParams must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. hNode is also + * not modified by this call. + * + * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or + * either the original or new memory operand are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Memset node from the graph which was used to instantiate graphExec + * \param memsetParams - The updated parameters to set + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemsetNode, + * ::cuGraphMemsetNodeSetParams, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); + +/** + * \brief Sets the parameters for a host node in the given graphExec. + * + * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + * contained \p nodeParams at instantiation. hNode must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. hNode is also + * not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Host node from the graph which was used to instantiate graphExec + * \param nodeParams - The updated parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddHostNode, + * ::cuGraphHostNodeSetParams, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Updates node parameters in the child graph node in the given graphExec. + * + * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained + * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation. + * \p hNode must remain in the graph which was used to instantiate \p hGraphExec. + * Changed edges to and from \p hNode are ignored. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p hNode is also + * not modified by this call. + * + * The topology of \p childGraph, as well as the node insertion order, must match that + * of the graph contained in \p hNode. See ::cuGraphExecUpdate() for a list of restrictions + * on what can be updated in an instantiated graph. The update is recursive, so child graph + * nodes contained within the top level child graph will also be updated. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Host node from the graph which was used to instantiate graphExec + * \param childGraph - The graph supplying the updated parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphChildGraphNodeGetGraph, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph); + +/** + * \brief Sets the event for an event record node in the given graphExec + * + * Sets the event of an event record node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - event record node from the graph from which graphExec was instantiated + * \param event - Updated event to use + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventRecordNode, + * ::cuGraphEventRecordNodeGetEvent, + * ::cuGraphEventWaitNodeSetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); + +/** + * \brief Sets the event for an event wait node in the given graphExec + * + * Sets the event of an event wait node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - event wait node from the graph from which graphExec was instantiated + * \param event - Updated event to use + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventWaitNode, + * ::cuGraphEventWaitNodeGetEvent, + * ::cuGraphEventRecordNodeSetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); + +/** + * \brief Sets the parameters for an external semaphore signal node in the given graphExec + * + * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * Changing \p nodeParams->numExtSems is not supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - semaphore signal node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); + +/** + * \brief Sets the parameters for an external semaphore wait node in the given graphExec + * + * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * Changing \p nodeParams->numExtSems is not supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - semaphore wait node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); + +/** + * \brief Uploads an executable graph in a stream + * + * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of + * the same \p hGraphExec will be serialized. Each upload is ordered behind both any + * previous work in \p hStream and any previous launches of \p hGraphExec. + * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec. + * + * \param hGraphExec - Executable graph to upload + * \param hStream - Stream in which to upload the graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphLaunch, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream); + /** * \brief Launches an executable graph in a stream * @@ -12181,6 +16237,10 @@ CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CU * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be * instantiated multiple times into multiple executable graphs. * + * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and + * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, + * the launch will fail with ::CUDA_ERROR_INVALID_VALUE. + * * \param hGraphExec - Executable graph to launch * \param hStream - Stream in which to launch the graph * @@ -12194,6 +16254,7 @@ CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CU * * \sa * ::cuGraphInstantiate, + * ::cuGraphUpload, * ::cuGraphExecDestroy */ CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream); @@ -12218,6 +16279,7 @@ CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream); * * \sa * ::cuGraphInstantiate, + * ::cuGraphUpload, * ::cuGraphLaunch */ CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec); @@ -12241,10 +16303,307 @@ CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec); * ::cuGraphCreate */ CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph); -/** @} */ /* END CUDA_GRAPH */ -#endif /* __CUDA_API_VERSION >= 10000 */ -#if __CUDA_API_VERSION >= 6050 +/** + * \brief Check whether an executable graph can be updated with a graph and perform the update if possible + * + * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the + * node parameters in a topologically identical graph specified by \p hGraph. + * + * Limitations: + * + * - Kernel nodes: + * - The owning context of the function cannot change. + * - A node whose function originally did not use CUDA dynamic parallelism cannot be updated + * to a function which uses CDP + * - Memset and memcpy nodes: + * - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change. + * - The source/destination memory must be allocated from the same contexts as the original + * source/destination memory. + * - Only 1D memsets can be changed. + * - Additional memcpy node restrictions: + * - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE, + * CU_MEMORYTYPE_ARRAY, etc.) is not supported. + * - External semaphore wait nodes and record nodes: + * - Changing the number of semaphores is not supported. + * + * Note: The API may add further restrictions in future releases. The return code should always be checked. + * + * cuGraphExecUpdate sets \p updateResult_out to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under + * the following conditions: + * + * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out + * is NULL. + * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out + * is NULL. + * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is + * the pairless node from \p hGraph. + * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph. + * + * cuGraphExecUpdate sets \p updateResult_out to: + * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value. + * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed + * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case + * \p hErrorNode_out is set to the node from \p hGraph. + * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported + * way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph + * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way + * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. + * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like + * the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph + * + * If \p updateResult_out isn't set in one of the situations described above, the update check passes + * and cuGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph. If an error happens + * during the update, \p updateResult_out will be set to CU_GRAPH_EXEC_UPDATE_ERROR; otherwise, + * \p updateResult_out is set to CU_GRAPH_EXEC_UPDATE_SUCCESS. + * + * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully. It returns + * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included + * changes which violated constraints specific to instantiated graph update. + * + * \param hGraphExec The instantiated graph to be updated + * \param hGraph The graph containing the updated parameters + * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any + * \param updateResult_out Whether the graph update was permitted. If was forbidden, the reason why + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + */ +CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out); + +/** + * \brief Copies attributes from source node to destination node. + * + * Copies attributes from source node \p src to destination node \p dst. + * Both node must have the same context. + * + * \param[out] dst Destination node + * \param[in] src Source node + * For list of attributes see ::CUkernelNodeAttrID + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src); + +/** + * \brief Queries node attribute. + * + * Queries attribute \p attr from node \p hNode and stores it in corresponding + * member of \p value_out. + * + * \param[in] hNode + * \param[in] attr + * \param[out] value_out + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, + CUkernelNodeAttrValue *value_out); + +/** + * \brief Sets node attribute. + * + * Sets attribute \p attr on node \p hNode from corresponding attribute of + * \p value. + * + * \param[out] hNode + * \param[in] attr + * \param[out] value + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, + const CUkernelNodeAttrValue *value); + +/** + * \brief Write a DOT file describing graph structure + * + * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph. + * By default this includes the graph topology, node types, node id, kernel names and memcpy direction. + * \p flags can be specified to write more detailed information about each node type such as + * parameter values, kernel attributes, node and function handles. + * + * \param hGraph - The graph to create a DOT file from + * \param path - The path to write the DOT file to + * \param flags - Flags from CUgraphDebugDot_flags for specifying which additional node information to write + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OPERATING_SYSTEM + */ +CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags); + +/** + * \brief Create a user object + * + * Create a user object with the specified destructor callback and initial reference count. The + * initial references are owned by the caller. + * + * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they + * are executed by a shared internal thread. Another thread may be signaled to perform such + * actions, if it does not block forward progress of tasks scheduled through CUDA. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param object_out - Location to return the user object handle + * \param ptr - The pointer to pass to the destroy function + * \param destroy - Callback to free the user object when it is no longer in use + * \param initialRefcount - The initial refcount to create the object with, typically 1. The + * initial references are owned by the calling thread. + * \param flags - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC, + * which is the only defined flag. This indicates that the destroy + * callback cannot be waited on by any CUDA API. Users requiring + * synchronization of the callback should signal its completion + * manually. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectRetain, + * ::cuUserObjectRelease, + * ::cuGraphRetainUserObject, + * ::cuGraphReleaseUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy, + unsigned int initialRefcount, unsigned int flags); + +/** + * \brief Retain a reference to a user object + * + * Retains new references to a user object. The new references are owned by the caller. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param object - The object to retain + * \param count - The number of references to retain, typically 1. Must be nonzero + * and not larger than INT_MAX. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectCreate, + * ::cuUserObjectRelease, + * ::cuGraphRetainUserObject, + * ::cuGraphReleaseUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count); + +/** + * \brief Release a reference to a user object + * + * Releases user object references owned by the caller. The object's destructor is invoked if + * the reference count reaches zero. + * + * It is undefined behavior to release references not owned by the caller, or to use a user + * object handle after all references are released. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param object - The object to release + * \param count - The number of references to release, typically 1. Must be nonzero + * and not larger than INT_MAX. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectCreate, + * ::cuUserObjectRetain, + * ::cuGraphRetainUserObject, + * ::cuGraphReleaseUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count); + +/** + * \brief Retain a reference to a user object from a graph + * + * Creates or moves user object references that will be owned by a CUDA graph. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param graph - The graph to associate the reference with + * \param object - The user object to retain a reference for + * \param count - The number of references to add to the graph, typically 1. Must be + * nonzero and not larger than INT_MAX. + * \param flags - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references + * from the calling thread, rather than create new references. Pass 0 + * to create new references. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectCreate, + * ::cuUserObjectRetain, + * ::cuUserObjectRelease, + * ::cuGraphReleaseUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags); + +/** + * \brief Release a user object reference from a graph + * + * Releases user object references owned by a graph. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param graph - The graph that will release the reference + * \param object - The user object to release a reference for + * \param count - The number of references to release, typically 1. Must be nonzero + * and not larger than INT_MAX. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectCreate, + * ::cuUserObjectRetain, + * ::cuUserObjectRelease, + * ::cuGraphRetainUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count); + +/** @} */ /* END CUDA_GRAPH */ + /** * \defgroup CUDA_OCCUPANCY Occupancy * @@ -12422,17 +16781,39 @@ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSi */ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); +/** + * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM + * + * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. + * + * \param dynamicSmemSize - Returned maximum dynamic shared memory + * \param func - Kernel function for which occupancy is calculated + * \param numBlocks - Number of blocks to fit on SM + * \param blockSize - Size of the blocks + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + */ +CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize); + /** @} */ /* END CUDA_OCCUPANCY */ -#endif /* __CUDA_API_VERSION >= 6050 */ /** - * \defgroup CUDA_TEXREF Texture Reference Management + * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED] * - * ___MANBRIEF___ texture reference management functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * ___MANBRIEF___ deprecated texture reference management functions of the + * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ * - * This section describes the texture reference management functions of the - * low-level CUDA driver application programming interface. + * This section describes the deprecated texture reference management + * functions of the low-level CUDA driver application programming interface. * * @{ */ @@ -12440,6 +16821,8 @@ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int /** * \brief Binds an array as a texture reference * + * \deprecated + * * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any * previous address or CUDA array state associated with the texture reference * is superseded by this function. \p Flags must be set to @@ -12464,11 +16847,13 @@ CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToArray */ -CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); /** * \brief Binds a mipmapped array to a texture reference * + * \deprecated + * * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef. * Any previous address or CUDA array state associated with the texture reference * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. @@ -12492,12 +16877,13 @@ CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToMipmappedArray */ -CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); -#if __CUDA_API_VERSION >= 3020 /** * \brief Binds an address as a texture reference * + * \deprecated + * * Binds a linear address range to the texture reference \p hTexRef. Any * previous address or CUDA array state associated with the texture reference * is superseded by this function. Any memory previously bound to \p hTexRef @@ -12537,11 +16923,13 @@ CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hM * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTexture */ -CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); /** * \brief Binds an address as a 2D texture reference * + * \deprecated + * * Binds a linear address range to the texture reference \p hTexRef. Any * previous address or CUDA array state associated with the texture reference * is superseded by this function. Any memory previously bound to \p hTexRef @@ -12590,12 +16978,13 @@ CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdevi * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTexture2D */ -CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); -#endif /* __CUDA_API_VERSION >= 3020 */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); /** * \brief Sets the format for a texture reference * + * \deprecated + * * Specifies the format of the data to be read by the texture reference * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure: @@ -12624,11 +17013,13 @@ CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIP * ::cudaBindTextureToArray, * ::cudaBindTextureToMipmappedArray */ -CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); /** * \brief Sets the addressing mode for a texture reference * + * \deprecated + * * Specifies the addressing mode \p am for the given dimension \p dim of the * texture reference \p hTexRef. If \p dim is zero, the addressing mode is * applied to the first parameter of the functions used to fetch from the @@ -12668,11 +17059,13 @@ CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int Num * ::cudaBindTextureToArray, * ::cudaBindTextureToMipmappedArray */ -CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); /** * \brief Sets the filtering mode for a texture reference * + * \deprecated + * * Specifies the filtering mode \p fm to be used when reading memory through * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: * @@ -12702,11 +17095,13 @@ CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mod * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToArray */ -CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); /** * \brief Sets the mipmap filtering mode for a texture reference * + * \deprecated + * * Specifies the mipmap filtering mode \p fm to be used when reading memory through * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: * @@ -12736,11 +17131,13 @@ CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToMipmappedArray */ -CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm); /** * \brief Sets the mipmap level bias for a texture reference * + * \deprecated + * * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when * reading memory through the texture reference \p hTexRef. * @@ -12763,11 +17160,13 @@ CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToMipmappedArray */ -CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); /** * \brief Sets the mipmap min/max mipmap level clamps for a texture reference * + * \deprecated + * * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp * respectively, to be used when reading memory through the texture reference * \p hTexRef. @@ -12792,11 +17191,13 @@ CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, * ::cudaBindTextureToMipmappedArray */ -CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); /** * \brief Sets the maximum anisotropy for a texture reference * + * \deprecated + * * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through * the texture reference \p hTexRef. * @@ -12820,11 +17221,13 @@ CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLe * ::cudaBindTextureToArray, * ::cudaBindTextureToMipmappedArray */ -CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso); /** * \brief Sets the border color for a texture reference * + * \deprecated + * * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference * \p hTexRef. The color value supports only float type and holds color components in * the following sequence: @@ -12854,11 +17257,13 @@ CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAnis * ::cudaBindTextureToArray, * ::cudaBindTextureToMipmappedArray */ -CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); /** * \brief Sets the flags for a texture reference * + * \deprecated + * * Specifies optional flags via \p Flags to specify the behavior of data * returned through the texture reference \p hTexRef. The valid flags are: * @@ -12872,6 +17277,10 @@ CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); * from [0, Dim) where Dim is the width or height of the CUDA * array. Instead, the texture coordinates [0, 1.0) reference * the entire breadth of the array dimension; + * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear + * filtering optimizations. Trilinear optimizations improve texture filtering + * performance by allowing bilinear filtering on textures in scenarios where + * it can closely approximate the expected results. * * \param hTexRef - Texture reference * \param Flags - Optional flags to set @@ -12893,12 +17302,13 @@ CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); * ::cudaBindTextureToArray, * ::cudaBindTextureToMipmappedArray */ -CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); -#if __CUDA_API_VERSION >= 3020 /** * \brief Gets the address associated with a texture reference * + * \deprecated + * * Returns in \p *pdptr the base address bound to the texture reference * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference * is not bound to any device memory range. @@ -12919,12 +17329,13 @@ CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); * ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ -CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); -#endif /* __CUDA_API_VERSION >= 3020 */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); /** * \brief Gets the array bound to a texture reference * + * \deprecated + * * Returns in \p *phArray the CUDA array bound to the texture reference * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference * is not bound to any CUDA array. @@ -12945,11 +17356,13 @@ CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ -CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); /** * \brief Gets the mipmapped array bound to a texture reference * + * \deprecated + * * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference * is not bound to any CUDA mipmapped array. @@ -12970,11 +17383,13 @@ CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ -CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); /** * \brief Gets the addressing mode used by a texture reference * + * \deprecated + * * Returns in \p *pam the addressing mode corresponding to the * dimension \p dim of the texture reference \p hTexRef. Currently, the only * valid value for \p dim are 0 and 1. @@ -12996,11 +17411,13 @@ CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, C * ::cuTexRefGetAddress, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ -CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); /** * \brief Gets the filter-mode used by a texture reference * + * \deprecated + * * Returns in \p *pfm the filtering mode of the texture reference * \p hTexRef. * @@ -13020,11 +17437,13 @@ CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, i * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFlags, ::cuTexRefGetFormat */ -CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); /** * \brief Gets the format used by a texture reference * + * \deprecated + * * Returns in \p *pFormat and \p *pNumChannels the format and number * of components of the CUDA array bound to the texture reference \p hTexRef. * If \p pFormat or \p pNumChannels is NULL, it will be ignored. @@ -13046,11 +17465,13 @@ CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags */ -CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); /** * \brief Gets the mipmap filtering mode for a texture reference * + * \deprecated + * * Returns the mipmap filtering mode in \p pfm that's used when reading memory through * the texture reference \p hTexRef. * @@ -13070,11 +17491,13 @@ CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, C * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ -CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); /** * \brief Gets the mipmap level bias for a texture reference * + * \deprecated + * * Returns the mipmap level bias in \p pBias that's added to the specified mipmap * level when reading memory through the texture reference \p hTexRef. * @@ -13094,11 +17517,13 @@ CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRe * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ -CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); /** * \brief Gets the min/max mipmap level clamps for a texture reference * + * \deprecated + * * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp * that's used when reading memory through the texture reference \p hTexRef. * @@ -13119,11 +17544,13 @@ CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ -CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); /** * \brief Gets the maximum anisotropy for a texture reference * + * \deprecated + * * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through * the texture reference \p hTexRef. * @@ -13143,11 +17570,13 @@ CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat */ -CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); /** * \brief Gets the border color used by a texture reference * + * \deprecated + * * Returns in \p pBorderColor, values of the RGBA color used by * the texture reference \p hTexRef. * The color value is of type float and holds color components in @@ -13170,11 +17599,13 @@ CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); * \sa ::cuTexRefSetAddressMode, * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor */ -CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); /** * \brief Gets the flags used by a texture reference * + * \deprecated + * * Returns in \p *pFlags the flags of the texture reference \p hTexRef. * * \param pFlags - Returned flags @@ -13193,21 +17624,7 @@ CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat */ -CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); - -/** @} */ /* END CUDA_TEXREF */ - -/** - * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED] - * - * ___MANBRIEF___ deprecated texture reference management functions of the - * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the deprecated texture reference management - * functions of the low-level CUDA driver application programming interface. - * - * @{ - */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); /** * \brief Creates a texture reference @@ -13258,7 +17675,7 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); /** - * \defgroup CUDA_SURFREF Surface Reference Management + * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED] * * ___MANBRIEF___ surface reference management functions of the low-level CUDA * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ @@ -13272,6 +17689,8 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); /** * \brief Sets the CUDA array for a surface reference. * + * \deprecated + * * Sets the CUDA array \p hArray to be read and written by the surface reference * \p hSurfRef. Any previous CUDA array state associated with the surface * reference is superseded by this function. \p Flags must be set to 0. @@ -13294,11 +17713,13 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); * ::cuSurfRefGetArray, * ::cudaBindSurfaceToArray */ -CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); +__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); /** * \brief Passes back the CUDA array bound to a surface reference. * + * \deprecated + * * Returns in \p *phArray the CUDA array bound to the surface reference * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference * is not bound to any CUDA array. @@ -13315,11 +17736,10 @@ CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned * * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray */ -CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); -/** @} */ /* END CUDA_SURFREF */ +/** @} */ /* END CUDA_SURFREF_DEPRECATED */ -#if __CUDA_API_VERSION >= 5000 /** * \defgroup CUDA_TEXOBJECT Texture Object Management * @@ -13457,11 +17877,19 @@ CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. * * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following: - * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of having the texture promote integer data to floating point data in the - * range [0, 1]. Note that texture with 32-bit integer format would not be promoted, regardless of whether or not this flag is specified. - * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior of having the texture coordinates range from [0, Dim) where Dim is - * the width or height of the CUDA array. Instead, the texture coordinates [0, 1.0) reference the entire breadth of the array dimension; Note - * that for CUDA mipmapped arrays, this flag has to be set. + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of + * having the texture promote integer data to floating point data in the + * range [0, 1]. Note that texture with 32-bit integer format would not be + * promoted, regardless of whether or not this flag is specified. + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior + * of having the texture coordinates range from [0, Dim) where Dim is the + * width or height of the CUDA array. Instead, the texture coordinates + * [0, 1.0) reference the entire breadth of the array dimension; Note that + * for CUDA mipmapped arrays, this flag has to be set. + * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear + * filtering optimizations. Trilinear optimizations improve texture filtering + * performance by allowing bilinear filtering on textures in scenarios where + * it can closely approximate the expected results. * * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be * clamped to the range [1,16]. @@ -13710,7 +18138,6 @@ CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject); CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject); /** @} */ /* END CUDA_SURFOBJECT */ -#endif /* __CUDA_API_VERSION >= 5000 */ /** * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access @@ -13724,8 +18151,6 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur * @{ */ -#if __CUDA_API_VERSION >= 4000 - /** * \brief Queries if a device may directly access a peer device's memory. * @@ -13767,7 +18192,9 @@ CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevic * memory from the current context in \p peerContext, a separate symmetric call * to ::cuCtxEnablePeerAccess() is required. * - * There is a system-wide maximum of eight peer connections per device. + * Note that there are both device-wide and system-wide limitations per system + * configuration, as noted in the CUDA Programming Guide under the section + * "Peer-to-Peer Memory Access". * * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates * that the ::CUdevice of the current context cannot directly access memory @@ -13832,10 +18259,6 @@ CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags */ CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); -#endif /* __CUDA_API_VERSION >= 4000 */ - -#if __CUDA_API_VERSION >= 8000 - /** * \brief Queries attributes of the link between two devices. * @@ -13876,8 +18299,6 @@ CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); */ CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice); -#endif /* __CUDA_API_VERSION >= 8000 */ - /** @} */ /* END CUDA_PEER_ACCESS */ /** @@ -13962,8 +18383,6 @@ CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource); */ CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); -#if __CUDA_API_VERSION >= 5000 - /** * \brief Get a mipmapped array through which to access a mapped graphics resource. * @@ -13995,9 +18414,6 @@ CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphics */ CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource); -#endif /* __CUDA_API_VERSION >= 5000 */ - -#if __CUDA_API_VERSION >= 3020 /** * \brief Get a device pointer through which to access a mapped graphics resource. * @@ -14031,7 +18447,6 @@ CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMi * ::cudaGraphicsResourceGetMappedPointer */ CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); -#endif /* __CUDA_API_VERSION >= 3020 */ /** * \brief Set usage flags for mapping a graphics resource @@ -14153,8 +18568,73 @@ CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource /** @} */ /* END CUDA_GRAPHICS */ -CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); +/** + * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access + * + * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the driver entry point access functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ +/** + * \brief Returns the requested driver API function pointer + * + * Returns in \p **pfn the address of the CUDA driver function for the requested + * CUDA version and flags. + * + * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2 + * should be specified as 11020. For a requested driver symbol, if the specified + * CUDA version is greater than or equal to the CUDA version in which the driver symbol + * was introduced, this API will return the function pointer to the corresponding + * versioned function. + * + * The pointer returned by the API should be cast to a function pointer matching the + * requested driver function's definition in the API header file. The function pointer + * typedef can be picked up from the corresponding typedefs header file. For example, + * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h. + * + * The API will return ::CUDA_ERROR_NOT_FOUND if the requested driver function is not + * supported on the platform, no ABI compatible driver function exists for the specified + * \p cudaVersion or if the driver symbol is invalid. + * + * The requested flags can be: + * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to + * ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with + * --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM + * is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise. + * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols + * that match the requested driver symbol name except the corresponding per-thread versions. + * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all + * driver symbols that match the requested driver symbol name including the per-thread + * versions. If a per-thread version is not found, the API will return the legacy version + * of the driver function. + * + * \param symbol - The base name of the driver API function to look for. As an example, + * for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and + * \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. + * \param pfn - Location to return the function pointer to the requested driver function + * \param cudaVersion - The CUDA version to look for the requested driver symbol + * \param flags - Flags to specify search options. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_NOT_FOUND + * \note_version_mixing + * + * \sa + * ::cudaGetDriverEntryPoint + */ +CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags); + +/** @} */ /* END CUDA_DRIVER_ENTRY_POINT */ + +CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); /** * CUDA API versioning support @@ -14233,6 +18713,7 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExp #undef cuStreamQuery #undef cuStreamSynchronize #undef cuEventRecord + #undef cuEventRecordWithFlags #undef cuLaunchKernel #undef cuLaunchHostFunc #undef cuGraphicsMapResources @@ -14249,184 +18730,160 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExp #undef cuStreamBeginCapture #undef cuStreamEndCapture #undef cuStreamIsCapturing + #undef cuStreamGetCaptureInfo + #undef cuStreamGetCaptureInfo_v2 + #undef cuGraphUpload #undef cuGraphLaunch -#endif /* __CUDA_API_VERSION_INTERNAL */ + #undef cuDevicePrimaryCtxRelease + #undef cuDevicePrimaryCtxReset + #undef cuDevicePrimaryCtxSetFlags + #undef cuIpcOpenMemHandle + #undef cuStreamCopyAttributes + #undef cuStreamSetAttribute + #undef cuStreamGetAttribute + #undef cuGraphInstantiate + #undef cuMemMapArrayAsync + #undef cuMemFreeAsync + #undef cuMemAllocAsync + #undef cuMemAllocFromPoolAsync + #undef cuStreamUpdateCaptureDependencies -#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) -CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); -#endif /* defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) */ + CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); + CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); + CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); + CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, + unsigned int numOptions, CUjit_option *options, void **optionValues); + CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, + unsigned int numOptions, CUjit_option *options, void **optionValues); + CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); -#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050 -CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); -#endif /* defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050 */ + typedef unsigned int CUdeviceptr_v1; -#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050) -CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); -CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, - unsigned int numOptions, CUjit_option *options, void **optionValues); -CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, - unsigned int numOptions, CUjit_option *options, void **optionValues); -#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050) */ + typedef struct CUDA_MEMCPY2D_v1_st + { + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr_v1 srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ -#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) -CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); -#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) */ + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ -/** - * CUDA API made obselete at API version 3020 - */ -#if defined(__CUDA_API_VERSION_INTERNAL) - #define CUdeviceptr CUdeviceptr_v1 - #define CUDA_MEMCPY2D_st CUDA_MEMCPY2D_v1_st - #define CUDA_MEMCPY2D CUDA_MEMCPY2D_v1 - #define CUDA_MEMCPY3D_st CUDA_MEMCPY3D_v1_st - #define CUDA_MEMCPY3D CUDA_MEMCPY3D_v1 - #define CUDA_ARRAY_DESCRIPTOR_st CUDA_ARRAY_DESCRIPTOR_v1_st - #define CUDA_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR_v1 - #define CUDA_ARRAY3D_DESCRIPTOR_st CUDA_ARRAY3D_DESCRIPTOR_v1_st - #define CUDA_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR_v1 -#endif /* CUDA_FORCE_LEGACY32_INTERNAL */ + unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ + unsigned int Height; /**< Height of 2D memory copy */ + } CUDA_MEMCPY2D_v1; -#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 + typedef struct CUDA_MEMCPY3D_v1_st + { + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + unsigned int srcZ; /**< Source Z */ + unsigned int srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr_v1 srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ -typedef unsigned int CUdeviceptr; + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + unsigned int dstZ; /**< Destination Z */ + unsigned int dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ -typedef struct CUDA_MEMCPY2D_st -{ - unsigned int srcXInBytes; /**< Source X in bytes */ - unsigned int srcY; /**< Source Y */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ + unsigned int Height; /**< Height of 3D memory copy */ + unsigned int Depth; /**< Depth of 3D memory copy */ + } CUDA_MEMCPY3D_v1; - unsigned int dstXInBytes; /**< Destination X in bytes */ - unsigned int dstY; /**< Destination Y */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st + { + unsigned int Width; /**< Width of array */ + unsigned int Height; /**< Height of array */ - unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ - unsigned int Height; /**< Height of 2D memory copy */ -} CUDA_MEMCPY2D; + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + } CUDA_ARRAY_DESCRIPTOR_v1; -typedef struct CUDA_MEMCPY3D_st -{ - unsigned int srcXInBytes; /**< Source X in bytes */ - unsigned int srcY; /**< Source Y */ - unsigned int srcZ; /**< Source Z */ - unsigned int srcLOD; /**< Source LOD */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - void *reserved0; /**< Must be NULL */ - unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ - unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st + { + unsigned int Width; /**< Width of 3D array */ + unsigned int Height; /**< Height of 3D array */ + unsigned int Depth; /**< Depth of 3D array */ - unsigned int dstXInBytes; /**< Destination X in bytes */ - unsigned int dstY; /**< Destination Y */ - unsigned int dstZ; /**< Destination Z */ - unsigned int dstLOD; /**< Destination LOD */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - void *reserved1; /**< Must be NULL */ - unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ - unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ + } CUDA_ARRAY3D_DESCRIPTOR_v1; - unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ - unsigned int Height; /**< Height of 3D memory copy */ - unsigned int Depth; /**< Depth of 3D memory copy */ -} CUDA_MEMCPY3D; + CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev); + CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); + CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name); + CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); + CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize); + CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); + CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr); + CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr); + CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); + CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags); + CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy); + CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy); + CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy); + CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N); + CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N); + CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N); + CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); + CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); + CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); + CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray); + CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); + CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray); + CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); + CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes); + CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch); + CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef); + CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); -typedef struct CUDA_ARRAY_DESCRIPTOR_st -{ - unsigned int Width; /**< Width of array */ - unsigned int Height; /**< Height of array */ + CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); + CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); + CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); + CUresult CUDAAPI cuStreamDestroy(CUstream hStream); + CUresult CUDAAPI cuEventDestroy(CUevent hEvent); + CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); + CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); + CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ -} CUDA_ARRAY_DESCRIPTOR; - -typedef struct CUDA_ARRAY3D_DESCRIPTOR_st -{ - unsigned int Width; /**< Width of 3D array */ - unsigned int Height; /**< Height of 3D array */ - unsigned int Depth; /**< Depth of 3D array */ - - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ - unsigned int Flags; /**< Flags */ -} CUDA_ARRAY3D_DESCRIPTOR; - -CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev); -CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); -CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name); -CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); -CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize); -CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); -CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); -CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr); -CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); -CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); -CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount); -CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount); -CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount); -CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount); -CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); -CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); -CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); -CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); -CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); -CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); -CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); -CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); -CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); -CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); -CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream); -CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream); -CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); -CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); -CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N); -CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N); -CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N); -CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); -CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); -CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); -CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); -CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); -CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); -CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); -CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes); -CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch); -CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); -CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); -#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 3020 */ -#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 4000 -CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); -CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); -CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); -CUresult CUDAAPI cuStreamDestroy(CUstream hStream); -CUresult CUDAAPI cuEventDestroy(CUevent hEvent); -#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 4000 */ -#if defined(__CUDA_API_VERSION_INTERNAL) - #undef CUdeviceptr - #undef CUDA_MEMCPY2D_st - #undef CUDA_MEMCPY2D - #undef CUDA_MEMCPY3D_st - #undef CUDA_MEMCPY3D - #undef CUDA_ARRAY_DESCRIPTOR_st - #undef CUDA_ARRAY_DESCRIPTOR - #undef CUDA_ARRAY3D_DESCRIPTOR_st - #undef CUDA_ARRAY3D_DESCRIPTOR -#endif /* __CUDA_API_VERSION_INTERNAL */ - -#if defined(__CUDA_API_VERSION_INTERNAL) CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); @@ -14474,6 +18931,7 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent); CUresult CUDAAPI cuStreamQuery(CUstream hStream); CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); + CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); @@ -14488,16 +18946,49 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent); CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); + CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream); + CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode); CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); + CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); + CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); + CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream); CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream); + CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream); + CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value); + CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param); + + CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); + CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); + CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); + + CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); + CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); + CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); + + CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); +#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) +static inline CUresult cuGetProcAddress_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags) { + const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM| + CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM); + if ((flags & procAddressMask) == 0) { + flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM; + } + return cuGetProcAddress(symbol, funcPtr, driverVersion, flags); +} +#define cuGetProcAddress cuGetProcAddress_ptsz #endif #ifdef __cplusplus } #endif -#undef __CUDA_API_VERSION +#if defined(__GNUC__) + #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) + #pragma GCC visibility pop + #endif +#endif + #undef __CUDA_DEPRECATED #endif /* __cuda_cuda_h__ */ diff --git a/python/src/triton.cc b/python/src/triton.cc index 4d7df76ff..cb6923afe 100644 --- a/python/src/triton.cc +++ b/python/src/triton.cc @@ -101,12 +101,12 @@ void hip_enqueue(uint64_t stream, uint64_t kernel, } -std::string pow2_divisor(long N){ - if(N % 16 == 0) return "16"; - if(N % 8 == 0) return "8"; - if(N % 4 == 0) return "4"; - if(N % 2 == 0) return "2"; - return "1"; +long pow2_divisor(long N){ + if(N % 16 == 0) return 16; + if(N % 8 == 0) return 8; + if(N % 4 == 0) return 4; + if(N % 2 == 0) return 2; + return 1; } // Returns something like "int16", whether dtype is a torch.dtype or @@ -127,6 +127,14 @@ std::string dtype_cache_key_part(const py::object& dtype) { } } +size_t get_pointer_range_size(uint64_t addr){ + if(addr == 0) + return 0; + size_t size; + drv::dispatch::cuPointerGetAttribute(&size, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)addr); + return size; +} + // Launch void parse_args(py::list& args, py::list do_not_specialize, const std::string& func_key, py::list& arg_names, std::string& cache_key, std::string& params, size_t& params_size, py::dict constants, @@ -187,7 +195,7 @@ void parse_args(py::list& args, py::list do_not_specialize, const std::string& f continue; // values divisible by small powers of 2 are specialized cache_key += "[multipleof("; - cache_key += pow2_divisor(value); + cache_key += std::to_string(pow2_divisor(value)); cache_key += ")]"; continue; } @@ -213,12 +221,15 @@ void parse_args(py::list& args, py::list do_not_specialize, const std::string& f py::object data_ptr = arg.attr("data_ptr")(); long value = data_ptr.cast(); params_ptr = (char*)(((uintptr_t)params_ptr + 7) & (-8)); + // copy param std::memcpy(params_ptr, &value, 8); params_ptr += 8; + // udpate cache key cache_key += dtype_cache_key_part(arg.attr("dtype")); cache_key += "*"; cache_key += "[multipleof("; - cache_key += pow2_divisor(value); + size_t range_size = get_pointer_range_size(value); + cache_key += std::to_string(std::min(pow2_divisor(value), pow2_divisor(range_size))); cache_key += ")]"; continue; } @@ -268,6 +279,10 @@ void init_triton_runtime(py::module &&m) { } ); + // get range size for the given pointer + m.def("get_pointer_range_size", &get_pointer_range_size); + + // cache key m.def("launch", [](py::list args, py::list do_not_specialize, const std::string& func_key, py::list& arg_names, py::object device, py::int_ stream, py::dict bin_cache, py::int_ num_warps, py::int_ num_stages, diff --git a/python/triton/code_gen.py b/python/triton/code_gen.py index b7da2047e..63e3d0aa0 100644 --- a/python/triton/code_gen.py +++ b/python/triton/code_gen.py @@ -674,9 +674,17 @@ class Kernel: def add_to_cache(self, key, wargs, device_idx, num_warps, num_stages): tensor_idxs = [i for i, arg in enumerate(wargs) if hasattr(arg, 'data_ptr')] # attributes - args = [arg.data_ptr() if i in tensor_idxs else arg for i, arg in enumerate(wargs)] - attributes = {i: Kernel.pow2_divisor(a) for i, a in enumerate(args) - if isinstance(a, int) and i not in self.fn.do_not_specialize} + attributes = dict() + for i, arg in enumerate(wargs): + if i in self.fn.do_not_specialize: + continue + if isinstance(arg, int): + attributes[i] = Kernel.pow2_divisor(arg) + elif i in tensor_idxs: + addr = arg.data_ptr() + range_size = _triton.runtime.get_pointer_range_size(addr) + attributes[i] = min(Kernel.pow2_divisor(addr), + Kernel.pow2_divisor(range_size)) # transforms ints whose value is one into constants for just-in-time compilation constants = {i: arg for i, arg in enumerate(wargs) if isinstance(arg, int) and arg == 1 and i not in self.fn.do_not_specialize} diff --git a/python/triton/language/core.py b/python/triton/language/core.py index 4ef6408d0..6f1bf3ba7 100644 --- a/python/triton/language/core.py +++ b/python/triton/language/core.py @@ -165,6 +165,7 @@ class block: self.numel = 1 for s in self.shape: self.numel *= s + self.numel = constexpr(self.numel) # Data-type wrapper self.dtype = block._init_dtype(self.handle.type.scalar) @@ -873,7 +874,7 @@ def ravel(x): :param x: the input block :type x: Block """ - return triton.language.reshape(x, [x.type.numel]) + return triton.language.reshape(x, [x.numel]) @triton.jit