diff --git a/CMakeLists.txt b/CMakeLists.txt index 2fb182135..e921f7275 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,7 +184,6 @@ target_link_libraries(triton TritonAnalysis TritonTransforms TritonGPUTransforms - TritonDriver TritonLLVMIR TritonPTX ${dialect_libs} diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt index ac7e877dc..7fb68f48a 100644 --- a/bin/CMakeLists.txt +++ b/bin/CMakeLists.txt @@ -26,35 +26,35 @@ target_link_libraries(triton-opt PRIVATE mlir_check_all_link_libraries(triton-opt) -add_llvm_executable(triton-translate triton-translate.cpp PARTIAL_SOURCES_INTENDED) -llvm_update_compile_flags(triton-translate) -target_link_libraries(triton-translate PRIVATE - TritonAnalysis - TritonTransforms - TritonGPUTransforms - TritonLLVMIR - TritonDriver - ${dialect_libs} - ${conversion_libs} - # tests - TritonTestAnalysis +# add_llvm_executable(triton-translate triton-translate.cpp PARTIAL_SOURCES_INTENDED) +#llvm_update_compile_flags(triton-translate) +# target_link_libraries(triton-translate PRIVATE +# TritonAnalysis +# TritonTransforms +# TritonGPUTransforms +# TritonLLVMIR +# TritonDriver +# ${dialect_libs} +# ${conversion_libs} +# # tests +# TritonTestAnalysis - LLVMCore - LLVMSupport - LLVMOption - LLVMCodeGen - LLVMAsmParser +# LLVMCore +# LLVMSupport +# LLVMOption +# LLVMCodeGen +# LLVMAsmParser - # MLIR core - MLIROptLib - MLIRIR - MLIRPass - MLIRSupport - MLIRTransforms - MLIRExecutionEngine - MLIRMathToLLVM - MLIRTransformUtils - MLIRLLVMToLLVMIRTranslation - MLIRNVVMToLLVMIRTranslation - ) -mlir_check_all_link_libraries(triton-translate) +# # MLIR core +# MLIROptLib +# MLIRIR +# MLIRPass +# MLIRSupport +# MLIRTransforms +# MLIRExecutionEngine +# MLIRMathToLLVM +# MLIRTransformUtils +# MLIRLLVMToLLVMIRTranslation +# MLIRNVVMToLLVMIRTranslation +# ) +# mlir_check_all_link_libraries(triton-translate) diff --git a/include/triton/Target/PTX/PTXTranslation.h b/include/triton/Target/PTX/PTXTranslation.h index 45f8e5240..df15edc73 100644 --- a/include/triton/Target/PTX/PTXTranslation.h +++ b/include/triton/Target/PTX/PTXTranslation.h @@ -1,34 +1,17 @@ #ifndef TRITON_TARGET_PTXTRANSLATION_H #define TRITON_TARGET_PTXTRANSLATION_H -#include "triton/driver/dispatch.h" - +#include #include -namespace mlir { - -class ModuleOp; - -} // namespace mlir +namespace llvm { +class Module; +} // namespace llvm namespace triton { -template int cuGetInfo(CUdevice device) { - int res; - driver::dispatch::cuDeviceGetAttribute(&res, attr, device); - return res; -} - -void getCuCCAndVersionFromDevice(uint64_t device, int *cc, int *version, - std::string *ptxasPath); - // Translate TritonGPU IR to PTX code. -std::tuple -translateTritonGPUToPTX(mlir::ModuleOp module, uint64_t device); +std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version); } // namespace triton diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h deleted file mode 100644 index de0fa403c..000000000 --- a/include/triton/driver/dispatch.h +++ /dev/null @@ -1,376 +0,0 @@ -#pragma once - -#ifndef _TRITON_DRIVER_DISPATCH_H_ -#define _TRITON_DRIVER_DISPATCH_H_ - -#include -#include - -// CUDA Backend -#include "triton/external/CUDA/cuda.h" -#include "triton/external/CUDA/nvml.h" - -//// HIP backend -//#define __HIP_PLATFORM_AMD__ -#include "triton/external/hip.h" - -// Exceptions -#include -#include - -namespace llvm { -class PassRegistry; -class Module; -} // namespace llvm - -namespace triton { -namespace driver { - -class cu_context; - -template void check(T) {} -void check(CUresult err); -void check(hipError_t err); - -class dispatch { -protected: - template struct return_type; - - template struct return_type { - typedef R type; - }; - - typedef bool (*f_init_t)(); - - template - static typename return_type::type - f_impl(void *&lib_h, FunPtrT, void *&cache, const char *name, Args... args) { - initializer(); - if (cache == nullptr) { - cache = dlsym(lib_h, name); - if (cache == 0) { -#ifdef __EXCEPTIONS - throw std::runtime_error("dlsym unable to load function"); -#else - std::cerr << "Triton: dlsym unable to load function `" << name << "`" - << std::endl; - std::abort(); -#endif - } - } - FunPtrT fptr; - *reinterpret_cast(&fptr) = cache; - typename return_type::type res = (*fptr)(args...); - check(res); - return res; - } - -public: - static void release(); - // Nvidia - static bool nvmlinit(); - static bool cuinit(); - // AMD - static bool hipinit(); - - /* ------------------- * - * CUDA - * ------------------- */ - // context management - static CUresult cuInit(unsigned int Flags); - static CUresult cuCtxDestroy_v2(CUcontext ctx); - static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, - CUdevice dev); - static CUresult cuCtxPushCurrent_v2(CUcontext ctx); - static CUresult cuCtxPopCurrent_v2(CUcontext *pctx); - static CUresult cuCtxGetDevice(CUdevice *result); - static CUresult cuCtxEnablePeerAccess(CUcontext peerContext, - unsigned int flags); - static CUresult cuDriverGetVersion(int *driverVersion); - // device management - static CUresult cuDeviceGet(CUdevice *device, int ordinal); - static CUresult cuDeviceGetName(char *name, int len, CUdevice dev); - static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev); - static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, - CUdevice dev); - static CUresult cuDeviceGetCount(int *count); - // link management - static CUresult cuLinkAddData_v2(CUlinkState state, CUjitInputType type, - void *data, size_t size, const char *name, - unsigned int numOptions, - CUjit_option *options, void **optionValues); - static CUresult cuLinkCreate_v2(unsigned int numOptions, - CUjit_option *options, void **optionValues, - CUlinkState *stateOut); - static CUresult cuLinkComplete(CUlinkState state, void **cubinOut, - size_t *sizeOut); - static CUresult cuLinkDestroy(CUlinkState state); - // module management - static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t *bytes, - CUmodule hmod, const char *name); - static CUresult cuModuleLoad(CUmodule *module, const char *fname); - static CUresult cuModuleLoadData(CUmodule *module, const void *image); - static CUresult cuModuleUnload(CUmodule hmod); - static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, - unsigned int numOptions, - CUjit_option *options, - void **optionValues); - static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, - const char *name); - // stream management - static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags); - static CUresult cuStreamSynchronize(CUstream hStream); - static CUresult cuStreamGetCtx(CUstream hStream, CUcontext *pctx); - static CUresult cuStreamDestroy_v2(CUstream hStream); - static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, - unsigned int gridDimY, unsigned int gridDimZ, - unsigned int blockDimX, unsigned int blockDimY, - unsigned int blockDimZ, - unsigned int sharedMemBytes, CUstream hStream, - void **kernelParams, void **extra); - // function management - static CUresult cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, - CUfunction hfunc); - static CUresult cuFuncSetAttribute(CUfunction hfunc, - CUfunction_attribute attrib, int value); - static CUresult cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); - // memory management - static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize); - static CUresult cuPointerGetAttribute(void *data, - CUpointer_attribute attribute, - CUdeviceptr ptr); - static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, - CUstream stream); - static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, - size_t ByteCount); - static CUresult cuMemFree_v2(CUdeviceptr dptr); - static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, - size_t ByteCount, CUstream hStream); - static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, - const void *srcHost, size_t ByteCount, - CUstream hStream); - static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, - size_t ByteCount); - // event management - static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags); - static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, - CUevent hEnd); - static CUresult cuEventRecord(CUevent hEvent, CUstream hStream); - static CUresult cuEventDestroy_v2(CUevent hEvent); - - /* ------------------- * - * NVML - * ------------------- */ - static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2(const char *pciBusId, - nvmlDevice_t *device); - static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, - nvmlClockType_t type, - unsigned int *clock); - static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, - nvmlClockType_t type, - unsigned int *clock); - static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, - unsigned int mem_clock, - unsigned int sm_clock); - - /* ------------------- * - * HIP - * ------------------- */ - // context management - static hipError_t hipInit(unsigned int Flags); - static hipError_t hipCtxDestroy(hipCtx_t ctx); - static hipError_t hipCtxCreate(hipCtx_t *pctx, unsigned int flags, - hipDevice_t dev); - static hipError_t hipCtxPushCurrent(hipCtx_t ctx); - static hipError_t hipCtxPopCurrent(hipCtx_t *pctx); - static hipError_t hipCtxGetDevice(hipDevice_t *result); - static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerContext, - unsigned int flags); - static hipError_t hipDriverGetVersion(int *driverVersion); - // device management - static hipError_t hipGetDevice(hipDevice_t *device, int ordinal); - static hipError_t hipDeviceGetName(char *name, int len, hipDevice_t dev); - static hipError_t hipDeviceGetPCIBusId(char *id, int len, hipDevice_t dev); - static hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attrib, - hipDevice_t dev); - static hipError_t hipGetDeviceCount(int *count); - // module management - static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, - hipModule_t hmod, const char *name); - static hipError_t hipModuleLoad(hipModule_t *module, const char *fname); - static hipError_t hipModuleLoadData(hipModule_t *module, const void *image); - static hipError_t hipModuleUnload(hipModule_t hmod); - static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, - unsigned int numOptions, - hipJitOption *options, - void **optionValues); - static hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, - const char *name); - // stream management - static hipError_t hipStreamCreate(hipStream_t *phStream, unsigned int Flags); - static hipError_t hipStreamSynchronize(hipStream_t hStream); - static hipError_t hipStreamDestroy(hipStream_t hStream); - static hipError_t - hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, - unsigned int gridDimY, unsigned int gridDimZ, - unsigned int blockDimX, unsigned int blockDimY, - unsigned int blockDimZ, unsigned int sharedMemBytes, - hipStream_t hStream, void **kernelParams, void **extra); - // function management - static hipError_t hipFuncGetAttributes(hipFuncAttributes *attrib, - void *hfunc); - static hipError_t hipFuncSetAttribute(hipFunction_t hfunc, - hipFuncAttribute attrib, int value); - static hipError_t hipFuncSetCacheConfig(hipFunction_t hfunc, - hipFuncCache_t config); - // memory management - static hipError_t hipMalloc(hipDeviceptr_t *dptr, size_t bytesize); - static hipError_t hipPointerGetAttribute(void *data, - CUpointer_attribute attribute, - hipDeviceptr_t ptr); - static hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char x, - size_t N, hipStream_t stream); - static hipError_t hipMemcpyDtoH(void *dstHost, hipDeviceptr_t srcDevice, - size_t ByteCount); - static hipError_t hipFree(hipDeviceptr_t dptr); - static hipError_t hipMemcpyDtoHAsync(void *dstHost, hipDeviceptr_t srcDevice, - size_t ByteCount, hipStream_t hStream); - static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, - const void *srcHost, size_t ByteCount, - hipStream_t hStream); - static hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, const void *srcHost, - size_t ByteCount); - // event management - static hipError_t hipEventCreate(hipEvent_t *phEvent, unsigned int Flags); - static hipError_t hipEventElapsedTime(float *pMilliseconds, hipEvent_t hStart, - hipEvent_t hEnd); - static hipError_t hipEventRecord(hipEvent_t hEvent, hipStream_t hStream); - static hipError_t hipEventDestroy(hipEvent_t hEvent); - -private: - // Libraries - static void *cuda_; - static void *nvml_; - static void *hip_; - - /* ------------------- * - * CUDA - * ------------------- */ - // context management - static void *cuCtxGetCurrent_; - static void *cuCtxSetCurrent_; - static void *cuCtxDestroy_v2_; - static void *cuCtxCreate_v2_; - static void *cuCtxGetDevice_; - static void *cuCtxPushCurrent_v2_; - static void *cuCtxPopCurrent_v2_; - static void *cuCtxEnablePeerAccess_; - static void *cuDriverGetVersion_; - static void *cuInit_; - // device management - static void *cuDeviceGet_; - static void *cuDeviceGetName_; - static void *cuDeviceGetPCIBusId_; - static void *cuDeviceGetAttribute_; - static void *cuDeviceGetCount_; - // link management - static void *cuLinkAddData_v2_; - static void *cuLinkCreate_v2_; - static void *cuLinkDestroy_; - static void *cuLinkComplete_; - // module management - static void *cuModuleGetGlobal_v2_; - static void *cuModuleLoad_; - static void *cuModuleUnload_; - static void *cuModuleLoadDataEx_; - static void *cuModuleLoadData_; - static void *cuModuleGetFunction_; - // stream management - static void *cuStreamCreate_; - static void *cuStreamSynchronize_; - static void *cuStreamDestroy_v2_; - static void *cuStreamGetCtx_; - static void *cuLaunchKernel_; - // function management - static void *cuFuncGetAttribute_; - static void *cuFuncSetAttribute_; - static void *cuFuncSetCacheConfig_; - // memory management - static void *cuMemcpyDtoH_v2_; - static void *cuMemFree_v2_; - static void *cuMemcpyDtoHAsync_v2_; - static void *cuMemcpyHtoDAsync_v2_; - static void *cuMemcpyHtoD_v2_; - static void *cuMemAlloc_v2_; - static void *cuMemsetD8Async_; - static void *cuPointerGetAttribute_; - // event management - static void *cuEventCreate_; - static void *cuEventElapsedTime_; - static void *cuEventRecord_; - static void *cuEventDestroy_v2_; - - /* ------------------- * - * NVML - * ------------------- */ - static void *nvmlInit_v2_; - static void *nvmlDeviceGetHandleByPciBusId_v2_; - static void *nvmlDeviceGetClockInfo_; - static void *nvmlDeviceGetMaxClockInfo_; - static void *nvmlDeviceSetApplicationsClocks_; - - /* ------------------- * - * HIP - * ------------------- */ - // context management - static void *hipInit_; - static void *hipCtxDestroy_; - static void *hipCtxCreate_; - static void *hipCtxPushCurrent_; - static void *hipCtxPopCurrent_; - static void *hipCtxGetDevice_; - static void *hipCtxEnablePeerAccess_; - static void *hipDriverGetVersion_; - // device management - static void *hipGetDevice_; - static void *hipDeviceGetName_; - static void *hipDeviceGetPCIBusId_; - static void *hipDeviceGetAttribute_; - static void *hipGetDeviceCount_; - // module management - static void *hipModuleGetGlobal_; - static void *hipModuleLoad_; - static void *hipModuleLoadData_; - static void *hipModuleUnload_; - static void *hipModuleLoadDataEx_; - static void *hipModuleGetFunction_; - // stream management - static void *hipStreamCreate_; - static void *hipStreamSynchronize_; - static void *hipStreamDestroy_; - static void *hipModuleLaunchKernel_; - ; - // function management - static void *hipFuncGetAttributes_; - static void *hipFuncSetAttribute_; - static void *hipFuncSetCacheConfig_; - // memory management - static void *hipMalloc_; - static void *hipPointerGetAttribute_; - static void *hipMemsetD8Async_; - static void *hipMemcpyDtoH_; - static void *hipFree_; - static void *hipMemcpyDtoHAsync_; - static void *hipMemcpyHtoDAsync_; - static void *hipMemcpyHtoD_; - // event management - static void *hipEventCreate_; - static void *hipEventElapsedTime_; - static void *hipEventRecord_; - static void *hipEventDestroy_; -}; - -} // namespace driver -} // namespace triton - -#endif diff --git a/include/triton/driver/error.h b/include/triton/driver/error.h deleted file mode 100644 index 229e1dee4..000000000 --- a/include/triton/driver/error.h +++ /dev/null @@ -1,254 +0,0 @@ -#pragma once - -#ifndef _TRITON_DRIVER_ERROR_H_ -#define _TRITON_DRIVER_ERROR_H_ - -#include "triton/driver/dispatch.h" -#include - -namespace triton { - -namespace driver { - -namespace exception { - -namespace nvrtc { - -#define TRITON_CREATE_NVRTC_EXCEPTION(name, msg) \ - class name : public std::exception { \ - public: \ - const char *what() const throw() override { return "NVRTC: Error- " msg; } \ - } - -TRITON_CREATE_NVRTC_EXCEPTION(out_of_memory, "out of memory"); -TRITON_CREATE_NVRTC_EXCEPTION(program_creation_failure, - "program creation failure"); -TRITON_CREATE_NVRTC_EXCEPTION(invalid_input, "invalid input"); -TRITON_CREATE_NVRTC_EXCEPTION(invalid_program, "invalid program"); -TRITON_CREATE_NVRTC_EXCEPTION(invalid_option, "invalid option"); -TRITON_CREATE_NVRTC_EXCEPTION(compilation, "compilation"); -TRITON_CREATE_NVRTC_EXCEPTION(builtin_operation_failure, - "builtin operation failure"); -TRITON_CREATE_NVRTC_EXCEPTION(unknown_error, "unknown error"); - -#undef TRITON_CREATE_NVRTC_EXCEPTION -} // namespace nvrtc - -namespace cuda { -class base : public std::exception {}; - -#define TRITON_CREATE_CUDA_EXCEPTION(name, msg) \ - class name : public base { \ - public: \ - const char *what() const throw() override { return "CUDA: Error- " msg; } \ - } - -TRITON_CREATE_CUDA_EXCEPTION(invalid_value, "invalid value"); -TRITON_CREATE_CUDA_EXCEPTION(out_of_memory, "out of memory"); -TRITON_CREATE_CUDA_EXCEPTION(not_initialized, "not initialized"); -TRITON_CREATE_CUDA_EXCEPTION(deinitialized, "deinitialized"); -TRITON_CREATE_CUDA_EXCEPTION(profiler_disabled, "profiler disabled"); -TRITON_CREATE_CUDA_EXCEPTION(profiler_not_initialized, - "profiler not initialized"); -TRITON_CREATE_CUDA_EXCEPTION(profiler_already_started, - "profiler already started"); -TRITON_CREATE_CUDA_EXCEPTION(profiler_already_stopped, - "profiler already stopped"); -TRITON_CREATE_CUDA_EXCEPTION(no_device, "no device"); -TRITON_CREATE_CUDA_EXCEPTION(invalid_device, "invalid device"); -TRITON_CREATE_CUDA_EXCEPTION(invalid_image, "invalid image"); -TRITON_CREATE_CUDA_EXCEPTION(invalid_context, "invalid context"); -TRITON_CREATE_CUDA_EXCEPTION(context_already_current, - "context already current"); -TRITON_CREATE_CUDA_EXCEPTION(map_failed, "map failed"); -TRITON_CREATE_CUDA_EXCEPTION(unmap_failed, "unmap failed"); -TRITON_CREATE_CUDA_EXCEPTION(array_is_mapped, "array is mapped"); -TRITON_CREATE_CUDA_EXCEPTION(already_mapped, "already mapped"); -TRITON_CREATE_CUDA_EXCEPTION(no_binary_for_gpu, "no binary for gpu"); -TRITON_CREATE_CUDA_EXCEPTION(already_acquired, "already acquired"); -TRITON_CREATE_CUDA_EXCEPTION(not_mapped, "not mapped"); -TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_array, "not mapped as array"); -TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer, "not mapped as pointer"); -TRITON_CREATE_CUDA_EXCEPTION(ecc_uncorrectable, "ecc uncorrectable"); -TRITON_CREATE_CUDA_EXCEPTION(unsupported_limit, "unsupported limit"); -TRITON_CREATE_CUDA_EXCEPTION(context_already_in_use, "context already in use"); -TRITON_CREATE_CUDA_EXCEPTION(peer_access_unsupported, - "peer access unsupported"); -TRITON_CREATE_CUDA_EXCEPTION(invalid_ptx, "invalid ptx"); -TRITON_CREATE_CUDA_EXCEPTION(invalid_graphics_context, - "invalid graphics context"); -TRITON_CREATE_CUDA_EXCEPTION(invalid_source, "invalid source"); -TRITON_CREATE_CUDA_EXCEPTION(file_not_found, "file not found"); -TRITON_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found, - "shared object symbol not found"); -TRITON_CREATE_CUDA_EXCEPTION(shared_object_init_failed, - "shared object init failed"); -TRITON_CREATE_CUDA_EXCEPTION(operating_system, "operating system"); -TRITON_CREATE_CUDA_EXCEPTION(invalid_handle, "invalid handle"); -TRITON_CREATE_CUDA_EXCEPTION(not_found, "not found"); -TRITON_CREATE_CUDA_EXCEPTION(not_ready, "not ready"); -TRITON_CREATE_CUDA_EXCEPTION(illegal_address, "illegal address"); -TRITON_CREATE_CUDA_EXCEPTION(launch_out_of_resources, - "launch out of resources"); -TRITON_CREATE_CUDA_EXCEPTION(launch_timeout, "launch timeout"); -TRITON_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing, - "launch incompatible texturing"); -TRITON_CREATE_CUDA_EXCEPTION(peer_access_already_enabled, - "peer access already enabled"); -TRITON_CREATE_CUDA_EXCEPTION(peer_access_not_enabled, - "peer access not enabled"); -TRITON_CREATE_CUDA_EXCEPTION(primary_context_active, "primary context active"); -TRITON_CREATE_CUDA_EXCEPTION(context_is_destroyed, "context is destroyed"); -TRITON_CREATE_CUDA_EXCEPTION(assert_error, "assert"); -TRITON_CREATE_CUDA_EXCEPTION(too_many_peers, "too many peers"); -TRITON_CREATE_CUDA_EXCEPTION(host_memory_already_registered, - "host memory already registered"); -TRITON_CREATE_CUDA_EXCEPTION(host_memory_not_registered, - "hot memory not registered"); -TRITON_CREATE_CUDA_EXCEPTION(hardware_stack_error, "hardware stack error"); -TRITON_CREATE_CUDA_EXCEPTION(illegal_instruction, "illegal instruction"); -TRITON_CREATE_CUDA_EXCEPTION(misaligned_address, "misaligned address"); -TRITON_CREATE_CUDA_EXCEPTION(invalid_address_space, "invalid address space"); -TRITON_CREATE_CUDA_EXCEPTION(invalid_pc, "invalid pc"); -TRITON_CREATE_CUDA_EXCEPTION(launch_failed, "launch failed"); -TRITON_CREATE_CUDA_EXCEPTION(not_permitted, "not permitted"); -TRITON_CREATE_CUDA_EXCEPTION(not_supported, "not supported"); -TRITON_CREATE_CUDA_EXCEPTION(unknown, "unknown"); - -#undef TRITON_CREATE_CUDA_EXCEPTION -} // namespace cuda - -namespace cublas { -class base : public std::exception {}; - -#define TRITON_CREATE_CUBLAS_EXCEPTION(name, msg) \ - class name : public base { \ - public: \ - const char *what() const throw() override { \ - return "CUBLAS: Error- " msg; \ - } \ - } - -TRITON_CREATE_CUBLAS_EXCEPTION(not_initialized, "not initialized"); -TRITON_CREATE_CUBLAS_EXCEPTION(alloc_failed, "alloc failed"); -TRITON_CREATE_CUBLAS_EXCEPTION(invalid_value, "invalid value"); -TRITON_CREATE_CUBLAS_EXCEPTION(arch_mismatch, "arch mismatch"); -TRITON_CREATE_CUBLAS_EXCEPTION(mapping_error, "mapping error"); -TRITON_CREATE_CUBLAS_EXCEPTION(execution_failed, "execution failed"); -TRITON_CREATE_CUBLAS_EXCEPTION(internal_error, "internal error"); -TRITON_CREATE_CUBLAS_EXCEPTION(not_supported, "not supported"); -TRITON_CREATE_CUBLAS_EXCEPTION(license_error, "license error"); -TRITON_CREATE_CUBLAS_EXCEPTION(unknown, "unknown"); - -#undef TRITON_CREATE_CUBLAS_EXCEPTION -} // namespace cublas - -namespace cudnn { -#define TRITON_CREATE_CUDNN_EXCEPTION(name, msg) \ - class name : public std::exception { \ - public: \ - const char *what() const throw() override { return "CUDNN: Error- " msg; } \ - } - -TRITON_CREATE_CUDNN_EXCEPTION(not_initialized, "not initialized"); -TRITON_CREATE_CUDNN_EXCEPTION(alloc_failed, "allocation failed"); -TRITON_CREATE_CUDNN_EXCEPTION(bad_param, "bad param"); -TRITON_CREATE_CUDNN_EXCEPTION(internal_error, "internal error"); -TRITON_CREATE_CUDNN_EXCEPTION(invalid_value, "invalid value"); -TRITON_CREATE_CUDNN_EXCEPTION(arch_mismatch, "arch mismatch"); -TRITON_CREATE_CUDNN_EXCEPTION(mapping_error, "mapping error"); -TRITON_CREATE_CUDNN_EXCEPTION(execution_failed, "execution failed"); -TRITON_CREATE_CUDNN_EXCEPTION(not_supported, "not supported"); -TRITON_CREATE_CUDNN_EXCEPTION(license_error, "license error"); -TRITON_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing, - "prerequisite missing"); -TRITON_CREATE_CUDNN_EXCEPTION(runtime_in_progress, "runtime in progress"); -TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow, "runtime fp overflow"); -} // namespace cudnn - -namespace hip { -class base : public std::exception {}; - -#define TRITON_CREATE_HIP_EXCEPTION(name, msg) \ - class name : public base { \ - public: \ - const char *what() const throw() override { return "HIP: Error- " msg; } \ - } - -TRITON_CREATE_HIP_EXCEPTION(invalid_value, "invalid value"); -TRITON_CREATE_HIP_EXCEPTION(out_of_memory, "out of memory"); -TRITON_CREATE_HIP_EXCEPTION(not_initialized, "not initialized"); -TRITON_CREATE_HIP_EXCEPTION(deinitialized, "deinitialized"); -TRITON_CREATE_HIP_EXCEPTION(profiler_disabled, "profiler disabled"); -TRITON_CREATE_HIP_EXCEPTION(profiler_not_initialized, - "profiler not initialized"); -TRITON_CREATE_HIP_EXCEPTION(profiler_already_started, - "profiler already started"); -TRITON_CREATE_HIP_EXCEPTION(profiler_already_stopped, - "profiler already stopped"); -TRITON_CREATE_HIP_EXCEPTION(no_device, "no device"); -TRITON_CREATE_HIP_EXCEPTION(invalid_device, "invalid device"); -TRITON_CREATE_HIP_EXCEPTION(invalid_image, "invalid image"); -TRITON_CREATE_HIP_EXCEPTION(invalid_context, "invalid context"); -TRITON_CREATE_HIP_EXCEPTION(context_already_current, "context already current"); -TRITON_CREATE_HIP_EXCEPTION(map_failed, "map failed"); -TRITON_CREATE_HIP_EXCEPTION(unmap_failed, "unmap failed"); -TRITON_CREATE_HIP_EXCEPTION(array_is_mapped, "array is mapped"); -TRITON_CREATE_HIP_EXCEPTION(already_mapped, "already mapped"); -TRITON_CREATE_HIP_EXCEPTION(no_binary_for_gpu, "no binary for gpu"); -TRITON_CREATE_HIP_EXCEPTION(already_acquired, "already acquired"); -TRITON_CREATE_HIP_EXCEPTION(not_mapped, "not mapped"); -TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_array, "not mapped as array"); -TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_pointer, "not mapped as pointer"); -TRITON_CREATE_HIP_EXCEPTION(ecc_uncorrectable, "ecc uncorrectable"); -TRITON_CREATE_HIP_EXCEPTION(unsupported_limit, "unsupported limit"); -TRITON_CREATE_HIP_EXCEPTION(context_already_in_use, "context already in use"); -TRITON_CREATE_HIP_EXCEPTION(peer_access_unsupported, "peer access unsupported"); -TRITON_CREATE_HIP_EXCEPTION(invalid_ptx, "invalid ptx"); -TRITON_CREATE_HIP_EXCEPTION(invalid_graphics_context, - "invalid graphics context"); -TRITON_CREATE_HIP_EXCEPTION(invalid_source, "invalid source"); -TRITON_CREATE_HIP_EXCEPTION(file_not_found, "file not found"); -TRITON_CREATE_HIP_EXCEPTION(shared_object_symbol_not_found, - "shared object symbol not found"); -TRITON_CREATE_HIP_EXCEPTION(shared_object_init_failed, - "shared object init failed"); -TRITON_CREATE_HIP_EXCEPTION(operating_system, "operating system"); -TRITON_CREATE_HIP_EXCEPTION(invalid_handle, "invalid handle"); -TRITON_CREATE_HIP_EXCEPTION(not_found, "not found"); -TRITON_CREATE_HIP_EXCEPTION(not_ready, "not ready"); -TRITON_CREATE_HIP_EXCEPTION(illegal_address, "illegal address"); -TRITON_CREATE_HIP_EXCEPTION(launch_out_of_resources, "launch out of resources"); -TRITON_CREATE_HIP_EXCEPTION(launch_timeout, "launch timeout"); -TRITON_CREATE_HIP_EXCEPTION(launch_incompatible_texturing, - "launch incompatible texturing"); -TRITON_CREATE_HIP_EXCEPTION(peer_access_already_enabled, - "peer access already enabled"); -TRITON_CREATE_HIP_EXCEPTION(peer_access_not_enabled, "peer access not enabled"); -TRITON_CREATE_HIP_EXCEPTION(primary_context_active, "primary context active"); -TRITON_CREATE_HIP_EXCEPTION(context_is_destroyed, "context is destroyed"); -TRITON_CREATE_HIP_EXCEPTION(assert_error, "assert"); -TRITON_CREATE_HIP_EXCEPTION(too_many_peers, "too many peers"); -TRITON_CREATE_HIP_EXCEPTION(host_memory_already_registered, - "host memory already registered"); -TRITON_CREATE_HIP_EXCEPTION(host_memory_not_registered, - "hot memory not registered"); -TRITON_CREATE_HIP_EXCEPTION(hardware_stack_error, "hardware stack error"); -TRITON_CREATE_HIP_EXCEPTION(illegal_instruction, "illegal instruction"); -TRITON_CREATE_HIP_EXCEPTION(misaligned_address, "misaligned address"); -TRITON_CREATE_HIP_EXCEPTION(invalid_address_space, "invalid address space"); -TRITON_CREATE_HIP_EXCEPTION(invalid_pc, "invalid pc"); -TRITON_CREATE_HIP_EXCEPTION(launch_failed, "launch failed"); -TRITON_CREATE_HIP_EXCEPTION(not_permitted, "not permitted"); -TRITON_CREATE_HIP_EXCEPTION(not_supported, "not supported"); -TRITON_CREATE_HIP_EXCEPTION(invalid_symbol, "invalid symbol"); -TRITON_CREATE_HIP_EXCEPTION(unknown, "unknown"); - -#undef TRITON_CREATE_CUDA_EXCEPTION -} // namespace hip - -} // namespace exception -} // namespace driver -} // namespace triton - -#endif diff --git a/include/triton/driver/llvm.h b/include/triton/driver/llvm.h deleted file mode 100644 index a46eb66b3..000000000 --- a/include/triton/driver/llvm.h +++ /dev/null @@ -1,22 +0,0 @@ -#include "triton/external/CUDA/cuda.h" -#include "triton/external/hip.h" -#include - -namespace llvm { -class Module; -} - -namespace triton { -namespace driver { - -void init_llvm(); -std::string path_to_ptxas(int &version); -std::string llir_to_ptx(llvm::Module *module, int cc, int version); -std::string ptx_to_cubin(const std::string &ptx, const std::string &ptxas_path, - int cc); -CUmodule ptx_to_cumodule(const std::string &ptx, int cc); -std::string llir_to_amdgpu(llvm::Module *module, const std::string &proc); -hipModule_t amdgpu_to_hipmodule(const std::string &path); - -} // namespace driver -} // namespace triton diff --git a/include/triton/external/CUDA/cuda.h b/include/triton/external/CUDA/cuda.h deleted file mode 100755 index f7bf9fc12..000000000 --- a/include/triton/external/CUDA/cuda.h +++ /dev/null @@ -1,18994 +0,0 @@ -/* - * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#ifndef __cuda_cuda_h__ -#define __cuda_cuda_h__ - -#include -#ifdef _MSC_VER -typedef unsigned __int32 cuuint32_t; -typedef unsigned __int64 cuuint64_t; -#else -#include -typedef uint32_t cuuint32_t; -typedef uint64_t cuuint64_t; -#endif - -#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) -#define __CUDA_DEPRECATED -#elif defined(_MSC_VER) -#define __CUDA_DEPRECATED __declspec(deprecated) -#elif defined(__GNUC__) -#define __CUDA_DEPRECATED __attribute__((deprecated)) -#else -#define __CUDA_DEPRECATED -#endif - -#if defined(CUDA_FORCE_API_VERSION) -#error "CUDA_FORCE_API_VERSION is no longer supported." -#endif - -#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) - #define __CUDA_API_PER_THREAD_DEFAULT_STREAM - #define __CUDA_API_PTDS(api) api ## _ptds - #define __CUDA_API_PTSZ(api) api ## _ptsz -#else - #define __CUDA_API_PTDS(api) api - #define __CUDA_API_PTSZ(api) api -#endif - -#define cuDeviceTotalMem cuDeviceTotalMem_v2 -#define cuCtxCreate cuCtxCreate_v2 -#define cuCtxCreate_v3 cuCtxCreate_v3 -#define cuModuleGetGlobal cuModuleGetGlobal_v2 -#define cuMemGetInfo cuMemGetInfo_v2 -#define cuMemAlloc cuMemAlloc_v2 -#define cuMemAllocPitch cuMemAllocPitch_v2 -#define cuMemFree cuMemFree_v2 -#define cuMemGetAddressRange cuMemGetAddressRange_v2 -#define cuMemAllocHost cuMemAllocHost_v2 -#define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 -#define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) -#define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) -#define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2) -#define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2) -#define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2) -#define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2) -#define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2) -#define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2) -#define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2) -#define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2) -#define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2) -#define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2) -#define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2) -#define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2) -#define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2) -#define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2) -#define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2) -#define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2) -#define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2) -#define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2) -#define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2) -#define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2) -#define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2) -#define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2) -#define cuArrayCreate cuArrayCreate_v2 -#define cuArrayGetDescriptor cuArrayGetDescriptor_v2 -#define cuArray3DCreate cuArray3DCreate_v2 -#define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2 -#define cuTexRefSetAddress cuTexRefSetAddress_v2 -#define cuTexRefGetAddress cuTexRefGetAddress_v2 -#define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2 -#define cuCtxDestroy cuCtxDestroy_v2 -#define cuCtxPopCurrent cuCtxPopCurrent_v2 -#define cuCtxPushCurrent cuCtxPushCurrent_v2 -#define cuStreamDestroy cuStreamDestroy_v2 -#define cuEventDestroy cuEventDestroy_v2 -#define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3 -#define cuLinkCreate cuLinkCreate_v2 -#define cuLinkAddData cuLinkAddData_v2 -#define cuLinkAddFile cuLinkAddFile_v2 -#define cuMemHostRegister cuMemHostRegister_v2 -#define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2 -#define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture_v2) -#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 -#define cuDevicePrimaryCtxReset cuDevicePrimaryCtxReset_v2 -#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 -#define cuDeviceGetUuid_v2 cuDeviceGetUuid_v2 -#define cuIpcOpenMemHandle cuIpcOpenMemHandle_v2 -#define cuGraphInstantiate cuGraphInstantiate_v2 - -#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) - #define cuMemcpy __CUDA_API_PTDS(cuMemcpy) - #define cuMemcpyAsync __CUDA_API_PTSZ(cuMemcpyAsync) - #define cuMemcpyPeer __CUDA_API_PTDS(cuMemcpyPeer) - #define cuMemcpyPeerAsync __CUDA_API_PTSZ(cuMemcpyPeerAsync) - #define cuMemcpy3DPeer __CUDA_API_PTDS(cuMemcpy3DPeer) - #define cuMemcpy3DPeerAsync __CUDA_API_PTSZ(cuMemcpy3DPeerAsync) - #define cuMemPrefetchAsync __CUDA_API_PTSZ(cuMemPrefetchAsync) - - #define cuMemsetD8Async __CUDA_API_PTSZ(cuMemsetD8Async) - #define cuMemsetD16Async __CUDA_API_PTSZ(cuMemsetD16Async) - #define cuMemsetD32Async __CUDA_API_PTSZ(cuMemsetD32Async) - #define cuMemsetD2D8Async __CUDA_API_PTSZ(cuMemsetD2D8Async) - #define cuMemsetD2D16Async __CUDA_API_PTSZ(cuMemsetD2D16Async) - #define cuMemsetD2D32Async __CUDA_API_PTSZ(cuMemsetD2D32Async) - - #define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority) - #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags) - #define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx) - #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent) - #define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture) - #define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing) - #define cuStreamGetCaptureInfo __CUDA_API_PTSZ(cuStreamGetCaptureInfo) - #define cuStreamGetCaptureInfo_v2 __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2) - #define cuStreamUpdateCaptureDependencies __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies) - #define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback) - #define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync) - #define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery) - #define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize) - #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord) - #define cuEventRecordWithFlags __CUDA_API_PTSZ(cuEventRecordWithFlags) - #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel) - #define cuLaunchHostFunc __CUDA_API_PTSZ(cuLaunchHostFunc) - #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources) - #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources) - - #define cuStreamWriteValue32 __CUDA_API_PTSZ(cuStreamWriteValue32) - #define cuStreamWaitValue32 __CUDA_API_PTSZ(cuStreamWaitValue32) - #define cuStreamWriteValue64 __CUDA_API_PTSZ(cuStreamWriteValue64) - #define cuStreamWaitValue64 __CUDA_API_PTSZ(cuStreamWaitValue64) - #define cuStreamBatchMemOp __CUDA_API_PTSZ(cuStreamBatchMemOp) - - #define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel) - - #define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync) - #define cuWaitExternalSemaphoresAsync __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync) - - #define cuGraphUpload __CUDA_API_PTSZ(cuGraphUpload) - #define cuGraphLaunch __CUDA_API_PTSZ(cuGraphLaunch) - #define cuStreamCopyAttributes __CUDA_API_PTSZ(cuStreamCopyAttributes) - #define cuStreamGetAttribute __CUDA_API_PTSZ(cuStreamGetAttribute) - #define cuStreamSetAttribute __CUDA_API_PTSZ(cuStreamSetAttribute) - #define cuMemMapArrayAsync __CUDA_API_PTSZ(cuMemMapArrayAsync) - - #define cuMemFreeAsync __CUDA_API_PTSZ(cuMemFreeAsync) - #define cuMemAllocAsync __CUDA_API_PTSZ(cuMemAllocAsync) - #define cuMemAllocFromPoolAsync __CUDA_API_PTSZ(cuMemAllocFromPoolAsync) -#endif - -/** - * \file cuda.h - * \brief Header file for the CUDA Toolkit application programming interface. - * - * \file cudaGL.h - * \brief Header file for the OpenGL interoperability functions of the - * low-level CUDA driver application programming interface. - * - * \file cudaD3D9.h - * \brief Header file for the Direct3D 9 interoperability functions of the - * low-level CUDA driver application programming interface. - */ - -/** - * \defgroup CUDA_TYPES Data types used by CUDA driver - * @{ - */ - -/** - * CUDA API version number - */ -#define CUDA_VERSION 11050 - -#ifdef __cplusplus -extern "C" { -#endif - -/** - * CUDA device pointer - * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. - */ -#if defined(_WIN64) || defined(__LP64__) -typedef unsigned long long CUdeviceptr_v2; -#else -typedef unsigned int CUdeviceptr_v2; -#endif -typedef CUdeviceptr_v2 CUdeviceptr; /**< CUDA device pointer */ - -typedef int CUdevice_v1; /**< CUDA device */ -typedef CUdevice_v1 CUdevice; /**< CUDA device */ -typedef struct CUctx_st *CUcontext; /**< CUDA context */ -typedef struct CUmod_st *CUmodule; /**< CUDA module */ -typedef struct CUfunc_st *CUfunction; /**< CUDA function */ -typedef struct CUarray_st *CUarray; /**< CUDA array */ -typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ -typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ -typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ -typedef struct CUevent_st *CUevent; /**< CUDA event */ -typedef struct CUstream_st *CUstream; /**< CUDA stream */ -typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ -typedef unsigned long long CUtexObject_v1; /**< An opaque value that represents a CUDA texture object */ -typedef CUtexObject_v1 CUtexObject; /**< An opaque value that represents a CUDA texture object */ -typedef unsigned long long CUsurfObject_v1; /**< An opaque value that represents a CUDA surface object */ -typedef CUsurfObject_v1 CUsurfObject; /**< An opaque value that represents a CUDA surface object */ -typedef struct CUextMemory_st *CUexternalMemory; /**< CUDA external memory */ -typedef struct CUextSemaphore_st *CUexternalSemaphore; /**< CUDA external semaphore */ -typedef struct CUgraph_st *CUgraph; /**< CUDA graph */ -typedef struct CUgraphNode_st *CUgraphNode; /**< CUDA graph node */ -typedef struct CUgraphExec_st *CUgraphExec; /**< CUDA executable graph */ -typedef struct CUmemPoolHandle_st *CUmemoryPool; /**< CUDA memory pool */ -typedef struct CUuserObject_st *CUuserObject; /**< CUDA user object for graphs */ - -#ifndef CU_UUID_HAS_BEEN_DEFINED -#define CU_UUID_HAS_BEEN_DEFINED -typedef struct CUuuid_st { /**< CUDA definition of UUID */ - char bytes[16]; -} CUuuid; -#endif - -/** - * CUDA IPC handle size - */ -#define CU_IPC_HANDLE_SIZE 64 - -/** - * CUDA IPC event handle - */ -typedef struct CUipcEventHandle_st { - char reserved[CU_IPC_HANDLE_SIZE]; -} CUipcEventHandle_v1; -typedef CUipcEventHandle_v1 CUipcEventHandle; - -/** - * CUDA IPC mem handle - */ -typedef struct CUipcMemHandle_st { - char reserved[CU_IPC_HANDLE_SIZE]; -} CUipcMemHandle_v1; -typedef CUipcMemHandle_v1 CUipcMemHandle; - -/** - * CUDA Ipc Mem Flags - */ -typedef enum CUipcMem_flags_enum { - CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */ -} CUipcMem_flags; - - -/** - * CUDA Mem Attach Flags - */ -typedef enum CUmemAttach_flags_enum { - CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */ - CU_MEM_ATTACH_HOST = 0x2, /**< Memory cannot be accessed by any stream on any device */ - CU_MEM_ATTACH_SINGLE = 0x4 /**< Memory can only be accessed by a single stream on the associated device */ -} CUmemAttach_flags; - -/** - * Context creation flags - */ -typedef enum CUctx_flags_enum { - CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ - CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ - CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ - CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ - CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling - * \deprecated This flag was deprecated as of CUDA 4.0 - * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */ - CU_CTX_SCHED_MASK = 0x07, - CU_CTX_MAP_HOST = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 - * and it no longer has any effect. All contexts - * as of CUDA 3.2 behave as though the flag is enabled. */ - CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ - CU_CTX_FLAGS_MASK = 0x1f -} CUctx_flags; - -/** - * Stream creation flags - */ -typedef enum CUstream_flags_enum { - CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */ - CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */ -} CUstream_flags; - -/** - * Legacy stream handle - * - * Stream handle that can be passed as a CUstream to use an implicit stream - * with legacy synchronization behavior. - * - * See details of the \link_sync_behavior - */ -#define CU_STREAM_LEGACY ((CUstream)0x1) - -/** - * Per-thread stream handle - * - * Stream handle that can be passed as a CUstream to use an implicit stream - * with per-thread synchronization behavior. - * - * See details of the \link_sync_behavior - */ -#define CU_STREAM_PER_THREAD ((CUstream)0x2) - -/** - * Event creation flags - */ -typedef enum CUevent_flags_enum { - CU_EVENT_DEFAULT = 0x0, /**< Default event flag */ - CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */ - CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */ - CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */ -} CUevent_flags; - -/** - * Event record flags - */ -typedef enum CUevent_record_flags_enum { - CU_EVENT_RECORD_DEFAULT = 0x0, /**< Default event record flag */ - CU_EVENT_RECORD_EXTERNAL = 0x1 /**< When using stream capture, create an event record node - * instead of the default behavior. This flag is invalid - * when used outside of capture. */ -} CUevent_record_flags; - -/** - * Event wait flags - */ -typedef enum CUevent_wait_flags_enum { - CU_EVENT_WAIT_DEFAULT = 0x0, /**< Default event wait flag */ - CU_EVENT_WAIT_EXTERNAL = 0x1 /**< When using stream capture, create an event wait node - * instead of the default behavior. This flag is invalid - * when used outside of capture.*/ -} CUevent_wait_flags; - -/** - * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64 - */ -typedef enum CUstreamWaitValue_flags_enum { - CU_STREAM_WAIT_VALUE_GEQ = 0x0, /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit - values). Note this is a cyclic comparison which ignores wraparound. - (Default behavior.) */ - CU_STREAM_WAIT_VALUE_EQ = 0x1, /**< Wait until *addr == value. */ - CU_STREAM_WAIT_VALUE_AND = 0x2, /**< Wait until (*addr & value) != 0. */ - CU_STREAM_WAIT_VALUE_NOR = 0x3, /**< Wait until ~(*addr | value) != 0. Support for this operation can be - queried with ::cuDeviceGetAttribute() and - ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/ - CU_STREAM_WAIT_VALUE_FLUSH = 1<<30 /**< Follow the wait operation with a flush of outstanding remote writes. This - means that, if a remote write operation is guaranteed to have reached the - device before the wait can be satisfied, that write is guaranteed to be - visible to downstream device work. The device is permitted to reorder - remote writes internally. For example, this flag would be required if - two remote writes arrive in a defined order, the wait is satisfied by the - second write, and downstream work needs to observe the first write. - Support for this operation is restricted to selected platforms and can be - queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/ -} CUstreamWaitValue_flags; - -/** - * Flags for ::cuStreamWriteValue32 - */ -typedef enum CUstreamWriteValue_flags_enum { - CU_STREAM_WRITE_VALUE_DEFAULT = 0x0, /**< Default behavior */ - CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1 /**< Permits the write to be reordered with writes which were issued - before it, as a performance optimization. Normally, - ::cuStreamWriteValue32 will provide a memory fence before the - write, which has similar semantics to - __threadfence_system() but is scoped to the stream - rather than a CUDA thread. */ -} CUstreamWriteValue_flags; - -/** - * Operations for ::cuStreamBatchMemOp - */ -typedef enum CUstreamBatchMemOpType_enum { - CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1, /**< Represents a ::cuStreamWaitValue32 operation */ - CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2, /**< Represents a ::cuStreamWriteValue32 operation */ - CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4, /**< Represents a ::cuStreamWaitValue64 operation */ - CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5, /**< Represents a ::cuStreamWriteValue64 operation */ - CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a - standalone operation. */ -} CUstreamBatchMemOpType; - -/** - * Per-operation parameters for ::cuStreamBatchMemOp - */ -typedef union CUstreamBatchMemOpParams_union { - CUstreamBatchMemOpType operation; - struct CUstreamMemOpWaitValueParams_st { - CUstreamBatchMemOpType operation; - CUdeviceptr address; - union { - cuuint32_t value; - cuuint64_t value64; - }; - unsigned int flags; - CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ - } waitValue; - struct CUstreamMemOpWriteValueParams_st { - CUstreamBatchMemOpType operation; - CUdeviceptr address; - union { - cuuint32_t value; - cuuint64_t value64; - }; - unsigned int flags; - CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ - } writeValue; - struct CUstreamMemOpFlushRemoteWritesParams_st { - CUstreamBatchMemOpType operation; - unsigned int flags; - } flushRemoteWrites; - cuuint64_t pad[6]; -} CUstreamBatchMemOpParams_v1; -typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams; - -/** - * Occupancy calculator flag - */ -typedef enum CUoccupancy_flags_enum { - CU_OCCUPANCY_DEFAULT = 0x0, /**< Default behavior */ - CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1 /**< Assume global caching is enabled and cannot be automatically turned off */ -} CUoccupancy_flags; - -/** - * Flags for ::cuStreamUpdateCaptureDependencies - */ -typedef enum CUstreamUpdateCaptureDependencies_flags_enum { - CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */ - CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1 /**< Replace the dependency set with the new nodes */ -} CUstreamUpdateCaptureDependencies_flags; - -/** - * Array formats - */ -typedef enum CUarray_format_enum { - CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ - CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ - CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ - CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ - CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ - CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ - CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ - CU_AD_FORMAT_FLOAT = 0x20, /**< 32-bit floating point */ - CU_AD_FORMAT_NV12 = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */ - CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */ - CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */ - CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */ - CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */ - CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */ - CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */ - CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, /**< 1 channel signed 8-bit normalized integer */ - CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, /**< 2 channel signed 8-bit normalized integer */ - CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, /**< 4 channel signed 8-bit normalized integer */ - CU_AD_FORMAT_SNORM_INT16X1 = 0xc9, /**< 1 channel signed 16-bit normalized integer */ - CU_AD_FORMAT_SNORM_INT16X2 = 0xca, /**< 2 channel signed 16-bit normalized integer */ - CU_AD_FORMAT_SNORM_INT16X4 = 0xcb, /**< 4 channel signed 16-bit normalized integer */ - CU_AD_FORMAT_BC1_UNORM = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */ - CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/ - CU_AD_FORMAT_BC2_UNORM = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */ - CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/ - CU_AD_FORMAT_BC3_UNORM = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */ - CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/ - CU_AD_FORMAT_BC4_UNORM = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */ - CU_AD_FORMAT_BC4_SNORM = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */ - CU_AD_FORMAT_BC5_UNORM = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */ - CU_AD_FORMAT_BC5_SNORM = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */ - CU_AD_FORMAT_BC6H_UF16 = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */ - CU_AD_FORMAT_BC6H_SF16 = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */ - CU_AD_FORMAT_BC7_UNORM = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */ - CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */ -} CUarray_format; - -/** - * Texture reference addressing modes - */ -typedef enum CUaddress_mode_enum { - CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ - CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ - CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ - CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ -} CUaddress_mode; - -/** - * Texture reference filtering modes - */ -typedef enum CUfilter_mode_enum { - CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ - CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ -} CUfilter_mode; - -/** - * Device properties - */ -typedef enum CUdevice_attribute_enum { - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ - CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ - CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ - CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ - CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ - CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */ - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ - CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ - CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ - CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ - CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ - CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ - CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ - CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ - CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ - CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ - CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ - CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ - CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ - CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ - CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ - CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ - CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ - CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, /**< Alternate maximum 3D texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ - CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ - CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, /**< Maximum mipmapped 2D texture height */ - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ - CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ - CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ - CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */ - CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */ - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ - CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ - CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ - CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ - CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ - CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ - CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ - CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, /**< Device can coherently access managed memory concurrently with the CPU */ - CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, /**< Device supports compute preemption. */ - CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ - CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, /**< ::cuStreamBatchMemOp and related APIs are supported. */ - CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */ - CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */ - CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ - CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */ - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */ - CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */ - CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ - CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */ - CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */ - CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102, /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/ - CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */ - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ - CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, /**< Maximum number of blocks per multiprocessor */ - CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, /**< Device supports compression of memory */ - CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */ - CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, /**< Maximum value of CUaccessPolicyWindow::num_bytes. */ - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */ - CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, /**< Shared memory reserved by CUDA driver per block in bytes */ - CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */ - CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */ - CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, /**< External timeline semaphore interop is supported on the device */ - CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */ - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */ - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */ - CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */ - CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, /**< Handle types supported with mempool based IPC */ - CU_DEVICE_ATTRIBUTE_MAX -} CUdevice_attribute; - -/** - * Legacy device properties - */ -typedef struct CUdevprop_st { - int maxThreadsPerBlock; /**< Maximum number of threads per block */ - int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ - int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ - int sharedMemPerBlock; /**< Shared memory available per block in bytes */ - int totalConstantMemory; /**< Constant memory available on device in bytes */ - int SIMDWidth; /**< Warp size in threads */ - int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ - int regsPerBlock; /**< 32-bit registers available per block */ - int clockRate; /**< Clock frequency in kilohertz */ - int textureAlign; /**< Alignment requirement for textures */ -} CUdevprop_v1; -typedef CUdevprop_v1 CUdevprop; - -/** - * Pointer information - */ -typedef enum CUpointer_attribute_enum { - CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ - CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ - CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ - CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ - CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ - CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ - CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ - CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, /**< Indicates if the pointer points to managed memory */ - CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9, /**< A device ordinal of a device on which a pointer was allocated or registered */ - CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/ - CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11, /**< Starting address for this requested pointer */ - CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12, /**< Size of the address range for this requested pointer */ - CU_POINTER_ATTRIBUTE_MAPPED = 13, /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/ - CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14, /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/ - CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/ - CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16, /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */ - CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17 /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/ -} CUpointer_attribute; - -/** - * Function properties - */ -typedef enum CUfunction_attribute_enum { - /** - * The maximum number of threads per block, beyond which a launch of the - * function would fail. This number depends on both the function and the - * device on which the function is currently loaded. - */ - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, - - /** - * The size in bytes of statically-allocated shared memory required by - * this function. This does not include dynamically-allocated shared - * memory requested by the user at runtime. - */ - CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, - - /** - * The size in bytes of user-allocated constant memory required by this - * function. - */ - CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, - - /** - * The size in bytes of local memory used by each thread of this function. - */ - CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, - - /** - * The number of registers used by each thread of this function. - */ - CU_FUNC_ATTRIBUTE_NUM_REGS = 4, - - /** - * The PTX virtual architecture version for which the function was - * compiled. This value is the major PTX version * 10 + the minor PTX - * version, so a PTX version 1.3 function would return the value 13. - * Note that this may return the undefined value of 0 for cubins - * compiled prior to CUDA 3.0. - */ - CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, - - /** - * The binary architecture version for which the function was compiled. - * This value is the major binary version * 10 + the minor binary version, - * so a binary version 1.3 function would return the value 13. Note that - * this will return a value of 10 for legacy cubins that do not have a - * properly-encoded binary architecture version. - */ - CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, - - /** - * The attribute to indicate whether the function has been compiled with - * user specified option "-Xptxas --dlcm=ca" set . - */ - CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, - - /** - * The maximum size in bytes of dynamically-allocated shared memory that can be used by - * this function. If the user-specified dynamic shared memory size is larger than this - * value, the launch will fail. - * See ::cuFuncSetAttribute - */ - CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, - - /** - * On devices where the L1 cache and shared memory use the same hardware resources, - * this sets the shared memory carveout preference, in percent of the total shared memory. - * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. - * This is only a hint, and the driver can choose a different ratio if required to execute the function. - * See ::cuFuncSetAttribute - */ - CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, - - CU_FUNC_ATTRIBUTE_MAX -} CUfunction_attribute; - -/** - * Function cache configurations - */ -typedef enum CUfunc_cache_enum { - CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ - CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ - CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */ - CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */ -} CUfunc_cache; - -/** - * Shared memory configurations - */ -typedef enum CUsharedconfig_enum { - CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */ - CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */ - CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */ -} CUsharedconfig; - -/** - * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute - */ -typedef enum CUshared_carveout_enum { - CU_SHAREDMEM_CARVEOUT_DEFAULT = -1, /**< No preference for shared memory or L1 (default) */ - CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, /**< Prefer maximum available shared memory, minimum L1 cache */ - CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0 /**< Prefer maximum available L1 cache, minimum shared memory */ -} CUshared_carveout; - -/** - * Memory types - */ -typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ - CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ - CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ - CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ -} CUmemorytype; - -/** - * Compute Modes - */ -typedef enum CUcomputemode_enum { - CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ - CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ - CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */ -} CUcomputemode; - -/** - * Memory advise values - */ -typedef enum CUmem_advise_enum { - CU_MEM_ADVISE_SET_READ_MOSTLY = 1, /**< Data will mostly be read and only occassionally be written to */ - CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */ - CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, /**< Set the preferred location for the data as the specified device */ - CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */ - CU_MEM_ADVISE_SET_ACCESSED_BY = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ - CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ -} CUmem_advise; - -typedef enum CUmem_range_attribute_enum { - CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, /**< Whether the range will mostly be read and only occassionally be written to */ - CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, /**< The preferred location of the range */ - CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */ - CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4 /**< The last location to which the range was prefetched */ -} CUmem_range_attribute; - -/** - * Online compiler and linker options - */ -typedef enum CUjit_option_enum -{ - /** - * Max number of registers that a thread may use.\n - * Option type: unsigned int\n - * Applies to: compiler only - */ - CU_JIT_MAX_REGISTERS = 0, - - /** - * IN: Specifies minimum number of threads per block to target compilation - * for\n - * OUT: Returns the number of threads the compiler actually targeted. - * This restricts the resource utilization fo the compiler (e.g. max - * registers) such that a block with the given number of threads should be - * able to launch based on register limitations. Note, this option does not - * currently take into account any other resource limitations, such as - * shared memory utilization.\n - * Cannot be combined with ::CU_JIT_TARGET.\n - * Option type: unsigned int\n - * Applies to: compiler only - */ - CU_JIT_THREADS_PER_BLOCK, - - /** - * Overwrites the option value with the total wall clock time, in - * milliseconds, spent in the compiler and linker\n - * Option type: float\n - * Applies to: compiler and linker - */ - CU_JIT_WALL_TIME, - - /** - * Pointer to a buffer in which to print any log messages - * that are informational in nature (the buffer size is specified via - * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n - * Option type: char *\n - * Applies to: compiler and linker - */ - CU_JIT_INFO_LOG_BUFFER, - - /** - * IN: Log buffer size in bytes. Log messages will be capped at this size - * (including null terminator)\n - * OUT: Amount of log buffer filled with messages\n - * Option type: unsigned int\n - * Applies to: compiler and linker - */ - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, - - /** - * Pointer to a buffer in which to print any log messages that - * reflect errors (the buffer size is specified via option - * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n - * Option type: char *\n - * Applies to: compiler and linker - */ - CU_JIT_ERROR_LOG_BUFFER, - - /** - * IN: Log buffer size in bytes. Log messages will be capped at this size - * (including null terminator)\n - * OUT: Amount of log buffer filled with messages\n - * Option type: unsigned int\n - * Applies to: compiler and linker - */ - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - - /** - * Level of optimizations to apply to generated code (0 - 4), with 4 - * being the default and highest level of optimizations.\n - * Option type: unsigned int\n - * Applies to: compiler only - */ - CU_JIT_OPTIMIZATION_LEVEL, - - /** - * No option value required. Determines the target based on the current - * attached context (default)\n - * Option type: No option value needed\n - * Applies to: compiler and linker - */ - CU_JIT_TARGET_FROM_CUCONTEXT, - - /** - * Target is chosen based on supplied ::CUjit_target. Cannot be - * combined with ::CU_JIT_THREADS_PER_BLOCK.\n - * Option type: unsigned int for enumerated type ::CUjit_target\n - * Applies to: compiler and linker - */ - CU_JIT_TARGET, - - /** - * Specifies choice of fallback strategy if matching cubin is not found. - * Choice is based on supplied ::CUjit_fallback. This option cannot be - * used with cuLink* APIs as the linker requires exact matches.\n - * Option type: unsigned int for enumerated type ::CUjit_fallback\n - * Applies to: compiler only - */ - CU_JIT_FALLBACK_STRATEGY, - - /** - * Specifies whether to create debug information in output (-g) - * (0: false, default)\n - * Option type: int\n - * Applies to: compiler and linker - */ - CU_JIT_GENERATE_DEBUG_INFO, - - /** - * Generate verbose log messages (0: false, default)\n - * Option type: int\n - * Applies to: compiler and linker - */ - CU_JIT_LOG_VERBOSE, - - /** - * Generate line number information (-lineinfo) (0: false, default)\n - * Option type: int\n - * Applies to: compiler only - */ - CU_JIT_GENERATE_LINE_INFO, - - /** - * Specifies whether to enable caching explicitly (-dlcm) \n - * Choice is based on supplied ::CUjit_cacheMode_enum.\n - * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n - * Applies to: compiler only - */ - CU_JIT_CACHE_MODE, - - /** - * The below jit options are used for internal purposes only, in this version of CUDA - */ - CU_JIT_NEW_SM3X_OPT, - CU_JIT_FAST_COMPILE, - - /** - * Array of device symbol names that will be relocated to the corresponing - * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n - * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n - * When loding a device module, driver will relocate all encountered - * unresolved symbols to the host addresses.\n - * It is only allowed to register symbols that correspond to unresolved - * global variables.\n - * It is illegal to register the same device symbol at multiple addresses.\n - * Option type: const char **\n - * Applies to: dynamic linker only - */ - CU_JIT_GLOBAL_SYMBOL_NAMES, - - /** - * Array of host addresses that will be used to relocate corresponding - * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n - * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n - * Option type: void **\n - * Applies to: dynamic linker only - */ - CU_JIT_GLOBAL_SYMBOL_ADDRESSES, - - /** - * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and - * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n - * Option type: unsigned int\n - * Applies to: dynamic linker only - */ - CU_JIT_GLOBAL_SYMBOL_COUNT, - - /** - * Enable link-time optimization (-dlto) for device code (0: false, default)\n - * Option type: int\n - * Applies to: compiler and linker - */ - CU_JIT_LTO, - - /** - * Control single-precision denormals (-ftz) support (0: false, default). - * 1 : flushes denormal values to zero - * 0 : preserves denormal values - * Option type: int\n - * Applies to: link-time optimization specified with CU_JIT_LTO - */ - CU_JIT_FTZ, - - /** - * Control single-precision floating-point division and reciprocals - * (-prec-div) support (1: true, default). - * 1 : Enables the IEEE round-to-nearest mode - * 0 : Enables the fast approximation mode - * Option type: int\n - * Applies to: link-time optimization specified with CU_JIT_LTO - */ - CU_JIT_PREC_DIV, - - /** - * Control single-precision floating-point square root - * (-prec-sqrt) support (1: true, default). - * 1 : Enables the IEEE round-to-nearest mode - * 0 : Enables the fast approximation mode - * Option type: int\n - * Applies to: link-time optimization specified with CU_JIT_LTO - */ - CU_JIT_PREC_SQRT, - - /** - * Enable/Disable the contraction of floating-point multiplies - * and adds/subtracts into floating-point multiply-add (-fma) - * operations (1: Enable, default; 0: Disable). - * Option type: int\n - * Applies to: link-time optimization specified with CU_JIT_LTO - */ - CU_JIT_FMA, - - CU_JIT_NUM_OPTIONS - -} CUjit_option; - -/** - * Online compilation targets - */ -typedef enum CUjit_target_enum -{ - CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */ - CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */ - CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */ - CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */ - CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */ - CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */ - CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */ - CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */ - CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */ - CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/ - CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ - CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ - CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/ - CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/ - CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/ - CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/ - CU_TARGET_COMPUTE_86 = 86 /**< Compute device class 8.6.*/ -} CUjit_target; - -/** - * Cubin matching fallback strategies - */ -typedef enum CUjit_fallback_enum -{ - CU_PREFER_PTX = 0, /**< Prefer to compile ptx if exact binary match not found */ - - CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code if exact match not found */ - -} CUjit_fallback; - -/** - * Caching modes for dlcm - */ -typedef enum CUjit_cacheMode_enum -{ - CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */ - CU_JIT_CACHE_OPTION_CG, /**< Compile with L1 cache disabled */ - CU_JIT_CACHE_OPTION_CA /**< Compile with L1 cache enabled */ -} CUjit_cacheMode; - -/** - * Device code formats - */ -typedef enum CUjitInputType_enum -{ - /** - * Compiled device-class-specific device code\n - * Applicable options: none - */ - CU_JIT_INPUT_CUBIN = 0, - - /** - * PTX source code\n - * Applicable options: PTX compiler options - */ - CU_JIT_INPUT_PTX, - - /** - * Bundle of multiple cubins and/or PTX of some device code\n - * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY - */ - CU_JIT_INPUT_FATBINARY, - - /** - * Host object with embedded device code\n - * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY - */ - CU_JIT_INPUT_OBJECT, - - /** - * Archive of host objects with embedded device code\n - * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY - */ - CU_JIT_INPUT_LIBRARY, - - /** - * High-level intermediate code for link-time optimization\n - * Applicable options: NVVM compiler options, PTX compiler options - */ - CU_JIT_INPUT_NVVM, - - CU_JIT_NUM_INPUT_TYPES -} CUjitInputType; - -typedef struct CUlinkState_st *CUlinkState; - -/** - * Flags to register a graphics resource - */ -typedef enum CUgraphicsRegisterFlags_enum { - CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, - CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, - CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, - CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04, - CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08 -} CUgraphicsRegisterFlags; - -/** - * Flags for mapping and unmapping interop resources - */ -typedef enum CUgraphicsMapResourceFlags_enum { - CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, - CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, - CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 -} CUgraphicsMapResourceFlags; - -/** - * Array indices for cube faces - */ -typedef enum CUarray_cubemap_face_enum { - CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ - CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ - CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ - CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ - CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ - CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ -} CUarray_cubemap_face; - -/** - * Limits - */ -typedef enum CUlimit_enum { - CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ - CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ - CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, /**< GPU malloc heap size */ - CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, /**< GPU device runtime launch synchronize depth */ - CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */ - CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */ - CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x06, /**< A size in bytes for L2 persisting lines cache size */ - CU_LIMIT_MAX -} CUlimit; - -/** - * Resource types - */ -typedef enum CUresourcetype_enum { - CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ - CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ - CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ - CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ -} CUresourcetype; - -#ifdef _WIN32 -#define CUDA_CB __stdcall -#else -#define CUDA_CB -#endif - -/** - * CUDA host function - * \param userData Argument value passed to the function - */ -typedef void (CUDA_CB *CUhostFn)(void *userData); - -/** - * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members. - */ -typedef enum CUaccessProperty_enum { - CU_ACCESS_PROPERTY_NORMAL = 0, /**< Normal cache persistence. */ - CU_ACCESS_PROPERTY_STREAMING = 1, /**< Streaming access is less likely to persit from cache. */ - CU_ACCESS_PROPERTY_PERSISTING = 2 /**< Persisting access is more likely to persist in cache.*/ -} CUaccessProperty; - -/** - * Specifies an access policy for a window, a contiguous extent of memory - * beginning at base_ptr and ending at base_ptr + num_bytes. - * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. - * Partition into many segments and assign segments such that: - * sum of "hit segments" / window == approx. ratio. - * sum of "miss segments" / window == approx 1-ratio. - * Segments and ratio specifications are fitted to the capabilities of - * the architecture. - * Accesses in a hit segment apply the hitProp access policy. - * Accesses in a miss segment apply the missProp access policy. - */ -typedef struct CUaccessPolicyWindow_st { - void *base_ptr; /**< Starting address of the access policy window. CUDA driver may align it. */ - size_t num_bytes; /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */ - float hitRatio; /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */ - CUaccessProperty hitProp; /**< ::CUaccessProperty set for hit. */ - CUaccessProperty missProp; /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */ -} CUaccessPolicyWindow_v1; -typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow; - -/** - * GPU kernel node parameters - */ -typedef struct CUDA_KERNEL_NODE_PARAMS_st { - CUfunction func; /**< Kernel to launch */ - unsigned int gridDimX; /**< Width of grid in blocks */ - unsigned int gridDimY; /**< Height of grid in blocks */ - unsigned int gridDimZ; /**< Depth of grid in blocks */ - unsigned int blockDimX; /**< X dimension of each thread block */ - unsigned int blockDimY; /**< Y dimension of each thread block */ - unsigned int blockDimZ; /**< Z dimension of each thread block */ - unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ - void **kernelParams; /**< Array of pointers to kernel parameters */ - void **extra; /**< Extra options */ -} CUDA_KERNEL_NODE_PARAMS_v1; -typedef CUDA_KERNEL_NODE_PARAMS_v1 CUDA_KERNEL_NODE_PARAMS; - -/** - * Memset node parameters - */ -typedef struct CUDA_MEMSET_NODE_PARAMS_st { - CUdeviceptr dst; /**< Destination device pointer */ - size_t pitch; /**< Pitch of destination device pointer. Unused if height is 1 */ - unsigned int value; /**< Value to be set */ - unsigned int elementSize; /**< Size of each element in bytes. Must be 1, 2, or 4. */ - size_t width; /**< Width of the row in elements */ - size_t height; /**< Number of rows */ -} CUDA_MEMSET_NODE_PARAMS_v1; -typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS; - -/** - * Host node parameters - */ -typedef struct CUDA_HOST_NODE_PARAMS_st { - CUhostFn fn; /**< The function to call when the node executes */ - void* userData; /**< Argument to pass to the function */ -} CUDA_HOST_NODE_PARAMS_v1; -typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS; - -/** - * Graph node types - */ -typedef enum CUgraphNodeType_enum { - CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */ - CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */ - CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */ - CU_GRAPH_NODE_TYPE_HOST = 3, /**< Host (executable) node */ - CU_GRAPH_NODE_TYPE_GRAPH = 4, /**< Node which executes an embedded graph */ - CU_GRAPH_NODE_TYPE_EMPTY = 5, /**< Empty (no-op) node */ - CU_GRAPH_NODE_TYPE_WAIT_EVENT = 6, /**< External event wait node */ - CU_GRAPH_NODE_TYPE_EVENT_RECORD = 7, /**< External event record node */ - CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */ - CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT = 9, /**< External semaphore wait node */ - CU_GRAPH_NODE_TYPE_MEM_ALLOC = 10,/**< Memory Allocation Node */ - CU_GRAPH_NODE_TYPE_MEM_FREE = 11 /**< Memory Free Node */ -} CUgraphNodeType; - -typedef enum CUsynchronizationPolicy_enum { - CU_SYNC_POLICY_AUTO = 1, - CU_SYNC_POLICY_SPIN = 2, - CU_SYNC_POLICY_YIELD = 3, - CU_SYNC_POLICY_BLOCKING_SYNC = 4 -} CUsynchronizationPolicy; - -/** - * Graph kernel node Attributes - */ -typedef enum CUkernelNodeAttrID_enum { - CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, /**< Identifier for ::CUkernelNodeAttrValue::accessPolicyWindow. */ - CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE = 2 /**< Allows a kernel node to be cooperative (see ::cuLaunchCooperativeKernel). */ -} CUkernelNodeAttrID; - -/** - * Graph kernel node attributes union, used with ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute - */ -typedef union CUkernelNodeAttrValue_union { - CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ - int cooperative; /**< Nonzero indicates a cooperative kernel (see ::cuLaunchCooperativeKernel). */ -} CUkernelNodeAttrValue_v1; -typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue; - -/** - * Possible stream capture statuses returned by ::cuStreamIsCapturing - */ -typedef enum CUstreamCaptureStatus_enum { - CU_STREAM_CAPTURE_STATUS_NONE = 0, /**< Stream is not capturing */ - CU_STREAM_CAPTURE_STATUS_ACTIVE = 1, /**< Stream is actively capturing */ - CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2 /**< Stream is part of a capture sequence that - has been invalidated, but not terminated */ -} CUstreamCaptureStatus; - -/** - * Possible modes for stream capture thread interactions. For more details see - * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode - */ -typedef enum CUstreamCaptureMode_enum { - CU_STREAM_CAPTURE_MODE_GLOBAL = 0, - CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1, - CU_STREAM_CAPTURE_MODE_RELAXED = 2 -} CUstreamCaptureMode; - -/** - * Stream Attributes - */ -typedef enum CUstreamAttrID_enum { - CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, /**< Identifier for ::CUstreamAttrValue::accessPolicyWindow. */ - CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 /**< ::CUsynchronizationPolicy for work queued up in this stream */ -} CUstreamAttrID; - -/** - * Stream attributes union, used with ::cuStreamSetAttribute/::cuStreamGetAttribute - */ -typedef union CUstreamAttrValue_union { - CUaccessPolicyWindow accessPolicyWindow; /**< Attribute ::CUaccessPolicyWindow. */ - CUsynchronizationPolicy syncPolicy; /**< Value for ::CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY. */ -} CUstreamAttrValue_v1; -typedef CUstreamAttrValue_v1 CUstreamAttrValue; - -/** - * Flags to specify search options. For more details see ::cuGetProcAddress - */ -typedef enum CUdriverProcAddress_flags_enum { - CU_GET_PROC_ADDRESS_DEFAULT = 0, /**< Default search mode for driver symbols. */ - CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0, /**< Search for legacy versions of driver symbols. */ - CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1 /**< Search for per-thread versions of driver symbols. */ -} CUdriverProcAddress_flags; - -/** - * Execution Affinity Types - */ -typedef enum CUexecAffinityType_enum { - CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0, /**< Create a context with limited SMs. */ - CU_EXEC_AFFINITY_TYPE_MAX -} CUexecAffinityType; - -/** - * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT - */ -typedef struct CUexecAffinitySmCount_st { - unsigned int val; /**< The number of SMs the context is limited to use. */ -} CUexecAffinitySmCount_v1; -typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount; - -/** - * Execution Affinity Parameters - */ -typedef struct CUexecAffinityParam_st { - CUexecAffinityType type; - union { - CUexecAffinitySmCount smCount; /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */ - } param; -} CUexecAffinityParam_v1; -typedef CUexecAffinityParam_v1 CUexecAffinityParam; - -/** - * Error codes - */ -typedef enum cudaError_enum { - /** - * The API call returned with no errors. In the case of query calls, this - * also means that the operation being queried is complete (see - * ::cuEventQuery() and ::cuStreamQuery()). - */ - CUDA_SUCCESS = 0, - - /** - * This indicates that one or more of the parameters passed to the API call - * is not within an acceptable range of values. - */ - CUDA_ERROR_INVALID_VALUE = 1, - - /** - * The API call failed because it was unable to allocate enough memory to - * perform the requested operation. - */ - CUDA_ERROR_OUT_OF_MEMORY = 2, - - /** - * This indicates that the CUDA driver has not been initialized with - * ::cuInit() or that initialization has failed. - */ - CUDA_ERROR_NOT_INITIALIZED = 3, - - /** - * This indicates that the CUDA driver is in the process of shutting down. - */ - CUDA_ERROR_DEINITIALIZED = 4, - - /** - * This indicates profiler is not initialized for this run. This can - * happen when the application is running with external profiling tools - * like visual profiler. - */ - CUDA_ERROR_PROFILER_DISABLED = 5, - - /** - * \deprecated - * This error return is deprecated as of CUDA 5.0. It is no longer an error - * to attempt to enable/disable the profiling via ::cuProfilerStart or - * ::cuProfilerStop without initialization. - */ - CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, - - /** - * \deprecated - * This error return is deprecated as of CUDA 5.0. It is no longer an error - * to call cuProfilerStart() when profiling is already enabled. - */ - CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, - - /** - * \deprecated - * This error return is deprecated as of CUDA 5.0. It is no longer an error - * to call cuProfilerStop() when profiling is already disabled. - */ - CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, - - /** - * This indicates that the CUDA driver that the application has loaded is a - * stub library. Applications that run with the stub rather than a real - * driver loaded will result in CUDA API returning this error. - */ - CUDA_ERROR_STUB_LIBRARY = 34, - - /** - * This indicates that no CUDA-capable devices were detected by the installed - * CUDA driver. - */ - CUDA_ERROR_NO_DEVICE = 100, - - /** - * This indicates that the device ordinal supplied by the user does not - * correspond to a valid CUDA device or that the action requested is - * invalid for the specified device. - */ - CUDA_ERROR_INVALID_DEVICE = 101, - - /** - * This error indicates that the Grid license is not applied. - */ - CUDA_ERROR_DEVICE_NOT_LICENSED = 102, - - /** - * This indicates that the device kernel image is invalid. This can also - * indicate an invalid CUDA module. - */ - CUDA_ERROR_INVALID_IMAGE = 200, - - /** - * This most frequently indicates that there is no context bound to the - * current thread. This can also be returned if the context passed to an - * API call is not a valid handle (such as a context that has had - * ::cuCtxDestroy() invoked on it). This can also be returned if a user - * mixes different API versions (i.e. 3010 context with 3020 API calls). - * See ::cuCtxGetApiVersion() for more details. - */ - CUDA_ERROR_INVALID_CONTEXT = 201, - - /** - * This indicated that the context being supplied as a parameter to the - * API call was already the active context. - * \deprecated - * This error return is deprecated as of CUDA 3.2. It is no longer an - * error to attempt to push the active context via ::cuCtxPushCurrent(). - */ - CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, - - /** - * This indicates that a map or register operation has failed. - */ - CUDA_ERROR_MAP_FAILED = 205, - - /** - * This indicates that an unmap or unregister operation has failed. - */ - CUDA_ERROR_UNMAP_FAILED = 206, - - /** - * This indicates that the specified array is currently mapped and thus - * cannot be destroyed. - */ - CUDA_ERROR_ARRAY_IS_MAPPED = 207, - - /** - * This indicates that the resource is already mapped. - */ - CUDA_ERROR_ALREADY_MAPPED = 208, - - /** - * This indicates that there is no kernel image available that is suitable - * for the device. This can occur when a user specifies code generation - * options for a particular CUDA source file that do not include the - * corresponding device configuration. - */ - CUDA_ERROR_NO_BINARY_FOR_GPU = 209, - - /** - * This indicates that a resource has already been acquired. - */ - CUDA_ERROR_ALREADY_ACQUIRED = 210, - - /** - * This indicates that a resource is not mapped. - */ - CUDA_ERROR_NOT_MAPPED = 211, - - /** - * This indicates that a mapped resource is not available for access as an - * array. - */ - CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, - - /** - * This indicates that a mapped resource is not available for access as a - * pointer. - */ - CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, - - /** - * This indicates that an uncorrectable ECC error was detected during - * execution. - */ - CUDA_ERROR_ECC_UNCORRECTABLE = 214, - - /** - * This indicates that the ::CUlimit passed to the API call is not - * supported by the active device. - */ - CUDA_ERROR_UNSUPPORTED_LIMIT = 215, - - /** - * This indicates that the ::CUcontext passed to the API call can - * only be bound to a single CPU thread at a time but is already - * bound to a CPU thread. - */ - CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, - - /** - * This indicates that peer access is not supported across the given - * devices. - */ - CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217, - - /** - * This indicates that a PTX JIT compilation failed. - */ - CUDA_ERROR_INVALID_PTX = 218, - - /** - * This indicates an error with OpenGL or DirectX context. - */ - CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219, - - /** - * This indicates that an uncorrectable NVLink error was detected during the - * execution. - */ - CUDA_ERROR_NVLINK_UNCORRECTABLE = 220, - - /** - * This indicates that the PTX JIT compiler library was not found. - */ - CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221, - - /** - * This indicates that the provided PTX was compiled with an unsupported toolchain. - */ - - CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222, - - /** - * This indicates that the PTX JIT compilation was disabled. - */ - CUDA_ERROR_JIT_COMPILATION_DISABLED = 223, - - /** - * This indicates that the ::CUexecAffinityType passed to the API call is not - * supported by the active device. - */ - CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224, - - /** - * This indicates that the device kernel source is invalid. This includes - * compilation/linker errors encountered in device code or user error. - */ - CUDA_ERROR_INVALID_SOURCE = 300, - - /** - * This indicates that the file specified was not found. - */ - CUDA_ERROR_FILE_NOT_FOUND = 301, - - /** - * This indicates that a link to a shared object failed to resolve. - */ - CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, - - /** - * This indicates that initialization of a shared object failed. - */ - CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, - - /** - * This indicates that an OS call failed. - */ - CUDA_ERROR_OPERATING_SYSTEM = 304, - - /** - * This indicates that a resource handle passed to the API call was not - * valid. Resource handles are opaque types like ::CUstream and ::CUevent. - */ - CUDA_ERROR_INVALID_HANDLE = 400, - - /** - * This indicates that a resource required by the API call is not in a - * valid state to perform the requested operation. - */ - CUDA_ERROR_ILLEGAL_STATE = 401, - - /** - * This indicates that a named symbol was not found. Examples of symbols - * are global/constant variable names, driver function names, texture names, - * and surface names. - */ - CUDA_ERROR_NOT_FOUND = 500, - - /** - * This indicates that asynchronous operations issued previously have not - * completed yet. This result is not actually an error, but must be indicated - * differently than ::CUDA_SUCCESS (which indicates completion). Calls that - * may return this value include ::cuEventQuery() and ::cuStreamQuery(). - */ - CUDA_ERROR_NOT_READY = 600, - - /** - * While executing a kernel, the device encountered a - * load or store instruction on an invalid memory address. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_ILLEGAL_ADDRESS = 700, - - /** - * This indicates that a launch did not occur because it did not have - * appropriate resources. This error usually indicates that the user has - * attempted to pass too many arguments to the device kernel, or the - * kernel launch specifies too many threads for the kernel's register - * count. Passing arguments of the wrong size (i.e. a 64-bit pointer - * when a 32-bit int is expected) is equivalent to passing too many - * arguments and can also result in this error. - */ - CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, - - /** - * This indicates that the device kernel took too long to execute. This can - * only occur if timeouts are enabled - see the device attribute - * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_LAUNCH_TIMEOUT = 702, - - /** - * This error indicates a kernel launch that uses an incompatible texturing - * mode. - */ - CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, - - /** - * This error indicates that a call to ::cuCtxEnablePeerAccess() is - * trying to re-enable peer access to a context which has already - * had peer access to it enabled. - */ - CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, - - /** - * This error indicates that ::cuCtxDisablePeerAccess() is - * trying to disable peer access which has not been enabled yet - * via ::cuCtxEnablePeerAccess(). - */ - CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, - - /** - * This error indicates that the primary context for the specified device - * has already been initialized. - */ - CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, - - /** - * This error indicates that the context current to the calling thread - * has been destroyed using ::cuCtxDestroy, or is a primary context which - * has not yet been initialized. - */ - CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, - - /** - * A device-side assert triggered during kernel execution. The context - * cannot be used anymore, and must be destroyed. All existing device - * memory allocations from this context are invalid and must be - * reconstructed if the program is to continue using CUDA. - */ - CUDA_ERROR_ASSERT = 710, - - /** - * This error indicates that the hardware resources required to enable - * peer access have been exhausted for one or more of the devices - * passed to ::cuCtxEnablePeerAccess(). - */ - CUDA_ERROR_TOO_MANY_PEERS = 711, - - /** - * This error indicates that the memory range passed to ::cuMemHostRegister() - * has already been registered. - */ - CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, - - /** - * This error indicates that the pointer passed to ::cuMemHostUnregister() - * does not correspond to any currently registered memory region. - */ - CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, - - /** - * While executing a kernel, the device encountered a stack error. - * This can be due to stack corruption or exceeding the stack size limit. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_HARDWARE_STACK_ERROR = 714, - - /** - * While executing a kernel, the device encountered an illegal instruction. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_ILLEGAL_INSTRUCTION = 715, - - /** - * While executing a kernel, the device encountered a load or store instruction - * on a memory address which is not aligned. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_MISALIGNED_ADDRESS = 716, - - /** - * While executing a kernel, the device encountered an instruction - * which can only operate on memory locations in certain address spaces - * (global, shared, or local), but was supplied a memory address not - * belonging to an allowed address space. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_INVALID_ADDRESS_SPACE = 717, - - /** - * While executing a kernel, the device program counter wrapped its address space. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_INVALID_PC = 718, - - /** - * An exception occurred on the device while executing a kernel. Common - * causes include dereferencing an invalid device pointer and accessing - * out of bounds shared memory. Less common cases can be system specific - more - * information about these cases can be found in the system specific user guide. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - CUDA_ERROR_LAUNCH_FAILED = 719, - - /** - * This error indicates that the number of blocks launched per grid for a kernel that was - * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice - * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor - * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors - * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. - */ - CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720, - - /** - * This error indicates that the attempted operation is not permitted. - */ - CUDA_ERROR_NOT_PERMITTED = 800, - - /** - * This error indicates that the attempted operation is not supported - * on the current system or device. - */ - CUDA_ERROR_NOT_SUPPORTED = 801, - - /** - * This error indicates that the system is not yet ready to start any CUDA - * work. To continue using CUDA, verify the system configuration is in a - * valid state and all required driver daemons are actively running. - * More information about this error can be found in the system specific - * user guide. - */ - CUDA_ERROR_SYSTEM_NOT_READY = 802, - - /** - * This error indicates that there is a mismatch between the versions of - * the display driver and the CUDA driver. Refer to the compatibility documentation - * for supported versions. - */ - CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803, - - /** - * This error indicates that the system was upgraded to run with forward compatibility - * but the visible hardware detected by CUDA does not support this configuration. - * Refer to the compatibility documentation for the supported hardware matrix or ensure - * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES - * environment variable. - */ - CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804, - - /** - * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server. - */ - CUDA_ERROR_MPS_CONNECTION_FAILED = 805, - - /** - * This error indicates that the remote procedural call between the MPS server and the MPS client failed. - */ - CUDA_ERROR_MPS_RPC_FAILURE = 806, - - /** - * This error indicates that the MPS server is not ready to accept new MPS client requests. - * This error can be returned when the MPS server is in the process of recovering from a fatal failure. - */ - CUDA_ERROR_MPS_SERVER_NOT_READY = 807, - - /** - * This error indicates that the hardware resources required to create MPS client have been exhausted. - */ - CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808, - - /** - * This error indicates the the hardware resources required to support device connections have been exhausted. - */ - CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809, - - /** - * This error indicates that the operation is not permitted when - * the stream is capturing. - */ - CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900, - - /** - * This error indicates that the current capture sequence on the stream - * has been invalidated due to a previous error. - */ - CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901, - - /** - * This error indicates that the operation would have resulted in a merge - * of two independent capture sequences. - */ - CUDA_ERROR_STREAM_CAPTURE_MERGE = 902, - - /** - * This error indicates that the capture was not initiated in this stream. - */ - CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903, - - /** - * This error indicates that the capture sequence contains a fork that was - * not joined to the primary stream. - */ - CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904, - - /** - * This error indicates that a dependency would have been created which - * crosses the capture sequence boundary. Only implicit in-stream ordering - * dependencies are allowed to cross the boundary. - */ - CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905, - - /** - * This error indicates a disallowed implicit dependency on a current capture - * sequence from cudaStreamLegacy. - */ - CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906, - - /** - * This error indicates that the operation is not permitted on an event which - * was last recorded in a capturing stream. - */ - CUDA_ERROR_CAPTURED_EVENT = 907, - - /** - * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED - * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a - * different thread. - */ - CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908, - - /** - * This error indicates that the timeout specified for the wait operation has lapsed. - */ - CUDA_ERROR_TIMEOUT = 909, - - /** - * This error indicates that the graph update was not performed because it included - * changes which violated constraints specific to instantiated graph update. - */ - CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910, - - /** - * This indicates that an async error has occurred in a device outside of CUDA. - * If CUDA was waiting for an external device's signal before consuming shared data, - * the external device signaled an error indicating that the data is not valid for - * consumption. This leaves the process in an inconsistent state and any further CUDA - * work will return the same error. To continue using CUDA, the process must be - * terminated and relaunched. - */ - CUDA_ERROR_EXTERNAL_DEVICE = 911, - - /** - * This indicates that an unknown internal error has occurred. - */ - CUDA_ERROR_UNKNOWN = 999 -} CUresult; - -/** - * P2P Attributes - */ -typedef enum CUdevice_P2PAttribute_enum { - CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, /**< A relative value indicating the performance of the link between two devices */ - CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, /**< P2P Access is enable */ - CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03, /**< Atomic operation over the link supported */ - CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 0x04, /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */ - CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04 /**< Accessing CUDA arrays over the link supported */ -} CUdevice_P2PAttribute; - -/** - * CUDA stream callback - * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. - * \param status ::CUDA_SUCCESS or any persistent error on the stream. - * \param userData User parameter provided at registration. - */ -typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData); - -/** - * Block size to per-block dynamic shared memory mapping for a certain - * kernel \param blockSize Block size of the kernel. - * - * \return The dynamic shared memory needed by a block. - */ -typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize); - -/** - * If set, host memory is portable between CUDA contexts. - * Flag for ::cuMemHostAlloc() - */ -#define CU_MEMHOSTALLOC_PORTABLE 0x01 - -/** - * If set, host memory is mapped into CUDA address space and - * ::cuMemHostGetDevicePointer() may be called on the host pointer. - * Flag for ::cuMemHostAlloc() - */ -#define CU_MEMHOSTALLOC_DEVICEMAP 0x02 - -/** - * If set, host memory is allocated as write-combined - fast to write, - * faster to DMA, slow to read except via SSE4 streaming load instruction - * (MOVNTDQA). - * Flag for ::cuMemHostAlloc() - */ -#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 - -/** - * If set, host memory is portable between CUDA contexts. - * Flag for ::cuMemHostRegister() - */ -#define CU_MEMHOSTREGISTER_PORTABLE 0x01 - -/** - * If set, host memory is mapped into CUDA address space and - * ::cuMemHostGetDevicePointer() may be called on the host pointer. - * Flag for ::cuMemHostRegister() - */ -#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 - -/** - * If set, the passed memory pointer is treated as pointing to some - * memory-mapped I/O space, e.g. belonging to a third-party PCIe device. - * On Windows the flag is a no-op. - * On Linux that memory is marked as non cache-coherent for the GPU and - * is expected to be physically contiguous. It may return - * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, - * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. - * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED - * is returned. - * Flag for ::cuMemHostRegister() - */ -#define CU_MEMHOSTREGISTER_IOMEMORY 0x04 - -/** -* If set, the passed memory pointer is treated as pointing to memory that is -* considered read-only by the device. On platforms without -* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is -* required in order to register memory mapped to the CPU as read-only. Support -* for the use of this flag can be queried from the device attribute -* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with -* a current context associated with a device that does not have this attribute -* set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED. -*/ -#define CU_MEMHOSTREGISTER_READ_ONLY 0x08 - -/** - * 2D memory copy parameters - */ -typedef struct CUDA_MEMCPY2D_st { - size_t srcXInBytes; /**< Source X in bytes */ - size_t srcY; /**< Source Y */ - - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - size_t srcPitch; /**< Source pitch (ignored when src is array) */ - - size_t dstXInBytes; /**< Destination X in bytes */ - size_t dstY; /**< Destination Y */ - - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ - - size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ - size_t Height; /**< Height of 2D memory copy */ -} CUDA_MEMCPY2D_v2; -typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D; - -/** - * 3D memory copy parameters - */ -typedef struct CUDA_MEMCPY3D_st { - size_t srcXInBytes; /**< Source X in bytes */ - size_t srcY; /**< Source Y */ - size_t srcZ; /**< Source Z */ - size_t srcLOD; /**< Source LOD */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - void *reserved0; /**< Must be NULL */ - size_t srcPitch; /**< Source pitch (ignored when src is array) */ - size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ - - size_t dstXInBytes; /**< Destination X in bytes */ - size_t dstY; /**< Destination Y */ - size_t dstZ; /**< Destination Z */ - size_t dstLOD; /**< Destination LOD */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - void *reserved1; /**< Must be NULL */ - size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ - size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ - - size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ - size_t Height; /**< Height of 3D memory copy */ - size_t Depth; /**< Depth of 3D memory copy */ -} CUDA_MEMCPY3D_v2; -typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D; - -/** - * 3D memory cross-context copy parameters - */ -typedef struct CUDA_MEMCPY3D_PEER_st { - size_t srcXInBytes; /**< Source X in bytes */ - size_t srcY; /**< Source Y */ - size_t srcZ; /**< Source Z */ - size_t srcLOD; /**< Source LOD */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */ - size_t srcPitch; /**< Source pitch (ignored when src is array) */ - size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ - - size_t dstXInBytes; /**< Destination X in bytes */ - size_t dstY; /**< Destination Y */ - size_t dstZ; /**< Destination Z */ - size_t dstLOD; /**< Destination LOD */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */ - size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ - size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ - - size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ - size_t Height; /**< Height of 3D memory copy */ - size_t Depth; /**< Depth of 3D memory copy */ -} CUDA_MEMCPY3D_PEER_v1; -typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER; - -/** - * Array descriptor - */ -typedef struct CUDA_ARRAY_DESCRIPTOR_st -{ - size_t Width; /**< Width of array */ - size_t Height; /**< Height of array */ - - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ -} CUDA_ARRAY_DESCRIPTOR_v2; -typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR; - -/** - * 3D array descriptor - */ -typedef struct CUDA_ARRAY3D_DESCRIPTOR_st -{ - size_t Width; /**< Width of 3D array */ - size_t Height; /**< Height of 3D array */ - size_t Depth; /**< Depth of 3D array */ - - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ - unsigned int Flags; /**< Flags */ -} CUDA_ARRAY3D_DESCRIPTOR_v2; -typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR; - -/** - * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers - */ -#define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1 - -/** - * CUDA array sparse properties - */ -typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st { - struct { - unsigned int width; /**< Width of sparse tile in elements */ - unsigned int height; /**< Height of sparse tile in elements */ - unsigned int depth; /**< Depth of sparse tile in elements */ - } tileExtent; - - /** - * First mip level at which the mip tail begins. - */ - unsigned int miptailFirstLevel; - /** - * Total size of the mip tail. - */ - unsigned long long miptailSize; - /** - * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL - */ - unsigned int flags; - unsigned int reserved[4]; -} CUDA_ARRAY_SPARSE_PROPERTIES_v1; -typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES; - -/** - * CUDA Resource descriptor - */ -typedef struct CUDA_RESOURCE_DESC_st -{ - CUresourcetype resType; /**< Resource type */ - - union { - struct { - CUarray hArray; /**< CUDA array */ - } array; - struct { - CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ - } mipmap; - struct { - CUdeviceptr devPtr; /**< Device pointer */ - CUarray_format format; /**< Array format */ - unsigned int numChannels; /**< Channels per array element */ - size_t sizeInBytes; /**< Size in bytes */ - } linear; - struct { - CUdeviceptr devPtr; /**< Device pointer */ - CUarray_format format; /**< Array format */ - unsigned int numChannels; /**< Channels per array element */ - size_t width; /**< Width of the array in elements */ - size_t height; /**< Height of the array in elements */ - size_t pitchInBytes; /**< Pitch between two rows in bytes */ - } pitch2D; - struct { - int reserved[32]; - } reserved; - } res; - - unsigned int flags; /**< Flags (must be zero) */ -} CUDA_RESOURCE_DESC_v1; -typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC; - -/** - * Texture descriptor - */ -typedef struct CUDA_TEXTURE_DESC_st { - CUaddress_mode addressMode[3]; /**< Address modes */ - CUfilter_mode filterMode; /**< Filter mode */ - unsigned int flags; /**< Flags */ - unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ - CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ - float mipmapLevelBias; /**< Mipmap level bias */ - float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ - float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ - float borderColor[4]; /**< Border Color */ - int reserved[12]; -} CUDA_TEXTURE_DESC_v1; -typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC; - -/** - * Resource view format - */ -typedef enum CUresourceViewFormat_enum -{ - CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ - CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ - CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ - CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ - CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ - CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ - CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ - CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ - CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ - CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ - CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ - CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ - CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ - CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ - CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ - CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ - CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ - CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ - CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ - CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ - CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ - CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ - CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ - CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ - CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ -} CUresourceViewFormat; - -/** - * Resource view descriptor - */ -typedef struct CUDA_RESOURCE_VIEW_DESC_st -{ - CUresourceViewFormat format; /**< Resource view format */ - size_t width; /**< Width of the resource view */ - size_t height; /**< Height of the resource view */ - size_t depth; /**< Depth of the resource view */ - unsigned int firstMipmapLevel; /**< First defined mipmap level */ - unsigned int lastMipmapLevel; /**< Last defined mipmap level */ - unsigned int firstLayer; /**< First layer index */ - unsigned int lastLayer; /**< Last layer index */ - unsigned int reserved[16]; -} CUDA_RESOURCE_VIEW_DESC_v1; -typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC; - -/** - * GPU Direct v3 tokens - */ -typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st { - unsigned long long p2pToken; - unsigned int vaSpaceToken; -} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1; -typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; - -/** -* Access flags that specify the level of access the current context's device has -* on the memory referenced. -*/ -typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum { - CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE = 0x0, /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */ - CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ = 0x1, /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */ - CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3 /**< Read-write access, the device has full read-write access to the memory */ -} CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS; - -/** - * Kernel launch parameters - */ -typedef struct CUDA_LAUNCH_PARAMS_st { - CUfunction function; /**< Kernel to launch */ - unsigned int gridDimX; /**< Width of grid in blocks */ - unsigned int gridDimY; /**< Height of grid in blocks */ - unsigned int gridDimZ; /**< Depth of grid in blocks */ - unsigned int blockDimX; /**< X dimension of each thread block */ - unsigned int blockDimY; /**< Y dimension of each thread block */ - unsigned int blockDimZ; /**< Z dimension of each thread block */ - unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ - CUstream hStream; /**< Stream identifier */ - void **kernelParams; /**< Array of pointers to kernel parameters */ -} CUDA_LAUNCH_PARAMS_v1; -typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS; - -/** - * External memory handle types - */ -typedef enum CUexternalMemoryHandleType_enum { - /** - * Handle is an opaque file descriptor - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, - /** - * Handle is an opaque shared NT handle - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, - /** - * Handle is an opaque, globally shared handle - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, - /** - * Handle is a D3D12 heap object - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, - /** - * Handle is a D3D12 committed resource - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, - /** - * Handle is a shared NT handle to a D3D11 resource - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, - /** - * Handle is a globally shared handle to a D3D11 resource - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, - /** - * Handle is an NvSciBuf object - */ - CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 -} CUexternalMemoryHandleType; - -/** - * Indicates that the external memory object is a dedicated resource - */ -#define CUDA_EXTERNAL_MEMORY_DEDICATED 0x1 - -/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS - * contains this flag, it indicates that signaling an external semaphore object - * should skip performing appropriate memory synchronization operations over all - * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, - * which otherwise are performed by default to ensure data coherency with other - * importers of the same NvSciBuf memory objects. - */ -#define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01 - -/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS - * contains this flag, it indicates that waiting on an external semaphore object - * should skip performing appropriate memory synchronization operations over all - * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, - * which otherwise are performed by default to ensure data coherency with other - * importers of the same NvSciBuf memory objects. - */ -#define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02 - -/** - * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, - * it indicates that application needs signaler specific NvSciSyncAttr - * to be filled by ::cuDeviceGetNvSciSyncAttributes. - */ -#define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1 - -/** - * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, - * it indicates that application needs waiter specific NvSciSyncAttr - * to be filled by ::cuDeviceGetNvSciSyncAttributes. - */ -#define CUDA_NVSCISYNC_ATTR_WAIT 0x2 -/** - * External memory handle descriptor - */ -typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { - /** - * Type of the handle - */ - CUexternalMemoryHandleType type; - union { - /** - * File descriptor referencing the memory object. Valid - * when type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD - */ - int fd; - /** - * Win32 handle referencing the semaphore object. Valid when - * type is one of the following: - * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 - * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT - * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP - * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE - * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE - * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT - * Exactly one of 'handle' and 'name' must be non-NULL. If - * type is one of the following: - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT - * then 'name' must be NULL. - */ - struct { - /** - * Valid NT handle. Must be NULL if 'name' is non-NULL - */ - void *handle; - /** - * Name of a valid memory object. - * Must be NULL if 'handle' is non-NULL. - */ - const void *name; - } win32; - /** - * A handle representing an NvSciBuf Object. Valid when type - * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF - */ - const void *nvSciBufObject; - } handle; - /** - * Size of the memory allocation - */ - unsigned long long size; - /** - * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED - */ - unsigned int flags; - unsigned int reserved[16]; -} CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1; -typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC; - -/** - * External memory buffer descriptor - */ -typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { - /** - * Offset into the memory object where the buffer's base is - */ - unsigned long long offset; - /** - * Size of the buffer - */ - unsigned long long size; - /** - * Flags reserved for future use. Must be zero. - */ - unsigned int flags; - unsigned int reserved[16]; -} CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1; -typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC; - -/** - * External memory mipmap descriptor - */ -typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { - /** - * Offset into the memory object where the base level of the - * mipmap chain is. - */ - unsigned long long offset; - /** - * Format, dimension and type of base level of the mipmap chain - */ - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - /** - * Total number of levels in the mipmap chain - */ - unsigned int numLevels; - unsigned int reserved[16]; -} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1; -typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; - -/** - * External semaphore handle types - */ -typedef enum CUexternalSemaphoreHandleType_enum { - /** - * Handle is an opaque file descriptor - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, - /** - * Handle is an opaque shared NT handle - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, - /** - * Handle is an opaque, globally shared handle - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, - /** - * Handle is a shared NT handle referencing a D3D12 fence object - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, - /** - * Handle is a shared NT handle referencing a D3D11 fence object - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, - /** - * Opaque handle to NvSciSync Object - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, - /** - * Handle is a shared NT handle referencing a D3D11 keyed mutex object - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, - /** - * Handle is a globally shared handle referencing a D3D11 keyed mutex object - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, - /** - * Handle is an opaque file descriptor referencing a timeline semaphore - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, - /** - * Handle is an opaque shared NT handle referencing a timeline semaphore - */ - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 -} CUexternalSemaphoreHandleType; - -/** - * External semaphore handle descriptor - */ -typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { - /** - * Type of the handle - */ - CUexternalSemaphoreHandleType type; - union { - /** - * File descriptor referencing the semaphore object. Valid - * when type is one of the following: - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD - */ - int fd; - /** - * Win32 handle referencing the semaphore object. Valid when - * type is one of the following: - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 - * Exactly one of 'handle' and 'name' must be non-NULL. If - * type is one of the following: - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT - * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT - * then 'name' must be NULL. - */ - struct { - /** - * Valid NT handle. Must be NULL if 'name' is non-NULL - */ - void *handle; - /** - * Name of a valid synchronization primitive. - * Must be NULL if 'handle' is non-NULL. - */ - const void *name; - } win32; - /** - * Valid NvSciSyncObj. Must be non NULL - */ - const void* nvSciSyncObj; - } handle; - /** - * Flags reserved for the future. Must be zero. - */ - unsigned int flags; - unsigned int reserved[16]; -} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1; -typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; - -/** - * External semaphore signal parameters - */ -typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st { - struct { - /** - * Parameters for fence objects - */ - struct { - /** - * Value of fence to be signaled - */ - unsigned long long value; - } fence; - union { - /** - * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType - * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. - */ - void *fence; - unsigned long long reserved; - } nvSciSync; - /** - * Parameters for keyed mutex objects - */ - struct { - /** - * Value of key to release the mutex with - */ - unsigned long long key; - } keyedMutex; - unsigned int reserved[12]; - } params; - /** - * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to - * signal a ::CUexternalSemaphore of type - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is - * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates - * that while signaling the ::CUexternalSemaphore, no memory synchronization - * operations should be performed for any external memory object imported - * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. - * For all other types of ::CUexternalSemaphore, flags must be zero. - */ - unsigned int flags; - unsigned int reserved[16]; -} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1; -typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS; - -/** - * External semaphore wait parameters - */ -typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { - struct { - /** - * Parameters for fence objects - */ - struct { - /** - * Value of fence to be waited on - */ - unsigned long long value; - } fence; - /** - * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType - * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. - */ - union { - void *fence; - unsigned long long reserved; - } nvSciSync; - /** - * Parameters for keyed mutex objects - */ - struct { - /** - * Value of key to acquire the mutex with - */ - unsigned long long key; - /** - * Timeout in milliseconds to wait to acquire the mutex - */ - unsigned int timeoutMs; - } keyedMutex; - unsigned int reserved[10]; - } params; - /** - * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on - * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, - * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC - * which indicates that while waiting for the ::CUexternalSemaphore, no memory - * synchronization operations should be performed for any external memory - * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. - * For all other types of ::CUexternalSemaphore, flags must be zero. - */ - unsigned int flags; - unsigned int reserved[16]; -} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1; -typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS; - -/** - * Semaphore signal node parameters - */ -typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st { - CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ - const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */ - unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ -} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1; -typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS; - -/** - * Semaphore wait node parameters - */ -typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st { - CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ - const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */ - unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ -} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1; -typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS; - -typedef unsigned long long CUmemGenericAllocationHandle_v1; -typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle; - -/** - * Flags for specifying particular handle types - */ -typedef enum CUmemAllocationHandleType_enum { - CU_MEM_HANDLE_TYPE_NONE = 0x0, /**< Does not allow any export mechanism. > */ - CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1, /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */ - CU_MEM_HANDLE_TYPE_WIN32 = 0x2, /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */ - CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4, /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */ - CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF -} CUmemAllocationHandleType; - -/** - * Specifies the memory protection flags for mapping. - */ -typedef enum CUmemAccess_flags_enum { - CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0, /**< Default, make the address range not accessible */ - CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1, /**< Make the address range read accessible */ - CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3, /**< Make the address range read-write accessible */ - CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF -} CUmemAccess_flags; - -/** - * Specifies the type of location - */ -typedef enum CUmemLocationType_enum { - CU_MEM_LOCATION_TYPE_INVALID = 0x0, - CU_MEM_LOCATION_TYPE_DEVICE = 0x1, /**< Location is a device location, thus id is a device ordinal */ - CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF -} CUmemLocationType; - -/** -* Defines the allocation types available -*/ -typedef enum CUmemAllocationType_enum { - CU_MEM_ALLOCATION_TYPE_INVALID = 0x0, - - /** This allocation type is 'pinned', i.e. cannot migrate from its current - * location while the application is actively using it - */ - CU_MEM_ALLOCATION_TYPE_PINNED = 0x1, - CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF -} CUmemAllocationType; - -/** -* Flag for requesting different optimal and required granularities for an allocation. -*/ -typedef enum CUmemAllocationGranularity_flags_enum { - CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0, /**< Minimum required granularity for allocation */ - CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1 /**< Recommended granularity for allocation for best performance */ -} CUmemAllocationGranularity_flags; - -/** - * Sparse subresource types - */ -typedef enum CUarraySparseSubresourceType_enum { - CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, - CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 -} CUarraySparseSubresourceType; - -/** - * Memory operation types - */ -typedef enum CUmemOperationType_enum { - CU_MEM_OPERATION_TYPE_MAP = 1, - CU_MEM_OPERATION_TYPE_UNMAP = 2 -} CUmemOperationType; - -/** - * Memory handle types - */ -typedef enum CUmemHandleType_enum { - CU_MEM_HANDLE_TYPE_GENERIC = 0 -} CUmemHandleType; - -/** - * Specifies the CUDA array or CUDA mipmapped array memory mapping information - */ -typedef struct CUarrayMapInfo_st { - CUresourcetype resourceType; /**< Resource type */ - - union { - CUmipmappedArray mipmap; - CUarray array; - } resource; - - CUarraySparseSubresourceType subresourceType; /**< Sparse subresource type */ - - union { - struct { - unsigned int level; /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */ - unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ - unsigned int offsetX; /**< Starting X offset in elements */ - unsigned int offsetY; /**< Starting Y offset in elements */ - unsigned int offsetZ; /**< Starting Z offset in elements */ - unsigned int extentWidth; /**< Width in elements */ - unsigned int extentHeight; /**< Height in elements */ - unsigned int extentDepth; /**< Depth in elements */ - } sparseLevel; - struct { - unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ - unsigned long long offset; /**< Offset within mip tail */ - unsigned long long size; /**< Extent in bytes */ - } miptail; - } subresource; - - CUmemOperationType memOperationType; /**< Memory operation type */ - CUmemHandleType memHandleType; /**< Memory handle type */ - - union { - CUmemGenericAllocationHandle memHandle; - } memHandle; - - unsigned long long offset; /**< Offset within the memory */ - unsigned int deviceBitMask; /**< Device ordinal bit mask */ - unsigned int flags; /**< flags for future use, must be zero now. */ - unsigned int reserved[2]; /**< Reserved for future use, must be zero now. */ -} CUarrayMapInfo_v1; -typedef CUarrayMapInfo_v1 CUarrayMapInfo; - -/** - * Specifies a memory location. - */ -typedef struct CUmemLocation_st { - CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */ - int id; /**< identifier for a given this location's ::CUmemLocationType. */ -} CUmemLocation_v1; -typedef CUmemLocation_v1 CUmemLocation; - -/** - * Specifies compression attribute for an allocation. - */ -typedef enum CUmemAllocationCompType_enum { - CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */ - CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating compressible memory */ -} CUmemAllocationCompType; - -/** - * This flag if set indicates that the memory will be used as a tile pool. - */ -#define CU_MEM_CREATE_USAGE_TILE_POOL 0x1 - -/** -* Specifies the allocation properties for a allocation. -*/ -typedef struct CUmemAllocationProp_st { - /** Allocation type */ - CUmemAllocationType type; - /** requested ::CUmemAllocationHandleType */ - CUmemAllocationHandleType requestedHandleTypes; - /** Location of allocation */ - CUmemLocation location; - /** - * Windows-specific POBJECT_ATTRIBUTES required when - * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This object atributes structure - * includes security attributes that define - * the scope of which exported allocations may be tranferred to other - * processes. In all other cases, this field is required to be zero. - */ - void *win32HandleMetaData; - struct { - /** - * Allocation hint for requesting compressible memory. - * On devices that support Compute Data Compression, compressible - * memory can be used to accelerate accesses to data with unstructured - * sparsity and other compressible data patterns. Applications are - * expected to query allocation property of the handle obtained with - * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to - * validate if the obtained allocation is compressible or not. Note that - * compressed memory may not be mappable on all devices. - */ - unsigned char compressionType; - unsigned char gpuDirectRDMACapable; - /** Bitmask indicating intended usage for this allocation */ - unsigned short usage; - unsigned char reserved[4]; - } allocFlags; -} CUmemAllocationProp_v1; -typedef CUmemAllocationProp_v1 CUmemAllocationProp; - -/** - * Memory access descriptor - */ -typedef struct CUmemAccessDesc_st { - CUmemLocation location; /**< Location on which the request is to change it's accessibility */ - CUmemAccess_flags flags; /**< ::CUmemProt accessibility flags to set on the request */ -} CUmemAccessDesc_v1; -typedef CUmemAccessDesc_v1 CUmemAccessDesc; - -typedef enum CUgraphExecUpdateResult_enum { - CU_GRAPH_EXEC_UPDATE_SUCCESS = 0x0, /**< The update succeeded */ - CU_GRAPH_EXEC_UPDATE_ERROR = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */ - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED = 0x2, /**< The update failed because the topology changed */ - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED = 0x3, /**< The update failed because a node type changed */ - CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */ - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED = 0x5, /**< The update failed because the parameters changed in a way that is not supported */ - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED = 0x6, /**< The update failed because something about the node is not supported */ - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7 /**< The update failed because the function of a kernel node changed in an unsupported way */ -} CUgraphExecUpdateResult; - -/** - * CUDA memory pool attributes - */ -typedef enum CUmemPool_attribute_enum { - /** - * (value type = int) - * Allow cuMemAllocAsync to use memory asynchronously freed - * in another streams as long as a stream ordering dependency - * of the allocating stream on the free action exists. - * Cuda events and null stream interactions can create the required - * stream ordered dependencies. (default enabled) - */ - CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1, - - /** - * (value type = int) - * Allow reuse of already completed frees when there is no dependency - * between the free and allocation. (default enabled) - */ - CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, - - /** - * (value type = int) - * Allow cuMemAllocAsync to insert new stream dependencies - * in order to establish the stream ordering required to reuse - * a piece of memory released by cuFreeAsync (default enabled). - */ - CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, - - /** - * (value type = cuuint64_t) - * Amount of reserved memory in bytes to hold onto before trying - * to release memory back to the OS. When more than the release - * threshold bytes of memory are held by the memory pool, the - * allocator will try to release memory back to the OS on the - * next call to stream, event or context synchronize. (default 0) - */ - CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, - - /** - * (value type = cuuint64_t) - * Amount of backing memory currently allocated for the mempool. - */ - CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, - - /** - * (value type = cuuint64_t) - * High watermark of backing memory allocated for the mempool since the - * last time it was reset. High watermark can only be reset to zero. - */ - CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, - - /** - * (value type = cuuint64_t) - * Amount of memory from the pool that is currently in use by the application. - */ - CU_MEMPOOL_ATTR_USED_MEM_CURRENT, - - /** - * (value type = cuuint64_t) - * High watermark of the amount of memory from the pool that was in use by the application since - * the last time it was reset. High watermark can only be reset to zero. - */ - CU_MEMPOOL_ATTR_USED_MEM_HIGH -} CUmemPool_attribute; - -/** - * Specifies the properties of allocations made from the pool. - */ -typedef struct CUmemPoolProps_st { - CUmemAllocationType allocType; /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */ - CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */ - CUmemLocation location; /**< Location where allocations should reside. */ - /** - * Windows-specific LPSECURITYATTRIBUTES required when - * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute defines - * the scope of which exported allocations may be tranferred to other - * processes. In all other cases, this field is required to be zero. - */ - void *win32SecurityAttributes; - unsigned char reserved[64]; /**< reserved for future use, must be 0 */ -} CUmemPoolProps_v1; -typedef CUmemPoolProps_v1 CUmemPoolProps; - -/** - * Opaque data for exporting a pool allocation - */ -typedef struct CUmemPoolPtrExportData_st { - unsigned char reserved[64]; -} CUmemPoolPtrExportData_v1; -typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData; - -/** - * Memory allocation node parameters - */ -typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_st { - /** - * in: location where the allocation should reside (specified in ::location). - * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported. - */ - CUmemPoolProps poolProps; - const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */ - size_t accessDescCount; /**< in: number of memory access descriptors. Must not exceed the number of GPUs. */ - size_t bytesize; /**< in: size in bytes of the requested allocation */ - CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */ -} CUDA_MEM_ALLOC_NODE_PARAMS; - -typedef enum CUgraphMem_attribute_enum { - /** - * (value type = cuuint64_t) - * Amount of memory, in bytes, currently associated with graphs - */ - CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT, - - /** - * (value type = cuuint64_t) - * High watermark of memory, in bytes, associated with graphs since the - * last time it was reset. High watermark can only be reset to zero. - */ - CU_GRAPH_MEM_ATTR_USED_MEM_HIGH, - - /** - * (value type = cuuint64_t) - * Amount of memory, in bytes, currently allocated for use by - * the CUDA graphs asynchronous allocator. - */ - CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT, - - /** - * (value type = cuuint64_t) - * High watermark of memory, in bytes, currently allocated for use by - * the CUDA graphs asynchronous allocator. - */ - CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH -} CUgraphMem_attribute; - -/** - * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only - * waits for prior work in the stream corresponding to that GPU to complete before the - * kernel begins execution. - */ -#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC 0x01 - -/** - * If set, any subsequent work pushed in a stream that participated in a call to - * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on - * the GPU corresponding to that stream to complete before it begins execution. - */ -#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC 0x02 - -/** - * If set, the CUDA array is a collection of layers, where each layer is either a 1D - * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number - * of layers, not the depth of a 3D array. - */ -#define CUDA_ARRAY3D_LAYERED 0x01 - -/** - * Deprecated, use CUDA_ARRAY3D_LAYERED - */ -#define CUDA_ARRAY3D_2DARRAY 0x01 - -/** - * This flag must be set in order to bind a surface reference - * to the CUDA array - */ -#define CUDA_ARRAY3D_SURFACE_LDST 0x02 - -/** - * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The - * width of such a CUDA array must be equal to its height, and Depth must be six. - * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps - * and Depth must be a multiple of six. - */ -#define CUDA_ARRAY3D_CUBEMAP 0x04 - -/** - * This flag must be set in order to perform texture gather operations - * on a CUDA array. - */ -#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08 - -/** - * This flag if set indicates that the CUDA - * array is a DEPTH_TEXTURE. - */ -#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10 - -/** - * This flag indicates that the CUDA array may be bound as a color target - * in an external graphics API - */ -#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20 - -/** - * This flag if set indicates that the CUDA array or CUDA mipmapped array - * is a sparse CUDA array or CUDA mipmapped array respectively - */ -#define CUDA_ARRAY3D_SPARSE 0x40 - -/** - * Override the texref format with a format inferred from the array. - * Flag for ::cuTexRefSetArray() - */ -#define CU_TRSA_OVERRIDE_FORMAT 0x01 - -/** - * Read the texture as integers rather than promoting the values to floats - * in the range [0,1]. - * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() - */ -#define CU_TRSF_READ_AS_INTEGER 0x01 - -/** - * Use normalized texture coordinates in the range [0,1) instead of [0,dim). - * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() - */ -#define CU_TRSF_NORMALIZED_COORDINATES 0x02 - -/** - * Perform sRGB->linear conversion during texture read. - * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() - */ -#define CU_TRSF_SRGB 0x10 - - /** - * Disable any trilinear filtering optimizations. - * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() - */ -#define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION 0x20 - -/** - * End of array terminator for the \p extra parameter to - * ::cuLaunchKernel - */ -#define CU_LAUNCH_PARAM_END ((void*)0x00) - -/** - * Indicator that the next value in the \p extra parameter to - * ::cuLaunchKernel will be a pointer to a buffer containing all kernel - * parameters used for launching kernel \p f. This buffer needs to - * honor all alignment/padding requirements of the individual parameters. - * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the - * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no - * effect. - */ -#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01) - -/** - * Indicator that the next value in the \p extra parameter to - * ::cuLaunchKernel will be a pointer to a size_t which contains the - * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER. - * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified - * in the \p extra array if the value associated with - * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. - */ -#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02) - -/** - * For texture references loaded into the module, use default texunit from - * texture reference. - */ -#define CU_PARAM_TR_DEFAULT -1 - -/** - * Device that represents the CPU - */ -#define CU_DEVICE_CPU ((CUdevice)-1) - -/** - * Device that represents an invalid device - */ -#define CU_DEVICE_INVALID ((CUdevice)-2) - -/** - * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS - */ -typedef enum CUflushGPUDirectRDMAWritesOptions_enum { - CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */ - CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1 /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */ -} CUflushGPUDirectRDMAWritesOptions; - -/** - * Platform native ordering for GPUDirect RDMA writes - */ -typedef enum CUGPUDirectRDMAWritesOrdering_enum { - CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE = 0, /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */ - CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */ - CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200 /**< Any CUDA device in the system can consistently consume remote writes to this device. */ -} CUGPUDirectRDMAWritesOrdering; - -/** - * The scopes for ::cuFlushGPUDirectRDMAWrites - */ -typedef enum CUflushGPUDirectRDMAWritesScope_enum { - CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */ - CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200 /**< Blocks until remote writes are visible to all CUDA device contexts. */ -} CUflushGPUDirectRDMAWritesScope; - -/** - * The targets for ::cuFlushGPUDirectRDMAWrites - */ -typedef enum CUflushGPUDirectRDMAWritesTarget_enum { - CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */ -} CUflushGPUDirectRDMAWritesTarget; - -/** - * The additional write options for ::cuGraphDebugDotPrint - */ -typedef enum CUgraphDebugDot_flags_enum { - CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE = 1<<0, /** Output all debug data as if every debug flag is enabled */ - CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES = 1<<1, /** Use CUDA Runtime structures for output */ - CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS = 1<<2, /** Adds CUDA_KERNEL_NODE_PARAMS values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS = 1<<3, /** Adds CUDA_MEMCPY3D values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS = 1<<4, /** Adds CUDA_MEMSET_NODE_PARAMS values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS = 1<<5, /** Adds CUDA_HOST_NODE_PARAMS values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS = 1<<6, /** Adds CUevent handle from record and wait nodes to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS = 1<<7, /** Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS = 1<<8, /** Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES = 1<<9, /** Adds CUkernelNodeAttrValue values to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES = 1<<10, /** Adds node handles and every kernel function handle to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS = 1<<11, /** Adds memory alloc node parameters to output */ - CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS = 1<<12 /** Adds memory free node parameters to output */ -} CUgraphDebugDot_flags; - -/** - * Flags for user objects for graphs - */ -typedef enum CUuserObject_flags_enum { - CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1 /**< Indicates the destructor execution is not synchronized by any CUDA handle. */ -} CUuserObject_flags; - -/** - * Flags for retaining user object references for graphs - */ -typedef enum CUuserObjectRetain_flags_enum { - CU_GRAPH_USER_OBJECT_MOVE = 1 /**< Transfer references from the caller rather than creating new references. */ -} CUuserObjectRetain_flags; - -/** - * Flags for instantiating a graph - */ -typedef enum CUgraphInstantiate_flags_enum { - CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH = 1 /**< Automatically free memory allocated in a graph before relaunching. */ -} CUgraphInstantiate_flags; - -/** @} */ /* END CUDA_TYPES */ - -#if defined(__GNUC__) - #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) - #pragma GCC visibility push(default) - #endif -#endif - -#ifdef _WIN32 -#define CUDAAPI __stdcall -#else -#define CUDAAPI -#endif - -/** - * \defgroup CUDA_ERROR Error Handling - * - * ___MANBRIEF___ error handling functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the error handling functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Gets the string description of an error code - * - * Sets \p *pStr to the address of a NULL-terminated string description - * of the error code \p error. - * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE - * will be returned and \p *pStr will be set to the NULL address. - * - * \param error - Error code to convert to string - * \param pStr - Address of the string pointer. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::CUresult, - * ::cudaGetErrorString - */ -CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr); - -/** - * \brief Gets the string representation of an error code enum name - * - * Sets \p *pStr to the address of a NULL-terminated string representation - * of the name of the enum error code \p error. - * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE - * will be returned and \p *pStr will be set to the NULL address. - * - * \param error - Error code to convert to string - * \param pStr - Address of the string pointer. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::CUresult, - * ::cudaGetErrorName - */ -CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr); - -/** @} */ /* END CUDA_ERROR */ - -/** - * \defgroup CUDA_INITIALIZE Initialization - * - * ___MANBRIEF___ initialization functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the initialization functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Initialize the CUDA driver API - * - * Initializes the driver API and must be called before any other function from - * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit() - * has not been called, any function from the driver API will return - * ::CUDA_ERROR_NOT_INITIALIZED. - * - * \param Flags - Initialization flag for CUDA. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH, - * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE - * \notefnerr - */ -CUresult CUDAAPI cuInit(unsigned int Flags); - -/** @} */ /* END CUDA_INITIALIZE */ - -/** - * \defgroup CUDA_VERSION Version Management - * - * ___MANBRIEF___ version management functions of the low-level CUDA driver - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the version management functions of the low-level - * CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Returns the latest CUDA version supported by driver - * - * Returns in \p *driverVersion the version of CUDA supported by - * the driver. The version is returned as - * (1000 × major + 10 × minor). For example, CUDA 9.2 - * would be represented by 9020. - * - * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if - * \p driverVersion is NULL. - * - * \param driverVersion - Returns the CUDA driver version - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cudaDriverGetVersion, - * ::cudaRuntimeGetVersion - */ -CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); - -/** @} */ /* END CUDA_VERSION */ - -/** - * \defgroup CUDA_DEVICE Device Management - * - * ___MANBRIEF___ device management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the device management functions of the low-level - * CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Returns a handle to a compute device - * - * Returns in \p *device a device handle given an ordinal in the range [0, - * ::cuDeviceGetCount()-1]. - * - * \param device - Returned device handle - * \param ordinal - Device number to get handle for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGetLuid, - * ::cuDeviceTotalMem, - * ::cuDeviceGetExecAffinitySupport - */ -CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); - -/** - * \brief Returns the number of compute-capable devices - * - * Returns in \p *count the number of devices with compute capability greater - * than or equal to 2.0 that are available for execution. If there is no such - * device, ::cuDeviceGetCount() returns 0. - * - * \param count - Returned number of compute-capable devices - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGetLuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem, - * ::cuDeviceGetExecAffinitySupport, - * ::cudaGetDeviceCount - */ -CUresult CUDAAPI cuDeviceGetCount(int *count); - -/** - * \brief Returns an identifer string for the device - * - * Returns an ASCII string identifying the device \p dev in the NULL-terminated - * string pointed to by \p name. \p len specifies the maximum length of the - * string that may be returned. - * - * \param name - Returned identifier string for the device - * \param len - Maximum length of string to store in \p name - * \param dev - Device to get identifier string for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetUuid, - * ::cuDeviceGetLuid, - * ::cuDeviceGetCount, - * ::cuDeviceGet, - * ::cuDeviceTotalMem, - * ::cuDeviceGetExecAffinitySupport, - * ::cudaGetDeviceProperties - */ -CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); - -/** - * \brief Return an UUID for the device - * - * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will - * supplant this version in 12.0, which is retained for minor version compatibility. - * - * Returns 16-octets identifing the device \p dev in the structure - * pointed by the \p uuid. - * - * \param uuid - Returned UUID - * \param dev - Device to get identifier string for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetUuid_v2 - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetLuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem, - * ::cuDeviceGetExecAffinitySupport, - * ::cudaGetDeviceProperties - */ -CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev); - -/** - * \brief Return an UUID for the device (11.4+) - * - * Returns 16-octets identifing the device \p dev in the structure - * pointed by the \p uuid. If the device is in MIG mode, returns its - * MIG UUID which uniquely identifies the subscribed MIG compute instance. - * - * \param uuid - Returned UUID - * \param dev - Device to get identifier string for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetLuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem, - * ::cudaGetDeviceProperties - */ -CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev); - -/** - * \brief Return an LUID and device node mask for the device - * - * Return identifying information (\p luid and \p deviceNodeMask) to allow - * matching device with graphics APIs. - * - * \param luid - Returned LUID - * \param deviceNodeMask - Returned device node mask - * \param dev - Device to get identifier string for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGet, - * ::cuDeviceTotalMem, - * ::cuDeviceGetExecAffinitySupport, - * ::cudaGetDeviceProperties - */ -CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev); - -/** - * \brief Returns the total amount of memory on the device - * - * Returns in \p *bytes the total amount of memory available on the device - * \p dev in bytes. - * - * \param bytes - Returned memory available on device in bytes - * \param dev - Device handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGet, - * ::cuDeviceGetExecAffinitySupport, - * ::cudaMemGetInfo - */ -CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); - -/** - * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size. - * - * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture - * for given \p format and \p numChannels. - * - * \param maxWidthInElements - Returned maximum number of texture elements allocatable for given \p format and \p numChannels. - * \param format - Texture format. - * \param numChannels - Number of channels per texture element. - * \param dev - Device handle. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGet, - * ::cudaMemGetInfo, - * ::cuDeviceTotalMem - */ -CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev); - -/** - * \brief Returns information about the device - * - * Returns in \p *pi the integer value of the attribute \p attrib on device - * \p dev. The supported attributes are: - * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per - * block; - * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block - * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block - * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block - * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid - * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid - * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid - * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of - * shared memory available to a thread block in bytes - * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for - * __constant__ variables in a CUDA C kernel in bytes - * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads - * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the - * memory copy functions that involve memory regions allocated through - * ::cuMemAllocPitch() - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D - * texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width - * for a 1D texture bound to linear memory - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum - * mipmapped 1D texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D - * texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D - * texture height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width - * for a 2D texture bound to linear memory - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height - * for a 2D texture bound to linear memory - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch - * in bytes for a 2D texture bound to linear memory - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum - * mipmapped 2D texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum - * mipmapped 2D texture height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D - * texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D - * texture height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D - * texture depth - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: - * Alternate maximum 3D texture width, 0 if no alternate - * maximum 3D texture size is supported - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: - * Alternate maximum 3D texture height, 0 if no alternate - * maximum 3D texture size is supported - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: - * Alternate maximum 3D texture depth, 0 if no alternate - * maximum 3D texture size is supported - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: - * Maximum cubemap texture width or height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: - * Maximum 1D layered texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: - * Maximum layers in a 1D layered texture - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: - * Maximum 2D layered texture width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: - * Maximum 2D layered texture height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: - * Maximum layers in a 2D layered texture - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: - * Maximum cubemap layered texture width or height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: - * Maximum layers in a cubemap layered texture - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: - * Maximum 1D surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: - * Maximum 2D surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: - * Maximum 2D surface height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: - * Maximum 3D surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: - * Maximum 3D surface height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: - * Maximum 3D surface depth - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: - * Maximum 1D layered surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: - * Maximum layers in a 1D layered surface - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: - * Maximum 2D layered surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: - * Maximum 2D layered surface height - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: - * Maximum layers in a 2D layered surface - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: - * Maximum cubemap surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: - * Maximum cubemap layered surface width - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: - * Maximum layers in a cubemap layered surface - * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit - * registers available to a thread block - * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz - * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture - * base addresses aligned to ::textureAlign bytes do not need an offset - * applied to texture fetches - * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement - * for 2D texture references bound to pitched memory - * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy - * memory between host and device while executing a kernel, or 0 if not - * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on - * the device - * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit - * for kernels executed on the device, or 0 if not - * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the - * memory subsystem, or 0 if not - * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host - * memory into the CUDA address space, or 0 if not - * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently - * in. Available modes are as follows: - * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and - * can have multiple CUDA contexts present at a single time. - * - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is - * prohibited from creating new CUDA contexts. - * - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS: Compute-exclusive-process mode - Device - * can have only one context used by a single process at a time. - * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports - * executing multiple kernels within the same context simultaneously, or 0 if - * not. It is not guaranteed that multiple kernels will be resident - * on the device concurrently so this feature should not be relied upon for - * correctness. - * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the - * device, 0 if error correction is disabled or not supported by the device - * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device - * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier - * of the device - * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device - * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC - * is only available on Tesla hardware running Windows Vista or later - * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz - * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits - * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache - * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor - * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with - * the host, or 0 if not - * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number - * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number - * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals - * in L1 cache, 0 if caching globals in L1 cache is not supported by the device - * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals - * in L1 cache, 0 if caching locals in L1 cache is not supported by the device - * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of - * shared memory available to a multiprocessor in bytes; this amount is shared - * by all thread blocks simultaneously resident on a multiprocessor - * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit - * registers available to a multiprocessor; this number is shared by all thread - * blocks simultaneously resident on a multiprocessor - * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory - * on this system, 0 if allocating managed memory is not supported by the device on this system. - * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not. - * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices - * associated with the same board. Devices on the same multi-GPU board will share the same identifier. - * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host - * supports native atomic operations. - * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance - * (in floating-point operations per second) to double precision performance. - * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently accessing - * pageable memory without calling cudaHostRegister on it. - * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory - * concurrently with the CPU. - * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption. - * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered - * memory at the same virtual address as the CPU. - * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size - * suported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call. - * For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES - * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's - * page tables. - * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration. - * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED: Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs - * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate - * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate - * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate - * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor - * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate - * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes - * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes - * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate. - * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes - * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. - * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU - * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs - * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) - * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum - * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. - * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC - * - * \param pi - Returned device attribute value - * \param attrib - Device attribute to query - * \param dev - Device handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem, - * ::cuDeviceGetExecAffinitySupport, - * ::cudaDeviceGetAttribute, - * ::cudaGetDeviceProperties - */ -CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); - -/** - * \brief Return NvSciSync attributes that this device can support. - * - * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that - * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList - * can be used to create an NvSciSync object that matches this device's capabilities. - * - * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is - * already set this API will return ::CUDA_ERROR_INVALID_VALUE. - * - * The applications should set \p nvSciSyncAttrList to a valid - * NvSciSyncAttrList failing which this API will return - * ::CUDA_ERROR_INVALID_HANDLE. - * - * The \p flags controls how applications intends to use - * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are: - * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to - * signal an NvSciSync on this CUDA device. - * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to - * wait on an NvSciSync on this CUDA device. - * - * At least one of these flags must be set, failing which the API - * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal - * to one another: a developer may set both these flags that allows to - * set both wait and signal specific attributes in the same \p nvSciSyncAttrList. - * - * \param nvSciSyncAttrList - Return NvSciSync attributes supported. - * \param dev - Valid Cuda Device to get NvSciSync attributes for. - * \param flags - flags describing NvSciSync usage. - * - * \return - * - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa - * ::cuImportExternalSemaphore, - * ::cuDestroyExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags); - -/** - * \brief Sets the current memory pool of a device - * - * The memory pool must be local to the specified device. - * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device. - * By default, a device's current memory pool is its default memory pool. - * - * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different - * than the one the stream runs on. - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync - */ -CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool); - -/** - * \brief Gets the current mempool for a device - * - * Returns the last pool provided to ::cuDeviceSetMemPool for this device - * or the device's default memory pool if ::cuDeviceSetMemPool has never been called. - * By default the current mempool is the default mempool for a device. - * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool. - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool - */ -CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev); - -/** - * \brief Returns the default mempool of a device - * - * The default mempool of a device contains device memory from that device. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev); - -/** - * \brief Blocks until remote writes are visible to the specified scope - * - * Blocks until GPUDirect RDMA writes to the target context via mappings - * created through APIs like nvidia_p2p_get_pages (see - * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are - * visible to the specified scope. - * - * If the scope equals or lies within the scope indicated by - * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call - * will be a no-op and can be safely omitted for performance. This can be - * determined by comparing the numerical values between the two enums, with - * smaller scopes having smaller values. - * - * Users may query support for this API via - * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS. - * - * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget - * \param scope - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * \notefnerr - * - */ -CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope); - -/** @} */ /* END CUDA_DEVICE */ - -/** - * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED] - * - * ___MANBRIEF___ deprecated device management functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the device management functions of the low-level - * CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Returns properties for a selected device - * - * \deprecated - * - * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute(). - * - * Returns in \p *prop the properties of device \p dev. The ::CUdevprop - * structure is defined as: - * - * \code - typedef struct CUdevprop_st { - int maxThreadsPerBlock; - int maxThreadsDim[3]; - int maxGridSize[3]; - int sharedMemPerBlock; - int totalConstantMemory; - int SIMDWidth; - int memPitch; - int regsPerBlock; - int clockRate; - int textureAlign - } CUdevprop; - * \endcode - * where: - * - * - ::maxThreadsPerBlock is the maximum number of threads per block; - * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block; - * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid; - * - ::sharedMemPerBlock is the total amount of shared memory available per - * block in bytes; - * - ::totalConstantMemory is the total amount of constant memory available on - * the device in bytes; - * - ::SIMDWidth is the warp size; - * - ::memPitch is the maximum pitch allowed by the memory copy functions that - * involve memory regions allocated through ::cuMemAllocPitch(); - * - ::regsPerBlock is the total number of registers available per block; - * - ::clockRate is the clock frequency in kilohertz; - * - ::textureAlign is the alignment requirement; texture base addresses that - * are aligned to ::textureAlign bytes do not need an offset applied to - * texture fetches. - * - * \param prop - Returned properties of device - * \param dev - Device to get properties for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); - -/** - * \brief Returns the compute capability of the device - * - * \deprecated - * - * This function was deprecated as of CUDA 5.0 and its functionality superceded - * by ::cuDeviceGetAttribute(). - * - * Returns in \p *major and \p *minor the major and minor revision numbers that - * define the compute capability of the device \p dev. - * - * \param major - Major revision number - * \param minor - Minor revision number - * \param dev - Device handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); - -/** @} */ /* END CUDA_DEVICE_DEPRECATED */ - -/** - * \defgroup CUDA_PRIMARY_CTX Primary Context Management - * - * ___MANBRIEF___ primary context management functions of the low-level CUDA driver - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the primary context management functions of the low-level - * CUDA driver application programming interface. - * - * The primary context is unique per device and shared with the CUDA runtime API. - * These functions allow integration with other libraries using CUDA. - * - * @{ - */ - -/** - * \brief Retain the primary context on the GPU - * - * Retains the primary context on the device. - * Once the user successfully retains the primary context, the primary context - * will be active and available to the user until the user releases it - * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset(). - * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack. - * - * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN - * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function - * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to - * determine the compute mode of the device. - * The nvidia-smi tool can be used to set the compute mode for - * devices. Documentation for nvidia-smi can be obtained by passing a - * -h option to it. - * - * Please note that the primary context always supports pinned allocations. Other - * flags can be specified by ::cuDevicePrimaryCtxSetFlags(). - * - * \param pctx - Returned context handle of the new context - * \param dev - Device for which primary context is requested - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa ::cuDevicePrimaryCtxRelease, - * ::cuDevicePrimaryCtxSetFlags, - * ::cuCtxCreate, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev); - -/** - * \brief Release the primary context on the GPU - * - * Releases the primary context interop on the device. - * A retained context should always be released once the user is done using - * it. The context is automatically reset once the last reference to it is - * released. This behavior is different when the primary context was retained - * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary - * context remains always active. - * - * Releasing a primary context that has not been previously retained will - * fail with ::CUDA_ERROR_INVALID_CONTEXT. - * - * Please note that unlike ::cuCtxDestroy() this method does not pop the context - * from stack in any circumstances. - * - * \param dev - Device which primary context is released - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuDevicePrimaryCtxRetain, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); - -/** - * \brief Set flags for the primary context - * - * Sets the flags for the primary context on the device overwriting perviously - * set ones. - * - * The three LSBs of the \p flags parameter can be used to control how the OS - * thread, which owns the CUDA context at the time of an API call, interacts - * with the OS scheduler when waiting for results from the GPU. Only one of - * the scheduling flags can be set when creating a context. - * - * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for - * results from the GPU. This can decrease latency when waiting for the GPU, - * but may lower the performance of CPU threads if they are performing work in - * parallel with the CUDA thread. - * - * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for - * results from the GPU. This can increase latency when waiting for the GPU, - * but can increase the performance of CPU threads performing work in parallel - * with the GPU. - * - * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work. - * - * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work.
- * Deprecated: This flag was deprecated as of CUDA 4.0 and was - * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. - * - * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, - * uses a heuristic based on the number of active CUDA contexts in the - * process \e C and the number of logical processors in the system \e P. If - * \e C > \e P, then CUDA will yield to other OS threads when waiting for - * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while - * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). - * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on - * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC - * for low-powered devices. - * - * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory - * after resizing local memory for a kernel. This can prevent thrashing by - * local memory allocations when launching many kernels with high local - * memory usage at the cost of potentially increased memory usage.
- * Deprecated: This flag is deprecated and the behavior enabled - * by this flag is now the default and cannot be disabled. - * - * \param dev - Device for which the primary context flags are set - * \param flags - New flags for the device - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_VALUE, - * \notefnerr - * - * \sa ::cuDevicePrimaryCtxRetain, - * ::cuDevicePrimaryCtxGetState, - * ::cuCtxCreate, - * ::cuCtxGetFlags, - * ::cudaSetDeviceFlags - */ -CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); - -/** - * \brief Get the state of the primary context - * - * Returns in \p *flags the flags for the primary context of \p dev, and in - * \p *active whether it is active. See ::cuDevicePrimaryCtxSetFlags for flag - * values. - * - * \param dev - Device to get primary context flags for - * \param flags - Pointer to store flags - * \param active - Pointer to store context state; 0 = inactive, 1 = active - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_VALUE, - * \notefnerr - * - * \sa - * ::cuDevicePrimaryCtxSetFlags, - * ::cuCtxGetFlags, - * ::cudaGetDeviceFlags - */ -CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active); - -/** - * \brief Destroy all allocations and reset all state on the primary context - * - * Explicitly destroys and cleans up all resources associated with the current - * device in the current process. - * - * Note that it is responsibility of the calling function to ensure that no - * other module in the process is using the device any more. For that reason - * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. - * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() - * even after resetting the device. - * Resetting the primary context does not release it, an application that has - * retained the primary context should explicitly release its usage. - * - * \param dev - Device for which primary context is destroyed - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE - * \notefnerr - * - * \sa ::cuDevicePrimaryCtxRetain, - * ::cuDevicePrimaryCtxRelease, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cudaDeviceReset - */ -CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); - -/** @} */ /* END CUDA_PRIMARY_CTX */ - -/** - * \brief Returns information about the execution affinity support of the device. - * - * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev. - * The supported types are: - * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device, - * or 0 if not; - * - * \param pi - 1 if the execution affinity type \p type is supported by the device, or 0 if not - * \param type - Execution affinity type to query - * \param dev - Device handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGetAttribute, - * ::cuDeviceGetCount, - * ::cuDeviceGetName, - * ::cuDeviceGetUuid, - * ::cuDeviceGet, - * ::cuDeviceTotalMem - */ -CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev); - -/** - * \defgroup CUDA_CTX Context Management - * - * ___MANBRIEF___ context management functions of the low-level CUDA driver - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the context management functions of the low-level - * CUDA driver application programming interface. - * - * Please note that some functions are described in - * \ref CUDA_PRIMARY_CTX "Primary Context Management" section. - * - * @{ - */ - -/** - * \brief Create a CUDA context - * - * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain. - * - * Creates a new CUDA context and associates it with the calling thread. The - * \p flags parameter is described below. The context is created with a usage - * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or - * when done using the context. If a context is already current to the thread, - * it is supplanted by the newly created context and may be restored by a subsequent - * call to ::cuCtxPopCurrent(). - * - * The three LSBs of the \p flags parameter can be used to control how the OS - * thread, which owns the CUDA context at the time of an API call, interacts - * with the OS scheduler when waiting for results from the GPU. Only one of - * the scheduling flags can be set when creating a context. - * - * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for - * results from the GPU. This can decrease latency when waiting for the GPU, - * but may lower the performance of CPU threads if they are performing work in - * parallel with the CUDA thread. - * - * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for - * results from the GPU. This can increase latency when waiting for the GPU, - * but can increase the performance of CPU threads performing work in parallel - * with the GPU. - * - * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work. - * - * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work.
- * Deprecated: This flag was deprecated as of CUDA 4.0 and was - * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. - * - * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, - * uses a heuristic based on the number of active CUDA contexts in the - * process \e C and the number of logical processors in the system \e P. If - * \e C > \e P, then CUDA will yield to other OS threads when waiting for - * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while - * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). - * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on - * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC - * for low-powered devices. - * - * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. - * This flag must be set in order to allocate pinned host memory that is - * accessible to the GPU. - * - * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory - * after resizing local memory for a kernel. This can prevent thrashing by - * local memory allocations when launching many kernels with high local - * memory usage at the cost of potentially increased memory usage.
- * Deprecated: This flag is deprecated and the behavior enabled - * by this flag is now the default and cannot be disabled. - * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). - * - * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of - * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() - * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the - * compute mode of the device. The nvidia-smi tool can be used to set - * the compute mode for * devices. - * Documentation for nvidia-smi can be obtained by passing a - * -h option to it. - * - * \param pctx - Returned context handle of the new context - * \param flags - Context creation flags - * \param dev - Device to create context on - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); - -/** - * \brief Create a CUDA context with execution affinity - * - * Creates a new CUDA context with execution affinity and associates it with - * the calling thread. The \p paramsArray and \p flags parameter are described below. - * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must - * call ::cuCtxDestroy() or when done using the context. If a context is already - * current to the thread, it is supplanted by the newly created context and may - * be restored by a subsequent call to ::cuCtxPopCurrent(). - * - * The type and the amount of execution resource the context can use is limited by \p paramsArray - * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams - * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type, - * the latter execution affinity parameter overrides the former execution affinity parameter. - * The supported execution affinity types are: - * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion - * of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally - * rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution - * affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute - * is only supported under Volta+ MPS. - * - * The three LSBs of the \p flags parameter can be used to control how the OS - * thread, which owns the CUDA context at the time of an API call, interacts - * with the OS scheduler when waiting for results from the GPU. Only one of - * the scheduling flags can be set when creating a context. - * - * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for - * results from the GPU. This can decrease latency when waiting for the GPU, - * but may lower the performance of CPU threads if they are performing work in - * parallel with the CUDA thread. - * - * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for - * results from the GPU. This can increase latency when waiting for the GPU, - * but can increase the performance of CPU threads performing work in parallel - * with the GPU. - * - * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work. - * - * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work.
- * Deprecated: This flag was deprecated as of CUDA 4.0 and was - * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. - * - * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, - * uses a heuristic based on the number of active CUDA contexts in the - * process \e C and the number of logical processors in the system \e P. If - * \e C > \e P, then CUDA will yield to other OS threads when waiting for - * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while - * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). - * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on - * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC - * for low-powered devices. - * - * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. - * This flag must be set in order to allocate pinned host memory that is - * accessible to the GPU. - * - * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory - * after resizing local memory for a kernel. This can prevent thrashing by - * local memory allocations when launching many kernels with high local - * memory usage at the cost of potentially increased memory usage.
- * Deprecated: This flag is deprecated and the behavior enabled - * by this flag is now the default and cannot be disabled. - * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). - * - * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of - * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() - * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the - * compute mode of the device. The nvidia-smi tool can be used to set - * the compute mode for * devices. - * Documentation for nvidia-smi can be obtained by passing a - * -h option to it. - * - * \param pctx - Returned context handle of the new context - * \param paramsArray - Execution affinity parameters - * \param numParams - Number of execution affinity parameters - * \param flags - Context creation flags - * \param dev - Device to create context on - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::CUexecAffinityParam - */ -CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev); - -/** - * \brief Destroy a CUDA context - * - * Destroys the CUDA context specified by \p ctx. The context \p ctx will be - * destroyed regardless of how many threads it is current to. - * It is the responsibility of the calling function to ensure that no API - * call issues using \p ctx while ::cuCtxDestroy() is executing. - * - * Destroys and cleans up all resources associated with the context. - * It is the caller's responsibility to ensure that the context or its resources - * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior. - * These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent, - * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref, - * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore. - * - * If \p ctx is current to the calling thread then \p ctx will also be - * popped from the current thread's context stack (as though ::cuCtxPopCurrent() - * were called). If \p ctx is current to other threads, then \p ctx will - * remain current to those threads, and attempting to access \p ctx from - * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED. - * - * \param ctx - Context to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); - -/** - * \brief Pushes a context on the current CPU thread - * - * Pushes the given context \p ctx onto the CPU thread's stack of current - * contexts. The specified context becomes the CPU thread's current context, so - * all CUDA functions that operate on the current context are affected. - * - * The previous current context may be made current again by calling - * ::cuCtxDestroy() or ::cuCtxPopCurrent(). - * - * \param ctx - Context to push - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); - -/** - * \brief Pops the current CUDA context from the current CPU thread. - * - * Pops the current CUDA context from the CPU thread and passes back the - * old context handle in \p *pctx. That context may then be made current - * to a different CPU thread by calling ::cuCtxPushCurrent(). - * - * If a context was current to the CPU thread before ::cuCtxCreate() or - * ::cuCtxPushCurrent() was called, this function makes that context current to - * the CPU thread again. - * - * \param pctx - Returned new context handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); - -/** - * \brief Binds the specified CUDA context to the calling CPU thread - * - * Binds the specified CUDA context to the calling CPU thread. - * If \p ctx is NULL then the CUDA context previously bound to the - * calling CPU thread is unbound and ::CUDA_SUCCESS is returned. - * - * If there exists a CUDA context stack on the calling CPU thread, this - * will replace the top of that stack with \p ctx. - * If \p ctx is NULL then this will be equivalent to popping the top - * of the calling CPU thread's CUDA context stack (or a no-op if the - * calling CPU thread's CUDA context stack is empty). - * - * \param ctx - Context to bind to the calling CPU thread - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa - * ::cuCtxGetCurrent, - * ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cudaSetDevice - */ -CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); - -/** - * \brief Returns the CUDA context bound to the calling CPU thread. - * - * Returns in \p *pctx the CUDA context bound to the calling CPU thread. - * If no context is bound to the calling CPU thread then \p *pctx is - * set to NULL and ::CUDA_SUCCESS is returned. - * - * \param pctx - Returned context handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * \notefnerr - * - * \sa - * ::cuCtxSetCurrent, - * ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cudaGetDevice - */ -CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx); - -/** - * \brief Returns the device ID for the current context - * - * Returns in \p *device the ordinal of the current context's device. - * - * \param device - Returned device ID for the current context - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cudaGetDevice - */ -CUresult CUDAAPI cuCtxGetDevice(CUdevice *device); - -/** - * \brief Returns the flags for the current context - * - * Returns in \p *flags the flags of the current context. See ::cuCtxCreate - * for flag values. - * - * \param flags - Pointer to store flags of current context - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetCurrent, - * ::cuCtxGetDevice, - * ::cuCtxGetLimit, - * ::cuCtxGetSharedMemConfig, - * ::cuCtxGetStreamPriorityRange, - * ::cudaGetDeviceFlags - */ -CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags); - -/** - * \brief Block for a context's tasks to complete - * - * Blocks until the device has completed all preceding requested tasks. - * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed. - * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the - * CPU thread will block until the GPU context has finished its work. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cudaDeviceSynchronize - */ -CUresult CUDAAPI cuCtxSynchronize(void); - -/** - * \brief Set resource limits - * - * Setting \p limit to \p value is a request by the application to update - * the current limit maintained by the context. The driver is free to - * modify the requested value to meet h/w requirements (this could be - * clamping to minimum or maximum values, rounding up to nearest element - * size, etc). The application can use ::cuCtxGetLimit() to find out exactly - * what the limit has been set to. - * - * Setting each ::CUlimit has its own specific restrictions, so each is - * discussed here. - * - * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread. - * The driver automatically increases the per-thread stack size - * for each kernel launch as needed. This size isn't reset back to the - * original value after each launch. Setting this value will take effect - * immediately, and if necessary, the device will block until all preceding - * requested tasks are complete. - * - * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used - * by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE - * must be performed before launching any kernel that uses the ::printf() - * device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned. - * - * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used - * by the ::malloc() and ::free() device system calls. Setting - * ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel - * that uses the ::malloc() or ::free() device system calls, otherwise - * ::CUDA_ERROR_INVALID_VALUE will be returned. - * - * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of - * a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting - * this limit must be performed before any launch of a kernel that uses the - * device runtime and calls ::cudaDeviceSynchronize() above the default sync - * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail - * with error code ::cudaErrorSyncDepthExceeded if the limitation is - * violated. This limit can be set smaller than the default or up the maximum - * launch depth of 24. When setting this limit, keep in mind that additional - * levels of sync depth require the driver to reserve large amounts of device - * memory which can no longer be used for user allocations. If these - * reservations of device memory fail, ::cuCtxSetLimit() will return - * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. - * This limit is only applicable to devices of compute capability 3.5 and - * higher. Attempting to set this limit on devices of compute capability less - * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being - * returned. - * - * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of - * outstanding device runtime launches that can be made from the current - * context. A grid is outstanding from the point of launch up until the grid - * is known to have been completed. Device runtime launches which violate - * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when - * ::cudaGetLastError() is called after launch. If more pending launches than - * the default (2048 launches) are needed for a module using the device - * runtime, this limit can be increased. Keep in mind that being able to - * sustain additional pending launches will require the driver to reserve - * larger amounts of device memory upfront which can no longer be used for - * allocations. If these reservations fail, ::cuCtxSetLimit() will return - * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. - * This limit is only applicable to devices of compute capability 3.5 and - * higher. Attempting to set this limit on devices of compute capability less - * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being - * returned. - * - * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity. - * Values can range from 0B to 128B. This is purely a performence hint and - * it can be ignored or clamped depending on the platform. - * - * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes availabe for - * persisting L2 cache. This is purely a performance hint and it can be - * ignored or clamped depending on the platform. - * - * \param limit - Limit to set - * \param value - Size of limit - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNSUPPORTED_LIMIT, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSynchronize, - * ::cudaDeviceSetLimit - */ -CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value); - -/** - * \brief Returns resource limits - * - * Returns in \p *pvalue the current size of \p limit. The supported - * ::CUlimit values are: - * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread. - * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the - * ::printf() device system call. - * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the - * ::malloc() and ::free() device system calls. - * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread - * can issue the device runtime call ::cudaDeviceSynchronize() to wait on - * child grid launches to complete. - * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding - * device runtime launches that can be made from this context. - * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity. - * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes - * - * \param limit - Limit to query - * \param pvalue - Returned size of limit - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNSUPPORTED_LIMIT - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cudaDeviceGetLimit - */ -CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit); - -/** - * \brief Returns the preferred cache configuration for the current context. - * - * On devices where the L1 cache and shared memory use the same hardware - * resources, this function returns through \p pconfig the preferred cache configuration - * for the current context. This is only a preference. The driver will use - * the requested configuration if possible, but it is free to choose a different - * configuration if required to execute functions. - * - * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices - * where the size of the L1 cache and shared memory are fixed. - * - * The supported cache configurations are: - * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) - * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache - * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory - * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory - * - * \param pconfig - Returned cache configuration - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cuFuncSetCacheConfig, - * ::cudaDeviceGetCacheConfig - */ -CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig); - -/** - * \brief Sets the preferred cache configuration for the current context. - * - * On devices where the L1 cache and shared memory use the same hardware - * resources, this sets through \p config the preferred cache configuration for - * the current context. This is only a preference. The driver will use - * the requested configuration if possible, but it is free to choose a different - * configuration if required to execute the function. Any function preference - * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide - * setting. Setting the context-wide cache configuration to - * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer - * to not change the cache configuration unless required to launch the kernel. - * - * This setting does nothing on devices where the size of the L1 cache and - * shared memory are fixed. - * - * Launching a kernel with a different preference than the most recent - * preference setting may insert a device-side synchronization point. - * - * The supported cache configurations are: - * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) - * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache - * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory - * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory - * - * \param config - Requested cache configuration - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cuFuncSetCacheConfig, - * ::cudaDeviceSetCacheConfig - */ -CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config); - -/** - * \brief Returns the current shared memory configuration for the current context. - * - * This function will return in \p pConfig the current size of shared memory banks - * in the current context. On devices with configurable shared memory banks, - * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all - * subsequent kernel launches will by default use the new bank size. When - * ::cuCtxGetSharedMemConfig is called on devices without configurable shared - * memory, it will return the fixed bank size of the hardware. - * - * The returned bank configurations can be either: - * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is - * four bytes. - * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will - * eight bytes. - * - * \param pConfig - returned shared memory configuration - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cuCtxGetSharedMemConfig, - * ::cuFuncSetCacheConfig, - * ::cudaDeviceGetSharedMemConfig - */ -CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig); - -/** - * \brief Sets the shared memory configuration for the current context. - * - * On devices with configurable shared memory banks, this function will set - * the context's shared memory bank size which is used for subsequent kernel - * launches. - * - * Changed the shared memory configuration between launches may insert a device - * side synchronization point between those launches. - * - * Changing the shared memory bank size will not increase shared memory usage - * or affect occupancy of kernels, but may have major effects on performance. - * Larger bank sizes will allow for greater potential bandwidth to shared memory, - * but will change what kinds of accesses to shared memory will result in bank - * conflicts. - * - * This function will do nothing on devices with fixed shared memory bank size. - * - * The supported bank configurations are: - * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial - * setting (currently, four bytes). - * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to - * be natively four bytes. - * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to - * be natively eight bytes. - * - * \param config - requested shared memory configuration - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cuCtxGetSharedMemConfig, - * ::cuFuncSetCacheConfig, - * ::cudaDeviceSetSharedMemConfig - */ -CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config); - -/** - * \brief Gets the context's API version. - * - * Returns a version number in \p version corresponding to the capabilities of - * the context (e.g. 3010 or 3020), which library developers can use to direct - * callers to a specific API version. If \p ctx is NULL, returns the API version - * used to create the currently bound context. - * - * Note that new API versions are only introduced when context capabilities are - * changed that break binary compatibility, so the API version and driver version - * may be different. For example, it is valid for the API version to be 3020 while - * the driver version is 4020. - * - * \param ctx - Context to check - * \param version - Pointer to version - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version); - -/** - * \brief Returns numerical values that correspond to the least and - * greatest stream priorities. - * - * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond - * to the least and greatest stream priorities respectively. Stream priorities - * follow a convention where lower numbers imply greater priorities. The range of - * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. - * If the user attempts to create a stream with a priority value that is - * outside the meaningful range as specified by this API, the priority is - * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority - * respectively. See ::cuStreamCreateWithPriority for details on creating a - * priority stream. - * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value - * is not desired. - * - * This function will return '0' in both \p *leastPriority and \p *greatestPriority if - * the current context's device does not support stream priorities - * (see ::cuDeviceGetAttribute). - * - * \param leastPriority - Pointer to an int in which the numerical value for least - * stream priority is returned - * \param greatestPriority - Pointer to an int in which the numerical value for greatest - * stream priority is returned - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \notefnerr - * - * \sa ::cuStreamCreateWithPriority, - * ::cuStreamGetPriority, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize, - * ::cudaDeviceGetStreamPriorityRange - */ -CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority); - -/** - * \brief Resets all persisting lines in cache to normal status. - * - * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal - * status. Takes effect on function return. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuCtxResetPersistingL2Cache(void); - -/** - * \brief Returns the execution affinity setting for the current context. - * - * Returns in \p *pExecAffinity the current value of \p type. The supported - * ::CUexecAffinityType values are: - * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use. - * - * \param type - Execution affinity type to query - * \param pExecAffinity - Returned execution affinity - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY - * \notefnerr - * - * \sa - * ::CUexecAffinityParam - */ -CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type); - - -/** @} */ /* END CUDA_CTX */ - -/** - * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED] - * - * ___MANBRIEF___ deprecated context management functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the deprecated context management functions of the low-level - * CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Increment a context's usage-count - * - * \deprecated - * - * Note that this function is deprecated and should not be used. - * - * Increments the usage count of the context and passes back a context handle - * in \p *pctx that must be passed to ::cuCtxDetach() when the application is - * done with the context. ::cuCtxAttach() fails if there is no context current - * to the thread. - * - * Currently, the \p flags parameter must be 0. - * - * \param pctx - Returned context handle of the current context - * \param flags - Context attach flags (must be 0) - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxDetach, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); - -/** - * \brief Decrement a context's usage-count - * - * \deprecated - * - * Note that this function is deprecated and should not be used. - * - * Decrements the usage count of the context \p ctx, and destroys the context - * if the usage count goes to 0. The context must be a handle that was passed - * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the - * calling thread. - * - * \param ctx - Context to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuCtxCreate, - * ::cuCtxDestroy, - * ::cuCtxGetApiVersion, - * ::cuCtxGetCacheConfig, - * ::cuCtxGetDevice, - * ::cuCtxGetFlags, - * ::cuCtxGetLimit, - * ::cuCtxPopCurrent, - * ::cuCtxPushCurrent, - * ::cuCtxSetCacheConfig, - * ::cuCtxSetLimit, - * ::cuCtxSynchronize - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx); - -/** @} */ /* END CUDA_CTX_DEPRECATED */ - - -/** - * \defgroup CUDA_MODULE Module Management - * - * ___MANBRIEF___ module management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the module management functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Loads a compute module - * - * Takes a filename \p fname and loads the corresponding module \p module into - * the current context. The CUDA driver API does not attempt to lazily - * allocate the resources needed by a module; if the memory for functions and - * data (constant and global) needed by the module cannot be allocated, - * ::cuModuleLoad() fails. The file should be a \e cubin file as output by - * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or - * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later. - * - * \param module - Returned module - * \param fname - Filename of module to load - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_PTX, - * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, - * ::CUDA_ERROR_NOT_FOUND, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_FILE_NOT_FOUND, - * ::CUDA_ERROR_NO_BINARY_FOR_GPU, - * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, - * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload - */ -CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); - -/** - * \brief Load a module's data - * - * Takes a pointer \p image and loads the corresponding module \p module into - * the current context. The pointer may be obtained by mapping a \e cubin or - * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file - * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin - * object into the executable resources and using operating system calls such - * as Windows \c FindResource() to obtain the pointer. - * - * \param module - Returned module - * \param image - Module data to load - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_PTX, - * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_NO_BINARY_FOR_GPU, - * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, - * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload - */ -CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image); - -/** - * \brief Load a module's data with options - * - * Takes a pointer \p image and loads the corresponding module \p module into - * the current context. The pointer may be obtained by mapping a \e cubin or - * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file - * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin - * object into the executable resources and using operating system calls such - * as Windows \c FindResource() to obtain the pointer. Options are passed as - * an array via \p options and any corresponding parameters are passed in - * \p optionValues. The number of total options is supplied via \p numOptions. - * Any outputs will be returned via \p optionValues. - * - * \param module - Returned module - * \param image - Module data to load - * \param numOptions - Number of options - * \param options - Options for JIT - * \param optionValues - Option values for JIT - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_PTX, - * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_NO_BINARY_FOR_GPU, - * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, - * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload - */ -CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); - -/** - * \brief Load a module's data - * - * Takes a pointer \p fatCubin and loads the corresponding module \p module - * into the current context. The pointer represents a fat binary object, - * which is a collection of different \e cubin and/or \e PTX files, all - * representing the same device code, but compiled and optimized for different - * architectures. - * - * Prior to CUDA 4.0, there was no documented API for constructing and using - * fat binary objects by programmers. Starting with CUDA 4.0, fat binary - * objects can be constructed by providing the -fatbin option to \b nvcc. - * More information can be found in the \b nvcc document. - * - * \param module - Returned module - * \param fatCubin - Fat binary to load - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_PTX, - * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, - * ::CUDA_ERROR_NOT_FOUND, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_NO_BINARY_FOR_GPU, - * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, - * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleUnload - */ -CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); - -/** - * \brief Unloads a module - * - * Unloads a module \p hmod from the current context. - * - * \param hmod - Module to unload - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_destroy_ub - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary - */ -CUresult CUDAAPI cuModuleUnload(CUmodule hmod); - -/** - * \brief Returns a function handle - * - * Returns in \p *hfunc the handle of the function of name \p name located in - * module \p hmod. If no function of that name exists, ::cuModuleGetFunction() - * returns ::CUDA_ERROR_NOT_FOUND. - * - * \param hfunc - Returned function handle - * \param hmod - Module to retrieve function from - * \param name - Name of function to retrieve - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload - */ -CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); - -/** - * \brief Returns a global pointer from a module - * - * Returns in \p *dptr and \p *bytes the base pointer and size of the - * global of name \p name located in module \p hmod. If no variable of that name - * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both - * parameters \p dptr and \p bytes are optional. If one of them is - * NULL, it is ignored. - * - * \param dptr - Returned global device pointer - * \param bytes - Returned global size in bytes - * \param hmod - Module to retrieve global from - * \param name - Name of global to retrieve - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload, - * ::cudaGetSymbolAddress, - * ::cudaGetSymbolSize - */ -CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); - -/** - * \brief Returns a handle to a texture reference - * - * Returns in \p *pTexRef the handle of the texture reference of name \p name - * in the module \p hmod. If no texture reference of that name exists, - * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference - * handle should not be destroyed, since it will be destroyed when the module - * is unloaded. - * - * \param pTexRef - Returned texture reference - * \param hmod - Module to retrieve texture reference from - * \param name - Name of texture reference to retrieve - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetSurfRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload, - * ::cudaGetTextureReference - */ -CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); - -/** - * \brief Returns a handle to a surface reference - * - * Returns in \p *pSurfRef the handle of the surface reference of name \p name - * in the module \p hmod. If no surface reference of that name exists, - * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND. - * - * \param pSurfRef - Returned surface reference - * \param hmod - Module to retrieve surface reference from - * \param name - Name of surface reference to retrieve - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_FOUND - * \notefnerr - * - * \sa ::cuModuleGetFunction, - * ::cuModuleGetGlobal, - * ::cuModuleGetTexRef, - * ::cuModuleLoad, - * ::cuModuleLoadData, - * ::cuModuleLoadDataEx, - * ::cuModuleLoadFatBinary, - * ::cuModuleUnload, - * ::cudaGetSurfaceReference - */ -CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); - -/** - * \brief Creates a pending JIT linker invocation. - * - * If the call is successful, the caller owns the returned CUlinkState, which - * should eventually be destroyed with ::cuLinkDestroy. The - * device code machine size (32 or 64 bit) will match the calling application. - * - * Both linker and compiler options may be specified. Compiler options will - * be applied to inputs to this linker action which must be compiled from PTX. - * The options ::CU_JIT_WALL_TIME, - * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES - * will accumulate data until the CUlinkState is destroyed. - * - * \p optionValues must remain valid for the life of the CUlinkState if output - * options are used. No other references to inputs are maintained after this - * call returns. - * - * \param numOptions Size of options arrays - * \param options Array of linker and compiler options - * \param optionValues Array of option values, each cast to void * - * \param stateOut On success, this will contain a CUlinkState to specify - * and complete this action - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND - * \notefnerr - * - * \sa ::cuLinkAddData, - * ::cuLinkAddFile, - * ::cuLinkComplete, - * ::cuLinkDestroy - */ -CUresult CUDAAPI -cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); - -/** - * \brief Add an input to a pending linker invocation - * - * Ownership of \p data is retained by the caller. No reference is retained to any - * inputs after this call returns. - * - * This method accepts only compiler options, which are used if the data must - * be compiled from PTX, and does not accept any of - * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, - * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. - * - * \param state A pending linker action. - * \param type The type of the input data. - * \param data The input data. PTX must be NULL-terminated. - * \param size The length of the input data. - * \param name An optional name for this input in log messages. - * \param numOptions Size of options. - * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate). - * \param optionValues Array of option values, each cast to void *. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_IMAGE, - * ::CUDA_ERROR_INVALID_PTX, - * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_NO_BINARY_FOR_GPU - * - * \sa ::cuLinkCreate, - * ::cuLinkAddFile, - * ::cuLinkComplete, - * ::cuLinkDestroy - */ -CUresult CUDAAPI -cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, - unsigned int numOptions, CUjit_option *options, void **optionValues); - -/** - * \brief Add a file input to a pending linker invocation - * - * No reference is retained to any inputs after this call returns. - * - * This method accepts only compiler options, which are used if the input - * must be compiled from PTX, and does not accept any of - * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, - * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. - * - * This method is equivalent to invoking ::cuLinkAddData on the contents - * of the file. - * - * \param state A pending linker action - * \param type The type of the input data - * \param path Path to the input file - * \param numOptions Size of options - * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate) - * \param optionValues Array of option values, each cast to void * - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_FILE_NOT_FOUND - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_IMAGE, - * ::CUDA_ERROR_INVALID_PTX, - * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_NO_BINARY_FOR_GPU - * - * \sa ::cuLinkCreate, - * ::cuLinkAddData, - * ::cuLinkComplete, - * ::cuLinkDestroy - */ -CUresult CUDAAPI -cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, - unsigned int numOptions, CUjit_option *options, void **optionValues); - -/** - * \brief Complete a pending linker invocation - * - * Completes the pending linker action and returns the cubin image for the linked - * device code, which can be used with ::cuModuleLoadData. The cubin is owned by - * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy. - * This call does not destroy \p state. - * - * \param state A pending linker invocation - * \param cubinOut On success, this will point to the output image - * \param sizeOut Optional parameter to receive the size of the generated image - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuLinkCreate, - * ::cuLinkAddData, - * ::cuLinkAddFile, - * ::cuLinkDestroy, - * ::cuModuleLoadData - */ -CUresult CUDAAPI -cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut); - -/** - * \brief Destroys state for a JIT linker invocation. - * - * \param state State object for the linker invocation - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_HANDLE - * - * \sa ::cuLinkCreate - */ -CUresult CUDAAPI -cuLinkDestroy(CUlinkState state); - -/** @} */ /* END CUDA_MODULE */ - - -/** - * \defgroup CUDA_MEM Memory Management - * - * ___MANBRIEF___ memory management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the memory management functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Gets free and total memory - * - * Returns in \p *total the total amount of memory available to the the current context. - * Returns in \p *free the amount of memory on the device that is free according to the OS. - * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free. - * - * \param free - Returned free memory in bytes - * \param total - Returned total memory in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemGetInfo - */ -CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total); - -/** - * \brief Allocates device memory - * - * Allocates \p bytesize bytes of linear memory on the device and returns in - * \p *dptr a pointer to the allocated memory. The allocated memory is suitably - * aligned for any kind of variable. The memory is not cleared. If \p bytesize - * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE. - * - * \param dptr - Returned device pointer - * \param bytesize - Requested allocation size in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMalloc - */ -CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); - -/** - * \brief Allocates pitched device memory - * - * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on - * the device and returns in \p *dptr a pointer to the allocated memory. The - * function may pad the allocation to ensure that corresponding pointers in - * any given row will continue to meet the alignment requirements for - * coalescing as the address is updated from row to row. \p ElementSizeBytes - * specifies the size of the largest reads and writes that will be performed - * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced - * memory transactions are not possible on other data sizes). If - * \p ElementSizeBytes is smaller than the actual read/write size of a kernel, - * the kernel will run correctly, but possibly at reduced speed. The pitch - * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the - * allocation. The intended usage of pitch is as a separate parameter of the - * allocation, used to compute addresses within the 2D array. Given the row - * and column of an array element of type \b T, the address is computed as: - * \code - T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column; - * \endcode - * - * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with - * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is - * recommended that programmers consider performing pitch allocations using - * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is - * especially true if the application will be performing 2D memory copies - * between different regions of device memory (whether linear memory or CUDA - * arrays). - * - * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed - * to match or exceed the alignment requirement for texture binding with - * ::cuTexRefSetAddress2D(). - * - * \param dptr - Returned device pointer - * \param pPitch - Returned pitch of allocation in bytes - * \param WidthInBytes - Requested allocation width in bytes - * \param Height - Requested allocation height in rows - * \param ElementSizeBytes - Size of largest reads/writes for range - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMallocPitch - */ -CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); - -/** - * \brief Frees device memory - * - * Frees the memory space pointed to by \p dptr, which must have been returned - * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch(). - * - * \param dptr - Pointer to memory to free - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaFree - */ -CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); - -/** - * \brief Get information on memory allocations - * - * Returns the base address in \p *pbase and size in \p *psize of the - * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input - * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one - * of them is NULL, it is ignored. - * - * \param pbase - Returned base address - * \param psize - Returned size of device memory allocation - * \param dptr - Device pointer to query - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_NOT_FOUND, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 - */ -CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); - -/** - * \brief Allocates page-locked host memory - * - * Allocates \p bytesize bytes of host memory that is page-locked and - * accessible to the device. The driver tracks the virtual memory ranges - * allocated with this function and automatically accelerates calls to - * functions such as ::cuMemcpy(). Since the memory can be accessed directly by - * the device, it can be read or written with much higher bandwidth than - * pageable memory obtained with functions such as ::malloc(). Allocating - * excessive amounts of memory with ::cuMemAllocHost() may degrade system - * performance, since it reduces the amount of memory available to the system - * for paging. As a result, this function is best used sparingly to allocate - * staging areas for data exchange between host and device. - * - * Note all host memory allocated using ::cuMemHostAlloc() will automatically - * be immediately accessible to all contexts on all devices which support unified - * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). - * The device pointer that may be used to access this host memory from those - * contexts is always equal to the returned host pointer \p *pp. - * See \ref CUDA_UNIFIED for additional details. - * - * \param pp - Returned host pointer to page-locked memory - * \param bytesize - Requested allocation size in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMallocHost - */ -CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize); - -/** - * \brief Frees page-locked host memory - * - * Frees the memory space pointed to by \p p, which must have been returned by - * a previous call to ::cuMemAllocHost(). - * - * \param p - Pointer to memory to free - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaFreeHost - */ -CUresult CUDAAPI cuMemFreeHost(void *p); - -/** - * \brief Allocates page-locked host memory - * - * Allocates \p bytesize bytes of host memory that is page-locked and accessible - * to the device. The driver tracks the virtual memory ranges allocated with - * this function and automatically accelerates calls to functions such as - * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device, - * it can be read or written with much higher bandwidth than pageable memory - * obtained with functions such as ::malloc(). Allocating excessive amounts of - * pinned memory may degrade system performance, since it reduces the amount - * of memory available to the system for paging. As a result, this function is - * best used sparingly to allocate staging areas for data exchange between - * host and device. - * - * The \p Flags parameter enables different options to be specified that - * affect the allocation, as follows. - * - * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be - * considered as pinned memory by all CUDA contexts, not just the one that - * performed the allocation. - * - * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address - * space. The device pointer to the memory may be obtained by calling - * ::cuMemHostGetDevicePointer(). - * - * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined - * (WC). WC memory can be transferred across the PCI Express bus more - * quickly on some system configurations, but cannot be read efficiently by - * most CPUs. WC memory is a good option for buffers that will be written by - * the CPU and read by the GPU via mapped pinned memory or host->device - * transfers. - * - * All of these flags are orthogonal to one another: a developer may allocate - * memory that is portable, mapped and/or write-combined with no restrictions. - * - * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for - * devices that do not support mapped pinned memory. The failure is deferred - * to ::cuMemHostGetDevicePointer() because the memory may be mapped into - * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag. - * - * The memory allocated by this function must be freed with ::cuMemFreeHost(). - * - * Note all host memory allocated using ::cuMemHostAlloc() will automatically - * be immediately accessible to all contexts on all devices which support unified - * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). - * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer - * that may be used to access this host memory from those contexts is always equal - * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED - * is specified, then the function ::cuMemHostGetDevicePointer() must be used - * to query the device pointer, even if the context supports unified addressing. - * See \ref CUDA_UNIFIED for additional details. - * - * \param pp - Returned host pointer to page-locked memory - * \param bytesize - Requested allocation size in bytes - * \param Flags - Flags for allocation request - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaHostAlloc - */ -CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); - -/** - * \brief Passes back device pointer of mapped pinned memory - * - * Passes back the device pointer \p pdptr corresponding to the mapped, pinned - * host buffer \p p allocated by ::cuMemHostAlloc. - * - * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP - * flag was not specified at the time the memory was allocated, or if the - * function is called on a GPU that does not support mapped pinned memory. - * - * For devices that have a non-zero value for the device attribute - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory - * can also be accessed from the device using the host pointer \p p. - * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not - * match the original host pointer \p p and depends on the devices visible to the - * application. If all devices visible to the application have a non-zero value for the - * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() - * will match the original pointer \p p. If any device visible to the application - * has a zero value for the device attribute, the device pointer returned by - * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p, - * but it will be suitable for use on all devices provided Unified Virtual Addressing - * is enabled. In such systems, it is valid to access the memory using either pointer - * on devices that have a non-zero value for the device attribute. Note however that - * such devices should access the memory using only of the two pointers and not both. - * - * \p Flags provides for future releases. For now, it must be set to 0. - * - * \param pdptr - Returned device pointer - * \param p - Host pointer - * \param Flags - Options (must be 0) - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaHostGetDevicePointer - */ -CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); - -/** - * \brief Passes back flags that were used for a pinned allocation - * - * Passes back the flags \p pFlags that were specified when allocating - * the pinned host buffer \p p allocated by ::cuMemHostAlloc. - * - * ::cuMemHostGetFlags() will fail if the pointer does not reside in - * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc(). - * - * \param pFlags - Returned flags word - * \param p - Host pointer - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cuMemAllocHost, - * ::cuMemHostAlloc, - * ::cudaHostGetFlags - */ -CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p); - -/** - * \brief Allocates memory that will be automatically managed by the Unified Memory system - * - * Allocates \p bytesize bytes of managed memory on the device and returns in - * \p *dptr a pointer to the allocated memory. If the device doesn't support - * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support - * for managed memory can be queried using the device attribute - * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably - * aligned for any kind of variable. The memory is not cleared. If \p bytesize - * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer - * is valid on the CPU and on all GPUs in the system that support managed memory. - * All accesses to this pointer must obey the Unified Memory programming model. - * - * \p flags specifies the default stream association for this allocation. - * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If - * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from - * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the - * allocation should not be accessed from devices that have a zero value for the - * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to - * ::cuStreamAttachMemAsync will be required to enable access on such devices. - * - * If the association is later changed via ::cuStreamAttachMemAsync to - * a single stream, the default association as specifed during ::cuMemAllocManaged - * is restored when that stream is destroyed. For __managed__ variables, the - * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a - * stream is an asynchronous operation, and as a result, the change to default - * association won't happen until all work in the stream has completed. - * - * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree. - * - * Device memory oversubscription is possible for GPUs that have a non-zero value for the - * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on - * such GPUs may be evicted from device memory to host memory at any time by the Unified - * Memory driver in order to make room for other allocations. - * - * In a multi-GPU system where all GPUs have a non-zero value for the device attribute - * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this - * API returns and instead may be populated on access. In such systems, managed memory can - * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to - * maintain data locality and prevent excessive page faults to the extent possible. The application - * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application - * can also explicitly migrate memory to a desired processor's memory via - * ::cuMemPrefetchAsync. - * - * In a multi-GPU system where all of the GPUs have a zero value for the device attribute - * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support - * with each other, the physical storage for managed memory is created on the GPU which is active - * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced - * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate - * memory among such GPUs. - * - * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and - * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS - * is zero for at least one of those GPUs, the location chosen for physical storage of managed - * memory is system-dependent. - * - On Linux, the location chosen will be device memory as long as the current set of active - * contexts are on devices that either have peer-to-peer support with each other or have a - * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. - * If there is an active context on a GPU that does not have a non-zero value for that device - * attribute and it does not have peer-to-peer support with the other devices that have active - * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. - * Note that this means that managed memory that is located in device memory is migrated to - * host memory if a new context is created on a GPU that doesn't have a non-zero value for - * the device attribute and does not support peer-to-peer with at least one of the other devices - * that has an active context. This in turn implies that context creation may fail if there is - * insufficient host memory to migrate all managed allocations. - * - On Windows, the physical storage is always created in 'zero-copy' or host memory. - * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these - * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to - * restrict CUDA to only use those GPUs that have peer-to-peer support. - * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a - * non-zero value to force the driver to always use device memory for physical storage. - * When this environment variable is set to a non-zero value, all contexts created in - * that process on devices that support managed memory have to be peer-to-peer compatible - * with each other. Context creation will fail if a context is created on a device that - * supports managed memory and is not peer-to-peer compatible with any of the other - * managed memory supporting devices on which contexts were previously created, even if - * those contexts have been destroyed. These environment variables are described - * in the CUDA programming guide under the "CUDA environment variables" section. - * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. - * - * \param dptr - Returned device pointer - * \param bytesize - Requested allocation size in bytes - * \param flags - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync, - * ::cudaMallocManaged - */ -CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); - -/** - * \brief Returns a handle to a compute device - * - * Returns in \p *device a device handle given a PCI bus ID string. - * - * \param dev - Returned device handle - * - * \param pciBusId - String in one of the following forms: - * [domain]:[bus]:[device].[function] - * [domain]:[bus]:[device] - * [bus]:[device].[function] - * where \p domain, \p bus, \p device, and \p function are all hexadecimal values - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGet, - * ::cuDeviceGetAttribute, - * ::cuDeviceGetPCIBusId, - * ::cudaDeviceGetByPCIBusId - */ -CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId); - -/** - * \brief Returns a PCI Bus Id string for the device - * - * Returns an ASCII string identifying the device \p dev in the NULL-terminated - * string pointed to by \p pciBusId. \p len specifies the maximum length of the - * string that may be returned. - * - * \param pciBusId - Returned identifier string for the device in the following format - * [domain]:[bus]:[device].[function] - * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. - * pciBusId should be large enough to store 13 characters including the NULL-terminator. - * - * \param len - Maximum length of string to store in \p name - * - * \param dev - Device to get identifier string for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuDeviceGet, - * ::cuDeviceGetAttribute, - * ::cuDeviceGetByPCIBusId, - * ::cudaDeviceGetPCIBusId - */ -CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); - -/** - * \brief Gets an interprocess handle for a previously allocated event - * - * Takes as input a previously allocated event. This event must have been - * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING - * flags set. This opaque handle may be copied into other processes and - * opened with ::cuIpcOpenEventHandle to allow efficient hardware - * synchronization between GPU work in different processes. - * - * After the event has been opened in the importing process, - * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and - * ::cuEventQuery may be used in either process. Performing operations - * on the imported event after the exported event has been freed - * with ::cuEventDestroy will result in undefined behavior. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode - * - * \param pHandle - Pointer to a user allocated CUipcEventHandle - * in which to return the opaque event handle - * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and - * ::CU_EVENT_DISABLE_TIMING flags. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_MAP_FAILED, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuEventCreate, - * ::cuEventDestroy, - * ::cuEventSynchronize, - * ::cuEventQuery, - * ::cuStreamWaitEvent, - * ::cuIpcOpenEventHandle, - * ::cuIpcGetMemHandle, - * ::cuIpcOpenMemHandle, - * ::cuIpcCloseMemHandle, - * ::cudaIpcGetEventHandle - */ -CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); - -/** - * \brief Opens an interprocess event handle for use in the current process - * - * Opens an interprocess event handle exported from another process with - * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like - * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. - * This event must be freed with ::cuEventDestroy. - * - * Performing operations on the imported event after the exported event has - * been freed with ::cuEventDestroy will result in undefined behavior. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode - * - * \param phEvent - Returns the imported event - * \param handle - Interprocess handle to open - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_MAP_FAILED, - * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuEventCreate, - * ::cuEventDestroy, - * ::cuEventSynchronize, - * ::cuEventQuery, - * ::cuStreamWaitEvent, - * ::cuIpcGetEventHandle, - * ::cuIpcGetMemHandle, - * ::cuIpcOpenMemHandle, - * ::cuIpcCloseMemHandle, - * ::cudaIpcOpenEventHandle - */ -CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle); - -/** - * \brief Gets an interprocess memory handle for an existing device memory - * allocation - * - * Takes a pointer to the base of an existing device memory allocation created - * with ::cuMemAlloc and exports it for use in another process. This is a - * lightweight operation and may be called multiple times on an allocation - * without adverse effects. - * - * If a region of memory is freed with ::cuMemFree and a subsequent call - * to ::cuMemAlloc returns memory with the same device address, - * ::cuIpcGetMemHandle will return a unique handle for the - * new memory. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode - * - * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return - * the handle in. - * \param dptr - Base pointer to previously allocated device memory - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_MAP_FAILED, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuMemAlloc, - * ::cuMemFree, - * ::cuIpcGetEventHandle, - * ::cuIpcOpenEventHandle, - * ::cuIpcOpenMemHandle, - * ::cuIpcCloseMemHandle, - * ::cudaIpcGetMemHandle - */ -CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); - -/** - * \brief Opens an interprocess memory handle exported from another process - * and returns a device pointer usable in the local process. - * - * Maps memory exported from another process with ::cuIpcGetMemHandle into - * the current device address space. For contexts on different devices - * ::cuIpcOpenMemHandle can attempt to enable peer access between the - * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is - * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. - * ::cuDeviceCanAccessPeer can determine if a mapping is possible. - * - * Contexts that may open ::CUipcMemHandles are restricted in the following way. - * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened - * by one ::CUcontext per ::CUdevice per other process. - * - * If the memory handle has already been opened by the current context, the - * reference count on the handle is incremented by 1 and the existing device pointer - * is returned. - * - * Memory returned from ::cuIpcOpenMemHandle must be freed with - * ::cuIpcCloseMemHandle. - * - * Calling ::cuMemFree on an exported memory region before calling - * ::cuIpcCloseMemHandle in the importing context will result in undefined - * behavior. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode - * - * \param pdptr - Returned device pointer - * \param handle - ::CUipcMemHandle to open - * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_MAP_FAILED, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_TOO_MANY_PEERS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \note No guarantees are made about the address returned in \p *pdptr. - * In particular, multiple processes may not receive the same address for the same \p handle. - * - * \sa - * ::cuMemAlloc, - * ::cuMemFree, - * ::cuIpcGetEventHandle, - * ::cuIpcOpenEventHandle, - * ::cuIpcGetMemHandle, - * ::cuIpcCloseMemHandle, - * ::cuCtxEnablePeerAccess, - * ::cuDeviceCanAccessPeer, - * ::cudaIpcOpenMemHandle - */ -CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); - -/** - * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle - * - * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1. - * When the reference count reaches 0, this API unmaps the memory. The original allocation - * in the exporting process as well as imported mappings in other processes - * will be unaffected. - * - * Any resources used to enable peer access will be freed if this is the - * last mapping using them. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux and Windows operating systems. - * IPC functionality on Windows is restricted to GPUs in TCC mode - * - * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_MAP_FAILED, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \sa - * ::cuMemAlloc, - * ::cuMemFree, - * ::cuIpcGetEventHandle, - * ::cuIpcOpenEventHandle, - * ::cuIpcGetMemHandle, - * ::cuIpcOpenMemHandle, - * ::cudaIpcCloseMemHandle - */ -CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); - -/** - * \brief Registers an existing host memory range for use by CUDA - * - * Page-locks the memory range specified by \p p and \p bytesize and maps it - * for the device(s) as specified by \p Flags. This memory range also is added - * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed - * directly by the device, it can be read or written with much higher bandwidth - * than pageable memory that has not been registered. Page-locking excessive - * amounts of memory may degrade system performance, since it reduces the amount - * of memory available to the system for paging. As a result, this function is - * best used sparingly to register staging areas for data exchange between - * host and device. - * - * This function has limited support on Mac OS X. OS 10.7 or higher is required. - * - * The \p Flags parameter enables different options to be specified that - * affect the allocation, as follows. - * - * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be - * considered as pinned memory by all CUDA contexts, not just the one that - * performed the allocation. - * - * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address - * space. The device pointer to the memory may be obtained by calling - * ::cuMemHostGetDevicePointer(). - * - * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some - * I/O memory space, e.g. the PCI Express resource of a 3rd party device. - * - * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory - * that is considered read-only by the device. On platforms without - * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is - * required in order to register memory mapped to the CPU as read-only. Support - * for the use of this flag can be queried from the device attribute - * ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with - * a current context associated with a device that does not have this attribute - * set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED. - * - * All of these flags are orthogonal to one another: a developer may page-lock - * memory that is portable or mapped with no restrictions. - * - * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for - * devices that do not support mapped pinned memory. The failure is deferred - * to ::cuMemHostGetDevicePointer() because the memory may be mapped into - * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag. - * - * For devices that have a non-zero value for the device attribute - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory - * can also be accessed from the device using the host pointer \p p. - * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not - * match the original host pointer \p ptr and depends on the devices visible to the - * application. If all devices visible to the application have a non-zero value for the - * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() - * will match the original pointer \p ptr. If any device visible to the application - * has a zero value for the device attribute, the device pointer returned by - * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr, - * but it will be suitable for use on all devices provided Unified Virtual Addressing - * is enabled. In such systems, it is valid to access the memory using either pointer - * on devices that have a non-zero value for the device attribute. Note however that - * such devices should access the memory using only of the two pointers and not both. - * - * The memory page-locked by this function must be unregistered with - * ::cuMemHostUnregister(). - * - * \param p - Host pointer to memory to page-lock - * \param bytesize - Size in bytes of the address range to page-lock - * \param Flags - Flags for allocation request - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, - * ::CUDA_ERROR_NOT_PERMITTED, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa - * ::cuMemHostUnregister, - * ::cuMemHostGetFlags, - * ::cuMemHostGetDevicePointer, - * ::cudaHostRegister - */ -CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); - -/** - * \brief Unregisters a memory range that was registered with cuMemHostRegister. - * - * Unmaps the memory range whose base address is specified by \p p, and makes - * it pageable again. - * - * The base address must be the same one specified to ::cuMemHostRegister(). - * - * \param p - Host pointer to memory to unregister - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, - * \notefnerr - * - * \sa - * ::cuMemHostRegister, - * ::cudaHostUnregister - */ -CUresult CUDAAPI cuMemHostUnregister(void *p); - -/** - * \brief Copies memory - * - * Copies data between two pointers. - * \p dst and \p src are base pointers of the destination and source, respectively. - * \p ByteCount specifies the number of bytes to copy. - * Note that this function infers the type of the transfer (host to host, host to - * device, device to device, or device to host) from the pointer values. This - * function is only allowed in contexts which support unified addressing. - * - * \param dst - Destination unified virtual address space pointer - * \param src - Source unified virtual address space pointer - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy, - * ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol - */ -CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); - -/** - * \brief Copies device memory between two contexts - * - * Copies from device memory in one context to device memory in another - * context. \p dstDevice is the base device pointer of the destination memory - * and \p dstContext is the destination context. \p srcDevice is the base - * device pointer of the source memory and \p srcContext is the source pointer. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstDevice - Destination device pointer - * \param dstContext - Destination context - * \param srcDevice - Source device pointer - * \param srcContext - Source context - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, - * ::cuMemcpy3DPeerAsync, - * ::cudaMemcpyPeer - */ -CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); - -/** - * \brief Copies memory from Host to Device - * - * Copies from host memory to device memory. \p dstDevice and \p srcHost are - * the base addresses of the destination and source, respectively. \p ByteCount - * specifies the number of bytes to copy. - * - * \param dstDevice - Destination device pointer - * \param srcHost - Source host pointer - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy, - * ::cudaMemcpyToSymbol - */ -CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); - -/** - * \brief Copies memory from Device to Host - * - * Copies from device to host memory. \p dstHost and \p srcDevice specify the - * base pointers of the destination and source, respectively. \p ByteCount - * specifies the number of bytes to copy. - * - * \param dstHost - Destination host pointer - * \param srcDevice - Source device pointer - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy, - * ::cudaMemcpyFromSymbol - */ -CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); - -/** - * \brief Copies memory from Device to Device - * - * Copies from device memory to device memory. \p dstDevice and \p srcDevice - * are the base pointers of the destination and source, respectively. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstDevice - Destination device pointer - * \param srcDevice - Source device pointer - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy, - * ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol - */ -CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); - -/** - * \brief Copies memory from Device to Array - * - * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset - * specify the CUDA array handle and starting index of the destination data. - * \p srcDevice specifies the base pointer of the source. \p ByteCount - * specifies the number of bytes to copy. - * - * \param dstArray - Destination array - * \param dstOffset - Offset in bytes of destination array - * \param srcDevice - Source device pointer - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpyToArray - */ -CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); - -/** - * \brief Copies memory from Array to Device - * - * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the - * base pointer of the destination and must be naturally aligned with the CUDA - * array elements. \p srcArray and \p srcOffset specify the CUDA array handle - * and the offset in bytes into the array where the copy is to begin. - * \p ByteCount specifies the number of bytes to copy and must be evenly - * divisible by the array element size. - * - * \param dstDevice - Destination device pointer - * \param srcArray - Source array - * \param srcOffset - Offset in bytes of source array - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpyFromArray - */ -CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); - -/** - * \brief Copies memory from Host to Array - * - * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset - * specify the CUDA array handle and starting offset in bytes of the destination - * data. \p pSrc specifies the base address of the source. \p ByteCount specifies - * the number of bytes to copy. - * - * \param dstArray - Destination array - * \param dstOffset - Offset in bytes of destination array - * \param srcHost - Source host pointer - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpyToArray - */ -CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); - -/** - * \brief Copies memory from Array to Host - * - * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base - * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA - * array handle and starting offset in bytes of the source data. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstHost - Destination device pointer - * \param srcArray - Source array - * \param srcOffset - Offset in bytes of source array - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpyFromArray - */ -CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); - -/** - * \brief Copies memory from Array to Array - * - * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray - * specify the handles of the destination and source CUDA arrays for the copy, - * respectively. \p dstOffset and \p srcOffset specify the destination and - * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of - * bytes to be copied. The size of the elements in the CUDA arrays need not be - * the same format, but the elements must be the same size; and count must be - * evenly divisible by that size. - * - * \param dstArray - Destination array - * \param dstOffset - Offset in bytes of destination array - * \param srcArray - Source array - * \param srcOffset - Offset in bytes of source array - * \param ByteCount - Size of memory copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpyArrayToArray - */ -CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); - -/** - * \brief Copies memory for 2D arrays - * - * Perform a 2D memory copy according to the parameters specified in \p pCopy. - * The ::CUDA_MEMCPY2D structure is defined as: - * - * \code - typedef struct CUDA_MEMCPY2D_st { - unsigned int srcXInBytes, srcY; - CUmemorytype srcMemoryType; - const void *srcHost; - CUdeviceptr srcDevice; - CUarray srcArray; - unsigned int srcPitch; - - unsigned int dstXInBytes, dstY; - CUmemorytype dstMemoryType; - void *dstHost; - CUdeviceptr dstDevice; - CUarray dstArray; - unsigned int dstPitch; - - unsigned int WidthInBytes; - unsigned int Height; - } CUDA_MEMCPY2D; - * \endcode - * where: - * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the - * source and destination, respectively; ::CUmemorytype_enum is defined as: - * - * \code - typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, - CU_MEMORYTYPE_DEVICE = 0x02, - CU_MEMORYTYPE_ARRAY = 0x03, - CU_MEMORYTYPE_UNIFIED = 0x04 - } CUmemorytype; - * \endcode - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch - * specify the (host) base address of the source data and the bytes per row to - * apply. ::srcArray is ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch - * specify the (device) base address of the source data and the bytes per row - * to apply. ::srcArray is ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the - * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are - * ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch - * specify the (host) base address of the destination data and the bytes per - * row to apply. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch - * specify the (device) base address of the destination data and the bytes per - * row to apply. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the - * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are - * ignored. - * - * - ::srcXInBytes and ::srcY specify the base address of the source data for - * the copy. - * - * \par - * For host pointers, the starting address is - * \code - void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array - * element size. - * - * - ::dstXInBytes and ::dstY specify the base address of the destination data - * for the copy. - * - * \par - * For host pointers, the base address is - * \code - void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array - * element size. - * - * - ::WidthInBytes and ::Height specify the width (in bytes) and height of - * the 2D copy being performed. - * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + - * ::srcXInBytes, and ::dstPitch must be greater than or equal to - * ::WidthInBytes + dstXInBytes. - * - * \par - * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum - * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back - * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies - * (device to device, CUDA array to device, CUDA array to CUDA array), - * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). - * ::cuMemcpy2DUnaligned() does not have this restriction, but may run - * significantly slower in the cases where ::cuMemcpy2D() would have returned - * an error code. - * - * \param pCopy - Parameters for the memory copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy2D, - * ::cudaMemcpy2DToArray, - * ::cudaMemcpy2DFromArray - */ -CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); - -/** - * \brief Copies memory for 2D arrays - * - * Perform a 2D memory copy according to the parameters specified in \p pCopy. - * The ::CUDA_MEMCPY2D structure is defined as: - * - * \code - typedef struct CUDA_MEMCPY2D_st { - unsigned int srcXInBytes, srcY; - CUmemorytype srcMemoryType; - const void *srcHost; - CUdeviceptr srcDevice; - CUarray srcArray; - unsigned int srcPitch; - unsigned int dstXInBytes, dstY; - CUmemorytype dstMemoryType; - void *dstHost; - CUdeviceptr dstDevice; - CUarray dstArray; - unsigned int dstPitch; - unsigned int WidthInBytes; - unsigned int Height; - } CUDA_MEMCPY2D; - * \endcode - * where: - * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the - * source and destination, respectively; ::CUmemorytype_enum is defined as: - * - * \code - typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, - CU_MEMORYTYPE_DEVICE = 0x02, - CU_MEMORYTYPE_ARRAY = 0x03, - CU_MEMORYTYPE_UNIFIED = 0x04 - } CUmemorytype; - * \endcode - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch - * specify the (host) base address of the source data and the bytes per row to - * apply. ::srcArray is ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch - * specify the (device) base address of the source data and the bytes per row - * to apply. ::srcArray is ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the - * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are - * ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch - * specify the (host) base address of the destination data and the bytes per - * row to apply. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch - * specify the (device) base address of the destination data and the bytes per - * row to apply. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the - * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are - * ignored. - * - * - ::srcXInBytes and ::srcY specify the base address of the source data for - * the copy. - * - * \par - * For host pointers, the starting address is - * \code - void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array - * element size. - * - * - ::dstXInBytes and ::dstY specify the base address of the destination data - * for the copy. - * - * \par - * For host pointers, the base address is - * \code - void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array - * element size. - * - * - ::WidthInBytes and ::Height specify the width (in bytes) and height of - * the 2D copy being performed. - * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + - * ::srcXInBytes, and ::dstPitch must be greater than or equal to - * ::WidthInBytes + dstXInBytes. - * - * \par - * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum - * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back - * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies - * (device to device, CUDA array to device, CUDA array to CUDA array), - * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). - * ::cuMemcpy2DUnaligned() does not have this restriction, but may run - * significantly slower in the cases where ::cuMemcpy2D() would have returned - * an error code. - * - * \param pCopy - Parameters for the memory copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy2D, - * ::cudaMemcpy2DToArray, - * ::cudaMemcpy2DFromArray - */ -CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); - -/** - * \brief Copies memory for 3D arrays - * - * Perform a 3D memory copy according to the parameters specified in - * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: - * - * \code - typedef struct CUDA_MEMCPY3D_st { - - unsigned int srcXInBytes, srcY, srcZ; - unsigned int srcLOD; - CUmemorytype srcMemoryType; - const void *srcHost; - CUdeviceptr srcDevice; - CUarray srcArray; - unsigned int srcPitch; // ignored when src is array - unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 - - unsigned int dstXInBytes, dstY, dstZ; - unsigned int dstLOD; - CUmemorytype dstMemoryType; - void *dstHost; - CUdeviceptr dstDevice; - CUarray dstArray; - unsigned int dstPitch; // ignored when dst is array - unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 - - unsigned int WidthInBytes; - unsigned int Height; - unsigned int Depth; - } CUDA_MEMCPY3D; - * \endcode - * where: - * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the - * source and destination, respectively; ::CUmemorytype_enum is defined as: - * - * \code - typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, - CU_MEMORYTYPE_DEVICE = 0x02, - CU_MEMORYTYPE_ARRAY = 0x03, - CU_MEMORYTYPE_UNIFIED = 0x04 - } CUmemorytype; - * \endcode - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and - * ::srcHeight specify the (host) base address of the source data, the bytes - * per row, and the height of each 2D slice of the 3D array. ::srcArray is - * ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and - * ::srcHeight specify the (device) base address of the source data, the bytes - * per row, and the height of each 2D slice of the 3D array. ::srcArray is - * ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the - * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and - * ::srcHeight are ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch - * specify the (host) base address of the destination data, the bytes per row, - * and the height of each 2D slice of the 3D array. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch - * specify the (device) base address of the destination data, the bytes per - * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the - * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and - * ::dstHeight are ignored. - * - * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source - * data for the copy. - * - * \par - * For host pointers, the starting address is - * \code - void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes, ::dstY and ::dstZ specify the base address of the - * destination data for the copy. - * - * \par - * For host pointers, the base address is - * \code - void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array - * element size. - * - * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height - * and depth of the 3D copy being performed. - * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + - * ::srcXInBytes, and ::dstPitch must be greater than or equal to - * ::WidthInBytes + dstXInBytes. - * - If specified, ::srcHeight must be greater than or equal to ::Height + - * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. - * - * \par - * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum - * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). - * - * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be - * set to 0. - * - * \param pCopy - Parameters for the memory copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMemcpy3D - */ -CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); - -/** - * \brief Copies memory between contexts - * - * Perform a 3D memory copy according to the parameters specified in - * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure - * for documentation of its parameters. - * - * \param pCopy - Parameters for the memory copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_sync - * - * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, - * ::cuMemcpy3DPeerAsync, - * ::cudaMemcpy3DPeer - */ -CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); - -/** - * \brief Copies memory asynchronously - * - * Copies data between two pointers. - * \p dst and \p src are base pointers of the destination and source, respectively. - * \p ByteCount specifies the number of bytes to copy. - * Note that this function infers the type of the transfer (host to host, host to - * device, device to device, or device to host) from the pointer values. This - * function is only allowed in contexts which support unified addressing. - * - * \param dst - Destination unified virtual address space pointer - * \param src - Source unified virtual address space pointer - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpyAsync, - * ::cudaMemcpyToSymbolAsync, - * ::cudaMemcpyFromSymbolAsync - */ -CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies device memory between two contexts asynchronously. - * - * Copies from device memory in one context to device memory in another - * context. \p dstDevice is the base device pointer of the destination memory - * and \p dstContext is the destination context. \p srcDevice is the base - * device pointer of the source memory and \p srcContext is the source pointer. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstDevice - Destination device pointer - * \param dstContext - Destination context - * \param srcDevice - Source device pointer - * \param srcContext - Source context - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, - * ::cuMemcpy3DPeerAsync, - * ::cudaMemcpyPeerAsync - */ -CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies memory from Host to Device - * - * Copies from host memory to device memory. \p dstDevice and \p srcHost are - * the base addresses of the destination and source, respectively. \p ByteCount - * specifies the number of bytes to copy. - * - * \param dstDevice - Destination device pointer - * \param srcHost - Source host pointer - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpyAsync, - * ::cudaMemcpyToSymbolAsync - */ -CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies memory from Device to Host - * - * Copies from device to host memory. \p dstHost and \p srcDevice specify the - * base pointers of the destination and source, respectively. \p ByteCount - * specifies the number of bytes to copy. - * - * \param dstHost - Destination host pointer - * \param srcDevice - Source device pointer - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpyAsync, - * ::cudaMemcpyFromSymbolAsync - */ -CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies memory from Device to Device - * - * Copies from device memory to device memory. \p dstDevice and \p srcDevice - * are the base pointers of the destination and source, respectively. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstDevice - Destination device pointer - * \param srcDevice - Source device pointer - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpyAsync, - * ::cudaMemcpyToSymbolAsync, - * ::cudaMemcpyFromSymbolAsync - */ -CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies memory from Host to Array - * - * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset - * specify the CUDA array handle and starting offset in bytes of the - * destination data. \p srcHost specifies the base address of the source. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstArray - Destination array - * \param dstOffset - Offset in bytes of destination array - * \param srcHost - Source host pointer - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpyToArrayAsync - */ -CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies memory from Array to Host - * - * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base - * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA - * array handle and starting offset in bytes of the source data. - * \p ByteCount specifies the number of bytes to copy. - * - * \param dstHost - Destination pointer - * \param srcArray - Source array - * \param srcOffset - Offset in bytes of source array - * \param ByteCount - Size of memory copy in bytes - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * \note_memcpy - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpyFromArrayAsync - */ -CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); - -/** - * \brief Copies memory for 2D arrays - * - * Perform a 2D memory copy according to the parameters specified in \p pCopy. - * The ::CUDA_MEMCPY2D structure is defined as: - * - * \code - typedef struct CUDA_MEMCPY2D_st { - unsigned int srcXInBytes, srcY; - CUmemorytype srcMemoryType; - const void *srcHost; - CUdeviceptr srcDevice; - CUarray srcArray; - unsigned int srcPitch; - unsigned int dstXInBytes, dstY; - CUmemorytype dstMemoryType; - void *dstHost; - CUdeviceptr dstDevice; - CUarray dstArray; - unsigned int dstPitch; - unsigned int WidthInBytes; - unsigned int Height; - } CUDA_MEMCPY2D; - * \endcode - * where: - * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the - * source and destination, respectively; ::CUmemorytype_enum is defined as: - * - * \code - typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, - CU_MEMORYTYPE_DEVICE = 0x02, - CU_MEMORYTYPE_ARRAY = 0x03, - CU_MEMORYTYPE_UNIFIED = 0x04 - } CUmemorytype; - * \endcode - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch - * specify the (host) base address of the source data and the bytes per row to - * apply. ::srcArray is ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch - * specify the (device) base address of the source data and the bytes per row - * to apply. ::srcArray is ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the - * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are - * ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch - * specify the (host) base address of the destination data and the bytes per - * row to apply. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch - * specify the (device) base address of the destination data and the bytes per - * row to apply. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the - * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are - * ignored. - * - * - ::srcXInBytes and ::srcY specify the base address of the source data for - * the copy. - * - * \par - * For host pointers, the starting address is - * \code - void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array - * element size. - * - * - ::dstXInBytes and ::dstY specify the base address of the destination data - * for the copy. - * - * \par - * For host pointers, the base address is - * \code - void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array - * element size. - * - * - ::WidthInBytes and ::Height specify the width (in bytes) and height of - * the 2D copy being performed. - * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + - * ::srcXInBytes, and ::dstPitch must be greater than or equal to - * ::WidthInBytes + dstXInBytes. - * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + - * ::srcXInBytes, and ::dstPitch must be greater than or equal to - * ::WidthInBytes + dstXInBytes. - * - If specified, ::srcHeight must be greater than or equal to ::Height + - * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. - * - * \par - * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum - * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back - * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies - * (device to device, CUDA array to device, CUDA array to CUDA array), - * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch(). - * - * \param pCopy - Parameters for the memory copy - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpy2DAsync, - * ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpy2DFromArrayAsync - */ -CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); - -/** - * \brief Copies memory for 3D arrays - * - * Perform a 3D memory copy according to the parameters specified in - * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: - * - * \code - typedef struct CUDA_MEMCPY3D_st { - - unsigned int srcXInBytes, srcY, srcZ; - unsigned int srcLOD; - CUmemorytype srcMemoryType; - const void *srcHost; - CUdeviceptr srcDevice; - CUarray srcArray; - unsigned int srcPitch; // ignored when src is array - unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 - - unsigned int dstXInBytes, dstY, dstZ; - unsigned int dstLOD; - CUmemorytype dstMemoryType; - void *dstHost; - CUdeviceptr dstDevice; - CUarray dstArray; - unsigned int dstPitch; // ignored when dst is array - unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 - - unsigned int WidthInBytes; - unsigned int Height; - unsigned int Depth; - } CUDA_MEMCPY3D; - * \endcode - * where: - * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the - * source and destination, respectively; ::CUmemorytype_enum is defined as: - * - * \code - typedef enum CUmemorytype_enum { - CU_MEMORYTYPE_HOST = 0x01, - CU_MEMORYTYPE_DEVICE = 0x02, - CU_MEMORYTYPE_ARRAY = 0x03, - CU_MEMORYTYPE_UNIFIED = 0x04 - } CUmemorytype; - * \endcode - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and - * ::srcHeight specify the (host) base address of the source data, the bytes - * per row, and the height of each 2D slice of the 3D array. ::srcArray is - * ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and - * ::srcHeight specify the (device) base address of the source data, the bytes - * per row, and the height of each 2D slice of the 3D array. ::srcArray is - * ignored. - * - * \par - * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the - * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and - * ::srcHeight are ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling - * context. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch - * specify the (host) base address of the destination data, the bytes per row, - * and the height of each 2D slice of the 3D array. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch - * specify the (device) base address of the destination data, the bytes per - * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. - * - * \par - * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the - * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and - * ::dstHeight are ignored. - * - * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source - * data for the copy. - * - * \par - * For host pointers, the starting address is - * \code - void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes, ::dstY and ::dstZ specify the base address of the - * destination data for the copy. - * - * \par - * For host pointers, the base address is - * \code - void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); - * \endcode - * - * \par - * For device pointers, the starting address is - * \code - CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; - * \endcode - * - * \par - * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array - * element size. - * - * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height - * and depth of the 3D copy being performed. - * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + - * ::srcXInBytes, and ::dstPitch must be greater than or equal to - * ::WidthInBytes + dstXInBytes. - * - If specified, ::srcHeight must be greater than or equal to ::Height + - * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. - * - * \par - * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum - * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). - * - * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be - * set to 0. - * - * \param pCopy - Parameters for the memory copy - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemcpy3DAsync - */ -CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); - -/** - * \brief Copies memory between contexts asynchronously. - * - * Perform a 3D memory copy according to the parameters specified in - * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure - * for documentation of its parameters. - * - * \param pCopy - Parameters for the memory copy - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, - * ::cuMemcpy3DPeerAsync, - * ::cudaMemcpy3DPeerAsync - */ -CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); - -/** - * \brief Initializes device memory - * - * Sets the memory range of \p N 8-bit values to the specified value - * \p uc. - * - * \param dstDevice - Destination device pointer - * \param uc - Value to set - * \param N - Number of elements - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset - */ -CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N); - -/** - * \brief Initializes device memory - * - * Sets the memory range of \p N 16-bit values to the specified value - * \p us. The \p dstDevice pointer must be two byte aligned. - * - * \param dstDevice - Destination device pointer - * \param us - Value to set - * \param N - Number of elements - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset - */ -CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N); - -/** - * \brief Initializes device memory - * - * Sets the memory range of \p N 32-bit values to the specified value - * \p ui. The \p dstDevice pointer must be four byte aligned. - * - * \param dstDevice - Destination device pointer - * \param ui - Value to set - * \param N - Number of elements - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32Async, - * ::cudaMemset - */ -CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N); - -/** - * \brief Initializes device memory - * - * Sets the 2D memory range of \p Width 8-bit values to the specified value - * \p uc. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. This function performs - * fastest when the pitch is one that has been passed back by - * ::cuMemAllocPitch(). - * - * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) - * \param uc - Value to set - * \param Width - Width of row - * \param Height - Number of rows - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset2D - */ -CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); - -/** - * \brief Initializes device memory - * - * Sets the 2D memory range of \p Width 16-bit values to the specified value - * \p us. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. The \p dstDevice pointer - * and \p dstPitch offset must be two byte aligned. This function performs - * fastest when the pitch is one that has been passed back by - * ::cuMemAllocPitch(). - * - * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) - * \param us - Value to set - * \param Width - Width of row - * \param Height - Number of rows - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset2D - */ -CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); - -/** - * \brief Initializes device memory - * - * Sets the 2D memory range of \p Width 32-bit values to the specified value - * \p ui. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. The \p dstDevice pointer - * and \p dstPitch offset must be four byte aligned. This function performs - * fastest when the pitch is one that has been passed back by - * ::cuMemAllocPitch(). - * - * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) - * \param ui - Value to set - * \param Width - Width of row - * \param Height - Number of rows - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset2D - */ -CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); - -/** - * \brief Sets device memory - * - * Sets the memory range of \p N 8-bit values to the specified value - * \p uc. - * - * \param dstDevice - Destination device pointer - * \param uc - Value to set - * \param N - Number of elements - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemsetAsync - */ -CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); - -/** - * \brief Sets device memory - * - * Sets the memory range of \p N 16-bit values to the specified value - * \p us. The \p dstDevice pointer must be two byte aligned. - * - * \param dstDevice - Destination device pointer - * \param us - Value to set - * \param N - Number of elements - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemsetAsync - */ -CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); - -/** - * \brief Sets device memory - * - * Sets the memory range of \p N 32-bit values to the specified value - * \p ui. The \p dstDevice pointer must be four byte aligned. - * - * \param dstDevice - Destination device pointer - * \param ui - Value to set - * \param N - Number of elements - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32, - * ::cudaMemsetAsync - */ -CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); - -/** - * \brief Sets device memory - * - * Sets the 2D memory range of \p Width 8-bit values to the specified value - * \p uc. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. This function performs - * fastest when the pitch is one that has been passed back by - * ::cuMemAllocPitch(). - * - * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) - * \param uc - Value to set - * \param Width - Width of row - * \param Height - Number of rows - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset2DAsync - */ -CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); - -/** - * \brief Sets device memory - * - * Sets the 2D memory range of \p Width 16-bit values to the specified value - * \p us. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. The \p dstDevice pointer - * and \p dstPitch offset must be two byte aligned. This function performs - * fastest when the pitch is one that has been passed back by - * ::cuMemAllocPitch(). - * - * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) - * \param us - Value to set - * \param Width - Width of row - * \param Height - Number of rows - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset2DAsync - */ -CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); - -/** - * \brief Sets device memory - * - * Sets the 2D memory range of \p Width 32-bit values to the specified value - * \p ui. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. The \p dstDevice pointer - * and \p dstPitch offset must be four byte aligned. This function performs - * fastest when the pitch is one that has been passed back by - * ::cuMemAllocPitch(). - * - * \param dstDevice - Destination device pointer - * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) - * \param ui - Value to set - * \param Width - Width of row - * \param Height - Number of rows - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, - * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, - * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, - * ::cuMemsetD32, ::cuMemsetD32Async, - * ::cudaMemset2DAsync - */ -CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); - -/** - * \brief Creates a 1D or 2D CUDA array - * - * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure - * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. - * The ::CUDA_ARRAY_DESCRIPTOR is defined as: - * - * \code - typedef struct { - unsigned int Width; - unsigned int Height; - CUarray_format Format; - unsigned int NumChannels; - } CUDA_ARRAY_DESCRIPTOR; - * \endcode - * where: - * - * - \p Width, and \p Height are the width, and height of the CUDA array (in - * elements); the CUDA array is one-dimensional if height is 0, two-dimensional - * otherwise; - * - ::Format specifies the format of the elements; ::CUarray_format is - * defined as: - * \code - typedef enum CUarray_format_enum { - CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, - CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, - CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, - CU_AD_FORMAT_SIGNED_INT8 = 0x08, - CU_AD_FORMAT_SIGNED_INT16 = 0x09, - CU_AD_FORMAT_SIGNED_INT32 = 0x0a, - CU_AD_FORMAT_HALF = 0x10, - CU_AD_FORMAT_FLOAT = 0x20 - } CUarray_format; - * \endcode - * - \p NumChannels specifies the number of packed components per CUDA array - * element; it may be 1, 2, or 4; - * - * Here are examples of CUDA array descriptions: - * - * Description for a CUDA array of 2048 floats: - * \code - CUDA_ARRAY_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_FLOAT; - desc.NumChannels = 1; - desc.Width = 2048; - desc.Height = 1; - * \endcode - * - * Description for a 64 x 64 CUDA array of floats: - * \code - CUDA_ARRAY_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_FLOAT; - desc.NumChannels = 1; - desc.Width = 64; - desc.Height = 64; - * \endcode - * - * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit - * float16's: - * \code - CUDA_ARRAY_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_HALF; - desc.NumChannels = 4; - desc.Width = width; - desc.Height = height; - * \endcode - * - * Description for a \p width x \p height CUDA array of 16-bit elements, each - * of which is two 8-bit unsigned chars: - * \code - CUDA_ARRAY_DESCRIPTOR arrayDesc; - desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; - desc.NumChannels = 2; - desc.Width = width; - desc.Height = height; - * \endcode - * - * \param pHandle - Returned array - * \param pAllocateArray - Array descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMallocArray - */ -CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); - -/** - * \brief Get a 1D or 2D CUDA array descriptor - * - * Returns in \p *pArrayDescriptor a descriptor containing information on the - * format and dimensions of the CUDA array \p hArray. It is useful for - * subroutines that have been passed a CUDA array, but need to know the CUDA - * array parameters for validation or other purposes. - * - * \param pArrayDescriptor - Returned array descriptor - * \param hArray - Array to get descriptor of - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaArrayGetInfo - */ -CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); - -/** - * \brief Returns the layout properties of a sparse CUDA array - * - * Returns the layout properties of a sparse CUDA array in \p sparseProperties - * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE - * ::CUDA_ERROR_INVALID_VALUE will be returned. - * - * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, - * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero. - * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero. - * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained - * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties - * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to. - * - * \return - * ::CUDA_SUCCESS - * ::CUDA_ERROR_INVALID_VALUE - * - * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES - * \param[in] array - CUDA array to get the sparse properties of - * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync - */ -CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array); - -/** - * \brief Returns the layout properties of a sparse CUDA mipmapped array - * - * Returns the sparse array layout properties in \p sparseProperties - * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE - * ::CUDA_ERROR_INVALID_VALUE will be returned. - * - * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the - * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth - * is less than that of the tile. - * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, - * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. - * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer. - * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero. - * - * \return - * ::CUDA_SUCCESS - * ::CUDA_ERROR_INVALID_VALUE - * - * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES - * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of - * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync - */ -CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap); - -/** - * \brief Gets a CUDA array plane from a CUDA array - * - * Returns in \p pPlaneArray a CUDA array that represents a single format plane - * of the CUDA array \p hArray. - * - * If \p planeIdx is greater than the maximum number of planes in this array or if the array does - * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned. - * - * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns - * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. - * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width - * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. - * - * \param pPlaneArray - Returned CUDA array referenced by the \p planeIdx - * \param hArray - Multiplanar CUDA array - * \param planeIdx - Plane index - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa - * ::cuArrayCreate, - * ::cudaGetArrayPlane - */ -CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx); - -/** - * \brief Destroys a CUDA array - * - * Destroys the CUDA array \p hArray. - * - * \param hArray - Array to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_ARRAY_IS_MAPPED, - * ::CUDA_ERROR_CONTEXT_IS_DESTROYED - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaFreeArray - */ -CUresult CUDAAPI cuArrayDestroy(CUarray hArray); - -/** - * \brief Creates a 3D CUDA array - * - * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure - * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. - * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: - * - * \code - typedef struct { - unsigned int Width; - unsigned int Height; - unsigned int Depth; - CUarray_format Format; - unsigned int NumChannels; - unsigned int Flags; - } CUDA_ARRAY3D_DESCRIPTOR; - * \endcode - * where: - * - * - \p Width, \p Height, and \p Depth are the width, height, and depth of the - * CUDA array (in elements); the following types of CUDA arrays can be allocated: - * - A 1D array is allocated if \p Height and \p Depth extents are both zero. - * - A 2D array is allocated if only \p Depth extent is zero. - * - A 3D array is allocated if all three extents are non-zero. - * - A 1D layered CUDA array is allocated if only \p Height is zero and the - * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number - * of layers is determined by the depth extent. - * - A 2D layered CUDA array is allocated if all three extents are non-zero and - * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number - * of layers is determined by the depth extent. - * - A cubemap CUDA array is allocated if all three extents are non-zero and the - * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and - * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, - * where the six layers represent the six faces of a cube. The order of the six - * layers in memory is the same as that listed in ::CUarray_cubemap_face. - * - A cubemap layered CUDA array is allocated if all three extents are non-zero, - * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. - * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. - * A cubemap layered CUDA array is a special type of 2D layered CUDA array that - * consists of a collection of cubemaps. The first six layers represent the first - * cubemap, the next six layers form the second cubemap, and so on. - * - * - ::Format specifies the format of the elements; ::CUarray_format is - * defined as: - * \code - typedef enum CUarray_format_enum { - CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, - CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, - CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, - CU_AD_FORMAT_SIGNED_INT8 = 0x08, - CU_AD_FORMAT_SIGNED_INT16 = 0x09, - CU_AD_FORMAT_SIGNED_INT32 = 0x0a, - CU_AD_FORMAT_HALF = 0x10, - CU_AD_FORMAT_FLOAT = 0x20 - } CUarray_format; - * \endcode - * - * - \p NumChannels specifies the number of packed components per CUDA array - * element; it may be 1, 2, or 4; - * - * - ::Flags may be set to - * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, - * \p Depth specifies the number of layers, not the depth of a 3D array. - * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array. - * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array - * to a surface reference. - * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be - * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, - * then \p Depth must be a multiple of six. - * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather. - * Texture gather can only be performed on 2D CUDA arrays. - * - * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. - * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute - * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH. - * - * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag - * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH - * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case. - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), - * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
- * {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } - *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), - * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), - * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, - * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, - * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), - * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), - * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), - * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), - * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), - * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
- * - * Here are examples of CUDA array descriptions: - * - * Description for a CUDA array of 2048 floats: - * \code - CUDA_ARRAY3D_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_FLOAT; - desc.NumChannels = 1; - desc.Width = 2048; - desc.Height = 0; - desc.Depth = 0; - * \endcode - * - * Description for a 64 x 64 CUDA array of floats: - * \code - CUDA_ARRAY3D_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_FLOAT; - desc.NumChannels = 1; - desc.Width = 64; - desc.Height = 64; - desc.Depth = 0; - * \endcode - * - * Description for a \p width x \p height x \p depth CUDA array of 64-bit, - * 4x16-bit float16's: - * \code - CUDA_ARRAY3D_DESCRIPTOR desc; - desc.Format = CU_AD_FORMAT_HALF; - desc.NumChannels = 4; - desc.Width = width; - desc.Height = height; - desc.Depth = depth; - * \endcode - * - * \param pHandle - Returned array - * \param pAllocateArray - 3D array descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaMalloc3DArray - */ -CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); - -/** - * \brief Get a 3D CUDA array descriptor - * - * Returns in \p *pArrayDescriptor a descriptor containing information on the - * format and dimensions of the CUDA array \p hArray. It is useful for - * subroutines that have been passed a CUDA array, but need to know the CUDA - * array parameters for validation or other purposes. - * - * This function may be called on 1D and 2D arrays, in which case the \p Height - * and/or \p Depth members of the descriptor struct will be set to 0. - * - * \param pArrayDescriptor - Returned 3D array descriptor - * \param hArray - 3D array to get descriptor of - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_CONTEXT_IS_DESTROYED - * \notefnerr - * - * \sa ::cuArray3DCreate, ::cuArrayCreate, - * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, - * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, - * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, - * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, - * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, - * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, - * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, - * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, - * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, - * ::cudaArrayGetInfo - */ -CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); - -/** - * \brief Creates a CUDA mipmapped array - * - * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure - * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle. - * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is - * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. - * - * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: - * - * \code - typedef struct { - unsigned int Width; - unsigned int Height; - unsigned int Depth; - CUarray_format Format; - unsigned int NumChannels; - unsigned int Flags; - } CUDA_ARRAY3D_DESCRIPTOR; - * \endcode - * where: - * - * - \p Width, \p Height, and \p Depth are the width, height, and depth of the - * CUDA array (in elements); the following types of CUDA arrays can be allocated: - * - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero. - * - A 2D mipmapped array is allocated if only \p Depth extent is zero. - * - A 3D mipmapped array is allocated if all three extents are non-zero. - * - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the - * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number - * of layers is determined by the depth extent. - * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and - * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number - * of layers is determined by the depth extent. - * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the - * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and - * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, - * where the six layers represent the six faces of a cube. The order of the six - * layers in memory is the same as that listed in ::CUarray_cubemap_face. - * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, - * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. - * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. - * A cubemap layered CUDA array is a special type of 2D layered CUDA array that - * consists of a collection of cubemaps. The first six layers represent the first - * cubemap, the next six layers form the second cubemap, and so on. - * - * - ::Format specifies the format of the elements; ::CUarray_format is - * defined as: - * \code - typedef enum CUarray_format_enum { - CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, - CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, - CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, - CU_AD_FORMAT_SIGNED_INT8 = 0x08, - CU_AD_FORMAT_SIGNED_INT16 = 0x09, - CU_AD_FORMAT_SIGNED_INT32 = 0x0a, - CU_AD_FORMAT_HALF = 0x10, - CU_AD_FORMAT_FLOAT = 0x20 - } CUarray_format; - * \endcode - * - * - \p NumChannels specifies the number of packed components per CUDA array - * element; it may be 1, 2, or 4; - * - * - ::Flags may be set to - * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, - * \p Depth specifies the number of layers, not the depth of a 3D array. - * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of - * the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to - * bind a mipmap level of the CUDA mipmapped array to a surface reference. - * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be - * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, - * then \p Depth must be a multiple of six. - * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather. - * Texture gather can only be performed on 2D CUDA mipmapped arrays. - * - * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. - * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute - * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH. - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), - * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
- * {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } - *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), - * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), - * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, - * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, - * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), - * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), - * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), - * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), - * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), - * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
- * - * - * \param pHandle - Returned mipmapped array - * \param pMipmappedArrayDesc - mipmapped array descriptor - * \param numMipmapLevels - Number of mipmap levels - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - * ::cuMipmappedArrayDestroy, - * ::cuMipmappedArrayGetLevel, - * ::cuArrayCreate, - * ::cudaMallocMipmappedArray - */ -CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels); - -/** - * \brief Gets a mipmap level of a CUDA mipmapped array - * - * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level - * of the CUDA mipmapped array \p hMipmappedArray. - * - * If \p level is greater than the maximum number of levels in this mipmapped array, - * ::CUDA_ERROR_INVALID_VALUE is returned. - * - * \param pLevelArray - Returned mipmap level CUDA array - * \param hMipmappedArray - CUDA mipmapped array - * \param level - Mipmap level - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa - * ::cuMipmappedArrayCreate, - * ::cuMipmappedArrayDestroy, - * ::cuArrayCreate, - * ::cudaGetMipmappedArrayLevel - */ -CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level); - -/** - * \brief Destroys a CUDA mipmapped array - * - * Destroys the CUDA mipmapped array \p hMipmappedArray. - * - * \param hMipmappedArray - Mipmapped array to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_ARRAY_IS_MAPPED, - * ::CUDA_ERROR_CONTEXT_IS_DESTROYED - * \notefnerr - * - * \sa - * ::cuMipmappedArrayCreate, - * ::cuMipmappedArrayGetLevel, - * ::cuArrayCreate, - * ::cudaFreeMipmappedArray - */ -CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); - -/** @} */ /* END CUDA_MEM */ - -/** - * \defgroup CUDA_VA Virtual Memory Management - * - * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the virtual memory management functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** -* \brief Allocate an address range reservation. -* -* Reserves a virtual address range based on the given parameters, giving -* the starting address of the range in \p ptr. This API requires a system that -* supports UVA. The size and address parameters must be a multiple of the -* host page size and the alignment must be a power of two or zero for default -* alignment. -* -* \param[out] ptr - Resulting pointer to start of virtual address range allocated -* \param[in] size - Size of the reserved virtual address range requested -* \param[in] alignment - Alignment of the reserved virtual address range requested -* \param[in] addr - Fixed starting address range requested -* \param[in] flags - Currently unused, must be zero -* \return -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_OUT_OF_MEMORY, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemAddressFree -*/ -CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags); - -/** -* \brief Free an address range reservation. -* -* Frees a virtual address range reserved by cuMemAddressReserve. The size -* must match what was given to memAddressReserve and the ptr given must -* match what was returned from memAddressReserve. -* -* \param[in] ptr - Starting address of the virtual address range to free -* \param[in] size - Size of the virtual address region to free -* \return -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemAddressReserve -*/ -CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size); - -/** -* \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties -* -* This creates a memory allocation on the target device specified through the -* \p prop strcuture. The created allocation will not have any device or host -* mappings. The generic memory \p handle for the allocation can be -* mapped to the address space of calling process via ::cuMemMap. This handle -* cannot be transmitted directly to other processes (see -* ::cuMemExportToShareableHandle). On Windows, the caller must also pass -* an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which -* limits or allows access to this handle for a recepient process (see -* ::CUmemAllocationProp::win32HandleMetaData for more). The \p size of this -* allocation must be a multiple of the the value given via -* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM -* flag. -* If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then -* the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays -* and sparse CUDA mipmapped arrays. -* (see ::cuMemMapArrayAsync). -* -* \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle. -* \param[in] size - Size of the allocation requested -* \param[in] prop - Properties of the allocation to create. -* \param[in] flags - flags for future use, must be zero now. -* \return -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_OUT_OF_MEMORY, -* ::CUDA_ERROR_INVALID_DEVICE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* \notefnerr -* -* \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle -*/ -CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags); - -/** -* \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate. -* -* Frees the memory that was allocated on a device through cuMemCreate. -* -* The memory allocation will be freed when all outstanding mappings to the memory -* are unmapped and when all outstanding references to the handle (including it's -* shareable counterparts) are also released. The generic memory handle can be -* freed when there are still outstanding mappings made with this handle. Each -* time a recepient process imports a shareable handle, it needs to pair it with -* ::cuMemRelease for the handle to be freed. If \p handle is not a valid handle -* the behavior is undefined. -* -* \param[in] handle Value of handle which was returned previously by cuMemCreate. -* \return -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* \notefnerr -* -* \sa ::cuMemCreate -*/ -CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle); - -/** -* \brief Maps an allocation handle to a reserved virtual address range. -* -* Maps bytes of memory represented by \p handle starting from byte \p offset to -* \p size to address range [\p addr, \p addr + \p size]. This range must be an -* address reservation previously reserved with ::cuMemAddressReserve, and -* \p offset + \p size must be less than the size of the memory allocation. -* Both \p ptr, \p size, and \p offset must be a multiple of the value given via -* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag. -* -* Please note calling ::cuMemMap does not make the address accessible, -* the caller needs to update accessibility of a contiguous mapped VA -* range by calling ::cuMemSetAccess. -* -* Once a recipient process obtains a shareable memory handle -* from ::cuMemImportFromShareableHandle, the process must -* use ::cuMemMap to map the memory into its address ranges before -* setting accessibility with ::cuMemSetAccess. -* -* ::cuMemMap can only create mappings on VA range reservations -* that are not currently mapped. -* -* \param[in] ptr - Address where memory will be mapped. -* \param[in] size - Size of the memory mapping. -* \param[in] offset - Offset into the memory represented by -* - \p handle from which to start mapping -* - Note: currently must be zero. -* \param[in] handle - Handle to a shareable memory -* \param[in] flags - flags for future use, must be zero now. -* \return -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_INVALID_DEVICE, -* ::CUDA_ERROR_OUT_OF_MEMORY, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* \notefnerr -* -* \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle -*/ -CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags); - -/** - * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays - * - * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays. - * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count. - * The structure ::CUarrayMapInfo is defined as follow: - \code - typedef struct CUarrayMapInfo_st { - CUresourcetype resourceType; - union { - CUmipmappedArray mipmap; - CUarray array; - } resource; - - CUarraySparseSubresourceType subresourceType; - union { - struct { - unsigned int level; - unsigned int layer; - unsigned int offsetX; - unsigned int offsetY; - unsigned int offsetZ; - unsigned int extentWidth; - unsigned int extentHeight; - unsigned int extentDepth; - } sparseLevel; - struct { - unsigned int layer; - unsigned long long offset; - unsigned long long size; - } miptail; - } subresource; - - CUmemOperationType memOperationType; - - CUmemHandleType memHandleType; - union { - CUmemGenericAllocationHandle memHandle; - } memHandle; - - unsigned long long offset; - unsigned int deviceBitMask; - unsigned int flags; - unsigned int reserved[2]; - } CUarrayMapInfo; - \endcode - * - * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on. - * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then - * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle. - * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using - * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE. - * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. - * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY - * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle. - * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been - * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE. - * - * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. - * ::CUarraySparseSubresourceType_enum is defined as: - \code - typedef enum CUarraySparseSubresourceType_enum { - CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, - CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 - } CUarraySparseSubresourceType; - \endcode - * - * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a - * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which - * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by - * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type. - * - * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL - * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents. - * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY - * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively. - * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight - * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively. - * These offsets and extents must be aligned to the corresponding tile dimension. - * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise, - * must be zero. - * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise, - * must be zero. - * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth - * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays. - * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties - * - * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL - * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in - * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size. - * Both, mip tail offset and mip tail size must be aligned to the tile size. - * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags - * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index. - * Otherwise, must be zero. - * - * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as: - \code - typedef enum CUmemOperationType_enum { - CU_MEM_OPERATION_TYPE_MAP = 1, - CU_MEM_OPERATION_TYPE_UNMAP = 2 - } CUmemOperationType; - \endcode - * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource - * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. - * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, - * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC. - * - * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation - * is performed. ::CUarrayMapInfo::memHandle must be NULL. - * - * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. - * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. - * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match - * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle. - * - * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * - * \param[in] mapInfoList - List of ::CUarrayMapInfo - * \param[in] count - Count of ::CUarrayMapInfo in \p mapInfoList - * \param[in] hStream - Stream identifier for the stream to use for map or unmap operations - * - * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties - */ -CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); - -/** -* \brief Unmap the backing memory of a given address range. -* -* The range must be the entire contiguous address range that was mapped to. In -* other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped -* by ::cuMemCreate / ::cuMemMap. Any backing memory allocations will be freed -* if there are no existing mappings and there are no unreleased memory handles. -* -* When ::cuMemUnmap returns successfully the address range is converted to an -* address reservation and can be used for a future calls to ::cuMemMap. Any new -* mapping to this virtual address will need to have access granted through -* ::cuMemSetAccess, as all mappings start with no accessibility setup. -* -* \param[in] ptr - Starting address for the virtual address range to unmap -* \param[in] size - Size of the virtual address range to unmap -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* \notefnerr -* \note_sync -* -* \sa ::cuMemCreate, ::cuMemAddressReserve -*/ -CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size); - -/** -* \brief Set the access flags for each location specified in \p desc for the given virtual address range -* -* Given the virtual address range via \p ptr and \p size, and the locations -* in the array given by \p desc and \p count, set the access flags for the -* target locations. The range must be a fully mapped address range -* containing all allocations created by ::cuMemMap / ::cuMemCreate. -* -* \param[in] ptr - Starting address for the virtual address range -* \param[in] size - Length of the virtual address range -* \param[in] desc - Array of ::CUmemAccessDesc that describe how to change the -* - mapping for each location specified -* \param[in] count - Number of ::CUmemAccessDesc in \p desc -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_INVALID_DEVICE, -* ::CUDA_ERROR_NOT_SUPPORTED -* \notefnerr -* \note_sync -* -* \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap -*/ -CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count); - -/** -* \brief Get the access \p flags set for the given \p location and \p ptr -* -* \param[out] flags - Flags set for this location -* \param[in] location - Location in which to check the flags for -* \param[in] ptr - Address in which to check the access flags for -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_INVALID_DEVICE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemSetAccess -*/ -CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr); - -/** -* \brief Exports an allocation to a requested shareable handle type -* -* Given a CUDA memory handle, create a shareable memory -* allocation handle that can be used to share the memory with other -* processes. The recipient process can convert the shareable handle back into a -* CUDA memory handle using ::cuMemImportFromShareableHandle and map -* it with ::cuMemMap. The implementation of what this handle is and how it -* can be transferred is defined by the requested handle type in \p handleType -* -* Once all shareable handles are closed and the allocation is released, the allocated -* memory referenced will be released back to the OS and uses of the CUDA handle afterward -* will lead to undefined behavior. -* -* This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL) -* that support importing memory from the shareable type -* -* \param[out] shareableHandle - Pointer to the location in which to store the requested handle type -* \param[in] handle - CUDA handle for the memory allocation -* \param[in] handleType - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter) -* \param[in] flags - Reserved, must be zero -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemImportFromShareableHandle -*/ -CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags); - -/** -* \brief Imports an allocation from a requested shareable handle type. -* -* If the current process cannot support the memory described by this shareable -* handle, this API will error as CUDA_ERROR_NOT_SUPPORTED. -* -* \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc) -* created on devices under an SLI group may not be supported, and thus this API will -* return CUDA_ERROR_NOT_SUPPORTED. -* There is no guarantee that the contents of \p handle will be the same CUDA memory handle -* for the same given OS shareable handle, or the same underlying allocation. -* -* \param[out] handle - CUDA Memory handle for the memory allocation. -* \param[in] osHandle - Shareable Handle representing the memory allocation that is to be imported. -* \param[in] shHandleType - handle type of the exported handle ::CUmemAllocationHandleType. -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease -*/ -CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType); - -/** -* \brief Calculates either the minimal or recommended granularity -* -* Calculates either the minimal or recommended granularity -* for a given allocation specification and returns it in granularity. This -* granularity can be used as a multiple for alignment, size, or address mapping. -* -* \param[out] granularity Returned granularity. -* \param[in] prop Property for which to determine the granularity for -* \param[in] option Determines which granularity to return -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemCreate, ::cuMemMap -*/ -CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); - -/** -* \brief Retrieve the contents of the property structure defining properties for this handle -* -* \param[out] prop - Pointer to a properties structure which will hold the information about this handle -* \param[in] handle - Handle which to perform the query on -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemCreate, ::cuMemImportFromShareableHandle -*/ -CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle); - -/** -* \brief Given an address \p addr, returns the allocation handle of the backing memory allocation. -* -* The handle is guaranteed to be the same handle value used to map the memory. If the address -* requested is not mapped, the function will fail. The returned handle must be released with -* corresponding number of calls to ::cuMemRelease. -* -* \note The address \p addr, can be any address in a range previously mapped -* by ::cuMemMap, and not necessarily the start address. -* -* \param[out] handle CUDA Memory handle for the backing memory allocation. -* \param[in] addr Memory address to query, that has been mapped previously. -* \returns -* ::CUDA_SUCCESS, -* ::CUDA_ERROR_INVALID_VALUE, -* ::CUDA_ERROR_NOT_INITIALIZED, -* ::CUDA_ERROR_DEINITIALIZED, -* ::CUDA_ERROR_NOT_PERMITTED, -* ::CUDA_ERROR_NOT_SUPPORTED -* -* \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap -*/ -CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr); - -/** @} */ /* END CUDA_VA */ - -/** - * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator - * - * ___MANBRIEF___ Functions for performing allocation and free operations in stream order. - * Functions for controlling the behavior of the underlying allocator. - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the stream ordered memory allocator exposed by the - * low-level CUDA driver application programming interface. - * - * @{ - * - * \section CUDA_MALLOC_ASYNC_overview overview - * - * The asynchronous allocator allows the user to allocate and free in stream order. - * All asynchronous accesses of the allocation must happen between - * the stream executions of the allocation and the free. If the memory is accessed - * outside of the promised stream order, a use before allocation / use after free error - * will cause undefined behavior. - * - * The allocator is free to reallocate the memory as long as it can guarantee - * that compliant memory accesses will not overlap temporally. - * The allocator may refer to internal stream ordering as well as inter-stream dependencies - * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee. - * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. - * - * \section CUDA_MALLOC_ASYNC_support Supported Platforms - * - * Whether or not a device supports the integrated stream ordered memory allocator - * may be queried by calling ::cuDeviceGetAttribute() with the device attribute - * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED - */ - -/** - * \brief Frees memory with stream ordered semantics - * - * Inserts a free operation into \p hStream. - * The allocation must not be accessed after stream execution reaches the free. - * After this API returns, accessing the memory from any subsequent work launched on the GPU - * or querying its pointer attributes results in undefined behavior. - * - * \note During stream capture, this function results in the creation of a free node and - * must therefore be passed the address of a graph allocation. - * - * \param dptr - memory to free - * \param hStream - The stream establishing the stream ordering contract. - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), - * ::CUDA_ERROR_NOT_SUPPORTED - */ -CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); - -/** - * \brief Allocates memory with stream ordered semantics - * - * Inserts an allocation operation into \p hStream. - * A pointer to the allocated memory is returned immediately in *dptr. - * The allocation must not be accessed until the the allocation operation completes. - * The allocation comes from the memory pool current to the stream's device. - * - * \note The default memory pool of a device contains device memory from that device. - * \note Basic stream ordering allows future work submitted into the same stream to use the allocation. - * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation - * operation completes before work submitted in a separate stream runs. - * \note During stream capture, this function results in the creation of an allocation node. In this case, - * the allocation is owned by the graph instead of the memory pool. The memory pool's properties - * are used to set the node's creation parameters. - * - * \param[out] dptr - Returned device pointer - * \param[in] bytesize - Number of bytes to allocate - * \param[in] hStream - The stream establishing the stream ordering contract and the memory pool to allocate from - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool, - * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, - * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute - */ -CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); - -/** - * \brief Tries to release memory back to the OS - * - * Releases memory back to the OS until the pool contains fewer than minBytesToKeep - * reserved bytes, or there is no more memory that the allocator can safely release. - * The allocator cannot release OS allocations that back outstanding asynchronous allocations. - * The OS allocations may happen at different granularity from the user allocations. - * - * \note: Allocations that have not been freed count as outstanding. - * \note: Allocations that have been asynchronously freed but whose completion has - * not been observed on the host (eg. by a synchronize) can count as outstanding. - * - * \param[in] pool - The memory pool to trim - * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved, - * the TrimTo operation is a no-op. Otherwise the pool will be guaranteed to have - * at least minBytesToKeep bytes reserved after the operation. - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, - * ::cuDeviceGetMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep); - -/** - * \brief Sets attributes of a memory pool - * - * Supported attributes are: - * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) - * Amount of reserved memory in bytes to hold onto before trying - * to release memory back to the OS. When more than the release - * threshold bytes of memory are held by the memory pool, the - * allocator will try to release memory back to the OS on the - * next call to stream, event or context synchronize. (default 0) - * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) - * Allow ::cuMemAllocAsync to use memory asynchronously freed - * in another stream as long as a stream ordering dependency - * of the allocating stream on the free action exists. - * Cuda events and null stream interactions can create the required - * stream ordered dependencies. (default enabled) - * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) - * Allow reuse of already completed frees when there is no dependency - * between the free and allocation. (default enabled) - * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) - * Allow ::cuMemAllocAsync to insert new stream dependencies - * in order to establish the stream ordering required to reuse - * a piece of memory released by ::cuMemFreeAsync (default enabled). - * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) - * Reset the high watermark that tracks the amount of backing memory that was - * allocated for the memory pool. It is illegal to set this attribute to a non-zero value. - * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) - * Reset the high watermark that tracks the amount of used memory that was - * allocated for the memory pool. - * - * \param[in] pool - The memory pool to modify - * \param[in] attr - The attribute to modify - * \param[in] value - Pointer to the value to assign - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, - * ::cuDeviceGetMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); - -/** - * \brief Gets attributes of a memory pool - * - * Supported attributes are: - * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) - * Amount of reserved memory in bytes to hold onto before trying - * to release memory back to the OS. When more than the release - * threshold bytes of memory are held by the memory pool, the - * allocator will try to release memory back to the OS on the - * next call to stream, event or context synchronize. (default 0) - * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) - * Allow ::cuMemAllocAsync to use memory asynchronously freed - * in another stream as long as a stream ordering dependency - * of the allocating stream on the free action exists. - * Cuda events and null stream interactions can create the required - * stream ordered dependencies. (default enabled) - * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) - * Allow reuse of already completed frees when there is no dependency - * between the free and allocation. (default enabled) - * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) - * Allow ::cuMemAllocAsync to insert new stream dependencies - * in order to establish the stream ordering required to reuse - * a piece of memory released by ::cuMemFreeAsync (default enabled). - * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t) - * Amount of backing memory currently allocated for the mempool - * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) - * High watermark of backing memory allocated for the mempool since the - * last time it was reset. - * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t) - * Amount of memory from the pool that is currently in use by the application. - * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) - * High watermark of the amount of memory from the pool that was in use by the application. - * - * \param[in] pool - The memory pool to get attributes of - * \param[in] attr - The attribute to get - * \param[out] value - Retrieved value - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, - * ::cuDeviceGetMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); - -/** - * \brief Controls visibility of pools between devices - * - * \param[in] pool - The pool being modified - * \param[in] map - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu. - * \param[in] count - Number of descriptors in the map array. - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, - * ::cuDeviceGetMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count); - -/** - * \brief Returns the accessibility of a pool from a device - * - * Returns the accessibility of the pool's memory from the specified location. - * - * \param[out] flags - the accessibility of the pool from the specified location - * \param[in] memPool - the pool being queried - * \param[in] location - the location accessing the pool - * - * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, - * ::cuDeviceGetMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location); - -/** - * \brief Creates a memory pool - * - * Creates a CUDA memory pool and returns the handle in \p pool. The \p poolProps determines - * the properties of the pool such as the backing device and IPC capabilities. - * - * By default, the pool's memory will be accessible from the device it is allocated on. - * - * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC. - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_NOT_SUPPORTED - * - * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool, - * ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle - */ -CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps); - -/** - * \brief Destroys the specified memory pool - * - * If any pointers obtained from this pool haven't been freed or - * the pool has free operations that haven't completed - * when ::cuMemPoolDestroy is invoked, the function will return immediately and the - * resources associated with the pool will be released automatically - * once there are no more outstanding allocations. - * - * Destroying the current mempool of a device sets the default mempool of - * that device as the current mempool for that device. - * - * \note A device's default memory pool cannot be destroyed. - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, - * ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate - */ -CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool); - -/** - * \brief Allocates memory from a specified pool with stream ordered semantics. - * - * Inserts an allocation operation into \p hStream. - * A pointer to the allocated memory is returned immediately in *dptr. - * The allocation must not be accessed until the the allocation operation completes. - * The allocation comes from the specified memory pool. - * - * \note - * - The specified memory pool may be from a device different than that of the specified \p hStream. - * - * - Basic stream ordering allows future work submitted into the same stream to use the allocation. - * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation - * operation completes before work submitted in a separate stream runs. - * - * \note During stream capture, this function results in the creation of an allocation node. In this case, - * the allocation is owned by the graph instead of the memory pool. The memory pool's properties - * are used to set the node's creation parameters. - * - * \param[out] dptr - Returned device pointer - * \param[in] bytesize - Number of bytes to allocate - * \param[in] pool - The pool to allocate from - * \param[in] hStream - The stream establishing the stream ordering semantic - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, - * ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess, - * ::cuMemPoolSetAttribute - */ -CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); - -/** - * \brief Exports a memory pool to the requested handle type. - * - * Given an IPC capable mempool, create an OS handle to share the pool with another process. - * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle. - * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs. - * The implementation of what the shareable handle is and how it can be transferred is defined by the requested - * handle type. - * - * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE. - * - * \param[out] handle_out - Returned OS handle - * \param[in] pool - pool to export - * \param[in] handleType - the type of handle to create - * \param[in] flags - must be 0 - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer, - * ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync, - * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, - * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute - */ -CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags); - -/** - * \brief imports a memory pool from a shared handle. - * - * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer. - * - * \note Imported memory pools do not support creating new allocations. - * As such imported memory pools may not be used in cuDeviceSetMemPool - * or ::cuMemAllocFromPoolAsync calls. - * - * \param[out] pool_out - Returned memory pool - * \param[in] handle - OS handle of the pool to open - * \param[in] handleType - The type of handle being imported - * \param[in] flags - must be 0 - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer - */ -CUresult CUDAAPI cuMemPoolImportFromShareableHandle( - CUmemoryPool *pool_out, - void *handle, - CUmemAllocationHandleType handleType, - unsigned long long flags); - -/** - * \brief Export data to share a memory pool allocation between processes. - * - * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool. - * The recipient process can import the allocation with the ::cuMemPoolImportPointer api. - * The data is not a handle and may be shared through any IPC mechanism. - * - * \param[out] shareData_out - Returned export data - * \param[in] ptr - pointer to memory being exported - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer - */ -CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr); - -/** - * \brief Import a memory pool allocation from another process. - * - * Returns in \p ptr_out a pointer to the imported memory. - * The imported memory must not be accessed before the allocation operation completes - * in the exporting process. The imported memory must be freed from all importing processes before - * being freed in the exporting process. The pointer may be freed with cuMemFree - * or cuMemFreeAsync. If cuMemFreeAsync is used, the free must be completed - * on the importing process before the free operation on the exporting process. - * - * \note The cuMemFreeAsync api may be used in the exporting process before - * the cuMemFreeAsync operation completes in its stream as long as the - * cuMemFreeAsync in the exporting process specifies a stream with - * a stream dependency on the importing process's cuMemFreeAsync. - * - * \param[out] ptr_out - pointer to imported memory - * \param[in] pool - pool from which to import - * \param[in] shareData - data specifying the memory to import - * - * \returns - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_OUT_OF_MEMORY - * - * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer - */ -CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData); - -/** @} */ /* END CUDA_MALLOC_ASYNC */ - -/** - * \defgroup CUDA_UNIFIED Unified Addressing - * - * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the unified addressing functions of the - * low-level CUDA driver application programming interface. - * - * @{ - * - * \section CUDA_UNIFIED_overview Overview - * - * CUDA devices can share a unified address space with the host. - * For these devices there is no distinction between a device - * pointer and a host pointer -- the same pointer value may be - * used to access memory from the host program and from a kernel - * running on the device (with exceptions enumerated below). - * - * \section CUDA_UNIFIED_support Supported Platforms - * - * Whether or not a device supports unified addressing may be - * queried by calling ::cuDeviceGetAttribute() with the device - * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. - * - * Unified addressing is automatically enabled in 64-bit processes - * - * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values - * - * It is possible to look up information about the memory which backs a - * pointer value. For instance, one may want to know if a pointer points - * to host or device memory. As another example, in the case of device - * memory, one may want to know on which CUDA device the memory - * resides. These properties may be queried using the function - * ::cuPointerGetAttribute() - * - * Since pointers are unique, it is not necessary to specify information - * about the pointers specified to the various copy functions in the - * CUDA API. The function ::cuMemcpy() may be used to perform a copy - * between two pointers, ignoring whether they point to host or device - * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH() - * unnecessary for devices supporting unified addressing). For - * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be - * used to specify that the CUDA driver should infer the location of the - * pointer from its value. - * - * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory - * - * All host memory allocated in all contexts using ::cuMemAllocHost() and - * ::cuMemHostAlloc() is always directly accessible from all contexts on - * all devices that support unified addressing. This is the case regardless - * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and - * ::CU_MEMHOSTALLOC_DEVICEMAP are specified. - * - * The pointer value through which allocated host memory may be accessed - * in kernels on all devices that support unified addressing is the same - * as the pointer value through which that memory is accessed on the host, - * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device - * pointer for these allocations. - * - * Note that this is not the case for memory allocated using the flag - * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below. - * - * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory - * - * Upon enabling direct access from a context that supports unified addressing - * to another peer context that supports unified addressing using - * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using - * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible - * by the current context. The device pointer value through - * which any peer memory may be accessed in the current context - * is the same pointer value through which that memory may be - * accessed in the peer context. - * - * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing - * - * Not all memory may be accessed on devices through the same pointer - * value through which they are accessed on the host. These exceptions - * are host memory registered using ::cuMemHostRegister() and host memory - * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these - * exceptions, there exists a distinct host and device address for the - * memory. The device address is guaranteed to not overlap any valid host - * pointer range and is guaranteed to have the same value across all - * contexts that support unified addressing. - * - * This device address may be queried using ::cuMemHostGetDevicePointer() - * when a context using unified addressing is current. Either the host - * or the unified device pointer value may be used to refer to this memory - * through ::cuMemcpy() and similar functions using the - * ::CU_MEMORYTYPE_UNIFIED memory type. - * - */ - -/** - * \brief Returns information about a pointer - * - * The supported attributes are: - * - * - ::CU_POINTER_ATTRIBUTE_CONTEXT: - * - * Returns in \p *data the ::CUcontext in which \p ptr was allocated or - * registered. - * The type of \p data must be ::CUcontext *. - * - * If \p ptr was not allocated by, mapped by, or registered with - * a ::CUcontext which uses unified virtual addressing then - * ::CUDA_ERROR_INVALID_VALUE is returned. - * - * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: - * - * Returns in \p *data the physical memory type of the memory that - * \p ptr addresses as a ::CUmemorytype enumerated value. - * The type of \p data must be unsigned int. - * - * If \p ptr addresses device memory then \p *data is set to - * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the - * memory resides is the ::CUdevice of the ::CUcontext returned by the - * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr. - * - * If \p ptr addresses host memory then \p *data is set to - * ::CU_MEMORYTYPE_HOST. - * - * If \p ptr was not allocated by, mapped by, or registered with - * a ::CUcontext which uses unified virtual addressing then - * ::CUDA_ERROR_INVALID_VALUE is returned. - * - * If the current ::CUcontext does not support unified virtual - * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned. - * - * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER: - * - * Returns in \p *data the device pointer value through which - * \p ptr may be accessed by kernels running in the current - * ::CUcontext. - * The type of \p data must be CUdeviceptr *. - * - * If there exists no device pointer value through which - * kernels running in the current ::CUcontext may access - * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned. - * - * If there is no current ::CUcontext then - * ::CUDA_ERROR_INVALID_CONTEXT is returned. - * - * Except in the exceptional disjoint addressing cases discussed - * below, the value returned in \p *data will equal the input - * value \p ptr. - * - * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER: - * - * Returns in \p *data the host pointer value through which - * \p ptr may be accessed by by the host program. - * The type of \p data must be void **. - * If there exists no host pointer value through which - * the host program may directly access \p ptr then - * ::CUDA_ERROR_INVALID_VALUE is returned. - * - * Except in the exceptional disjoint addressing cases discussed - * below, the value returned in \p *data will equal the input - * value \p ptr. - * - * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS: - * - * Returns in \p *data two tokens for use with the nv-p2p.h Linux - * kernel interface. \p data must be a struct of type - * CUDA_POINTER_ATTRIBUTE_P2P_TOKENS. - * - * \p ptr must be a pointer to memory obtained from :cuMemAlloc(). - * Note that p2pToken and vaSpaceToken are only valid for the - * lifetime of the source allocation. A subsequent allocation at - * the same address may return completely different tokens. - * Querying this attribute has a side effect of setting the attribute - * ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that - * \p ptr points to. - * - * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: - * - * A boolean attribute which when set, ensures that synchronous memory operations - * initiated on the region of memory that \p ptr points to will always synchronize. - * See further documentation in the section titled "API synchronization behavior" - * to learn more about cases when synchronous memory operations can - * exhibit asynchronous behavior. - * - * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID: - * - * Returns in \p *data a buffer ID which is guaranteed to be unique within the process. - * \p data must point to an unsigned long long. - * - * \p ptr must be a pointer to memory obtained from a CUDA memory allocation API. - * Every memory allocation from any of the CUDA memory allocation APIs will - * have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs - * from previous freed allocations. IDs are only unique within a single process. - * - * - * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED: - * - * Returns in \p *data a boolean that indicates whether the pointer points to - * managed memory or not. - * - * If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned. - * - * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: - * - * Returns in \p *data an integer representing a device ordinal of a device against - * which the memory was allocated or registered. - * - * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE: - * - * Returns in \p *data a boolean that indicates if this pointer maps to - * an allocation that is suitable for ::cudaIpcGetMemHandle. - * - * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR: - * - * Returns in \p *data the starting address for the allocation referenced - * by the device pointer \p ptr. Note that this is not necessarily the - * address of the mapped region, but the address of the mappable address - * range \p ptr references (e.g. from ::cuMemAddressReserve). - * - * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE: - * - * Returns in \p *data the size for the allocation referenced by the device - * pointer \p ptr. Note that this is not necessarily the size of the mapped - * region, but the size of the mappable address range \p ptr references - * (e.g. from ::cuMemAddressReserve). To retrieve the size of the mapped - * region, see ::cuMemGetAddressRange - * - * - ::CU_POINTER_ATTRIBUTE_MAPPED: - * - * Returns in \p *data a boolean that indicates if this pointer is in a - * valid address range that is mapped to a backing allocation. - * - * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: - * - * Returns a bitmask of the allowed handle types for an allocation that may - * be passed to ::cuMemExportToShareableHandle. - * - * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE: - * - * Returns in \p *data the handle to the mempool that the allocation was obtained from. - * - * \par - * - * Note that for most allocations in the unified virtual address space - * the host and device pointer for accessing the allocation will be the - * same. The exceptions to this are - * - user memory registered using ::cuMemHostRegister - * - host memory allocated using ::cuMemHostAlloc with the - * ::CU_MEMHOSTALLOC_WRITECOMBINED flag - * For these types of allocation there will exist separate, disjoint host - * and device addresses for accessing the allocation. In particular - * - The host address will correspond to an invalid unmapped device address - * (which will result in an exception if accessed from the device) - * - The device address will correspond to an invalid unmapped host address - * (which will result in an exception if accessed from the host). - * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER - * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host - * and device addresses from either address. - * - * \param data - Returned pointer attribute value - * \param attribute - Pointer attribute to query - * \param ptr - Pointer - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuPointerSetAttribute, - * ::cuMemAlloc, - * ::cuMemFree, - * ::cuMemAllocHost, - * ::cuMemFreeHost, - * ::cuMemHostAlloc, - * ::cuMemHostRegister, - * ::cuMemHostUnregister, - * ::cudaPointerGetAttributes - */ -CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); - -/** - * \brief Prefetches memory to the specified destination device - * - * Prefetches memory to the specified destination device. \p devPtr is the - * base device pointer of the memory to be prefetched and \p dstDevice is the - * destination device. \p count specifies the number of bytes to copy. \p hStream - * is the stream in which the operation is enqueued. The memory range must refer - * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. - * - * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If - * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS - * must be non-zero. Additionally, \p hStream must be associated with a device that has a - * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. - * - * The start address and end address of the memory range will be rounded down and rounded up - * respectively to be aligned to CPU page size before the prefetch operation is enqueued - * in the stream. - * - * If no physical memory has been allocated for this region, then this memory region - * will be populated and mapped on the destination device. If there's insufficient - * memory to prefetch the desired region, the Unified Memory driver may evict pages from other - * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory - * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted. - * - * By default, any mappings to the previous location of the migrated pages are removed and - * mappings for the new location are only setup on \p dstDevice. The exact behavior however - * also depends on the settings applied to this memory range via ::cuMemAdvise as described - * below: - * - * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, - * then that subset will create a read-only copy of the pages on \p dstDevice. - * - * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory - * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the - * preferred location of any pages in the memory range. - * - * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, - * then mappings to those pages from all the appropriate processors are updated to - * refer to the new location if establishing such a mapping is possible. Otherwise, - * those mappings are cleared. - * - * Note that this API is not required for functionality and only serves to improve performance - * by allowing the application to migrate data to a suitable location before it is accessed. - * Memory accesses to this range are always coherent and are allowed even when the data is - * actively being migrated. - * - * Note that this function is asynchronous with respect to the host and all work - * on other devices. - * - * \param devPtr - Pointer to be prefetched - * \param count - Size in bytes - * \param dstDevice - Destination device to prefetch to - * \param hStream - Stream to enqueue prefetch operation - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, - * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, - * ::cudaMemPrefetchAsync - */ -CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); - -/** - * \brief Advise about the usage of a given memory range - * - * Advise the Unified Memory subsystem about the usage pattern for the memory range - * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory - * range will be rounded down and rounded up respectively to be aligned to CPU page size before the - * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged - * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable - * memory provided it represents a valid, host-accessible region of memory and all additional constraints - * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable - * memory range results in an error being returned. - * - * The \p advice parameter can take the following values: - * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - * from and only occasionally written to. Any read accesses from any processor to this region will create a - * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync - * is called on this region, it will create a read-only copy of the data on the destination processor. - * If any processor writes to this region, all copies of the corresponding page will be invalidated - * except for the one where the write occurred. The \p device argument is ignored for this advice. - * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU - * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. - * Also, if a context is created on a device that does not have the device attribute - * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until - * all such contexts are destroyed. - * If the memory region refers to valid system-allocated pageable memory, then the accessing device must - * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only - * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the - * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice - * will not create a read-only copy when that device accesses this memory region. - * - * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the - * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated - * copies of the data will be collapsed into a single copy. The location for the collapsed - * copy will be the preferred location if the page has a preferred location and one of the read-duplicated - * copies was resident at that location. Otherwise, the location chosen is arbitrary. - * - * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the - * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location - * does not cause data to migrate to that location immediately. Instead, it guides the migration policy - * when a fault occurs on that memory region. If the data is already in its preferred location and the - * faulting processor can establish a mapping without requiring the data to be migrated, then - * data migration will be avoided. On the other hand, if the data is not in its preferred location - * or if a direct mapping cannot be established, then it will be migrated to the processor accessing - * it. It is important to note that setting the preferred location does not prevent data prefetching - * done using ::cuMemPrefetchAsync. - * Having a preferred location can override the page thrash detection and resolution logic in the Unified - * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device - * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But - * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. - * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the - * policies associated with that advice will override the policies of this advice, unless read accesses from - * \p device will not result in a read-only copy being created on that device as outlined in description for - * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. - * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero - * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has - * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, - * then this call has no effect. Note however that this behavior may change in the future. - * - * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - * and changes the preferred location to none. - * - * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then - * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. - * This advice does not cause data migration and has no impact on the location of the data per se. Instead, - * it causes the data to always be mapped in the specified processor's page tables, as long as the - * location of the data permits a mapping to be established. If the data gets migrated for any reason, - * the mappings are updated accordingly. - * This advice is recommended in scenarios where data locality is not important, but avoiding faults is. - * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data - * over to the other GPUs is not as important because the accesses are infrequent and the overhead of - * migration may be too high. But preventing faults can still help improve performance, and so having - * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the - * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - * page in host memory. - * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the - * policies associated with that advice will override the policies of this advice. Additionally, if the - * preferred location of this memory region or any subset of it is also \p device, then the policies - * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. - * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero - * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has - * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, - * then this call has no effect. - * - * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to - * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. - * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero - * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has - * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, - * then this call has no effect. - * - * \param devPtr - Pointer to memory to set the advice for - * \param count - Size in bytes of the memory range - * \param advice - Advice to be applied for the specified memory range - * \param device - Device to apply the advice for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, - * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, - * ::cudaMemAdvise - */ -CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device); - -/** - * \brief Query an attribute of a given memory range - * - * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The - * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via - * __managed__ variables. - * - * The \p attribute parameter can take the following values: - * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted - * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given - * memory range have read-duplication enabled, or 0 otherwise. - * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be - * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device - * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU - * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID - * if either all the pages don't have the same preferred location or some of the pages don't have a - * preferred location at all. Note that the actual location of the pages in the memory range at the time of - * the query may be different from the preferred location. - * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted - * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned - * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range. - * If any device does not have that advice set for the entire memory range, that device will not be included. - * If \p data is larger than the number of devices that have that advice set for that memory range, - * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12 - * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be - * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have - * that advice set, then only as many devices will be returned as can fit in the array. There is no - * guarantee on which specific devices will be returned, however. - * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be - * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location - * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be - * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU - * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not - * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the - * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to - * whether the prefetch operation to that location has completed or even begun. - * - * \param data - A pointers to a memory location where the result - * of each attribute query will be written to. - * \param dataSize - Array containing the size of data - * \param attribute - The attribute to query - * \param devPtr - Start of the range to query - * \param count - Size of the range to query - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync, - * ::cuMemAdvise, - * ::cudaMemRangeGetAttribute - */ -CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count); - -/** - * \brief Query attributes of a given memory range. - * - * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The - * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via - * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes - * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries. - * The results of the query will be stored in \p data. - * - * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for - * attribute descriptions and restrictions. - * - * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY - * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION - * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY - * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION - * - * \param data - A two-dimensional array containing pointers to memory - * locations where the result of each attribute query will be written to. - * \param dataSizes - Array containing the sizes of each result - * \param attributes - An array of attributes to query - * (numAttributes and the number of attributes in this array should match) - * \param numAttributes - Number of attributes to query - * \param devPtr - Start of the range to query - * \param count - Size of the range to query - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise, - * ::cuMemPrefetchAsync, - * ::cudaMemRangeGetAttributes - */ -CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count); - -/** - * \brief Set attributes on a previously allocated memory region - * - * The supported attributes are: - * - * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: - * - * A boolean attribute that can either be set (1) or unset (0). When set, - * the region of memory that \p ptr points to is guaranteed to always synchronize - * memory operations that are synchronous. If there are some previously initiated - * synchronous memory operations that are pending when this attribute is set, the - * function does not return until those memory operations are complete. - * See further documentation in the section titled "API synchronization behavior" - * to learn more about cases when synchronous memory operations can - * exhibit asynchronous behavior. - * \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set. - * - * \param value - Pointer to memory containing the value to be set - * \param attribute - Pointer attribute to set - * \param ptr - Pointer to a memory region allocated using CUDA memory allocation APIs - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa ::cuPointerGetAttribute, - * ::cuPointerGetAttributes, - * ::cuMemAlloc, - * ::cuMemFree, - * ::cuMemAllocHost, - * ::cuMemFreeHost, - * ::cuMemHostAlloc, - * ::cuMemHostRegister, - * ::cuMemHostUnregister - */ -CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr); - -/** - * \brief Returns information about a pointer. - * - * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions): - * - * - ::CU_POINTER_ATTRIBUTE_CONTEXT - * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE - * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER - * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER - * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS - * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID - * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED - * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL - * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR - * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE - * - ::CU_POINTER_ATTRIBUTE_MAPPED - * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE - * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES - * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE - * - * \param numAttributes - Number of attributes to query - * \param attributes - An array of attributes to query - * (numAttributes and the number of attributes in this array should match) - * \param data - A two-dimensional array containing pointers to memory - * locations where the result of each attribute query will be written to. - * \param ptr - Pointer to query - * - * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr - * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values - * and CUDA_SUCCESS is returned. - * - * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA - * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuPointerGetAttribute, - * ::cuPointerSetAttribute, - * ::cudaPointerGetAttributes - */ -CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); - -/** @} */ /* END CUDA_UNIFIED */ - -/** - * \defgroup CUDA_STREAM Stream Management - * - * ___MANBRIEF___ stream management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the stream management functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Create a stream - * - * Creates a stream and returns a handle in \p phStream. The \p Flags argument - * determines behaviors of the stream. - * - * Valid values for \p Flags are: - * - ::CU_STREAM_DEFAULT: Default stream creation flag. - * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created - * stream may run concurrently with work in stream 0 (the NULL stream), and that - * the created stream should perform no implicit synchronization with stream 0. - * - * \param phStream - Returned newly created stream - * \param Flags - Parameters for stream creation - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuStreamDestroy, - * ::cuStreamCreateWithPriority, - * ::cuStreamGetPriority, - * ::cuStreamGetFlags, - * ::cuStreamWaitEvent, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamAddCallback, - * ::cudaStreamCreate, - * ::cudaStreamCreateWithFlags - */ -CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags); - -/** - * \brief Create a stream with the given priority - * - * Creates a stream with the specified priority and returns a handle in \p phStream. - * This API alters the scheduler priority of work in the stream. Work in a higher - * priority stream may preempt work already executing in a low priority stream. - * - * \p priority follows a convention where lower numbers represent higher priorities. - * '0' represents default priority. The range of meaningful numerical priorities can - * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is - * outside the numerical range returned by ::cuCtxGetStreamPriorityRange, - * it will automatically be clamped to the lowest or the highest number in the range. - * - * \param phStream - Returned newly created stream - * \param flags - Flags for stream creation. See ::cuStreamCreate for a list of - * valid flags - * \param priority - Stream priority. Lower numbers represent higher priorities. - * See ::cuCtxGetStreamPriorityRange for more information about - * meaningful stream priorities that can be passed. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \note Stream priorities are supported only on GPUs - * with compute capability 3.5 or higher. - * - * \note In the current implementation, only compute kernels launched in - * priority streams are affected by the stream's priority. Stream priorities have - * no effect on host-to-device and device-to-host memory operations. - * - * \sa ::cuStreamDestroy, - * ::cuStreamCreate, - * ::cuStreamGetPriority, - * ::cuCtxGetStreamPriorityRange, - * ::cuStreamGetFlags, - * ::cuStreamWaitEvent, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamAddCallback, - * ::cudaStreamCreateWithPriority - */ -CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority); - - -/** - * \brief Query the priority of a given stream - * - * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority - * and return the priority in \p priority. Note that if the stream was created with a - * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange, - * this function returns the clamped priority. - * See ::cuStreamCreateWithPriority for details about priority clamping. - * - * \param hStream - Handle to the stream to be queried - * \param priority - Pointer to a signed integer in which the stream's priority is returned - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuStreamDestroy, - * ::cuStreamCreate, - * ::cuStreamCreateWithPriority, - * ::cuCtxGetStreamPriorityRange, - * ::cuStreamGetFlags, - * ::cudaStreamGetPriority - */ -CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); - -/** - * \brief Query the flags of a given stream - * - * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority - * and return the flags in \p flags. - * - * \param hStream - Handle to the stream to be queried - * \param flags - Pointer to an unsigned integer in which the stream's flags are returned - * The value returned in \p flags is a logical 'OR' of all flags that - * were used while creating this stream. See ::cuStreamCreate for the list - * of valid flags - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa ::cuStreamDestroy, - * ::cuStreamCreate, - * ::cuStreamGetPriority, - * ::cudaStreamGetFlags - */ -CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); - -/** - * \brief Query the context associated with a stream - * - * Returns the CUDA context that the stream is associated with. - * - * The stream handle \p hStream can refer to any of the following: - *
    - *
  • a stream created via any of the CUDA driver APIs such as ::cuStreamCreate - * and ::cuStreamCreateWithPriority, or their runtime API equivalents such as - * ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority. - * The returned context is the context that was active in the calling thread when the - * stream was created. Passing an invalid handle will result in undefined behavior.
  • - *
  • any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and - * ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted, - * which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively. - * Specifying any of the special handles will return the context current to the - * calling thread. If no context is current to the calling thread, - * ::CUDA_ERROR_INVALID_CONTEXT is returned.
  • - *
- * - * \param hStream - Handle to the stream to be queried - * \param pctx - Returned context associated with the stream - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * \notefnerr - * - * \sa ::cuStreamDestroy, - * ::cuStreamCreateWithPriority, - * ::cuStreamGetPriority, - * ::cuStreamGetFlags, - * ::cuStreamWaitEvent, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamAddCallback, - * ::cudaStreamCreate, - * ::cudaStreamCreateWithFlags - */ -CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); - -/** - * \brief Make a compute stream wait on an event - * - * Makes all future work submitted to \p hStream wait for all work captured in - * \p hEvent. See ::cuEventRecord() for details on what is captured by an event. - * The synchronization will be performed efficiently on the device when applicable. - * \p hEvent may be from a different context or device than \p hStream. - * - * flags include: - * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag. - * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external - * event node when performing stream capture. This flag is invalid outside - * of stream capture. - * - * \param hStream - Stream to wait - * \param hEvent - Event to wait on (may not be NULL) - * \param Flags - See ::CUevent_capture_flags - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * \note_null_stream - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuEventRecord, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamAddCallback, - * ::cuStreamDestroy, - * ::cudaStreamWaitEvent - */ -CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); - -/** - * \brief Add a callback to a compute stream - * - * \note This function is slated for eventual deprecation and removal. If - * you do not require the callback to execute in case of a device error, - * consider using ::cuLaunchHostFunc. Additionally, this function is not - * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike - * ::cuLaunchHostFunc. - * - * Adds a callback to be called on the host after all currently enqueued - * items in the stream have completed. For each - * cuStreamAddCallback call, the callback will be executed exactly once. - * The callback will block later work in the stream until it is finished. - * - * The callback may be passed ::CUDA_SUCCESS or an error code. In the event - * of a device error, all subsequently executed callbacks will receive an - * appropriate ::CUresult. - * - * Callbacks must not make any CUDA API calls. Attempting to use a CUDA API - * will result in ::CUDA_ERROR_NOT_PERMITTED. Callbacks must not perform any - * synchronization that may depend on outstanding device work or other callbacks - * that are not mandated to run earlier. Callbacks without a mandated order - * (in independent streams) execute in undefined order and may be serialized. - * - * For the purposes of Unified Memory, callback execution makes a number of - * guarantees: - *
    - *
  • The callback stream is considered idle for the duration of the - * callback. Thus, for example, a callback may always use memory attached - * to the callback stream.
  • - *
  • The start of execution of a callback has the same effect as - * synchronizing an event recorded in the same stream immediately prior to - * the callback. It thus synchronizes streams which have been "joined" - * prior to the callback.
  • - *
  • Adding device work to any stream does not have the effect of making - * the stream active until all preceding host functions and stream callbacks - * have executed. Thus, for - * example, a callback might use global attached memory even if work has - * been added to another stream, if the work has been ordered behind the - * callback with an event.
  • - *
  • Completion of a callback does not cause a stream to become - * active except as described above. The callback stream will remain idle - * if no device work follows the callback, and will remain idle across - * consecutive callbacks without device work in between. Thus, for example, - * stream synchronization can be done by signaling from a callback at the - * end of the stream.
  • - *
- * - * \param hStream - Stream to add callback to - * \param callback - The function to call once preceding stream operations are complete - * \param userData - User specified data to be passed to the callback function - * \param flags - Reserved for future use, must be 0 - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \note_null_stream - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamWaitEvent, - * ::cuStreamDestroy, - * ::cuMemAllocManaged, - * ::cuStreamAttachMemAsync, - * ::cuStreamLaunchHostFunc, - * ::cudaStreamAddCallback - */ -CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); - -/** - * \brief Begins graph capture on a stream - * - * Begin graph capture on \p hStream. When a stream is in capture mode, all operations - * pushed into the stream will not be executed, but will instead be captured into - * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated - * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which - * it was initiated, and it may only be initiated if the stream is not already in capture - * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id - * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo. - * - * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be - * called on this stream from the same thread. - * - * \param hStream - Stream in which to initiate capture - * \param mode - Controls the interaction of this capture sequence with other API - * calls that are potentially unsafe. For more details see - * ::cuThreadExchangeStreamCaptureMode. - * - * \note Kernels captured using this API must not use texture and surface references. - * Reading or writing through any texture or surface reference is undefined - * behavior. This restriction does not apply to texture and surface objects. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cuStreamCreate, - * ::cuStreamIsCapturing, - * ::cuStreamEndCapture, - * ::cuThreadExchangeStreamCaptureMode - */ -CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode); - -/** - * \brief Swaps the stream capture interaction mode for a thread - * - * Sets the calling thread's stream capture interaction mode to the value contained - * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To - * facilitate deterministic behavior across function or module boundaries, callers - * are encouraged to use this API in a push-pop fashion: \code - CUstreamCaptureMode mode = desiredMode; - cuThreadExchangeStreamCaptureMode(&mode); - ... - cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode - * \endcode - * - * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call - * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is - * not enqueued asynchronously to a stream, and is not observed by stream capture. - * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture - * depended on the allocation being replayed whenever the graph is launched, the - * captured graph would be invalid. - * - * Therefore, stream capture places restrictions on API calls that can be made within - * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This - * behavior can be controlled via this API and flags to ::cuStreamBeginCapture. - * - * A thread's mode is one of the following: - * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has - * an ongoing capture sequence that was not initiated with - * \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread - * has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL, - * this thread is prohibited from potentially unsafe API calls. - * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture - * sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited - * from potentially unsafe API calls. Concurrent capture sequences in other threads - * are ignored. - * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially - * unsafe API calls. Note that the thread is still prohibited from API calls which - * necessarily conflict with stream capture, for example, attempting ::cuEventQuery - * on an event that was last recorded inside a capture sequence. - * - * \param mode - Pointer to mode value to swap with the current mode - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cuStreamBeginCapture - */ -CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode); - -/** - * \brief Ends capture on a stream, returning the captured graph - * - * End capture on \p hStream, returning the captured graph via \p phGraph. - * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture. - * If capture was invalidated, due to a violation of the rules of stream capture, then - * a NULL graph will be returned. - * - * If the \p mode argument to ::cuStreamBeginCapture was not - * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as - * ::cuStreamBeginCapture. - * - * \param hStream - Stream to query - * \param phGraph - The captured graph - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD - * \notefnerr - * - * \sa - * ::cuStreamCreate, - * ::cuStreamBeginCapture, - * ::cuStreamIsCapturing - */ -CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); - -/** - * \brief Returns a stream's capture status - * - * Return the capture status of \p hStream via \p captureStatus. After a successful - * call, \p *captureStatus will contain one of the following: - * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing. - * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing. - * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error - * has invalidated the capture sequence. The capture sequence must be terminated - * with ::cuStreamEndCapture on the stream where it was initiated in order to - * continue using \p hStream. - * - * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while - * a blocking stream in the same context is capturing, it will return - * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified - * after the call. The blocking stream capture is not invalidated. - * - * When a blocking stream is capturing, the legacy stream is in an - * unusable state until the blocking stream capture is terminated. The legacy - * stream is not supported for stream capture, but attempted use would have an - * implicit dependency on the capturing stream(s). - * - * \param hStream - Stream to query - * \param captureStatus - Returns the stream's capture status - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT - * \notefnerr - * - * \sa - * ::cuStreamCreate, - * ::cuStreamBeginCapture, - * ::cuStreamEndCapture - */ -CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); - -/** - * \brief Query capture status of a stream - * - * Note there is a later version of this API, ::cuStreamGetCaptureInfo_v2. It will - * supplant this version in 12.0, which is retained for minor version compatibility. - * - * Query the capture status of a stream and and get an id for - * the capture sequence, which is unique over the lifetime of the process. - * - * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created - * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. - * - * A valid id is returned only if both of the following are true: - * - the call returns CUDA_SUCCESS - * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT - * \notefnerr - * - * \sa - * ::cuStreamGetCaptureInfo_v2, - * ::cuStreamBeginCapture, - * ::cuStreamIsCapturing - */ -CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); - -/** - * \brief Query a stream's capture state (11.3+) - * - * Query stream state related to stream capture. - * - * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created - * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. - * - * Valid data (other than capture status) is returned only if both of the following are true: - * - the call returns CUDA_SUCCESS - * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE - * - * This version of cuStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the - * previous version in 12.0. Developers requiring compatibility across minor versions to - * CUDA 11.0 (driver version 445) should use ::cuStreamGetCaptureInfo or include a fallback - * path. - * - * \param hStream - The stream to query - * \param captureStatus_out - Location to return the capture status of the stream; required - * \param id_out - Optional location to return an id for the capture sequence, which is - * unique over the lifetime of the process - * \param graph_out - Optional location to return the graph being captured into. All - * operations other than destroy and node removal are permitted on the graph - * while the capture sequence is in progress. This API does not transfer - * ownership of the graph, which is transferred or destroyed at - * ::cuStreamEndCapture. Note that the graph handle may be invalidated before - * end of capture for certain errors. Nodes that are or become - * unreachable from the original stream at ::cuStreamEndCapture due to direct - * actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED. - * \param dependencies_out - Optional location to store a pointer to an array of nodes. - * The next node to be captured in the stream will depend on this set of nodes, - * absent operations such as event wait which modify this set. The array pointer - * is valid until the next API call which operates on the stream or until end of - * capture. The node handles may be copied out and are valid until they or the - * graph is destroyed. The driver-owned array may also be passed directly to - * APIs that operate on the graph (not the stream) without copying. - * \param numDependencies_out - Optional location to store the size of the array - * returned in dependencies_out. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuStreamGetCaptureInfo, - * ::cuStreamBeginCapture, - * ::cuStreamIsCapturing, - * ::cuStreamUpdateCaptureDependencies - */ -CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, - cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); - -/** - * \brief Update the set of dependencies in a capturing stream (11.3+) - * - * Modifies the dependency set of a capturing stream. The dependency set is the set - * of nodes that the next captured node in the stream will depend on. - * - * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and - * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to - * the API is added to the existing set or replaces it. A flags value of 0 defaults - * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES. - * - * Nodes that are removed from the dependency set via this API do not result in - * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at - * ::cuStreamEndCapture. - * - * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing. - * - * This API is new in CUDA 11.3. Developers requiring compatibility across minor - * versions to CUDA 11.0 should not use this API or provide a fallback. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_ILLEGAL_STATE - * - * \sa - * ::cuStreamBeginCapture, - * ::cuStreamGetCaptureInfo, - * ::cuStreamGetCaptureInfo_v2 - */ -CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); - -/** - * \brief Attach memory to a stream asynchronously - * - * Enqueues an operation in \p hStream to specify stream association of - * \p length bytes of memory starting from \p dptr. This function is a - * stream-ordered operation, meaning that it is dependent on, and will - * only take effect when, previous work in stream has completed. Any - * previous association is automatically replaced. - * - * \p dptr must point to one of the following types of memories: - * - managed memory declared using the __managed__ keyword or allocated with - * ::cuMemAllocManaged. - * - a valid host-accessible region of system-allocated pageable memory. This - * type of memory may only be specified if the device associated with the - * stream reports a non-zero value for the device attribute - * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. - * - * For managed allocations, \p length must be either zero or the entire - * allocation's size. Both indicate that the entire allocation's stream - * association is being changed. Currently, it is not possible to change stream - * association for a portion of a managed allocation. - * - * For pageable host allocations, \p length must be non-zero. - * - * The stream association is specified using \p flags which must be - * one of ::CUmemAttach_flags. - * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed - * by any stream on any device. - * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee - * that it won't access the memory on the device from any stream on a device that - * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. - * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with - * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, - * the program makes a guarantee that it will only access the memory on the device - * from \p hStream. It is illegal to attach singly to the NULL stream, because the - * NULL stream is a virtual global stream and not a specific stream. An error will - * be returned in this case. - * - * When memory is associated with a single stream, the Unified Memory system will - * allow CPU access to this memory region so long as all operations in \p hStream - * have completed, regardless of whether other streams are active. In effect, - * this constrains exclusive ownership of the managed memory region by - * an active GPU to per-stream activity instead of whole-GPU activity. - * - * Accessing memory on the device from streams that are not associated with - * it will produce undefined results. No error checking is performed by the - * Unified Memory system to ensure that kernels launched into other streams - * do not access this region. - * - * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync - * via events, synchronization or other means to ensure legal access to memory - * at all times. Data visibility and coherency will be changed appropriately - * for all kernels which follow a stream-association change. - * - * If \p hStream is destroyed while data is associated with it, the association is - * removed and the association reverts to the default visibility of the allocation - * as specified at ::cuMemAllocManaged. For __managed__ variables, the default - * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an - * asynchronous operation, and as a result, the change to default association won't - * happen until all work in the stream has completed. - * - * \param hStream - Stream in which to enqueue the attach operation - * \param dptr - Pointer to memory (must be a pointer to managed memory or - * to a valid host-accessible region of system-allocated - * pageable memory) - * \param length - Length of memory - * \param flags - Must be one of ::CUmemAttach_flags - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \note_null_stream - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamWaitEvent, - * ::cuStreamDestroy, - * ::cuMemAllocManaged, - * ::cudaStreamAttachMemAsync - */ -CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); - -/** - * \brief Determine status of a compute stream - * - * Returns ::CUDA_SUCCESS if all operations in the stream specified by - * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not. - * - * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS - * is equivalent to having called ::cuStreamSynchronize(). - * - * \param hStream - Stream to query status of - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_READY - * \note_null_stream - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuStreamWaitEvent, - * ::cuStreamDestroy, - * ::cuStreamSynchronize, - * ::cuStreamAddCallback, - * ::cudaStreamQuery - */ -CUresult CUDAAPI cuStreamQuery(CUstream hStream); - -/** - * \brief Wait until a stream's tasks are completed - * - * Waits until the device has completed all operations in the stream specified - * by \p hStream. If the context was created with the - * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the - * stream is finished with all of its tasks. - * - * \param hStream - Stream to wait for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE - - * \note_null_stream - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuStreamDestroy, - * ::cuStreamWaitEvent, - * ::cuStreamQuery, - * ::cuStreamAddCallback, - * ::cudaStreamSynchronize - */ -CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); - -/** - * \brief Destroys a stream - * - * Destroys the stream specified by \p hStream. - * - * In case the device is still doing work in the stream \p hStream - * when ::cuStreamDestroy() is called, the function will return immediately - * and the resources associated with \p hStream will be released automatically - * once the device has completed all work in \p hStream. - * - * \param hStream - Stream to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuStreamWaitEvent, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamAddCallback, - * ::cudaStreamDestroy - */ -CUresult CUDAAPI cuStreamDestroy(CUstream hStream); - -/** - * \brief Copies attributes from source stream to destination stream. - * - * Copies attributes from source stream \p src to destination stream \p dst. - * Both streams must have the same context. - * - * \param[out] dst Destination stream - * \param[in] src Source stream - * For list of attributes see ::CUstreamAttrID - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src); - -/** - * \brief Queries stream attribute. - * - * Queries attribute \p attr from \p hStream and stores it in corresponding - * member of \p value_out. - * - * \param[in] hStream - * \param[in] attr - * \param[out] value_out - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, - CUstreamAttrValue *value_out); - -/** - * \brief Sets stream attribute. - * - * Sets attribute \p attr on \p hStream from corresponding attribute of - * \p value. The updated attribute will be applied to subsequent work - * submitted to the stream. It will not affect previously submitted work. - * - * \param[out] hStream - * \param[in] attr - * \param[in] value - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, - const CUstreamAttrValue *value); - -/** @} */ /* END CUDA_STREAM */ - - -/** - * \defgroup CUDA_EVENT Event Management - * - * ___MANBRIEF___ event management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the event management functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Creates an event - * - * Creates an event *phEvent for the current context with the flags specified via - * \p Flags. Valid flags include: - * - ::CU_EVENT_DEFAULT: Default event creation flag. - * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking - * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on - * an event created with this flag will block until the event has actually - * been recorded. - * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need - * to record timing data. Events created with this flag specified and - * the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best - * performance when used with ::cuStreamWaitEvent() and ::cuEventQuery(). - * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an - * interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must - * be specified along with ::CU_EVENT_DISABLE_TIMING. - * - * \param phEvent - Returns newly created event - * \param Flags - Event creation flags - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \notefnerr - * - * \sa - * ::cuEventRecord, - * ::cuEventQuery, - * ::cuEventSynchronize, - * ::cuEventDestroy, - * ::cuEventElapsedTime, - * ::cudaEventCreate, - * ::cudaEventCreateWithFlags - */ -CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags); - -/** - * \brief Records an event - * - * Captures in \p hEvent the contents of \p hStream at the time of this call. - * \p hEvent and \p hStream must be from the same context. - * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then - * examine or wait for completion of the work that was captured. Uses of - * \p hStream after this call do not modify \p hEvent. See note on default - * stream behavior for what is captured in the default case. - * - * ::cuEventRecord() can be called multiple times on the same event and - * will overwrite the previously captured state. Other APIs such as - * ::cuStreamWaitEvent() use the most recently captured state at the time - * of the API call, and are not affected by later calls to - * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an - * event represents an empty set of work, so for example ::cuEventQuery() - * would return ::CUDA_SUCCESS. - * - * \param hEvent - Event to record - * \param hStream - Stream to record event for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \note_null_stream - * \notefnerr - * - * \sa ::cuEventCreate, - * ::cuEventQuery, - * ::cuEventSynchronize, - * ::cuStreamWaitEvent, - * ::cuEventDestroy, - * ::cuEventElapsedTime, - * ::cudaEventRecord, - * ::cuEventRecordWithFlags - */ -CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); - -/** - * \brief Records an event - * - * Captures in \p hEvent the contents of \p hStream at the time of this call. - * \p hEvent and \p hStream must be from the same context. - * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then - * examine or wait for completion of the work that was captured. Uses of - * \p hStream after this call do not modify \p hEvent. See note on default - * stream behavior for what is captured in the default case. - * - * ::cuEventRecordWithFlags() can be called multiple times on the same event and - * will overwrite the previously captured state. Other APIs such as - * ::cuStreamWaitEvent() use the most recently captured state at the time - * of the API call, and are not affected by later calls to - * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an - * event represents an empty set of work, so for example ::cuEventQuery() - * would return ::CUDA_SUCCESS. - * - * flags include: - * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag. - * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external - * event node when performing stream capture. This flag is invalid outside - * of stream capture. - * - * \param hEvent - Event to record - * \param hStream - Stream to record event for - * \param flags - See ::CUevent_capture_flags - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \note_null_stream - * \notefnerr - * - * \sa ::cuEventCreate, - * ::cuEventQuery, - * ::cuEventSynchronize, - * ::cuStreamWaitEvent, - * ::cuEventDestroy, - * ::cuEventElapsedTime, - * ::cuEventRecord, - * ::cudaEventRecord - */ -CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); - -/** - * \brief Queries an event's status - * - * Queries the status of all work currently captured by \p hEvent. See - * ::cuEventRecord() for details on what is captured by an event. - * - * Returns ::CUDA_SUCCESS if all captured work has been completed, or - * ::CUDA_ERROR_NOT_READY if any captured work is incomplete. - * - * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS - * is equivalent to having called ::cuEventSynchronize(). - * - * \param hEvent - Event to query - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_READY - * \notefnerr - * - * \sa ::cuEventCreate, - * ::cuEventRecord, - * ::cuEventSynchronize, - * ::cuEventDestroy, - * ::cuEventElapsedTime, - * ::cudaEventQuery - */ -CUresult CUDAAPI cuEventQuery(CUevent hEvent); - -/** - * \brief Waits for an event to complete - * - * Waits until the completion of all work currently captured in \p hEvent. - * See ::cuEventRecord() for details on what is captured by an event. - * - * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC - * flag will cause the calling CPU thread to block until the event has - * been completed by the device. If the ::CU_EVENT_BLOCKING_SYNC flag has - * not been set, then the CPU thread will busy-wait until the event has - * been completed by the device. - * - * \param hEvent - Event to wait for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuEventCreate, - * ::cuEventRecord, - * ::cuEventQuery, - * ::cuEventDestroy, - * ::cuEventElapsedTime, - * ::cudaEventSynchronize - */ -CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); - -/** - * \brief Destroys an event - * - * Destroys the event specified by \p hEvent. - * - * An event may be destroyed before it is complete (i.e., while - * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the - * call does not block on completion of the event, and any associated - * resources will automatically be released asynchronously at completion. - * - * \param hEvent - Event to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuEventCreate, - * ::cuEventRecord, - * ::cuEventQuery, - * ::cuEventSynchronize, - * ::cuEventElapsedTime, - * ::cudaEventDestroy - */ -CUresult CUDAAPI cuEventDestroy(CUevent hEvent); - -/** - * \brief Computes the elapsed time between two events - * - * Computes the elapsed time between two events (in milliseconds with a - * resolution of around 0.5 microseconds). - * - * If either event was last recorded in a non-NULL stream, the resulting time - * may be greater than expected (even if both used the same stream handle). This - * happens because the ::cuEventRecord() operation takes place asynchronously - * and there is no guarantee that the measured latency is actually just between - * the two events. Any number of other different stream operations could execute - * in between the two measured events, thus altering the timing in a significant - * way. - * - * If ::cuEventRecord() has not been called on either event then - * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called - * on both events but one or both of them has not yet been completed (that is, - * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the - * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with - * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return - * ::CUDA_ERROR_INVALID_HANDLE. - * - * \param pMilliseconds - Time between \p hStart and \p hEnd in ms - * \param hStart - Starting event - * \param hEnd - Ending event - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_READY - * \notefnerr - * - * \sa ::cuEventCreate, - * ::cuEventRecord, - * ::cuEventQuery, - * ::cuEventSynchronize, - * ::cuEventDestroy, - * ::cudaEventElapsedTime - */ -CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); - -/** @} */ /* END CUDA_EVENT */ - -/** - * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability - * - * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the external resource interoperability functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - - /** - * \brief Imports an external memory object - * - * Imports an externally allocated memory object and returns - * a handle to that in \p extMem_out. - * - * The properties of the handle being imported must be described in - * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure - * is defined as follows: - * - * \code - typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { - CUexternalMemoryHandleType type; - union { - int fd; - struct { - void *handle; - const void *name; - } win32; - const void *nvSciBufObject; - } handle; - unsigned long long size; - unsigned int flags; - } CUDA_EXTERNAL_MEMORY_HANDLE_DESC; - * \endcode - * - * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type - * of handle being imported. ::CUexternalMemoryHandleType is - * defined as: - * - * \code - typedef enum CUexternalMemoryHandleType_enum { - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, - CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 - } CUexternalMemoryHandleType; - * \endcode - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid - * file descriptor referencing a memory object. Ownership of - * the file descriptor is transferred to the CUDA driver when the - * handle is imported successfully. Performing any operations on the - * file descriptor after it is imported results in undefined behavior. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one - * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be - * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle - * is not NULL, then it must represent a valid shared NT handle that - * references a memory object. Ownership of this handle is - * not transferred to CUDA after the import operation, so the - * application must release the handle using the appropriate system - * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name - * is not NULL, then it must point to a NULL-terminated array of - * UTF-16 characters that refers to a memory object. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must - * be non-NULL and - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name - * must be NULL. The handle specified must be a globally shared KMT - * handle. This handle does not hold a reference to the underlying - * object, and thus will be invalid when all references to the - * memory object are destroyed. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one - * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be - * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle - * is not NULL, then it must represent a valid shared NT handle that - * is returned by ID3D12Device::CreateSharedHandle when referring to a - * ID3D12Heap object. This handle holds a reference to the underlying - * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name - * is not NULL, then it must point to a NULL-terminated array of - * UTF-16 characters that refers to a ID3D12Heap object. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one - * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be - * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle - * is not NULL, then it must represent a valid shared NT handle that - * is returned by ID3D12Device::CreateSharedHandle when referring to a - * ID3D12Resource object. This handle holds a reference to the - * underlying object. If - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name - * is not NULL, then it must point to a NULL-terminated array of - * UTF-16 characters that refers to a ID3D12Resource object. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must - * represent a valid shared NT handle that is returned by - * IDXGIResource1::CreateSharedHandle when referring to a - * ID3D11Resource object. If - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name - * is not NULL, then it must point to a NULL-terminated array of - * UTF-16 characters that refers to a ID3D11Resource object. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must - * represent a valid shared KMT handle that is returned by - * IDXGIResource::GetSharedHandle when referring to a - * ID3D11Resource object and - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name - * must be NULL. - * - * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL - * and reference a valid NvSciBuf object. - * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the - * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync - * as appropriate barriers to maintain coherence between CUDA and the other drivers. - * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC - * for memory synchronization. - * - * - * The size of the memory object must be specified in - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size. - * - * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in - * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the - * resource is a dedicated resource. The definition of what a - * dedicated resource is outside the scope of this extension. - * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type - * is one of the following: - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT - * - * \param extMem_out - Returned handle to an external memory object - * \param memHandleDesc - Memory import handle descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the - * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges - * as well as appropriate Vulkan pipeline barriers to maintain coherence between - * CPU and GPU. For more information on these APIs, please refer to "Synchronization - * and Cache Control" chapter from Vulkan specification. - * - * \sa ::cuDestroyExternalMemory, - * ::cuExternalMemoryGetMappedBuffer, - * ::cuExternalMemoryGetMappedMipmappedArray - */ -CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc); - -/** - * \brief Maps a buffer onto an imported memory object - * - * Maps a buffer onto an imported memory object and returns a device - * pointer in \p devPtr. - * - * The properties of the buffer being mapped must be described in - * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is - * defined as follows: - * - * \code - typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { - unsigned long long offset; - unsigned long long size; - unsigned int flags; - } CUDA_EXTERNAL_MEMORY_BUFFER_DESC; - * \endcode - * - * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in - * the memory object where the buffer's base address is. - * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer. - * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero. - * - * The offset and size have to be suitably aligned to match the - * requirements of the external API. Mapping two buffers whose ranges - * overlap may or may not result in the same virtual address being - * returned for the overlapped portion. In such cases, the application - * must ensure that all accesses to that region from the GPU are - * volatile. Otherwise writes made via one address are not guaranteed - * to be visible via the other address, even if they're issued by the - * same thread. It is recommended that applications map the combined - * range instead of mapping separate buffers and then apply the - * appropriate offsets to the returned pointer to derive the - * individual buffers. - * - * The returned pointer \p devPtr must be freed using ::cuMemFree. - * - * \param devPtr - Returned device pointer to buffer - * \param extMem - Handle to external memory object - * \param bufferDesc - Buffer descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuImportExternalMemory, - * ::cuDestroyExternalMemory, - * ::cuExternalMemoryGetMappedMipmappedArray - */ -CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc); - -/** - * \brief Maps a CUDA mipmapped array onto an external memory object - * - * Maps a CUDA mipmapped array onto an external object and returns a - * handle to it in \p mipmap. - * - * The properties of the CUDA mipmapped array being mapped must be - * described in \p mipmapDesc. The structure - * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows: - * - * \code - typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { - unsigned long long offset; - CUDA_ARRAY3D_DESCRIPTOR arrayDesc; - unsigned int numLevels; - } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; - * \endcode - * - * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the - * offset in the memory object where the base level of the mipmap - * chain is. - * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes - * the format, dimensions and type of the base level of the mipmap - * chain. For further details on these parameters, please refer to the - * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped - * array is bound as a color target in the graphics API, then the flag - * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in - * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags. - * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies - * the total number of levels in the mipmap chain. - * - * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then - * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1. - * - * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy. - * - * \param mipmap - Returned CUDA mipmapped array - * \param extMem - Handle to external memory object - * \param mipmapDesc - CUDA array descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuImportExternalMemory, - * ::cuDestroyExternalMemory, - * ::cuExternalMemoryGetMappedBuffer - */ -CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc); - -/** - * \brief Destroys an external memory object. - * - * Destroys the specified external memory object. Any existing buffers - * and CUDA mipmapped arrays mapped onto this object must no longer be - * used and must be explicitly freed using ::cuMemFree and - * ::cuMipmappedArrayDestroy respectively. - * - * \param extMem - External memory object to be destroyed - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuImportExternalMemory, - * ::cuExternalMemoryGetMappedBuffer, - * ::cuExternalMemoryGetMappedMipmappedArray - */ -CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); - -/** - * \brief Imports an external semaphore - * - * Imports an externally allocated synchronization object and returns - * a handle to that in \p extSem_out. - * - * The properties of the handle being imported must be described in - * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is - * defined as follows: - * - * \code - typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { - CUexternalSemaphoreHandleType type; - union { - int fd; - struct { - void *handle; - const void *name; - } win32; - const void* NvSciSyncObj; - } handle; - unsigned int flags; - } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; - * \endcode - * - * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of - * handle being imported. ::CUexternalSemaphoreHandleType is defined - * as: - * - * \code - typedef enum CUexternalSemaphoreHandleType_enum { - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, - CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 - } CUexternalSemaphoreHandleType; - * \endcode - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid - * file descriptor referencing a synchronization object. Ownership of - * the file descriptor is transferred to the CUDA driver when the - * handle is imported successfully. Performing any operations on the - * file descriptor after it is imported results in undefined behavior. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one - * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be - * NULL. If - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle - * is not NULL, then it must represent a valid shared NT handle that - * references a synchronization object. Ownership of this handle is - * not transferred to CUDA after the import operation, so the - * application must release the handle using the appropriate system - * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name - * is not NULL, then it must name a valid synchronization object. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must - * be non-NULL and - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name - * must be NULL. The handle specified must be a globally shared KMT - * handle. This handle does not hold a reference to the underlying - * object, and thus will be invalid when all references to the - * synchronization object are destroyed. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one - * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be - * NULL. If - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle - * is not NULL, then it must represent a valid shared NT handle that - * is returned by ID3D12Device::CreateSharedHandle when referring to a - * ID3D12Fence object. This handle holds a reference to the underlying - * object. If - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name - * is not NULL, then it must name a valid synchronization object that - * refers to a valid ID3D12Fence object. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle - * represents a valid shared NT handle that is returned by - * ID3D11Fence::CreateSharedHandle. If - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name - * is not NULL, then it must name a valid synchronization object that - * refers to a valid ID3D11Fence object. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj - * represents a valid NvSciSyncObj. - * - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle - * represents a valid shared NT handle that - * is returned by IDXGIResource1::CreateSharedHandle when referring to - * a IDXGIKeyedMutex object. If - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name - * is not NULL, then it must name a valid synchronization object that - * refers to a valid IDXGIKeyedMutex object. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle - * represents a valid shared KMT handle that - * is returned by IDXGIResource::GetSharedHandle when referring to - * a IDXGIKeyedMutex object and - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid - * file descriptor referencing a synchronization object. Ownership of - * the file descriptor is transferred to the CUDA driver when the - * handle is imported successfully. Performing any operations on the - * file descriptor after it is imported results in undefined behavior. - * - * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one - * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be - * NULL. If - * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle - * is not NULL, then it must represent a valid shared NT handle that - * references a synchronization object. Ownership of this handle is - * not transferred to CUDA after the import operation, so the - * application must release the handle using the appropriate system - * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name - * is not NULL, then it must name a valid synchronization object. - * - * \param extSem_out - Returned handle to an external semaphore - * \param semHandleDesc - Semaphore import handle descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuDestroyExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc); - -/** - * \brief Signals a set of external semaphore objects - * - * Enqueues a signal operation on a set of externally allocated - * semaphore object in the specified stream. The operations will be - * executed when all prior operations in the stream complete. - * - * The exact semantics of signaling a semaphore depends on the type of - * the object. - * - * If the semaphore object is any one of the following types: - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT - * then signaling the semaphore will set it to the signaled state. - * - * If the semaphore object is any one of the following types: - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 - * then the semaphore will be set to the value specified in - * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value. - * - * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC - * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence - * to a value that can be used by subsequent waiters of the same NvSciSync object - * to order operations with those currently submitted in \p stream. Such an update - * will overwrite previous contents of - * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default, - * signaling such an external semaphore object causes appropriate memory synchronization - * operations to be performed over all external memory objects that are imported as - * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses - * made by other importers of the same set of NvSciBuf memory object(s) are coherent. - * These operations can be skipped by specifying the flag - * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a - * performance optimization when data coherency is not required. But specifying this - * flag in scenarios where data coherency is required results in undefined behavior. - * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, - * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in - * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return - * CUDA_ERROR_NOT_SUPPORTED. - * - * If the semaphore object is any one of the following types: - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT - * then the keyed mutex will be released with the key specified in - * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key. - * - * \param extSemArray - Set of external semaphores to be signaled - * \param paramsArray - Array of semaphore parameters - * \param numExtSems - Number of semaphores to signal - * \param stream - Stream to enqueue the signal operations in - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuImportExternalSemaphore, - * ::cuDestroyExternalSemaphore, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); - -/** - * \brief Waits on a set of external semaphore objects - * - * Enqueues a wait operation on a set of externally allocated - * semaphore object in the specified stream. The operations will be - * executed when all prior operations in the stream complete. - * - * The exact semantics of waiting on a semaphore depends on the type - * of the object. - * - * If the semaphore object is any one of the following types: - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT - * then waiting on the semaphore will wait until the semaphore reaches - * the signaled state. The semaphore will then be reset to the - * unsignaled state. Therefore for every signal operation, there can - * only be one wait operation. - * - * If the semaphore object is any one of the following types: - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 - * then waiting on the semaphore will wait until the value of the - * semaphore is greater than or equal to - * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value. - * - * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC - * then, waiting on the semaphore will wait until the - * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the - * signaler of the NvSciSyncObj that was associated with this semaphore object. - * By default, waiting on such an external semaphore object causes appropriate - * memory synchronization operations to be performed over all external memory objects - * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that - * any subsequent accesses made by other importers of the same set of NvSciBuf memory - * object(s) are coherent. These operations can be skipped by specifying the flag - * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a - * performance optimization when data coherency is not required. But specifying this - * flag in scenarios where data coherency is required results in undefined behavior. - * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, - * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in - * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return - * CUDA_ERROR_NOT_SUPPORTED. - * - * If the semaphore object is any one of the following types: - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, - * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT - * then the keyed mutex will be acquired when it is released with the key - * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key - * or until the timeout specified by - * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs - * has lapsed. The timeout interval can either be a finite value - * specified in milliseconds or an infinite value. In case an infinite - * value is specified the timeout never elapses. The windows INFINITE - * macro must be used to specify infinite timeout. - * - * \param extSemArray - External semaphores to be waited on - * \param paramsArray - Array of semaphore parameters - * \param numExtSems - Number of semaphores to wait on - * \param stream - Stream to enqueue the wait operations in - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_TIMEOUT - * \notefnerr - * - * \sa ::cuImportExternalSemaphore, - * ::cuDestroyExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync - */ -CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); - -/** - * \brief Destroys an external semaphore - * - * Destroys an external semaphore object and releases any references - * to the underlying resource. Any outstanding signals or waits must - * have completed before the semaphore is destroyed. - * - * \param extSem - External semaphore to be destroyed - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa ::cuImportExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem); - -/** @} */ /* END CUDA_EXTRES_INTEROP */ - -/** - * \defgroup CUDA_MEMOP Stream memory operations - * - * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the stream memory operations of the low-level CUDA - * driver application programming interface. - * - * The whole set of operations is disabled by default. Users are required - * to explicitly enable them, e.g. on Linux by passing the kernel module - * parameter shown below: - * modprobe nvidia NVreg_EnableStreamMemOPs=1 - * There is currently no way to enable these operations on other operating - * systems. - * - * Users can programmatically query whether the device supports these - * operations with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. - * - * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. - * - * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64() - * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and - * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. - * - * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and - * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform - * hardware features and can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES. - * - * Note that all memory pointers passed as parameters to these operations - * are device pointers. Where necessary a device pointer should be - * obtained, for example with ::cuMemHostGetDevicePointer(). - * - * None of the operations accepts pointers to managed memory buffers - * (::cuMemAllocManaged). - * - * @{ - */ - -/** - * \brief Wait on a memory location - * - * Enqueues a synchronization of the stream on the given memory location. Work - * ordered after the operation will block until the given condition on the - * memory is satisfied. By default, the condition is to wait for - * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. - * Other condition types can be specified via \p flags. - * - * If the memory was registered via ::cuMemHostRegister(), the device pointer - * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot - * be used with managed memory (::cuMemAllocManaged). - * - * Support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. - * - * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. - * - * \param stream The stream to synchronize on the memory location. - * \param addr The memory location to wait on. - * \param value The value to compare with the memory location. - * \param flags See ::CUstreamWaitValue_flags. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuStreamWaitValue64, - * ::cuStreamWriteValue32, - * ::cuStreamWriteValue64, - * ::cuStreamBatchMemOp, - * ::cuMemHostRegister, - * ::cuStreamWaitEvent - */ -CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); - -/** - * \brief Wait on a memory location - * - * Enqueues a synchronization of the stream on the given memory location. Work - * ordered after the operation will block until the given condition on the - * memory is satisfied. By default, the condition is to wait for - * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal. - * Other condition types can be specified via \p flags. - * - * If the memory was registered via ::cuMemHostRegister(), the device pointer - * should be obtained with ::cuMemHostGetDevicePointer(). - * - * Support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. - * - * \param stream The stream to synchronize on the memory location. - * \param addr The memory location to wait on. - * \param value The value to compare with the memory location. - * \param flags See ::CUstreamWaitValue_flags. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuStreamWaitValue32, - * ::cuStreamWriteValue32, - * ::cuStreamWriteValue64, - * ::cuStreamBatchMemOp, - * ::cuMemHostRegister, - * ::cuStreamWaitEvent - */ -CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); - -/** - * \brief Write a value to memory - * - * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER - * flag is passed, the write is preceded by a system-wide memory fence, - * equivalent to a __threadfence_system() but scoped to the stream - * rather than a CUDA thread. - * - * If the memory was registered via ::cuMemHostRegister(), the device pointer - * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot - * be used with managed memory (::cuMemAllocManaged). - * - * Support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. - * - * \param stream The stream to do the write in. - * \param addr The device address to write to. - * \param value The value to write. - * \param flags See ::CUstreamWriteValue_flags. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuStreamWriteValue64, - * ::cuStreamWaitValue32, - * ::cuStreamWaitValue64, - * ::cuStreamBatchMemOp, - * ::cuMemHostRegister, - * ::cuEventRecord - */ -CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); - -/** - * \brief Write a value to memory - * - * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER - * flag is passed, the write is preceded by a system-wide memory fence, - * equivalent to a __threadfence_system() but scoped to the stream - * rather than a CUDA thread. - * - * If the memory was registered via ::cuMemHostRegister(), the device pointer - * should be obtained with ::cuMemHostGetDevicePointer(). - * - * Support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. - * - * \param stream The stream to do the write in. - * \param addr The device address to write to. - * \param value The value to write. - * \param flags See ::CUstreamWriteValue_flags. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuStreamWriteValue32, - * ::cuStreamWaitValue32, - * ::cuStreamWaitValue64, - * ::cuStreamBatchMemOp, - * ::cuMemHostRegister, - * ::cuEventRecord - */ -CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); - -/** - * \brief Batch operations to synchronize the stream via memory operations - * - * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32(). - * Batching operations may avoid some performance overhead in both the API call - * and the device execution versus adding them to the stream in separate API - * calls. The operations are enqueued in the order they appear in the array. - * - * See ::CUstreamBatchMemOpType for the full set of supported operations, and - * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(), - * and ::cuStreamWriteValue64() for details of specific operations. - * - * Basic support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details - * on querying support for specific operations. - * - * \param stream The stream to enqueue the operations in. - * \param count The number of operations in the array. Must be less than 256. - * \param paramArray The types and parameters of the individual operations. - * \param flags Reserved for future expansion; must be 0. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \notefnerr - * - * \sa ::cuStreamWaitValue32, - * ::cuStreamWaitValue64, - * ::cuStreamWriteValue32, - * ::cuStreamWriteValue64, - * ::cuMemHostRegister - */ -CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); - -/** @} */ /* END CUDA_MEMOP */ - -/** - * \defgroup CUDA_EXEC Execution Control - * - * ___MANBRIEF___ execution control functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the execution control functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Returns information about a function - * - * Returns in \p *pi the integer value of the attribute \p attrib on the kernel - * given by \p hfunc. The supported attributes are: - * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads - * per block, beyond which a launch of the function would fail. This number - * depends on both the function and the device on which the function is - * currently loaded. - * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of - * statically-allocated shared memory per block required by this function. - * This does not include dynamically-allocated shared memory requested by - * the user at runtime. - * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated - * constant memory required by this function. - * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory - * used by each thread of this function. - * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread - * of this function. - * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for - * which the function was compiled. This value is the major PTX version * 10 - * + the minor PTX version, so a PTX version 1.3 function would return the - * value 13. Note that this may return the undefined value of 0 for cubins - * compiled prior to CUDA 3.0. - * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for - * which the function was compiled. This value is the major binary - * version * 10 + the minor binary version, so a binary version 1.3 function - * would return the value 13. Note that this will return a value of 10 for - * legacy cubins that do not have a properly-encoded binary architecture - * version. - * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has - * been compiled with user specified option "-Xptxas --dlcm=ca" set . - * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of - * dynamically-allocated shared memory. - * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 - * cache split ratio in percent of total shared memory. - * - * \param pi - Returned attribute value - * \param attrib - Attribute requested - * \param hfunc - Function to query attribute of - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuFuncSetCacheConfig, - * ::cuLaunchKernel, - * ::cudaFuncGetAttributes, - * ::cudaFuncSetAttribute - */ -CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); - -/** - * \brief Sets information about a function - * - * This call sets the value of a specified attribute \p attrib on the kernel given - * by \p hfunc to an integer value specified by \p val - * This function returns CUDA_SUCCESS if the new value of the attribute could be - * successfully set. If the set fails, this call will return an error. - * Not all attributes can have values set. Attempting to set a value on a read-only - * attribute will result in an error (CUDA_ERROR_INVALID_VALUE) - * - * Supported attributes for the cuFuncSetAttribute call are: - * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of - * dynamically-allocated shared memory. The value should contain the requested - * maximum size of dynamically-allocated shared memory. The sum of this value and - * the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the - * device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. - * The maximal size of requestable dynamic shared memory may differ by GPU - * architecture. - * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 - * cache and shared memory use the same hardware resources, this sets the shared memory - * carveout preference, in percent of the total shared memory. - * See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR - * This is only a hint, and the driver can choose a different ratio if required to execute the function. - * - * \param hfunc - Function to query attribute of - * \param attrib - Attribute requested - * \param value - The value to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuFuncSetCacheConfig, - * ::cuLaunchKernel, - * ::cudaFuncGetAttributes, - * ::cudaFuncSetAttribute - */ -CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value); - -/** - * \brief Sets the preferred cache configuration for a device function - * - * On devices where the L1 cache and shared memory use the same hardware - * resources, this sets through \p config the preferred cache configuration for - * the device function \p hfunc. This is only a preference. The driver will use - * the requested configuration if possible, but it is free to choose a different - * configuration if required to execute \p hfunc. Any context-wide preference - * set via ::cuCtxSetCacheConfig() will be overridden by this per-function - * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In - * that case, the current context-wide setting will be used. - * - * This setting does nothing on devices where the size of the L1 cache and - * shared memory are fixed. - * - * Launching a kernel with a different preference than the most recent - * preference setting may insert a device-side synchronization point. - * - * - * The supported cache configurations are: - * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) - * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache - * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory - * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory - * - * \param hfunc - Kernel to configure cache for - * \param config - Requested cache configuration - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuFuncGetAttribute, - * ::cuLaunchKernel, - * ::cudaFuncSetCacheConfig - */ -CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); - -/** - * \brief Sets the shared memory configuration for a device function. - * - * On devices with configurable shared memory banks, this function will - * force all subsequent launches of the specified device function to have - * the given shared memory bank size configuration. On any given launch of the - * function, the shared memory configuration of the device will be temporarily - * changed if needed to suit the function's preferred configuration. Changes in - * shared memory configuration between subsequent launches of functions, - * may introduce a device side synchronization point. - * - * Any per-function setting of shared memory bank size set via - * ::cuFuncSetSharedMemConfig will override the context wide setting set with - * ::cuCtxSetSharedMemConfig. - * - * Changing the shared memory bank size will not increase shared memory usage - * or affect occupancy of kernels, but may have major effects on performance. - * Larger bank sizes will allow for greater potential bandwidth to shared memory, - * but will change what kinds of accesses to shared memory will result in bank - * conflicts. - * - * This function will do nothing on devices with fixed shared memory bank size. - * - * The supported bank configurations are: - * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory - * configuration when launching this function. - * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to - * be natively four bytes when launching this function. - * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to - * be natively eight bytes when launching this function. - * - * \param hfunc - kernel to be given a shared memory config - * \param config - requested shared memory configuration - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuCtxGetSharedMemConfig, - * ::cuCtxSetSharedMemConfig, - * ::cuFuncGetAttribute, - * ::cuLaunchKernel, - * ::cudaFuncSetSharedMemConfig - */ -CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); - -/** - * \brief Returns a module handle - * - * Returns in \p *hmod the handle of the module that function \p hfunc - * is located in. The lifetime of the module corresponds to the lifetime of - * the context it was loaded in or until the module is explicitly unloaded. - * - * The CUDA runtime manages its own modules loaded into the primary context. - * If the handle returned by this API refers to a module loaded by the CUDA runtime, - * calling ::cuModuleUnload() on that module will result in undefined behavior. - * - * \param hmod - Returned module handle - * \param hfunc - Function to retrieve module for - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_FOUND - * \notefnerr - * - */ -CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc); - -/** - * \brief Launches a CUDA function - * - * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ - * grid of blocks. Each block contains \p blockDimX x \p blockDimY x - * \p blockDimZ threads. - * - * \p sharedMemBytes sets the amount of dynamic shared memory that will be - * available to each thread block. - * - * Kernel parameters to \p f can be specified in one of two ways: - * - * 1) Kernel parameters can be specified via \p kernelParams. If \p f - * has N parameters, then \p kernelParams needs to be an array of N - * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] - * must point to a region of memory from which the actual kernel - * parameter will be copied. The number of kernel parameters and their - * offsets and sizes do not need to be specified as that information is - * retrieved directly from the kernel's image. - * - * 2) Kernel parameters can also be packaged by the application into - * a single buffer that is passed in via the \p extra parameter. - * This places the burden on the application of knowing each kernel - * parameter's size and alignment/padding within the buffer. Here is - * an example of using the \p extra parameter in this manner: - * \code - size_t argBufferSize; - char argBuffer[256]; - - // populate argBuffer and argBufferSize - - void *config[] = { - CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, - CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize, - CU_LAUNCH_PARAM_END - }; - status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config); - * \endcode - * - * The \p extra parameter exists to allow ::cuLaunchKernel to take - * additional less commonly used arguments. \p extra specifies a list of - * names of extra settings and their corresponding values. Each extra - * setting name is immediately followed by the corresponding value. The - * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END. - * - * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra - * array; - * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next - * value in \p extra will be a pointer to a buffer containing all - * the kernel parameters for launching kernel \p f; - * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next - * value in \p extra will be a pointer to a size_t containing the - * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER; - * - * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel - * parameters are specified with both \p kernelParams and \p extra - * (i.e. both \p kernelParams and \p extra are non-NULL). - * - * Calling ::cuLaunchKernel() invalidates the persistent function state - * set through the following deprecated APIs: - * ::cuFuncSetBlockShape(), - * ::cuFuncSetSharedSize(), - * ::cuParamSetSize(), - * ::cuParamSeti(), - * ::cuParamSetf(), - * ::cuParamSetv(). - * - * Note that to use ::cuLaunchKernel(), the kernel \p f must either have - * been compiled with toolchain version 3.2 or later so that it will - * contain kernel parameter information, or have no kernel parameters. - * If either of these conditions is not met, then ::cuLaunchKernel() will - * return ::CUDA_ERROR_INVALID_IMAGE. - * - * \param f - Kernel to launch - * \param gridDimX - Width of grid in blocks - * \param gridDimY - Height of grid in blocks - * \param gridDimZ - Depth of grid in blocks - * \param blockDimX - X dimension of each thread block - * \param blockDimY - Y dimension of each thread block - * \param blockDimZ - Z dimension of each thread block - * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes - * \param hStream - Stream identifier - * \param kernelParams - Array of pointers to kernel parameters - * \param extra - Extra options - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_IMAGE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_LAUNCH_FAILED, - * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - * ::CUDA_ERROR_LAUNCH_TIMEOUT, - * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - * \note_null_stream - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuFuncSetCacheConfig, - * ::cuFuncGetAttribute, - * ::cudaLaunchKernel - */ -CUresult CUDAAPI cuLaunchKernel(CUfunction f, - unsigned int gridDimX, - unsigned int gridDimY, - unsigned int gridDimZ, - unsigned int blockDimX, - unsigned int blockDimY, - unsigned int blockDimZ, - unsigned int sharedMemBytes, - CUstream hStream, - void **kernelParams, - void **extra); - -/** - * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute - * - * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ - * grid of blocks. Each block contains \p blockDimX x \p blockDimY x - * \p blockDimZ threads. - * - * \p sharedMemBytes sets the amount of dynamic shared memory that will be - * available to each thread block. - * - * The device on which this kernel is invoked must have a non-zero value for - * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH. - * - * The total number of blocks launched cannot exceed the maximum number of blocks per - * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or - * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors - * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. - * - * The kernel cannot make use of CUDA dynamic parallelism. - * - * Kernel parameters must be specified via \p kernelParams. If \p f - * has N parameters, then \p kernelParams needs to be an array of N - * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] - * must point to a region of memory from which the actual kernel - * parameter will be copied. The number of kernel parameters and their - * offsets and sizes do not need to be specified as that information is - * retrieved directly from the kernel's image. - * - * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is - * the same as function state set through ::cuLaunchKernel API - * - * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous - * block shape, shared size and parameter info associated with \p f - * is overwritten. - * - * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have - * been compiled with toolchain version 3.2 or later so that it will - * contain kernel parameter information, or have no kernel parameters. - * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will - * return ::CUDA_ERROR_INVALID_IMAGE. - * - * \param f - Kernel to launch - * \param gridDimX - Width of grid in blocks - * \param gridDimY - Height of grid in blocks - * \param gridDimZ - Depth of grid in blocks - * \param blockDimX - X dimension of each thread block - * \param blockDimY - Y dimension of each thread block - * \param blockDimZ - Z dimension of each thread block - * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes - * \param hStream - Stream identifier - * \param kernelParams - Array of pointers to kernel parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_IMAGE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_LAUNCH_FAILED, - * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - * ::CUDA_ERROR_LAUNCH_TIMEOUT, - * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - * \note_null_stream - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuFuncSetCacheConfig, - * ::cuFuncGetAttribute, - * ::cuLaunchCooperativeKernelMultiDevice, - * ::cudaLaunchCooperativeKernel - */ -CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, - unsigned int gridDimX, - unsigned int gridDimY, - unsigned int gridDimZ, - unsigned int blockDimX, - unsigned int blockDimY, - unsigned int blockDimZ, - unsigned int sharedMemBytes, - CUstream hStream, - void **kernelParams); - -/** - * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute - * - * \deprecated This function is deprecated as of CUDA 11.3. - * - * Invokes kernels as specified in the \p launchParamsList array where each element - * of the array specifies all the parameters required to perform a single kernel launch. - * These kernels can cooperate and synchronize as they execute. The size of the array is - * specified by \p numDevices. - * - * No two kernels can be launched on the same device. All the devices targeted by this - * multi-device launch must be identical. All devices must have a non-zero value for the - * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH. - * - * All kernels launched must be identical with respect to the compiled code. Note that - * any __device__, __constant__ or __managed__ variables present in the module that owns - * the kernel launched on each device, are independently instantiated on every device. - * It is the application's responsiblity to ensure these variables are initialized and - * used appropriately. - * - * The size of the grids as specified in blocks, the size of the blocks themselves - * and the amount of shared memory used by each thread block must also match across - * all launched kernels. - * - * The streams used to launch these kernels must have been created via either ::cuStreamCreate - * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD - * cannot be used. - * - * The total number of blocks launched per kernel cannot exceed the maximum number of blocks - * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or - * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors - * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the - * total number of blocks launched per device has to match across all devices, the maximum - * number of blocks that can be launched per device will be limited by the device with the - * least number of multiprocessors. - * - * The kernels cannot make use of CUDA dynamic parallelism. - * - * The ::CUDA_LAUNCH_PARAMS structure is defined as: - * \code - typedef struct CUDA_LAUNCH_PARAMS_st - { - CUfunction function; - unsigned int gridDimX; - unsigned int gridDimY; - unsigned int gridDimZ; - unsigned int blockDimX; - unsigned int blockDimY; - unsigned int blockDimZ; - unsigned int sharedMemBytes; - CUstream hStream; - void **kernelParams; - } CUDA_LAUNCH_PARAMS; - * \endcode - * where: - * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must - * be identical with respect to the compiled code. - * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across - * all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across - * all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across - * all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across - * all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across - * all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across - * all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes. - * This must match across all kernels launched. - * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot - * be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated - * with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function. - * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If - * ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams - * needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through - * ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual - * kernel parameter will be copied. The number of kernel parameters and their offsets and sizes - * do not need to be specified as that information is retrieved directly from the kernel's image. - * - * By default, the kernel won't begin execution on any GPU until all prior work in all the specified - * streams has completed. This behavior can be overridden by specifying the flag - * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel - * will only wait for prior work in the stream corresponding to that GPU to complete before it begins - * execution. - * - * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin - * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying - * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified, - * any subsequent work pushed in any of the specified streams will only wait for the kernel launched - * on the GPU corresponding to that stream to complete before it begins execution. - * - * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is - * the same as function state set through ::cuLaunchKernel API when called individually for each - * element in \p launchParamsList. - * - * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous - * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function - * in \p launchParamsList is overwritten. - * - * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have - * been compiled with toolchain version 3.2 or later so that it will - * contain kernel parameter information, or have no kernel parameters. - * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will - * return ::CUDA_ERROR_INVALID_IMAGE. - * - * \param launchParamsList - List of launch parameters, one per device - * \param numDevices - Size of the \p launchParamsList array - * \param flags - Flags to control launch behavior - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_IMAGE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_LAUNCH_FAILED, - * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - * ::CUDA_ERROR_LAUNCH_TIMEOUT, - * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - * \note_null_stream - * \notefnerr - * - * \sa ::cuCtxGetCacheConfig, - * ::cuCtxSetCacheConfig, - * ::cuFuncSetCacheConfig, - * ::cuFuncGetAttribute, - * ::cuLaunchCooperativeKernel, - * ::cudaLaunchCooperativeKernelMultiDevice - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags); - -/** - * \brief Enqueues a host function call in a stream - * - * Enqueues a host function to run in a stream. The function will be called - * after currently enqueued work and will block work added after it. - * - * The host function must not make any CUDA API calls. Attempting to use a - * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required. - * The host function must not perform any synchronization that may depend on - * outstanding CUDA work not mandated to run earlier. Host functions without a - * mandated order (such as in independent streams) execute in undefined order - * and may be serialized. - * - * For the purposes of Unified Memory, execution makes a number of guarantees: - *
    - *
  • The stream is considered idle for the duration of the function's - * execution. Thus, for example, the function may always use memory attached - * to the stream it was enqueued in.
  • - *
  • The start of execution of the function has the same effect as - * synchronizing an event recorded in the same stream immediately prior to - * the function. It thus synchronizes streams which have been "joined" - * prior to the function.
  • - *
  • Adding device work to any stream does not have the effect of making - * the stream active until all preceding host functions and stream callbacks - * have executed. Thus, for - * example, a function might use global attached memory even if work has - * been added to another stream, if the work has been ordered behind the - * function call with an event.
  • - *
  • Completion of the function does not cause a stream to become - * active except as described above. The stream will remain idle - * if no device work follows the function, and will remain idle across - * consecutive host functions or stream callbacks without device work in - * between. Thus, for example, - * stream synchronization can be done by signaling from a host function at the - * end of the stream.
  • - *
- * - * Note that, in contrast to ::cuStreamAddCallback, the function will not be - * called in the event of an error in the CUDA context. - * - * \param hStream - Stream to enqueue function call in - * \param fn - The function to call once preceding stream operations are complete - * \param userData - User-specified data to be passed to the function - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_SUPPORTED - * \note_null_stream - * \notefnerr - * - * \sa ::cuStreamCreate, - * ::cuStreamQuery, - * ::cuStreamSynchronize, - * ::cuStreamWaitEvent, - * ::cuStreamDestroy, - * ::cuMemAllocManaged, - * ::cuStreamAttachMemAsync, - * ::cuStreamAddCallback - */ -CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); - -/** @} */ /* END CUDA_EXEC */ - -/** - * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED] - * - * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the deprecated execution control functions of the - * low-level CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Sets the block-dimensions for the function - * - * \deprecated - * - * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are - * created when the kernel given by \p hfunc is launched. - * - * \param hfunc - Kernel to specify dimensions of - * \param x - X dimension - * \param y - Y dimension - * \param z - Z dimension - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuFuncSetSharedSize, - * ::cuFuncSetCacheConfig, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSeti, - * ::cuParamSetf, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); - -/** - * \brief Sets the dynamic shared-memory size for the function - * - * \deprecated - * - * Sets through \p bytes the amount of dynamic shared memory that will be - * available to each thread block when the kernel given by \p hfunc is launched. - * - * \param hfunc - Kernel to specify dynamic shared-memory size for - * \param bytes - Dynamic shared-memory size per thread in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetCacheConfig, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSeti, - * ::cuParamSetf, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); - -/** - * \brief Sets the parameter size for the function - * - * \deprecated - * - * Sets through \p numbytes the total size in bytes needed by the function - * parameters of the kernel corresponding to \p hfunc. - * - * \param hfunc - Kernel to set parameter size for - * \param numbytes - Size of parameter list in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetf, - * ::cuParamSeti, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); - -/** - * \brief Adds an integer parameter to the function's argument list - * - * \deprecated - * - * Sets an integer parameter that will be specified the next time the - * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. - * - * \param hfunc - Kernel to add parameter to - * \param offset - Offset to add parameter to argument list - * \param value - Value of parameter - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSetf, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); - -/** - * \brief Adds a floating-point parameter to the function's argument list - * - * \deprecated - * - * Sets a floating-point parameter that will be specified the next time the - * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. - * - * \param hfunc - Kernel to add parameter to - * \param offset - Offset to add parameter to argument list - * \param value - Value of parameter - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSeti, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); - -/** - * \brief Adds arbitrary data to the function's argument list - * - * \deprecated - * - * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr - * into the parameter space of the kernel corresponding to \p hfunc. \p offset - * is a byte offset. - * - * \param hfunc - Kernel to add data to - * \param offset - Offset to add data to argument list - * \param ptr - Pointer to arbitrary data - * \param numbytes - Size of data to copy in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSetf, - * ::cuParamSeti, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); - -/** - * \brief Launches a CUDA function - * - * \deprecated - * - * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block - * contains the number of threads specified by a previous call to - * ::cuFuncSetBlockShape(). - * - * The block shape, dynamic shared memory size, and parameter information - * must be set using - * ::cuFuncSetBlockShape(), - * ::cuFuncSetSharedSize(), - * ::cuParamSetSize(), - * ::cuParamSeti(), - * ::cuParamSetf(), and - * ::cuParamSetv() - * prior to calling this function. - * - * Launching a function via ::cuLaunchKernel() invalidates the function's - * block shape, dynamic shared memory size, and parameter information. After - * launching via cuLaunchKernel, this state must be re-initialized prior to - * calling this function. Failure to do so results in undefined behavior. - * - * \param f - Kernel to launch - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_LAUNCH_FAILED, - * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - * ::CUDA_ERROR_LAUNCH_TIMEOUT, - * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSetf, - * ::cuParamSeti, - * ::cuParamSetv, - * ::cuLaunchGrid, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f); - -/** - * \brief Launches a CUDA function - * - * \deprecated - * - * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of - * blocks. Each block contains the number of threads specified by a previous - * call to ::cuFuncSetBlockShape(). - * - * The block shape, dynamic shared memory size, and parameter information - * must be set using - * ::cuFuncSetBlockShape(), - * ::cuFuncSetSharedSize(), - * ::cuParamSetSize(), - * ::cuParamSeti(), - * ::cuParamSetf(), and - * ::cuParamSetv() - * prior to calling this function. - * - * Launching a function via ::cuLaunchKernel() invalidates the function's - * block shape, dynamic shared memory size, and parameter information. After - * launching via cuLaunchKernel, this state must be re-initialized prior to - * calling this function. Failure to do so results in undefined behavior. - * - * \param f - Kernel to launch - * \param grid_width - Width of grid in blocks - * \param grid_height - Height of grid in blocks - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_LAUNCH_FAILED, - * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - * ::CUDA_ERROR_LAUNCH_TIMEOUT, - * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSetf, - * ::cuParamSeti, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGridAsync, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); - -/** - * \brief Launches a CUDA function - * - * \deprecated - * - * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of - * blocks. Each block contains the number of threads specified by a previous - * call to ::cuFuncSetBlockShape(). - * - * The block shape, dynamic shared memory size, and parameter information - * must be set using - * ::cuFuncSetBlockShape(), - * ::cuFuncSetSharedSize(), - * ::cuParamSetSize(), - * ::cuParamSeti(), - * ::cuParamSetf(), and - * ::cuParamSetv() - * prior to calling this function. - * - * Launching a function via ::cuLaunchKernel() invalidates the function's - * block shape, dynamic shared memory size, and parameter information. After - * launching via cuLaunchKernel, this state must be re-initialized prior to - * calling this function. Failure to do so results in undefined behavior. - * - * \param f - Kernel to launch - * \param grid_width - Width of grid in blocks - * \param grid_height - Height of grid in blocks - * \param hStream - Stream identifier - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_LAUNCH_FAILED, - * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, - * ::CUDA_ERROR_LAUNCH_TIMEOUT, - * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, - * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED - * - * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), - * this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by - * growing the per-thread stack as needed per launch and not shrinking it afterwards. - * - * \note_null_stream - * \notefnerr - * - * \sa ::cuFuncSetBlockShape, - * ::cuFuncSetSharedSize, - * ::cuFuncGetAttribute, - * ::cuParamSetSize, - * ::cuParamSetf, - * ::cuParamSeti, - * ::cuParamSetv, - * ::cuLaunch, - * ::cuLaunchGrid, - * ::cuLaunchKernel - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); - - -/** - * \brief Adds a texture-reference to the function's argument list - * - * \deprecated - * - * Makes the CUDA array or linear memory bound to the texture reference - * \p hTexRef available to a device program as a texture. In this version of - * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and - * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT. - * - * \param hfunc - Kernel to add texture-reference to - * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT) - * \param hTexRef - Texture-reference to add to argument list - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); -/** @} */ /* END CUDA_EXEC_DEPRECATED */ - -/** - * \defgroup CUDA_GRAPH Graph Management - * - * ___MANBRIEF___ graph management functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the graph management functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Creates a graph - * - * Creates an empty graph, which is returned via \p phGraph. - * - * \param phGraph - Returns newly created graph - * \param flags - Graph creation flags, must be 0 - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode, - * ::cuGraphInstantiate, - * ::cuGraphDestroy, - * ::cuGraphGetNodes, - * ::cuGraphGetRootNodes, - * ::cuGraphGetEdges, - * ::cuGraphClone - */ -CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); - -/** - * \brief Creates a kernel execution node and adds it to a graph - * - * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and arguments specified in \p nodeParams. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * The CUDA_KERNEL_NODE_PARAMS structure is defined as: - * - * \code - * typedef struct CUDA_KERNEL_NODE_PARAMS_st { - * CUfunction func; - * unsigned int gridDimX; - * unsigned int gridDimY; - * unsigned int gridDimZ; - * unsigned int blockDimX; - * unsigned int blockDimY; - * unsigned int blockDimZ; - * unsigned int sharedMemBytes; - * void **kernelParams; - * void **extra; - * } CUDA_KERNEL_NODE_PARAMS; - * \endcode - * - * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x - * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains - * (\p blockDimX x \p blockDimY x \p blockDimZ) threads. - * - * \p sharedMemBytes sets the amount of dynamic shared memory that will be - * available to each thread block. - * - * Kernel parameters to \p func can be specified in one of two ways: - * - * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N - * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer, - * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual - * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need - * to be specified as that information is retrieved directly from the kernel's image. - * - * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single - * buffer that is passed in via \p extra. This places the burden on the application of knowing each - * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists - * to allow this function to take additional less commonly used arguments. \p extra specifies - * a list of names of extra settings and their corresponding values. Each extra setting name is - * immediately followed by the corresponding value. The list must be terminated with either NULL or - * CU_LAUNCH_PARAM_END. - * - * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra - * array; - * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next - * value in \p extra will be a pointer to a buffer - * containing all the kernel parameters for launching kernel - * \p func; - * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next - * value in \p extra will be a pointer to a size_t - * containing the size of the buffer specified with - * ::CU_LAUNCH_PARAM_BUFFER_POINTER; - * - * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both - * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL). - * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel. - * - * The \p kernelParams or \p extra array, as well as the argument values it points to, - * are copied during this call. - * - * \note Kernels launched using graphs must not use texture and surface references. Reading or - * writing through any texture or surface reference is undefined behavior. - * This restriction does not apply to texture and surface objects. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param nodeParams - Parameters for the GPU execution node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchKernel, - * ::cuLaunchCooperativeKernel, - * ::cuGraphKernelNodeGetParams, - * ::cuGraphKernelNodeSetParams, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams); - -/** - * \brief Returns a kernel node's parameters - * - * Returns the parameters of kernel node \p hNode in \p nodeParams. - * The \p kernelParams or \p extra array returned in \p nodeParams, - * as well as the argument values it points to, are owned by the node. - * This memory remains valid until the node is destroyed or its - * parameters are modified, and should not be modified - * directly. Use ::cuGraphKernelNodeSetParams to update the - * parameters of this node. - * - * The params will contain either \p kernelParams or \p extra, - * according to which of these was most recently set on the node. - * - * \param hNode - Node to get the parameters for - * \param nodeParams - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchKernel, - * ::cuGraphAddKernelNode, - * ::cuGraphKernelNodeSetParams - */ -CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams); - -/** - * \brief Sets a kernel node's parameters - * - * Sets the parameters of kernel node \p hNode to \p nodeParams. - * - * \param hNode - Node to set the parameters for - * \param nodeParams - Parameters to copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchKernel, - * ::cuGraphAddKernelNode, - * ::cuGraphKernelNodeGetParams - */ -CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); - -/** - * \brief Creates a memcpy node and adds it to a graph - * - * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * When the graph is launched, the node will perform the memcpy described by \p copyParams. - * See ::cuMemcpy3D() for a description of the structure and its restrictions. - * - * Memcpy nodes have some additional restrictions with regards to managed memory, if the - * system contains at least one device which has a zero value for the device attribute - * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer - * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed - * for those operand(s). The managed memory will be treated as residing on either the - * host or the device, depending on which memory type is specified. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param copyParams - Parameters for the memory copy - * \param ctx - Context on which to run the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuMemcpy3D, - * ::cuGraphMemcpyNodeGetParams, - * ::cuGraphMemcpyNodeSetParams, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); - -/** - * \brief Returns a memcpy node's parameters - * - * Returns the parameters of memcpy node \p hNode in \p nodeParams. - * - * \param hNode - Node to get the parameters for - * \param nodeParams - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuMemcpy3D, - * ::cuGraphAddMemcpyNode, - * ::cuGraphMemcpyNodeSetParams - */ -CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams); - -/** - * \brief Sets a memcpy node's parameters - * - * Sets the parameters of memcpy node \p hNode to \p nodeParams. - * - * \param hNode - Node to set the parameters for - * \param nodeParams - Parameters to copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuMemcpy3D, - * ::cuGraphAddMemcpyNode, - * ::cuGraphMemcpyNodeGetParams - */ -CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams); - -/** - * \brief Creates a memset node and adds it to a graph - * - * Creates a new memset node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * The element size must be 1, 2, or 4 bytes. - * When the graph is launched, the node will perform the memset described by \p memsetParams. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param memsetParams - Parameters for the memory set - * \param ctx - Context on which to run the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_CONTEXT - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuMemsetD2D32, - * ::cuGraphMemsetNodeGetParams, - * ::cuGraphMemsetNodeSetParams, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemcpyNode - */ -CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); - -/** - * \brief Returns a memset node's parameters - * - * Returns the parameters of memset node \p hNode in \p nodeParams. - * - * \param hNode - Node to get the parameters for - * \param nodeParams - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuMemsetD2D32, - * ::cuGraphAddMemsetNode, - * ::cuGraphMemsetNodeSetParams - */ -CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams); - -/** - * \brief Sets a memset node's parameters - * - * Sets the parameters of memset node \p hNode to \p nodeParams. - * - * \param hNode - Node to set the parameters for - * \param nodeParams - Parameters to copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuMemsetD2D32, - * ::cuGraphAddMemsetNode, - * ::cuGraphMemsetNodeGetParams - */ -CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams); - -/** - * \brief Creates a host execution node and adds it to a graph - * - * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and arguments specified in \p nodeParams. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * When the graph is launched, the node will invoke the specified CPU function. - * Host nodes are not supported under MPS with pre-Volta GPUs. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param nodeParams - Parameters for the host node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchHostFunc, - * ::cuGraphHostNodeGetParams, - * ::cuGraphHostNodeSetParams, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams); - -/** - * \brief Returns a host node's parameters - * - * Returns the parameters of host node \p hNode in \p nodeParams. - * - * \param hNode - Node to get the parameters for - * \param nodeParams - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchHostFunc, - * ::cuGraphAddHostNode, - * ::cuGraphHostNodeSetParams - */ -CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams); - -/** - * \brief Sets a host node's parameters - * - * Sets the parameters of host node \p hNode to \p nodeParams. - * - * \param hNode - Node to set the parameters for - * \param nodeParams - Parameters to copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchHostFunc, - * ::cuGraphAddHostNode, - * ::cuGraphHostNodeGetParams - */ -CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); - -/** - * \brief Creates a child graph node and adds it to a graph - * - * Creates a new node which executes an embedded graph, and adds it to \p hGraph with - * \p numDependencies dependencies specified via \p dependencies. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * If \p hGraph contains allocation or free nodes, this call will return an error. - * - * The node executes an embedded child graph. The child graph is cloned in this call. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param childGraph - The graph to clone into this node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphChildGraphNodeGetGraph, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode, - * ::cuGraphClone - */ -CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph); - -/** - * \brief Gets a handle to the embedded graph of a child graph node - * - * Gets a handle to the embedded graph in a child graph node. This call - * does not clone the graph. Changes to the graph will be reflected in - * the node, and the node retains ownership of the graph. - * - * Allocation and free nodes cannot be added to the returned graph. - * Attempting to do so will return an error. - * - * \param hNode - Node to get the embedded graph for - * \param phGraph - Location to store a handle to the graph - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddChildGraphNode, - * ::cuGraphNodeFindInClone - */ -CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph); - -/** - * \brief Creates an empty node and adds it to a graph - * - * Creates a new node which performs no operation, and adds it to \p hGraph with - * \p numDependencies dependencies specified via \p dependencies. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * An empty node performs no operation during execution, but can be used for - * transitive ordering. For example, a phased execution graph with 2 groups of n - * nodes with a barrier between them can be represented using an empty node and - * 2*n dependency edges, rather than no empty node and n^2 dependency edges. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies); - -/** - * \brief Creates an event record node and adds it to a graph - * - * Creates a new event record node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and event specified in \p event. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * Each launch of the graph will record \p event to capture execution of the - * node's dependencies. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param event - Event for the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventWaitNode, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode, - */ -CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); - -/** - * \brief Returns the event associated with an event record node - * - * Returns the event of event record node \p hNode in \p event_out. - * - * \param hNode - Node to get the event for - * \param event_out - Pointer to return the event - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventRecordNode, - * ::cuGraphEventRecordNodeSetEvent, - * ::cuGraphEventWaitNodeGetEvent, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent - */ -CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out); - -/** - * \brief Sets an event record node's event - * - * Sets the event of event record node \p hNode to \p event. - * - * \param hNode - Node to set the event for - * \param event - Event to use - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventRecordNode, - * ::cuGraphEventRecordNodeGetEvent, - * ::cuGraphEventWaitNodeSetEvent, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent - */ -CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event); - -/** - * \brief Creates an event wait node and adds it to a graph - * - * Creates a new event wait node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and event specified in \p event. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. - * A handle to the new node will be returned in \p phGraphNode. - * - * The graph node will wait for all work captured in \p event. See ::cuEventRecord() - * for details on what is captured by an event. \p event may be from a different context - * or device than the launch stream. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param event - Event for the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventRecordNode, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode, - */ -CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); - -/** - * \brief Returns the event associated with an event wait node - * - * Returns the event of event wait node \p hNode in \p event_out. - * - * \param hNode - Node to get the event for - * \param event_out - Pointer to return the event - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventWaitNode, - * ::cuGraphEventWaitNodeSetEvent, - * ::cuGraphEventRecordNodeGetEvent, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent - */ -CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out); - -/** - * \brief Sets an event wait node's event - * - * Sets the event of event wait node \p hNode to \p event. - * - * \param hNode - Node to set the event for - * \param event - Event to use - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventWaitNode, - * ::cuGraphEventWaitNodeGetEvent, - * ::cuGraphEventRecordNodeSetEvent, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent - */ -CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event); - -/** - * \brief Creates an external semaphore signal node and adds it to a graph - * - * Creates a new external semaphore signal node and adds it to \p hGraph with \p - * numDependencies dependencies specified via \p dependencies and arguments specified - * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the - * node will be placed at the root of the graph. \p dependencies may not have any - * duplicate entries. A handle to the new node will be returned in \p phGraphNode. - * - * Performs a signal operation on a set of externally allocated semaphore objects - * when the node is launched. The operation(s) will occur after all of the node's - * dependencies have completed. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param nodeParams - Parameters for the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphExternalSemaphoresSignalNodeGetParams, - * ::cuGraphExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuImportExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddEventRecordNode, - * ::cuGraphAddEventWaitNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode, - */ -CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); - -/** - * \brief Returns an external semaphore signal node's parameters - * - * Returns the parameters of an external semaphore signal node \p hNode in \p params_out. - * The \p extSemArray and \p paramsArray returned in \p params_out, - * are owned by the node. This memory remains valid until the node is destroyed or its - * parameters are modified, and should not be modified - * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the - * parameters of this node. - * - * \param hNode - Node to get the parameters for - * \param params_out - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchKernel, - * ::cuGraphAddExternalSemaphoresSignalNode, - * ::cuGraphExternalSemaphoresSignalNodeSetParams, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out); - -/** - * \brief Sets an external semaphore signal node's parameters - * - * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams. - * - * \param hNode - Node to set the parameters for - * \param nodeParams - Parameters to copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddExternalSemaphoresSignalNode, - * ::cuGraphExternalSemaphoresSignalNodeSetParams, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); - -/** - * \brief Creates an external semaphore wait node and adds it to a graph - * - * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and arguments specified in \p nodeParams. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. A handle - * to the new node will be returned in \p phGraphNode. - * - * Performs a wait operation on a set of externally allocated semaphore objects - * when the node is launched. The node's dependencies will not be launched until - * the wait operation has completed. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param nodeParams - Parameters for the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphExternalSemaphoresWaitNodeGetParams, - * ::cuGraphExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphAddExternalSemaphoresSignalNode, - * ::cuImportExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddEventRecordNode, - * ::cuGraphAddEventWaitNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode, - */ -CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); - -/** - * \brief Returns an external semaphore wait node's parameters - * - * Returns the parameters of an external semaphore wait node \p hNode in \p params_out. - * The \p extSemArray and \p paramsArray returned in \p params_out, - * are owned by the node. This memory remains valid until the node is destroyed or its - * parameters are modified, and should not be modified - * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the - * parameters of this node. - * - * \param hNode - Node to get the parameters for - * \param params_out - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuLaunchKernel, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuGraphExternalSemaphoresWaitNodeSetParams, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out); - -/** - * \brief Sets an external semaphore wait node's parameters - * - * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams. - * - * \param hNode - Node to set the parameters for - * \param nodeParams - Parameters to copy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuGraphExternalSemaphoresWaitNodeSetParams, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync - */ -CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); - -/** - * \brief Creates an allocation node and adds it to a graph - * - * Creates a new allocation node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and arguments specified in \p nodeParams. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. A handle - * to the new node will be returned in \p phGraphNode. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param nodeParams - Parameters for the node - * - * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in - * \p nodeParams.dptr. The allocation's address remains fixed across instantiations and launches. - * - * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode, - * the allocation can be accessed by nodes ordered after the allocation node but before the free node. - * These allocations cannot be freed outside the owning graph, and they can only be freed once in the - * owning graph. - * - * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the - * graph which are ordered after the allocation node, but also by stream operations ordered after the - * graph's execution but before the allocation is freed. - * - * Allocations which are not freed in the same graph can be freed by: - * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree; - * - launching a graph with a free node for that allocation; or - * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes - * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation. - * - * It is not possible to free an allocation in both the owning graph and another graph. If the allocation - * is freed in the same graph, a free node cannot be added to another graph. If the allocation is freed - * in another graph, a free node can no longer be added to the owning graph. - * - * The following restrictions apply to graphs which contain allocation and/or memory free nodes: - * - Nodes and edges of the graph cannot be deleted. - * - The graph cannot be used in a child node. - * - Only one instantiation of the graph may exist at any point in time. - * - The graph cannot be cloned. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddMemFreeNode, - * ::cuGraphMemAllocNodeGetParams, - * ::cuDeviceGraphMemTrim, - * ::cuDeviceGetGraphMemAttribute, - * ::cuDeviceSetGraphMemAttribute, - * ::cuMemAllocAsync, - * ::cuMemFreeAsync, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddEventRecordNode, - * ::cuGraphAddEventWaitNode, - * ::cuGraphAddExternalSemaphoresSignalNode, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams); - -/** - * \brief Returns a memory alloc node's parameters - * - * Returns the parameters of a memory alloc node \p hNode in \p params_out. - * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the - * node. This memory remains valid until the node is destroyed. The returned - * parameters must not be modified. - * - * \param hNode - Node to get the parameters for - * \param params_out - Pointer to return the parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddMemAllocNode, - * ::cuGraphMemFreeNodeGetParams - */ -CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out); - -/** - * \brief Creates a memory free node and adds it to a graph - * - * Creates a new memory free node and adds it to \p hGraph with \p numDependencies - * dependencies specified via \p dependencies and arguments specified in \p nodeParams. - * It is possible for \p numDependencies to be 0, in which case the node will be placed - * at the root of the graph. \p dependencies may not have any duplicate entries. A handle - * to the new node will be returned in \p phGraphNode. - * - * \param phGraphNode - Returns newly created node - * \param hGraph - Graph to which to add the node - * \param dependencies - Dependencies of the node - * \param numDependencies - Number of dependencies - * \param dptr - Address of memory to free - * - * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free: - * - an allocation twice in the same graph. - * - an address that was not returned by an allocation node. - * - an invalid address. - * - * The following restrictions apply to graphs which contain allocation and/or memory free nodes: - * - Nodes and edges of the graph cannot be deleted. - * - The graph cannot be used in a child node. - * - Only one instantiation of the graph may exist at any point in time. - * - The graph cannot be cloned. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddMemAllocNode, - * ::cuGraphMemFreeNodeGetParams, - * ::cuDeviceGraphMemTrim, - * ::cuDeviceGetGraphMemAttribute, - * ::cuDeviceSetGraphMemAttribute, - * ::cuMemAllocAsync, - * ::cuMemFreeAsync, - * ::cuGraphCreate, - * ::cuGraphDestroyNode, - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddEventRecordNode, - * ::cuGraphAddEventWaitNode, - * ::cuGraphAddExternalSemaphoresSignalNode, - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr); - -/** - * \brief Returns a memory free node's parameters - * - * Returns the address of a memory free node \p hNode in \p dptr_out. - * - * \param hNode - Node to get the parameters for - * \param dptr_out - Pointer to return the device address - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddMemFreeNode, - * ::cuGraphMemAllocNodeGetParams - */ -CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out); - -/** - * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS. - * - * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are - * freed back to the operating system. - * - * \param device - The device for which cached memory should be freed. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_DEVICE - * - * \sa - * ::cuGraphAddMemAllocNode, - * ::cuGraphAddMemFreeNode, - * ::cuDeviceSetGraphMemAttribute, - * ::cuDeviceGetGraphMemAttribute - */ -CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device); - -/** - * \brief Query asynchronous allocation attributes related to graphs - * - * Valid attributes are: - * - * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs - * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the - * last time it was reset. High watermark can only be reset to zero. - * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by - * the CUDA graphs asynchronous allocator. - * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by - * the CUDA graphs asynchronous allocator. - * - * \param device - Specifies the scope of the query - * \param attr - attribute to get - * \param value - retrieved value - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_DEVICE - * - * \sa - * ::cuDeviceSetGraphMemAttribute, - * ::cuGraphAddMemAllocNode, - * ::cuGraphAddMemFreeNode - */ -CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); - -/** - * \brief Set asynchronous allocation attributes related to graphs - * - * Valid attributes are: - * - * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the - * last time it was reset. High watermark can only be reset to zero. - * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by - * the CUDA graphs asynchronous allocator. - * - * \param device - Specifies the scope of the query - * \param attr - attribute to get - * \param value - pointer to value to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_DEVICE - * - * \sa - * ::cuDeviceGetGraphMemAttribute, - * ::cuGraphAddMemAllocNode, - * ::cuGraphAddMemFreeNode - */ -CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); - -/** - * \brief Clones a graph - * - * This function creates a copy of \p originalGraph and returns it in \p phGraphClone. - * All parameters are copied into the cloned graph. The original graph may be modified - * after this call without affecting the clone. - * - * Child graph nodes in the original graph are recursively copied into the clone. - * - * \param phGraphClone - Returns newly created cloned graph - * \param originalGraph - Graph to clone - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OUT_OF_MEMORY - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphCreate, - * ::cuGraphNodeFindInClone - */ -CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph); - -/** - * \brief Finds a cloned version of a node - * - * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode - * in the original graph. - * - * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone. - * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to - * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have - * been removed. The cloned node is then returned via \p phClonedNode. - * - * \param phNode - Returns handle to the cloned node - * \param hOriginalNode - Handle to the original node - * \param hClonedGraph - Cloned graph to query - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphClone - */ -CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph); - -/** - * \brief Returns a node's type - * - * Returns the node type of \p hNode in \p type. - * - * \param hNode - Node to query - * \param type - Pointer to return the node type - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphGetNodes, - * ::cuGraphGetRootNodes, - * ::cuGraphChildGraphNodeGetGraph, - * ::cuGraphKernelNodeGetParams, - * ::cuGraphKernelNodeSetParams, - * ::cuGraphHostNodeGetParams, - * ::cuGraphHostNodeSetParams, - * ::cuGraphMemcpyNodeGetParams, - * ::cuGraphMemcpyNodeSetParams, - * ::cuGraphMemsetNodeGetParams, - * ::cuGraphMemsetNodeSetParams - */ -CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type); - -/** - * \brief Returns a graph's nodes - * - * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this - * function will return the number of nodes in \p numNodes. Otherwise, - * \p numNodes entries will be filled in. If \p numNodes is higher than the actual - * number of nodes, the remaining entries in \p nodes will be set to NULL, and the - * number of nodes actually obtained will be returned in \p numNodes. - * - * \param hGraph - Graph to query - * \param nodes - Pointer to return the nodes - * \param numNodes - See description - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphCreate, - * ::cuGraphGetRootNodes, - * ::cuGraphGetEdges, - * ::cuGraphNodeGetType, - * ::cuGraphNodeGetDependencies, - * ::cuGraphNodeGetDependentNodes - */ -CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes); - -/** - * \brief Returns a graph's root nodes - * - * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this - * function will return the number of root nodes in \p numRootNodes. Otherwise, - * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual - * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the - * number of nodes actually obtained will be returned in \p numRootNodes. - * - * \param hGraph - Graph to query - * \param rootNodes - Pointer to return the root nodes - * \param numRootNodes - See description - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphCreate, - * ::cuGraphGetNodes, - * ::cuGraphGetEdges, - * ::cuGraphNodeGetType, - * ::cuGraphNodeGetDependencies, - * ::cuGraphNodeGetDependentNodes - */ -CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes); - -/** - * \brief Returns a graph's dependency edges - * - * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding - * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the - * node in \p from[i]. \p from and \p to may both be NULL, in which - * case this function only returns the number of edges in \p numEdges. Otherwise, - * \p numEdges entries will be filled in. If \p numEdges is higher than the actual - * number of edges, the remaining entries in \p from and \p to will be set to NULL, and - * the number of edges actually returned will be written to \p numEdges. - * - * \param hGraph - Graph to get the edges from - * \param from - Location to return edge endpoints - * \param to - Location to return edge endpoints - * \param numEdges - See description - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphGetNodes, - * ::cuGraphGetRootNodes, - * ::cuGraphAddDependencies, - * ::cuGraphRemoveDependencies, - * ::cuGraphNodeGetDependencies, - * ::cuGraphNodeGetDependentNodes - */ -CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges); - -/** - * \brief Returns a node's dependencies - * - * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this - * function will return the number of dependencies in \p numDependencies. Otherwise, - * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual - * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the - * number of nodes actually obtained will be returned in \p numDependencies. - * - * \param hNode - Node to query - * \param dependencies - Pointer to return the dependencies - * \param numDependencies - See description - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphNodeGetDependentNodes, - * ::cuGraphGetNodes, - * ::cuGraphGetRootNodes, - * ::cuGraphGetEdges, - * ::cuGraphAddDependencies, - * ::cuGraphRemoveDependencies - */ -CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies); - -/** - * \brief Returns a node's dependent nodes - * - * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which - * case this function will return the number of dependent nodes in \p numDependentNodes. - * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is - * higher than the actual number of dependent nodes, the remaining entries in - * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will - * be returned in \p numDependentNodes. - * - * \param hNode - Node to query - * \param dependentNodes - Pointer to return the dependent nodes - * \param numDependentNodes - See description - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphNodeGetDependencies, - * ::cuGraphGetNodes, - * ::cuGraphGetRootNodes, - * ::cuGraphGetEdges, - * ::cuGraphAddDependencies, - * ::cuGraphRemoveDependencies - */ -CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes); - -/** - * \brief Adds dependency edges to a graph - * - * The number of dependencies to be added is defined by \p numDependencies - * Elements in \p from and \p to at corresponding indices define a dependency. - * Each node in \p from and \p to must belong to \p hGraph. - * - * If \p numDependencies is 0, elements in \p from and \p to will be ignored. - * Specifying an existing dependency will return an error. - * - * \param hGraph - Graph to which dependencies are added - * \param from - Array of nodes that provide the dependencies - * \param to - Array of dependent nodes - * \param numDependencies - Number of dependencies to be added - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphRemoveDependencies, - * ::cuGraphGetEdges, - * ::cuGraphNodeGetDependencies, - * ::cuGraphNodeGetDependentNodes - */ -CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); - -/** - * \brief Removes dependency edges from a graph - * - * The number of \p dependencies to be removed is defined by \p numDependencies. - * Elements in \p from and \p to at corresponding indices define a dependency. - * Each node in \p from and \p to must belong to \p hGraph. - * - * If \p numDependencies is 0, elements in \p from and \p to will be ignored. - * Specifying a non-existing dependency will return an error. - * - * Dependencies cannot be removed from graphs which contain allocation or free nodes. - * Any attempt to do so will return an error. - * - * \param hGraph - Graph from which to remove dependencies - * \param from - Array of nodes that provide the dependencies - * \param to - Array of dependent nodes - * \param numDependencies - Number of dependencies to be removed - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddDependencies, - * ::cuGraphGetEdges, - * ::cuGraphNodeGetDependencies, - * ::cuGraphNodeGetDependentNodes - */ -CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); - -/** - * \brief Remove a node from the graph - * - * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes - * on \p hNode and vice versa. - * - * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed. - * Any attempt to do so will return an error. - * - * \param hNode - Node to remove - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddChildGraphNode, - * ::cuGraphAddEmptyNode, - * ::cuGraphAddKernelNode, - * ::cuGraphAddHostNode, - * ::cuGraphAddMemcpyNode, - * ::cuGraphAddMemsetNode - */ -CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode); - -/** - * \brief Creates an executable graph from a graph - * - * Instantiates \p hGraph as an executable graph. The graph is validated for any - * structural constraints or intra-node constraints which were not previously - * validated. If instantiation is successful, a handle to the instantiated graph - * is returned in \p phGraphExec. - * - * If there are any errors, diagnostic information may be returned in \p errorNode and - * \p logBuffer. This is the primary way to inspect instantiation errors. The output - * will be null terminated unless the diagnostics overflow - * the buffer. In this case, they will be truncated, and the last byte can be - * inspected to determine if truncation occurred. - * - * \param phGraphExec - Returns instantiated graph - * \param hGraph - Graph to instantiate - * \param phErrorNode - In case of an instantiation error, this may be modified to - * indicate a node contributing to the error - * \param logBuffer - A character buffer to store diagnostic messages - * \param bufferSize - Size of the log buffer in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphInstantiateWithFlags, - * ::cuGraphCreate, - * ::cuGraphUpload, - * ::cuGraphLaunch, - * ::cuGraphExecDestroy - */ -CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); - -/** - * \brief Creates an executable graph from a graph - * - * Instantiates \p hGraph as an executable graph. The graph is validated for any - * structural constraints or intra-node constraints which were not previously - * validated. If instantiation is successful, a handle to the instantiated graph - * is returned in \p phGraphExec. - * - * The \p flags parameter controls the behavior of instantiation and subsequent - * graph launches. Valid flags are: - * - * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a - * graph containing memory allocation nodes to automatically free any - * unfreed memory allocations before the graph is relaunched. - * - * If \p hGraph contains any allocation or free nodes, there can be at most one - * executable graph in existence for that graph at a time. - * - * An attempt to instantiate a second executable graph before destroying the first - * with ::cuGraphExecDestroy will result in an error. - * - * \param phGraphExec - Returns instantiated graph - * \param hGraph - Graph to instantiate - * \param flags - Flags to control instantiation. See ::CUgraphInstantiate_flags. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphInstantiate, - * ::cuGraphCreate, - * ::cuGraphUpload, - * ::cuGraphLaunch, - * ::cuGraphExecDestroy - */ -CUresult CUDAAPI cuGraphInstantiateWithFlags(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags); - -/** - * \brief Sets the parameters for a kernel node in the given graphExec - * - * Sets the parameters of a kernel node in an executable graph \p hGraphExec. - * The node is identified by the corresponding node \p hNode in the - * non-executable graph, from which the executable graph was instantiated. - * - * \p hNode must not have been removed from the original graph. The \p func field - * of \p nodeParams cannot be modified and must match the original value. - * All other values can be modified. - * - * The modifications only affect future launches of \p hGraphExec. Already - * enqueued or running launches of \p hGraphExec are not affected by this call. - * \p hNode is also not modified by this call. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - kernel node from the graph from which graphExec was instantiated - * \param nodeParams - Updated Parameters to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddKernelNode, - * ::cuGraphKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); - -/** - * \brief Sets the parameters for a memcpy node in the given graphExec. - * - * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had - * contained \p copyParams at instantiation. hNode must remain in the graph which was - * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. - * - * The source and destination memory in \p copyParams must be allocated from the same - * contexts as the original source and destination memory. Both the instantiation-time - * memory operands and the memory operands in \p copyParams must be 1-dimensional. - * Zero-length operations are not supported. - * - * The modifications only affect future launches of \p hGraphExec. Already enqueued - * or running launches of \p hGraphExec are not affected by this call. hNode is also - * not modified by this call. - * - * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or - * either the original or new memory operands are multidimensional. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - Memcpy node from the graph which was used to instantiate graphExec - * \param copyParams - The updated parameters to set - * \param ctx - Context on which to run the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddMemcpyNode, - * ::cuGraphMemcpyNodeSetParams, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); - -/** - * \brief Sets the parameters for a memset node in the given graphExec. - * - * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had - * contained \p memsetParams at instantiation. hNode must remain in the graph which was - * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. - * - * The destination memory in \p memsetParams must be allocated from the same - * contexts as the original destination memory. Both the instantiation-time - * memory operand and the memory operand in \p memsetParams must be 1-dimensional. - * Zero-length operations are not supported. - * - * The modifications only affect future launches of \p hGraphExec. Already enqueued - * or running launches of \p hGraphExec are not affected by this call. hNode is also - * not modified by this call. - * - * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or - * either the original or new memory operand are multidimensional. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - Memset node from the graph which was used to instantiate graphExec - * \param memsetParams - The updated parameters to set - * \param ctx - Context on which to run the node - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddMemsetNode, - * ::cuGraphMemsetNodeSetParams, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); - -/** - * \brief Sets the parameters for a host node in the given graphExec. - * - * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had - * contained \p nodeParams at instantiation. hNode must remain in the graph which was - * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. - * - * The modifications only affect future launches of \p hGraphExec. Already enqueued - * or running launches of \p hGraphExec are not affected by this call. hNode is also - * not modified by this call. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - Host node from the graph which was used to instantiate graphExec - * \param nodeParams - The updated parameters to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddHostNode, - * ::cuGraphHostNodeSetParams, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); - -/** - * \brief Updates node parameters in the child graph node in the given graphExec. - * - * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained - * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation. - * \p hNode must remain in the graph which was used to instantiate \p hGraphExec. - * Changed edges to and from \p hNode are ignored. - * - * The modifications only affect future launches of \p hGraphExec. Already enqueued - * or running launches of \p hGraphExec are not affected by this call. \p hNode is also - * not modified by this call. - * - * The topology of \p childGraph, as well as the node insertion order, must match that - * of the graph contained in \p hNode. See ::cuGraphExecUpdate() for a list of restrictions - * on what can be updated in an instantiated graph. The update is recursive, so child graph - * nodes contained within the top level child graph will also be updated. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - Host node from the graph which was used to instantiate graphExec - * \param childGraph - The graph supplying the updated parameters - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddChildGraphNode, - * ::cuGraphChildGraphNodeGetGraph, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph); - -/** - * \brief Sets the event for an event record node in the given graphExec - * - * Sets the event of an event record node in an executable graph \p hGraphExec. - * The node is identified by the corresponding node \p hNode in the - * non-executable graph, from which the executable graph was instantiated. - * - * The modifications only affect future launches of \p hGraphExec. Already - * enqueued or running launches of \p hGraphExec are not affected by this call. - * \p hNode is also not modified by this call. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - event record node from the graph from which graphExec was instantiated - * \param event - Updated event to use - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventRecordNode, - * ::cuGraphEventRecordNodeGetEvent, - * ::cuGraphEventWaitNodeSetEvent, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); - -/** - * \brief Sets the event for an event wait node in the given graphExec - * - * Sets the event of an event wait node in an executable graph \p hGraphExec. - * The node is identified by the corresponding node \p hNode in the - * non-executable graph, from which the executable graph was instantiated. - * - * The modifications only affect future launches of \p hGraphExec. Already - * enqueued or running launches of \p hGraphExec are not affected by this call. - * \p hNode is also not modified by this call. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - event wait node from the graph from which graphExec was instantiated - * \param event - Updated event to use - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddEventWaitNode, - * ::cuGraphEventWaitNodeGetEvent, - * ::cuGraphEventRecordNodeSetEvent, - * ::cuEventRecordWithFlags, - * ::cuStreamWaitEvent, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); - -/** - * \brief Sets the parameters for an external semaphore signal node in the given graphExec - * - * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec. - * The node is identified by the corresponding node \p hNode in the - * non-executable graph, from which the executable graph was instantiated. - * - * \p hNode must not have been removed from the original graph. - * - * The modifications only affect future launches of \p hGraphExec. Already - * enqueued or running launches of \p hGraphExec are not affected by this call. - * \p hNode is also not modified by this call. - * - * Changing \p nodeParams->numExtSems is not supported. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - semaphore signal node from the graph from which graphExec was instantiated - * \param nodeParams - Updated Parameters to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddExternalSemaphoresSignalNode, - * ::cuImportExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); - -/** - * \brief Sets the parameters for an external semaphore wait node in the given graphExec - * - * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec. - * The node is identified by the corresponding node \p hNode in the - * non-executable graph, from which the executable graph was instantiated. - * - * \p hNode must not have been removed from the original graph. - * - * The modifications only affect future launches of \p hGraphExec. Already - * enqueued or running launches of \p hGraphExec are not affected by this call. - * \p hNode is also not modified by this call. - * - * Changing \p nodeParams->numExtSems is not supported. - * - * \param hGraphExec - The executable graph in which to set the specified node - * \param hNode - semaphore wait node from the graph from which graphExec was instantiated - * \param nodeParams - Updated Parameters to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphAddExternalSemaphoresWaitNode, - * ::cuImportExternalSemaphore, - * ::cuSignalExternalSemaphoresAsync, - * ::cuWaitExternalSemaphoresAsync, - * ::cuGraphExecKernelNodeSetParams, - * ::cuGraphExecMemcpyNodeSetParams, - * ::cuGraphExecMemsetNodeSetParams, - * ::cuGraphExecHostNodeSetParams, - * ::cuGraphExecChildGraphNodeSetParams, - * ::cuGraphExecEventRecordNodeSetEvent, - * ::cuGraphExecEventWaitNodeSetEvent, - * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, - * ::cuGraphExecUpdate, - * ::cuGraphInstantiate - */ -CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); - -/** - * \brief Uploads an executable graph in a stream - * - * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of - * the same \p hGraphExec will be serialized. Each upload is ordered behind both any - * previous work in \p hStream and any previous launches of \p hGraphExec. - * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec. - * - * \param hGraphExec - Executable graph to upload - * \param hStream - Stream in which to upload the graph - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphInstantiate, - * ::cuGraphLaunch, - * ::cuGraphExecDestroy - */ -CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream); - -/** - * \brief Launches an executable graph in a stream - * - * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing - * at a time. Each launch is ordered behind both any previous work in \p hStream - * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be - * instantiated multiple times into multiple executable graphs. - * - * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and - * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, - * the launch will fail with ::CUDA_ERROR_INVALID_VALUE. - * - * \param hGraphExec - Executable graph to launch - * \param hStream - Stream in which to launch the graph - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphInstantiate, - * ::cuGraphUpload, - * ::cuGraphExecDestroy - */ -CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream); - -/** - * \brief Destroys an executable graph - * - * Destroys the executable graph specified by \p hGraphExec, as well - * as all of its executable nodes. If the executable graph is - * in-flight, it will not be terminated, but rather freed - * asynchronously on completion. - * - * \param hGraphExec - Executable graph to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphInstantiate, - * ::cuGraphUpload, - * ::cuGraphLaunch - */ -CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec); - -/** - * \brief Destroys a graph - * - * Destroys the graph specified by \p hGraph, as well as all of its nodes. - * - * \param hGraph - Graph to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_VALUE - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphCreate - */ -CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph); - -/** - * \brief Check whether an executable graph can be updated with a graph and perform the update if possible - * - * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the - * node parameters in a topologically identical graph specified by \p hGraph. - * - * Limitations: - * - * - Kernel nodes: - * - The owning context of the function cannot change. - * - A node whose function originally did not use CUDA dynamic parallelism cannot be updated - * to a function which uses CDP - * - Memset and memcpy nodes: - * - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change. - * - The source/destination memory must be allocated from the same contexts as the original - * source/destination memory. - * - Only 1D memsets can be changed. - * - Additional memcpy node restrictions: - * - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE, - * CU_MEMORYTYPE_ARRAY, etc.) is not supported. - * - External semaphore wait nodes and record nodes: - * - Changing the number of semaphores is not supported. - * - * Note: The API may add further restrictions in future releases. The return code should always be checked. - * - * cuGraphExecUpdate sets \p updateResult_out to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under - * the following conditions: - * - * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out - * is NULL. - * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out - * is NULL. - * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is - * the pairless node from \p hGraph. - * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph. - * - * cuGraphExecUpdate sets \p updateResult_out to: - * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value. - * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed - * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case - * \p hErrorNode_out is set to the node from \p hGraph. - * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported - * way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph - * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way - * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. - * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like - * the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph - * - * If \p updateResult_out isn't set in one of the situations described above, the update check passes - * and cuGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph. If an error happens - * during the update, \p updateResult_out will be set to CU_GRAPH_EXEC_UPDATE_ERROR; otherwise, - * \p updateResult_out is set to CU_GRAPH_EXEC_UPDATE_SUCCESS. - * - * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully. It returns - * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included - * changes which violated constraints specific to instantiated graph update. - * - * \param hGraphExec The instantiated graph to be updated - * \param hGraph The graph containing the updated parameters - * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any - * \param updateResult_out Whether the graph update was permitted. If was forbidden, the reason why - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE, - * \note_graph_thread_safety - * \notefnerr - * - * \sa - * ::cuGraphInstantiate, - */ -CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out); - -/** - * \brief Copies attributes from source node to destination node. - * - * Copies attributes from source node \p src to destination node \p dst. - * Both node must have the same context. - * - * \param[out] dst Destination node - * \param[in] src Source node - * For list of attributes see ::CUkernelNodeAttrID - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src); - -/** - * \brief Queries node attribute. - * - * Queries attribute \p attr from node \p hNode and stores it in corresponding - * member of \p value_out. - * - * \param[in] hNode - * \param[in] attr - * \param[out] value_out - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, - CUkernelNodeAttrValue *value_out); - -/** - * \brief Sets node attribute. - * - * Sets attribute \p attr on node \p hNode from corresponding attribute of - * \p value. - * - * \param[out] hNode - * \param[in] attr - * \param[out] value - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE - * \notefnerr - * - * \sa - * ::CUaccessPolicyWindow - */ -CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, - const CUkernelNodeAttrValue *value); - -/** - * \brief Write a DOT file describing graph structure - * - * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph. - * By default this includes the graph topology, node types, node id, kernel names and memcpy direction. - * \p flags can be specified to write more detailed information about each node type such as - * parameter values, kernel attributes, node and function handles. - * - * \param hGraph - The graph to create a DOT file from - * \param path - The path to write the DOT file to - * \param flags - Flags from CUgraphDebugDot_flags for specifying which additional node information to write - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_OPERATING_SYSTEM - */ -CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags); - -/** - * \brief Create a user object - * - * Create a user object with the specified destructor callback and initial reference count. The - * initial references are owned by the caller. - * - * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they - * are executed by a shared internal thread. Another thread may be signaled to perform such - * actions, if it does not block forward progress of tasks scheduled through CUDA. - * - * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. - * - * \param object_out - Location to return the user object handle - * \param ptr - The pointer to pass to the destroy function - * \param destroy - Callback to free the user object when it is no longer in use - * \param initialRefcount - The initial refcount to create the object with, typically 1. The - * initial references are owned by the calling thread. - * \param flags - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC, - * which is the only defined flag. This indicates that the destroy - * callback cannot be waited on by any CUDA API. Users requiring - * synchronization of the callback should signal its completion - * manually. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuUserObjectRetain, - * ::cuUserObjectRelease, - * ::cuGraphRetainUserObject, - * ::cuGraphReleaseUserObject, - * ::cuGraphCreate - */ -CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy, - unsigned int initialRefcount, unsigned int flags); - -/** - * \brief Retain a reference to a user object - * - * Retains new references to a user object. The new references are owned by the caller. - * - * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. - * - * \param object - The object to retain - * \param count - The number of references to retain, typically 1. Must be nonzero - * and not larger than INT_MAX. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuUserObjectCreate, - * ::cuUserObjectRelease, - * ::cuGraphRetainUserObject, - * ::cuGraphReleaseUserObject, - * ::cuGraphCreate - */ -CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count); - -/** - * \brief Release a reference to a user object - * - * Releases user object references owned by the caller. The object's destructor is invoked if - * the reference count reaches zero. - * - * It is undefined behavior to release references not owned by the caller, or to use a user - * object handle after all references are released. - * - * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. - * - * \param object - The object to release - * \param count - The number of references to release, typically 1. Must be nonzero - * and not larger than INT_MAX. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuUserObjectCreate, - * ::cuUserObjectRetain, - * ::cuGraphRetainUserObject, - * ::cuGraphReleaseUserObject, - * ::cuGraphCreate - */ -CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count); - -/** - * \brief Retain a reference to a user object from a graph - * - * Creates or moves user object references that will be owned by a CUDA graph. - * - * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. - * - * \param graph - The graph to associate the reference with - * \param object - The user object to retain a reference for - * \param count - The number of references to add to the graph, typically 1. Must be - * nonzero and not larger than INT_MAX. - * \param flags - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references - * from the calling thread, rather than create new references. Pass 0 - * to create new references. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuUserObjectCreate, - * ::cuUserObjectRetain, - * ::cuUserObjectRelease, - * ::cuGraphReleaseUserObject, - * ::cuGraphCreate - */ -CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags); - -/** - * \brief Release a user object reference from a graph - * - * Releases user object references owned by a graph. - * - * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. - * - * \param graph - The graph that will release the reference - * \param object - The user object to release a reference for - * \param count - The number of references to release, typically 1. Must be nonzero - * and not larger than INT_MAX. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuUserObjectCreate, - * ::cuUserObjectRetain, - * ::cuUserObjectRelease, - * ::cuGraphRetainUserObject, - * ::cuGraphCreate - */ -CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count); - -/** @} */ /* END CUDA_GRAPH */ - -/** - * \defgroup CUDA_OCCUPANCY Occupancy - * - * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the occupancy calculation functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Returns occupancy of a function - * - * Returns in \p *numBlocks the number of the maximum active blocks per - * streaming multiprocessor. - * - * \param numBlocks - Returned occupancy - * \param func - Kernel for which occupancy is calculated - * \param blockSize - Block size the kernel is intended to be launched with - * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor - */ -CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize); - -/** - * \brief Returns occupancy of a function - * - * Returns in \p *numBlocks the number of the maximum active blocks per - * streaming multiprocessor. - * - * The \p Flags parameter controls how special cases are handled. The - * valid flags are: - * - * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as - * ::cuOccupancyMaxActiveBlocksPerMultiprocessor; - * - * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the - * default behavior on platform where global caching affects - * occupancy. On such platforms, if caching is enabled, but - * per-block SM resource usage would result in zero occupancy, the - * occupancy calculator will calculate the occupancy as if caching - * is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes - * the occupancy calculator to return 0 in such cases. More information - * can be found about this feature in the "Unified L1/Texture Cache" - * section of the Maxwell tuning guide. - * - * \param numBlocks - Returned occupancy - * \param func - Kernel for which occupancy is calculated - * \param blockSize - Block size the kernel is intended to be launched with - * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes - * \param flags - Requested behavior for the occupancy calculator - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags - */ -CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags); - -/** - * \brief Suggest a launch configuration with reasonable occupancy - * - * Returns in \p *blockSize a reasonable block size that can achieve - * the maximum occupancy (or, the maximum number of active warps with - * the fewest blocks per multiprocessor), and in \p *minGridSize the - * minimum grid size to achieve the maximum occupancy. - * - * If \p blockSizeLimit is 0, the configurator will use the maximum - * block size permitted by the device / function instead. - * - * If per-block dynamic shared memory allocation is not needed, the - * user should leave both \p blockSizeToDynamicSMemSize and \p - * dynamicSMemSize as 0. - * - * If per-block dynamic shared memory allocation is needed, then if - * the dynamic shared memory size is constant regardless of block - * size, the size should be passed through \p dynamicSMemSize, and \p - * blockSizeToDynamicSMemSize should be NULL. - * - * Otherwise, if the per-block dynamic shared memory size varies with - * different block sizes, the user needs to provide a unary function - * through \p blockSizeToDynamicSMemSize that computes the dynamic - * shared memory needed by \p func for any given block size. \p - * dynamicSMemSize is ignored. An example signature is: - * - * \code - * // Take block size, returns dynamic shared memory needed - * size_t blockToSmem(int blockSize); - * \endcode - * - * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy - * \param blockSize - Returned maximum block size that can achieve the maximum occupancy - * \param func - Kernel for which launch configuration is calculated - * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size - * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes - * \param blockSizeLimit - The maximum block size \p func is designed to handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - * ::cudaOccupancyMaxPotentialBlockSize - */ -CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit); - -/** - * \brief Suggest a launch configuration with reasonable occupancy - * - * An extended version of ::cuOccupancyMaxPotentialBlockSize. In - * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize, - * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags - * parameter. - * - * The \p Flags parameter controls how special cases are handled. The - * valid flags are: - * - * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as - * ::cuOccupancyMaxPotentialBlockSize; - * - * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the - * default behavior on platform where global caching affects - * occupancy. On such platforms, the launch configurations that - * produces maximal occupancy might not support global - * caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE - * guarantees that the the produced launch configuration is global - * caching compatible at a potential cost of occupancy. More information - * can be found about this feature in the "Unified L1/Texture Cache" - * section of the Maxwell tuning guide. - * - * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy - * \param blockSize - Returned maximum block size that can achieve the maximum occupancy - * \param func - Kernel for which launch configuration is calculated - * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size - * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes - * \param blockSizeLimit - The maximum block size \p func is designed to handle - * \param flags - Options - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - * ::cudaOccupancyMaxPotentialBlockSizeWithFlags - */ -CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); - -/** - * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM - * - * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. - * - * \param dynamicSmemSize - Returned maximum dynamic shared memory - * \param func - Kernel function for which occupancy is calculated - * \param numBlocks - Number of blocks to fit on SM - * \param blockSize - Size of the blocks - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - */ -CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize); - -/** @} */ /* END CUDA_OCCUPANCY */ - -/** - * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED] - * - * ___MANBRIEF___ deprecated texture reference management functions of the - * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the deprecated texture reference management - * functions of the low-level CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Binds an array as a texture reference - * - * \deprecated - * - * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any - * previous address or CUDA array state associated with the texture reference - * is superseded by this function. \p Flags must be set to - * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is - * unbound. - * - * \param hTexRef - Texture reference to bind - * \param hArray - Array to bind - * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); - -/** - * \brief Binds a mipmapped array to a texture reference - * - * \deprecated - * - * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef. - * Any previous address or CUDA array state associated with the texture reference - * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. - * Any CUDA array previously bound to \p hTexRef is unbound. - * - * \param hTexRef - Texture reference to bind - * \param hMipmappedArray - Mipmapped array to bind - * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); - -/** - * \brief Binds an address as a texture reference - * - * \deprecated - * - * Binds a linear address range to the texture reference \p hTexRef. Any - * previous address or CUDA array state associated with the texture reference - * is superseded by this function. Any memory previously bound to \p hTexRef - * is unbound. - * - * Since the hardware enforces an alignment requirement on texture base - * addresses, ::cuTexRefSetAddress() passes back a byte offset in - * \p *ByteOffset that must be applied to texture fetches in order to read from - * the desired memory. This offset must be divided by the texel size and - * passed to kernels that read from the texture so they can be applied to the - * ::tex1Dfetch() function. - * - * If the device memory pointer was returned from ::cuMemAlloc(), the offset - * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter. - * - * The total number of elements (or texels) in the linear address range - * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. - * The number of elements is computed as (\p bytes / bytesPerElement), - * where bytesPerElement is determined from the data format and number of - * components set using ::cuTexRefSetFormat(). - * - * \param ByteOffset - Returned byte offset - * \param hTexRef - Texture reference to bind - * \param dptr - Device pointer to bind - * \param bytes - Size of memory to bind in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTexture - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); - -/** - * \brief Binds an address as a 2D texture reference - * - * \deprecated - * - * Binds a linear address range to the texture reference \p hTexRef. Any - * previous address or CUDA array state associated with the texture reference - * is superseded by this function. Any memory previously bound to \p hTexRef - * is unbound. - * - * Using a ::tex2D() function inside a kernel requires a call to either - * ::cuTexRefSetArray() to bind the corresponding texture reference to an - * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear - * memory. - * - * Function calls to ::cuTexRefSetFormat() cannot follow calls to - * ::cuTexRefSetAddress2D() for the same texture reference. - * - * It is required that \p dptr be aligned to the appropriate hardware-specific - * texture alignment. You can query this value using the device attribute - * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is - * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. - * - * \p Pitch has to be aligned to the hardware-specific texture pitch alignment. - * This value can be queried using the device attribute - * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is - * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. - * - * Width and Height, which are specified in elements (or texels), cannot exceed - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. - * \p Pitch, which is specified in bytes, cannot exceed - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. - * - * \param hTexRef - Texture reference to bind - * \param desc - Descriptor of CUDA array - * \param dptr - Device pointer to bind - * \param Pitch - Line pitch in bytes - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTexture2D - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); - -/** - * \brief Sets the format for a texture reference - * - * \deprecated - * - * Specifies the format of the data to be read by the texture reference - * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the - * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure: - * They specify the format of each component and the number of components per - * array element. - * - * \param hTexRef - Texture reference - * \param fmt - Format to set - * \param NumPackedComponents - Number of components per array element - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaCreateChannelDesc, - * ::cudaBindTexture, - * ::cudaBindTexture2D, - * ::cudaBindTextureToArray, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); - -/** - * \brief Sets the addressing mode for a texture reference - * - * \deprecated - * - * Specifies the addressing mode \p am for the given dimension \p dim of the - * texture reference \p hTexRef. If \p dim is zero, the addressing mode is - * applied to the first parameter of the functions used to fetch from the - * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined - * as: - * \code - typedef enum CUaddress_mode_enum { - CU_TR_ADDRESS_MODE_WRAP = 0, - CU_TR_ADDRESS_MODE_CLAMP = 1, - CU_TR_ADDRESS_MODE_MIRROR = 2, - CU_TR_ADDRESS_MODE_BORDER = 3 - } CUaddress_mode; - * \endcode - * - * Note that this call has no effect if \p hTexRef is bound to linear memory. - * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only - * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. - * - * \param hTexRef - Texture reference - * \param dim - Dimension - * \param am - Addressing mode to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTexture, - * ::cudaBindTexture2D, - * ::cudaBindTextureToArray, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); - -/** - * \brief Sets the filtering mode for a texture reference - * - * \deprecated - * - * Specifies the filtering mode \p fm to be used when reading memory through - * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: - * - * \code - typedef enum CUfilter_mode_enum { - CU_TR_FILTER_MODE_POINT = 0, - CU_TR_FILTER_MODE_LINEAR = 1 - } CUfilter_mode; - * \endcode - * - * Note that this call has no effect if \p hTexRef is bound to linear memory. - * - * \param hTexRef - Texture reference - * \param fm - Filtering mode to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); - -/** - * \brief Sets the mipmap filtering mode for a texture reference - * - * \deprecated - * - * Specifies the mipmap filtering mode \p fm to be used when reading memory through - * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: - * - * \code - typedef enum CUfilter_mode_enum { - CU_TR_FILTER_MODE_POINT = 0, - CU_TR_FILTER_MODE_LINEAR = 1 - } CUfilter_mode; - * \endcode - * - * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. - * - * \param hTexRef - Texture reference - * \param fm - Filtering mode to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm); - -/** - * \brief Sets the mipmap level bias for a texture reference - * - * \deprecated - * - * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when - * reading memory through the texture reference \p hTexRef. - * - * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. - * - * \param hTexRef - Texture reference - * \param bias - Mipmap level bias - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); - -/** - * \brief Sets the mipmap min/max mipmap level clamps for a texture reference - * - * \deprecated - * - * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp - * respectively, to be used when reading memory through the texture reference - * \p hTexRef. - * - * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. - * - * \param hTexRef - Texture reference - * \param minMipmapLevelClamp - Mipmap min level clamp - * \param maxMipmapLevelClamp - Mipmap max level clamp - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); - -/** - * \brief Sets the maximum anisotropy for a texture reference - * - * \deprecated - * - * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through - * the texture reference \p hTexRef. - * - * Note that this call has no effect if \p hTexRef is bound to linear memory. - * - * \param hTexRef - Texture reference - * \param maxAniso - Maximum anisotropy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTextureToArray, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso); - -/** - * \brief Sets the border color for a texture reference - * - * \deprecated - * - * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - * \p hTexRef. The color value supports only float type and holds color components in - * the following sequence: - * pBorderColor[0] holds 'R' component - * pBorderColor[1] holds 'G' component - * pBorderColor[2] holds 'B' component - * pBorderColor[3] holds 'A' component - * - * Note that the color values can be set only when the Address mode is set to - * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode. - * Applications using integer border color values have to "reinterpret_cast" their values to float. - * - * \param hTexRef - Texture reference - * \param pBorderColor - RGBA color - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddressMode, - * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor, - * ::cudaBindTexture, - * ::cudaBindTexture2D, - * ::cudaBindTextureToArray, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); - -/** - * \brief Sets the flags for a texture reference - * - * \deprecated - * - * Specifies optional flags via \p Flags to specify the behavior of data - * returned through the texture reference \p hTexRef. The valid flags are: - * - * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of - * having the texture promote integer data to floating point data in the - * range [0, 1]. Note that texture with 32-bit integer format - * would not be promoted, regardless of whether or not this - * flag is specified; - * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the - * default behavior of having the texture coordinates range - * from [0, Dim) where Dim is the width or height of the CUDA - * array. Instead, the texture coordinates [0, 1.0) reference - * the entire breadth of the array dimension; - * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear - * filtering optimizations. Trilinear optimizations improve texture filtering - * performance by allowing bilinear filtering on textures in scenarios where - * it can closely approximate the expected results. - * - * \param hTexRef - Texture reference - * \param Flags - Optional flags to set - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, - * ::cudaBindTexture, - * ::cudaBindTexture2D, - * ::cudaBindTextureToArray, - * ::cudaBindTextureToMipmappedArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); - -/** - * \brief Gets the address associated with a texture reference - * - * \deprecated - * - * Returns in \p *pdptr the base address bound to the texture reference - * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference - * is not bound to any device memory range. - * - * \param pdptr - Returned device address - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); - -/** - * \brief Gets the array bound to a texture reference - * - * \deprecated - * - * Returns in \p *phArray the CUDA array bound to the texture reference - * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference - * is not bound to any CUDA array. - * - * \param phArray - Returned array - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); - -/** - * \brief Gets the mipmapped array bound to a texture reference - * - * \deprecated - * - * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture - * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference - * is not bound to any CUDA mipmapped array. - * - * \param phMipmappedArray - Returned mipmapped array - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); - -/** - * \brief Gets the addressing mode used by a texture reference - * - * \deprecated - * - * Returns in \p *pam the addressing mode corresponding to the - * dimension \p dim of the texture reference \p hTexRef. Currently, the only - * valid value for \p dim are 0 and 1. - * - * \param pam - Returned addressing mode - * \param hTexRef - Texture reference - * \param dim - Dimension - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); - -/** - * \brief Gets the filter-mode used by a texture reference - * - * \deprecated - * - * Returns in \p *pfm the filtering mode of the texture reference - * \p hTexRef. - * - * \param pfm - Returned filtering mode - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); - -/** - * \brief Gets the format used by a texture reference - * - * \deprecated - * - * Returns in \p *pFormat and \p *pNumChannels the format and number - * of components of the CUDA array bound to the texture reference \p hTexRef. - * If \p pFormat or \p pNumChannels is NULL, it will be ignored. - * - * \param pFormat - Returned format - * \param pNumChannels - Returned number of components - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); - -/** - * \brief Gets the mipmap filtering mode for a texture reference - * - * \deprecated - * - * Returns the mipmap filtering mode in \p pfm that's used when reading memory through - * the texture reference \p hTexRef. - * - * \param pfm - Returned mipmap filtering mode - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); - -/** - * \brief Gets the mipmap level bias for a texture reference - * - * \deprecated - * - * Returns the mipmap level bias in \p pBias that's added to the specified mipmap - * level when reading memory through the texture reference \p hTexRef. - * - * \param pbias - Returned mipmap level bias - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); - -/** - * \brief Gets the min/max mipmap level clamps for a texture reference - * - * \deprecated - * - * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp - * that's used when reading memory through the texture reference \p hTexRef. - * - * \param pminMipmapLevelClamp - Returned mipmap min level clamp - * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); - -/** - * \brief Gets the maximum anisotropy for a texture reference - * - * \deprecated - * - * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through - * the texture reference \p hTexRef. - * - * \param pmaxAniso - Returned maximum anisotropy - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); - -/** - * \brief Gets the border color used by a texture reference - * - * \deprecated - * - * Returns in \p pBorderColor, values of the RGBA color used by - * the texture reference \p hTexRef. - * The color value is of type float and holds color components in - * the following sequence: - * pBorderColor[0] holds 'R' component - * pBorderColor[1] holds 'G' component - * pBorderColor[2] holds 'B' component - * pBorderColor[3] holds 'A' component - * - * \param hTexRef - Texture reference - * \param pBorderColor - Returned Type and Value of RGBA color - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddressMode, - * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); - -/** - * \brief Gets the flags used by a texture reference - * - * \deprecated - * - * Returns in \p *pFlags the flags of the texture reference \p hTexRef. - * - * \param pFlags - Returned flags - * \param hTexRef - Texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefSetAddress, - * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, - * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, - * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, - * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); - -/** - * \brief Creates a texture reference - * - * \deprecated - * - * Creates a texture reference and returns its handle in \p *pTexRef. Once - * created, the application must call ::cuTexRefSetArray() or - * ::cuTexRefSetAddress() to associate the reference with allocated memory. - * Other texture reference functions are used to specify the format and - * interpretation (addressing, filtering, etc.) to be used when the memory is - * read through this texture reference. - * - * \param pTexRef - Returned texture reference - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefDestroy - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); - -/** - * \brief Destroys a texture reference - * - * \deprecated - * - * Destroys the texture reference specified by \p hTexRef. - * - * \param hTexRef - Texture reference to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuTexRefCreate - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); - -/** @} */ /* END CUDA_TEXREF_DEPRECATED */ - - -/** - * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED] - * - * ___MANBRIEF___ surface reference management functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the surface reference management functions of the - * low-level CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Sets the CUDA array for a surface reference. - * - * \deprecated - * - * Sets the CUDA array \p hArray to be read and written by the surface reference - * \p hSurfRef. Any previous CUDA array state associated with the surface - * reference is superseded by this function. \p Flags must be set to 0. - * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array. - * Any CUDA array previously bound to \p hSurfRef is unbound. - - * \param hSurfRef - Surface reference handle - * \param hArray - CUDA array handle - * \param Flags - set to 0 - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuModuleGetSurfRef, - * ::cuSurfRefGetArray, - * ::cudaBindSurfaceToArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); - -/** - * \brief Passes back the CUDA array bound to a surface reference. - * - * \deprecated - * - * Returns in \p *phArray the CUDA array bound to the surface reference - * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference - * is not bound to any CUDA array. - - * \param phArray - Surface reference handle - * \param hSurfRef - Surface reference handle - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray - */ -__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); - -/** @} */ /* END CUDA_SURFREF_DEPRECATED */ - -/** - * \defgroup CUDA_TEXOBJECT Texture Object Management - * - * ___MANBRIEF___ texture object management functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the texture object management functions of the - * low-level CUDA driver application programming interface. The texture - * object API is only supported on devices of compute capability 3.0 or higher. - * - * @{ - */ - -/** - * \brief Creates a texture object - * - * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes - * the data to texture from. \p pTexDesc describes how the data should be sampled. - * \p pResViewDesc is an optional argument that specifies an alternate format for - * the data described by \p pResDesc, and also describes the subresource region - * to restrict access to when texturing. \p pResViewDesc can only be specified if - * the type of resource is a CUDA array or a CUDA mipmapped array. - * - * Texture objects are only supported on devices of compute capability 3.0 or higher. - * Additionally, a texture object is an opaque value, and, as such, should only be - * accessed through CUDA API calls. - * - * The ::CUDA_RESOURCE_DESC structure is defined as: - * \code - typedef struct CUDA_RESOURCE_DESC_st - { - CUresourcetype resType; - - union { - struct { - CUarray hArray; - } array; - struct { - CUmipmappedArray hMipmappedArray; - } mipmap; - struct { - CUdeviceptr devPtr; - CUarray_format format; - unsigned int numChannels; - size_t sizeInBytes; - } linear; - struct { - CUdeviceptr devPtr; - CUarray_format format; - unsigned int numChannels; - size_t width; - size_t height; - size_t pitchInBytes; - } pitch2D; - } res; - - unsigned int flags; - } CUDA_RESOURCE_DESC; - - * \endcode - * where: - * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from. - * CUresourceType is defined as: - * \code - typedef enum CUresourcetype_enum { - CU_RESOURCE_TYPE_ARRAY = 0x00, - CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, - CU_RESOURCE_TYPE_LINEAR = 0x02, - CU_RESOURCE_TYPE_PITCH2D = 0x03 - } CUresourcetype; - * \endcode - * - * \par - * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray - * must be set to a valid CUDA array handle. - * - * \par - * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray - * must be set to a valid CUDA mipmapped array handle. - * - * \par - * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr - * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. - * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels - * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes - * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)). - * - * \par - * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr - * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. - * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels - * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width - * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed - * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. - * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to - * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. - * - * - ::flags must be set to zero. - * - * - * The ::CUDA_TEXTURE_DESC struct is defined as - * \code - typedef struct CUDA_TEXTURE_DESC_st { - CUaddress_mode addressMode[3]; - CUfilter_mode filterMode; - unsigned int flags; - unsigned int maxAnisotropy; - CUfilter_mode mipmapFilterMode; - float mipmapLevelBias; - float minMipmapLevelClamp; - float maxMipmapLevelClamp; - } CUDA_TEXTURE_DESC; - * \endcode - * where - * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as: - * \code - typedef enum CUaddress_mode_enum { - CU_TR_ADDRESS_MODE_WRAP = 0, - CU_TR_ADDRESS_MODE_CLAMP = 1, - CU_TR_ADDRESS_MODE_MIRROR = 2, - CU_TR_ADDRESS_MODE_BORDER = 3 - } CUaddress_mode; - * \endcode - * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES - * is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. - * - * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as: - * \code - typedef enum CUfilter_mode_enum { - CU_TR_FILTER_MODE_POINT = 0, - CU_TR_FILTER_MODE_LINEAR = 1 - } CUfilter_mode; - * \endcode - * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. - * - * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following: - * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of - * having the texture promote integer data to floating point data in the - * range [0, 1]. Note that texture with 32-bit integer format would not be - * promoted, regardless of whether or not this flag is specified. - * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior - * of having the texture coordinates range from [0, Dim) where Dim is the - * width or height of the CUDA array. Instead, the texture coordinates - * [0, 1.0) reference the entire breadth of the array dimension; Note that - * for CUDA mipmapped arrays, this flag has to be set. - * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear - * filtering optimizations. Trilinear optimizations improve texture filtering - * performance by allowing bilinear filtering on textures in scenarios where - * it can closely approximate the expected results. - * - * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be - * clamped to the range [1,16]. - * - * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. - * - * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. - * - * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. - * - * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. - * - * - * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as - * \code - typedef struct CUDA_RESOURCE_VIEW_DESC_st - { - CUresourceViewFormat format; - size_t width; - size_t height; - size_t depth; - unsigned int firstMipmapLevel; - unsigned int lastMipmapLevel; - unsigned int firstLayer; - unsigned int lastLayer; - } CUDA_RESOURCE_VIEW_DESC; - * \endcode - * where: - * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should - * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block - * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32. - * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have - * a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base - * format but with 4 channels. - * - * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block - * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, - * this value has to be equal to that of the original resource. - * - * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block - * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, - * this value has to be equal to that of the original resource. - * - * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the - * original resource. - * - * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. - * For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp - * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, - * then the actual minimum mipmap level clamp will be 3.2. - * - * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value - * has to be zero. - * - * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. - * For non-layered resources, this value has to be zero. - * - * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, - * this value has to be zero. - * - * - * \param pTexObject - Texture object to create - * \param pResDesc - Resource descriptor - * \param pTexDesc - Texture descriptor - * \param pResViewDesc - Resource view descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuTexObjectDestroy, - * ::cudaCreateTextureObject - */ -CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc); - -/** - * \brief Destroys a texture object - * - * Destroys the texture object specified by \p texObject. - * - * \param texObject - Texture object to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuTexObjectCreate, - * ::cudaDestroyTextureObject - */ -CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject); - -/** - * \brief Returns a texture object's resource descriptor - * - * Returns the resource descriptor for the texture object specified by \p texObject. - * - * \param pResDesc - Resource descriptor - * \param texObject - Texture object - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuTexObjectCreate, - * ::cudaGetTextureObjectResourceDesc, - */ -CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject); - -/** - * \brief Returns a texture object's texture descriptor - * - * Returns the texture descriptor for the texture object specified by \p texObject. - * - * \param pTexDesc - Texture descriptor - * \param texObject - Texture object - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuTexObjectCreate, - * ::cudaGetTextureObjectTextureDesc - */ -CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject); - -/** - * \brief Returns a texture object's resource view descriptor - * - * Returns the resource view descriptor for the texture object specified by \p texObject. - * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned. - * - * \param pResViewDesc - Resource view descriptor - * \param texObject - Texture object - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuTexObjectCreate, - * ::cudaGetTextureObjectResourceViewDesc - */ -CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject); - -/** @} */ /* END CUDA_TEXOBJECT */ - -/** - * \defgroup CUDA_SURFOBJECT Surface Object Management - * - * ___MANBRIEF___ surface object management functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the surface object management functions of the - * low-level CUDA driver application programming interface. The surface - * object API is only supported on devices of compute capability 3.0 or higher. - * - * @{ - */ - -/** - * \brief Creates a surface object - * - * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes - * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be - * ::CU_RESOURCE_TYPE_ARRAY and ::CUDA_RESOURCE_DESC::res::array::hArray - * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero. - * - * Surface objects are only supported on devices of compute capability 3.0 or higher. - * Additionally, a surface object is an opaque value, and, as such, should only be - * accessed through CUDA API calls. - * - * \param pSurfObject - Surface object to create - * \param pResDesc - Resource descriptor - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuSurfObjectDestroy, - * ::cudaCreateSurfaceObject - */ -CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc); - -/** - * \brief Destroys a surface object - * - * Destroys the surface object specified by \p surfObject. - * - * \param surfObject - Surface object to destroy - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuSurfObjectCreate, - * ::cudaDestroySurfaceObject - */ -CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject); - -/** - * \brief Returns a surface object's resource descriptor - * - * Returns the resource descriptor for the surface object specified by \p surfObject. - * - * \param pResDesc - Resource descriptor - * \param surfObject - Surface object - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE - * - * \sa - * ::cuSurfObjectCreate, - * ::cudaGetSurfaceObjectResourceDesc - */ -CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject); - -/** @} */ /* END CUDA_SURFOBJECT */ - -/** - * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access - * - * ___MANBRIEF___ direct peer context memory access functions of the low-level - * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the direct peer context memory access functions - * of the low-level CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Queries if a device may directly access a peer device's memory. - * - * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of - * directly accessing memory from contexts on \p peerDev and 0 otherwise. - * If direct access of \p peerDev from \p dev is possible, then access may be - * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess(). - * - * \param canAccessPeer - Returned access capability - * \param dev - Device from which allocations on \p peerDev are to - * be directly accessed. - * \param peerDev - Device on which the allocations to be directly accessed - * by \p dev reside. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE - * \notefnerr - * - * \sa - * ::cuCtxEnablePeerAccess, - * ::cuCtxDisablePeerAccess, - * ::cudaDeviceCanAccessPeer - */ -CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev); - -/** - * \brief Enables direct access to memory allocations in a peer context. - * - * If both the current context and \p peerContext are on devices which support unified - * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same - * major compute capability, then on success all allocations from \p peerContext will - * immediately be accessible by the current context. See \ref CUDA_UNIFIED for additional - * details. - * - * Note that access granted by this call is unidirectional and that in order to access - * memory from the current context in \p peerContext, a separate symmetric call - * to ::cuCtxEnablePeerAccess() is required. - * - * Note that there are both device-wide and system-wide limitations per system - * configuration, as noted in the CUDA Programming Guide under the section - * "Peer-to-Peer Memory Access". - * - * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates - * that the ::CUdevice of the current context cannot directly access memory - * from the ::CUdevice of \p peerContext. - * - * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of - * \p peerContext from the current context has already been enabled. - * - * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible - * because hardware resources required for peer access have been exhausted. - * - * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext - * is not a valid context, or if the current context is \p peerContext. - * - * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0. - * - * \param peerContext - Peer context to enable direct access to from the current context - * \param Flags - Reserved for future use and must be set to 0 - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, - * ::CUDA_ERROR_TOO_MANY_PEERS, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cuDeviceCanAccessPeer, - * ::cuCtxDisablePeerAccess, - * ::cudaDeviceEnablePeerAccess - */ -CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags); - -/** - * \brief Disables direct access to memory allocations in a peer context and - * unregisters any registered allocations. - * - Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has - * not yet been enabled from \p peerContext to the current context. - * - * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if - * \p peerContext is not a valid context. - * - * \param peerContext - Peer context to disable direct access to - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * \notefnerr - * - * \sa - * ::cuDeviceCanAccessPeer, - * ::cuCtxEnablePeerAccess, - * ::cudaDeviceDisablePeerAccess - */ -CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); - -/** - * \brief Queries attributes of the link between two devices. - * - * Returns in \p *value the value of the requested attribute \p attrib of the - * link between \p srcDevice and \p dstDevice. The supported attributes are: - * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the - * performance of the link between two devices. - * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable. - * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over - * the link are supported. - * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can - * be accessed over the link. - * - * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid - * or if they represent the same device. - * - * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is - * a null pointer. - * - * \param value - Returned value of the requested attribute - * \param attrib - The requested attribute of the link between \p srcDevice and \p dstDevice. - * \param srcDevice - The source device of the target link. - * \param dstDevice - The destination device of the target link. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_DEVICE, - * ::CUDA_ERROR_INVALID_VALUE - * \notefnerr - * - * \sa - * ::cuCtxEnablePeerAccess, - * ::cuCtxDisablePeerAccess, - * ::cuDeviceCanAccessPeer, - * ::cudaDeviceGetP2PAttribute - */ -CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice); - -/** @} */ /* END CUDA_PEER_ACCESS */ - -/** - * \defgroup CUDA_GRAPHICS Graphics Interoperability - * - * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA - * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the graphics interoperability functions of the - * low-level CUDA driver application programming interface. - * - * @{ - */ - -/** - * \brief Unregisters a graphics resource for access by CUDA - * - * Unregisters the graphics resource \p resource so it is not accessible by - * CUDA unless registered again. - * - * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is - * returned. - * - * \param resource - Resource to unregister - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_UNKNOWN - * \notefnerr - * - * \sa - * ::cuGraphicsD3D9RegisterResource, - * ::cuGraphicsD3D10RegisterResource, - * ::cuGraphicsD3D11RegisterResource, - * ::cuGraphicsGLRegisterBuffer, - * ::cuGraphicsGLRegisterImage, - * ::cudaGraphicsUnregisterResource - */ -CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource); - -/** - * \brief Get an array through which to access a subresource of a mapped graphics resource. - * - * Returns in \p *pArray an array through which the subresource of the mapped - * graphics resource \p resource which corresponds to array index \p arrayIndex - * and mipmap level \p mipLevel may be accessed. The value set in \p *pArray may - * change every time that \p resource is mapped. - * - * If \p resource is not a texture then it cannot be accessed via an array and - * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. - * If \p arrayIndex is not a valid array index for \p resource then - * ::CUDA_ERROR_INVALID_VALUE is returned. - * If \p mipLevel is not a valid mipmap level for \p resource then - * ::CUDA_ERROR_INVALID_VALUE is returned. - * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. - * - * \param pArray - Returned array through which a subresource of \p resource may be accessed - * \param resource - Mapped resource to access - * \param arrayIndex - Array index for array textures or cubemap face - * index as defined by ::CUarray_cubemap_face for - * cubemap textures for the subresource to access - * \param mipLevel - Mipmap level for the subresource to access - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_MAPPED, - * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY - * \notefnerr - * - * \sa - * ::cuGraphicsResourceGetMappedPointer, - * ::cudaGraphicsSubResourceGetMappedArray - */ -CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); - -/** - * \brief Get a mipmapped array through which to access a mapped graphics resource. - * - * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics - * resource \p resource. The value set in \p *pMipmappedArray may change every time - * that \p resource is mapped. - * - * If \p resource is not a texture then it cannot be accessed via a mipmapped array and - * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. - * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. - * - * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed - * \param resource - Mapped resource to access - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_MAPPED, - * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY - * \notefnerr - * - * \sa - * ::cuGraphicsResourceGetMappedPointer, - * ::cudaGraphicsResourceGetMappedMipmappedArray - */ -CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource); - -/** - * \brief Get a device pointer through which to access a mapped graphics resource. - * - * Returns in \p *pDevPtr a pointer through which the mapped graphics resource - * \p resource may be accessed. - * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer. - * The value set in \p pPointer may change every time that \p resource is mapped. - * - * If \p resource is not a buffer then it cannot be accessed via a pointer and - * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned. - * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. - * * - * \param pDevPtr - Returned pointer through which \p resource may be accessed - * \param pSize - Returned size of the buffer accessible starting at \p *pPointer - * \param resource - Mapped resource to access - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_MAPPED, - * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER - * \notefnerr - * - * \sa - * ::cuGraphicsMapResources, - * ::cuGraphicsSubResourceGetMappedArray, - * ::cudaGraphicsResourceGetMappedPointer - */ -CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); - -/** - * \brief Set usage flags for mapping a graphics resource - * - * Set \p flags for mapping the graphics resource \p resource. - * - * Changes to \p flags will take effect the next time \p resource is mapped. - * The \p flags argument may be any of the following: - - * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this - * resource will be used. It is therefore assumed that this resource will be - * read from and written to by CUDA kernels. This is the default value. - * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which - * access this resource will not write to this resource. - * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels - * which access this resource will not read from this resource and will - * write over the entire contents of the resource, so none of the data - * previously stored in the resource will be preserved. - * - * If \p resource is presently mapped for access by CUDA then - * ::CUDA_ERROR_ALREADY_MAPPED is returned. - * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned. - * - * \param resource - Registered resource to set flags for - * \param flags - Parameters for resource mapping - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_ALREADY_MAPPED - * \notefnerr - * - * \sa - * ::cuGraphicsMapResources, - * ::cudaGraphicsResourceSetMapFlags - */ -CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); - -/** - * \brief Map graphics resources for access by CUDA - * - * Maps the \p count graphics resources in \p resources for access by CUDA. - * - * The resources in \p resources may be accessed by CUDA until they - * are unmapped. The graphics API from which \p resources were registered - * should not access any resources while they are mapped by CUDA. If an - * application does so, the results are undefined. - * - * This function provides the synchronization guarantee that any graphics calls - * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA - * work issued in \p stream begins. - * - * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. - * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned. - * - * \param count - Number of resources to map - * \param resources - Resources to map for CUDA usage - * \param hStream - Stream with which to synchronize - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_ALREADY_MAPPED, - * ::CUDA_ERROR_UNKNOWN - * \note_null_stream - * \notefnerr - * - * \sa - * ::cuGraphicsResourceGetMappedPointer, - * ::cuGraphicsSubResourceGetMappedArray, - * ::cuGraphicsUnmapResources, - * ::cudaGraphicsMapResources - */ -CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); - -/** - * \brief Unmap graphics resources. - * - * Unmaps the \p count graphics resources in \p resources. - * - * Once unmapped, the resources in \p resources may not be accessed by CUDA - * until they are mapped again. - * - * This function provides the synchronization guarantee that any CUDA work issued - * in \p stream before ::cuGraphicsUnmapResources() will complete before any - * subsequently issued graphics work begins. - * - * - * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. - * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned. - * - * \param count - Number of resources to unmap - * \param resources - Resources to unmap - * \param hStream - Stream with which to synchronize - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_DEINITIALIZED, - * ::CUDA_ERROR_NOT_INITIALIZED, - * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_NOT_MAPPED, - * ::CUDA_ERROR_UNKNOWN - * \note_null_stream - * \notefnerr - * - * \sa - * ::cuGraphicsMapResources, - * ::cudaGraphicsUnmapResources - */ -CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); - -/** @} */ /* END CUDA_GRAPHICS */ - -/** - * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access - * - * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the driver entry point access functions of the low-level CUDA - * driver application programming interface. - * - * @{ - */ - -/** - * \brief Returns the requested driver API function pointer - * - * Returns in \p **pfn the address of the CUDA driver function for the requested - * CUDA version and flags. - * - * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2 - * should be specified as 11020. For a requested driver symbol, if the specified - * CUDA version is greater than or equal to the CUDA version in which the driver symbol - * was introduced, this API will return the function pointer to the corresponding - * versioned function. - * - * The pointer returned by the API should be cast to a function pointer matching the - * requested driver function's definition in the API header file. The function pointer - * typedef can be picked up from the corresponding typedefs header file. For example, - * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h. - * - * The API will return ::CUDA_ERROR_NOT_FOUND if the requested driver function is not - * supported on the platform, no ABI compatible driver function exists for the specified - * \p cudaVersion or if the driver symbol is invalid. - * - * The requested flags can be: - * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to - * ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with - * --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM - * is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise. - * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols - * that match the requested driver symbol name except the corresponding per-thread versions. - * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all - * driver symbols that match the requested driver symbol name including the per-thread - * versions. If a per-thread version is not found, the API will return the legacy version - * of the driver function. - * - * \param symbol - The base name of the driver API function to look for. As an example, - * for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and - * \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. - * \param pfn - Location to return the function pointer to the requested driver function - * \param cudaVersion - The CUDA version to look for the requested driver symbol - * \param flags - Flags to specify search options. - * - * \return - * ::CUDA_SUCCESS, - * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_NOT_SUPPORTED, - * ::CUDA_ERROR_NOT_FOUND - * \note_version_mixing - * - * \sa - * ::cudaGetDriverEntryPoint - */ -CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags); - -/** @} */ /* END CUDA_DRIVER_ENTRY_POINT */ - -CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); - -/** - * CUDA API versioning support - */ -#if defined(__CUDA_API_VERSION_INTERNAL) - #undef cuMemHostRegister - #undef cuGraphicsResourceSetMapFlags - #undef cuLinkCreate - #undef cuLinkAddData - #undef cuLinkAddFile - #undef cuDeviceTotalMem - #undef cuCtxCreate - #undef cuModuleGetGlobal - #undef cuMemGetInfo - #undef cuMemAlloc - #undef cuMemAllocPitch - #undef cuMemFree - #undef cuMemGetAddressRange - #undef cuMemAllocHost - #undef cuMemHostGetDevicePointer - #undef cuMemcpyHtoD - #undef cuMemcpyDtoH - #undef cuMemcpyDtoD - #undef cuMemcpyDtoA - #undef cuMemcpyAtoD - #undef cuMemcpyHtoA - #undef cuMemcpyAtoH - #undef cuMemcpyAtoA - #undef cuMemcpyHtoAAsync - #undef cuMemcpyAtoHAsync - #undef cuMemcpy2D - #undef cuMemcpy2DUnaligned - #undef cuMemcpy3D - #undef cuMemcpyHtoDAsync - #undef cuMemcpyDtoHAsync - #undef cuMemcpyDtoDAsync - #undef cuMemcpy2DAsync - #undef cuMemcpy3DAsync - #undef cuMemsetD8 - #undef cuMemsetD16 - #undef cuMemsetD32 - #undef cuMemsetD2D8 - #undef cuMemsetD2D16 - #undef cuMemsetD2D32 - #undef cuArrayCreate - #undef cuArrayGetDescriptor - #undef cuArray3DCreate - #undef cuArray3DGetDescriptor - #undef cuTexRefSetAddress - #undef cuTexRefSetAddress2D - #undef cuTexRefGetAddress - #undef cuGraphicsResourceGetMappedPointer - #undef cuCtxDestroy - #undef cuCtxPopCurrent - #undef cuCtxPushCurrent - #undef cuStreamDestroy - #undef cuEventDestroy - #undef cuMemcpy - #undef cuMemcpyAsync - #undef cuMemcpyPeer - #undef cuMemcpyPeerAsync - #undef cuMemcpy3DPeer - #undef cuMemcpy3DPeerAsync - #undef cuMemsetD8Async - #undef cuMemsetD16Async - #undef cuMemsetD32Async - #undef cuMemsetD2D8Async - #undef cuMemsetD2D16Async - #undef cuMemsetD2D32Async - #undef cuStreamGetPriority - #undef cuStreamGetFlags - #undef cuStreamGetCtx - #undef cuStreamWaitEvent - #undef cuStreamAddCallback - #undef cuStreamAttachMemAsync - #undef cuStreamQuery - #undef cuStreamSynchronize - #undef cuEventRecord - #undef cuEventRecordWithFlags - #undef cuLaunchKernel - #undef cuLaunchHostFunc - #undef cuGraphicsMapResources - #undef cuGraphicsUnmapResources - #undef cuStreamWriteValue32 - #undef cuStreamWaitValue32 - #undef cuStreamWriteValue64 - #undef cuStreamWaitValue64 - #undef cuStreamBatchMemOp - #undef cuMemPrefetchAsync - #undef cuLaunchCooperativeKernel - #undef cuSignalExternalSemaphoresAsync - #undef cuWaitExternalSemaphoresAsync - #undef cuStreamBeginCapture - #undef cuStreamEndCapture - #undef cuStreamIsCapturing - #undef cuStreamGetCaptureInfo - #undef cuStreamGetCaptureInfo_v2 - #undef cuGraphUpload - #undef cuGraphLaunch - #undef cuDevicePrimaryCtxRelease - #undef cuDevicePrimaryCtxReset - #undef cuDevicePrimaryCtxSetFlags - #undef cuIpcOpenMemHandle - #undef cuStreamCopyAttributes - #undef cuStreamSetAttribute - #undef cuStreamGetAttribute - #undef cuGraphInstantiate - #undef cuMemMapArrayAsync - #undef cuMemFreeAsync - #undef cuMemAllocAsync - #undef cuMemAllocFromPoolAsync - #undef cuStreamUpdateCaptureDependencies - - CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); - CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); - CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); - CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, - unsigned int numOptions, CUjit_option *options, void **optionValues); - CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, - unsigned int numOptions, CUjit_option *options, void **optionValues); - CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); - - typedef unsigned int CUdeviceptr_v1; - - typedef struct CUDA_MEMCPY2D_v1_st - { - unsigned int srcXInBytes; /**< Source X in bytes */ - unsigned int srcY; /**< Source Y */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr_v1 srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ - - unsigned int dstXInBytes; /**< Destination X in bytes */ - unsigned int dstY; /**< Destination Y */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ - - unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ - unsigned int Height; /**< Height of 2D memory copy */ - } CUDA_MEMCPY2D_v1; - - typedef struct CUDA_MEMCPY3D_v1_st - { - unsigned int srcXInBytes; /**< Source X in bytes */ - unsigned int srcY; /**< Source Y */ - unsigned int srcZ; /**< Source Z */ - unsigned int srcLOD; /**< Source LOD */ - CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ - const void *srcHost; /**< Source host pointer */ - CUdeviceptr_v1 srcDevice; /**< Source device pointer */ - CUarray srcArray; /**< Source array reference */ - void *reserved0; /**< Must be NULL */ - unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ - unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ - - unsigned int dstXInBytes; /**< Destination X in bytes */ - unsigned int dstY; /**< Destination Y */ - unsigned int dstZ; /**< Destination Z */ - unsigned int dstLOD; /**< Destination LOD */ - CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ - void *dstHost; /**< Destination host pointer */ - CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ - CUarray dstArray; /**< Destination array reference */ - void *reserved1; /**< Must be NULL */ - unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ - unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ - - unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ - unsigned int Height; /**< Height of 3D memory copy */ - unsigned int Depth; /**< Depth of 3D memory copy */ - } CUDA_MEMCPY3D_v1; - - typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st - { - unsigned int Width; /**< Width of array */ - unsigned int Height; /**< Height of array */ - - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ - } CUDA_ARRAY_DESCRIPTOR_v1; - - typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st - { - unsigned int Width; /**< Width of 3D array */ - unsigned int Height; /**< Height of 3D array */ - unsigned int Depth; /**< Depth of 3D array */ - - CUarray_format Format; /**< Array format */ - unsigned int NumChannels; /**< Channels per array element */ - unsigned int Flags; /**< Flags */ - } CUDA_ARRAY3D_DESCRIPTOR_v1; - - CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev); - CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); - CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name); - CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); - CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize); - CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); - CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr); - CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr); - CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); - CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags); - CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); - CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy); - CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy); - CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy); - CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream); - CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream); - CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N); - CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N); - CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N); - CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); - CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); - CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); - CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray); - CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); - CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray); - CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); - CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes); - CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch); - CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef); - CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); - - CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); - CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); - CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); - CUresult CUDAAPI cuStreamDestroy(CUstream hStream); - CUresult CUDAAPI cuEventDestroy(CUevent hEvent); - CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); - CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); - CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); - - CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); - CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); - CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); - CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); - CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); - CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); - CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); - CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); - CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy); - CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy); - CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy); - CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream); - CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream); - CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N); - CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N); - CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N); - CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); - CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); - CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); - CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); - CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); - CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); - CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); - CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); - - CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); - CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); - CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); - CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); - CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); - CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); - - CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); - CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); - CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); - CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); - CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); - CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); - CUresult CUDAAPI cuStreamQuery(CUstream hStream); - CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); - CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); - CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); - CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); - CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); - CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); - CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); - CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); - CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); - CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); - CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); - CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); - CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); - CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); - CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); - CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); - CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); - CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream); - CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode); - CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); - CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); - CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); - CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); - CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream); - CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream); - CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream); - CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value); - CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param); - - CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); - CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); - CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); - - CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); - CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); - CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); - - CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); -#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) -static inline CUresult cuGetProcAddress_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags) { - const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM| - CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM); - if ((flags & procAddressMask) == 0) { - flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM; - } - return cuGetProcAddress(symbol, funcPtr, driverVersion, flags); -} -#define cuGetProcAddress cuGetProcAddress_ptsz -#endif - -#ifdef __cplusplus -} -#endif - -#if defined(__GNUC__) - #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) - #pragma GCC visibility pop - #endif -#endif - -#undef __CUDA_DEPRECATED - -#endif /* __cuda_cuda_h__ */ diff --git a/include/triton/external/CUDA/nvml.h b/include/triton/external/CUDA/nvml.h deleted file mode 100755 index 0b38f5f8a..000000000 --- a/include/triton/external/CUDA/nvml.h +++ /dev/null @@ -1,6281 +0,0 @@ -/* - * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO USER: - * - * This source code is subject to NVIDIA ownership rights under U.S. and - * international Copyright laws. Users and possessors of this source code - * are hereby granted a nonexclusive, royalty-free license to use this code - * in individual and commercial software. - * - * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE - * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR - * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH - * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, - * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS - * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE - * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE - * OR PERFORMANCE OF THIS SOURCE CODE. - * - * U.S. Government End Users. This source code is a "commercial item" as - * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of - * "commercial computer software" and "commercial computer software - * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) - * and is provided to the U.S. Government only as a commercial end item. - * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through - * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the - * source code with only those rights set forth herein. - * - * Any use of this source code in individual and commercial software must - * include, in the user documentation and internal comments to the code, - * the above Disclaimer and U.S. Government End Users Notice. - */ - -/* -NVML API Reference - -The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and -managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building -3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi -tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads. - -API Documentation - -Supported platforms: -- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit -- Linux: 32-bit and 64-bit -- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5 - -Supported products: -- Full Support - - All Tesla products, starting with the Fermi architecture - - All Quadro products, starting with the Fermi architecture - - All GRID products, starting with the Kepler architecture - - Selected GeForce Titan products -- Limited Support - - All Geforce products, starting with the Fermi architecture - -The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is -not be added to the system path by default. To dynamically link to NVML, add this path to the PATH -environmental variable. To dynamically load NVML, call LoadLibrary with this path. - -On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit -and 64 bit NVML libraries will be installed. - -Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html -*/ - -#ifndef __nvml_nvml_h__ -#define __nvml_nvml_h__ - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * On Windows, set up methods for DLL export - * define NVML_STATIC_IMPORT when using nvml_loader library - */ -#if defined _WINDOWS - #if !defined NVML_STATIC_IMPORT - #if defined NVML_LIB_EXPORT - #define DECLDIR __declspec(dllexport) - #else - #define DECLDIR __declspec(dllimport) - #endif - #else - #define DECLDIR - #endif -#else - #define DECLDIR -#endif - -/** - * NVML API versioning support - */ -#define NVML_API_VERSION 10 -#define NVML_API_VERSION_STR "10" -#define nvmlInit nvmlInit_v2 -#define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 -#define nvmlDeviceGetCount nvmlDeviceGetCount_v2 -#define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 -#define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 -#define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 -#define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2 - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceStructs Device Structs - * @{ - */ -/***************************************************************************************************/ - -/** - * Special constant that some fields take when they are not available. - * Used when only part of the struct is not available. - * - * Each structure explicitly states when to check for this value. - */ -#define NVML_VALUE_NOT_AVAILABLE (-1) - -typedef struct nvmlDevice_st* nvmlDevice_t; - -/** - * Buffer size guaranteed to be large enough for pci bus id - */ -#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32 - -/** - * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy - */ -#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16 - -/** - * PCI information about a GPU device. - */ -typedef struct nvmlPciInfo_st -{ - char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (& NULL terminator) - unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff - unsigned int bus; //!< The bus on which the device resides, 0 to 0xff - unsigned int device; //!< The device's id on the bus, 0 to 31 - unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id - - // Added in NVML 2.285 API - unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID - - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) -} nvmlPciInfo_t; - -/** - * PCI format string for ::busIdLegacy - */ -#define NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT "%04X:%02X:%02X.0" - -/** - * PCI format string for ::busId - */ -#define NVML_DEVICE_PCI_BUS_ID_FMT "%08X:%02X:%02X.0" - -/** - * Utility macro for filling the pci bus id format from a nvmlPciInfo_t - */ -#define NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(pciInfo) (pciInfo)->domain, \ - (pciInfo)->bus, \ - (pciInfo)->device - -/** - * Detailed ECC error counts for a device. - * - * @deprecated Different GPU families can have different memory error counters - * See \ref nvmlDeviceGetMemoryErrorCounter - */ -typedef struct nvmlEccErrorCounts_st -{ - unsigned long long l1Cache; //!< L1 cache errors - unsigned long long l2Cache; //!< L2 cache errors - unsigned long long deviceMemory; //!< Device memory errors - unsigned long long registerFile; //!< Register file errors -} nvmlEccErrorCounts_t; - -/** - * Utilization information for a device. - * Each sample period may be between 1 second and 1/6 second, depending on the product being queried. - */ -typedef struct nvmlUtilization_st -{ - unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU - unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written -} nvmlUtilization_t; - -/** - * Memory allocation information for a device. - */ -typedef struct nvmlMemory_st -{ - unsigned long long total; //!< Total installed FB memory (in bytes) - unsigned long long free; //!< Unallocated FB memory (in bytes) - unsigned long long used; //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping -} nvmlMemory_t; - -/** - * BAR1 Memory allocation Information for a device - */ -typedef struct nvmlBAR1Memory_st -{ - unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes) - unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes) - unsigned long long bar1Used; //!< Allocated Used Memory (in bytes) -}nvmlBAR1Memory_t; - -/** - * Information about running compute processes on the GPU - */ -typedef struct nvmlProcessInfo_st -{ - unsigned int pid; //!< Process ID - unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. - //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported - //! because Windows KMD manages all the memory and not the NVIDIA driver -} nvmlProcessInfo_t; - -/** - * Enum to represent type of bridge chip - */ -typedef enum nvmlBridgeChipType_enum -{ - NVML_BRIDGE_CHIP_PLX = 0, - NVML_BRIDGE_CHIP_BRO4 = 1 -}nvmlBridgeChipType_t; - -/** - * Maximum number of NvLink links supported - */ -#define NVML_NVLINK_MAX_LINKS 6 - -/** - * Enum to represent the NvLink utilization counter packet units - */ -typedef enum nvmlNvLinkUtilizationCountUnits_enum -{ - NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles - NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets - NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes - - // this must be last - NVML_NVLINK_COUNTER_UNIT_COUNT -} nvmlNvLinkUtilizationCountUnits_t; - -/** - * Enum to represent the NvLink utilization counter packet types to count - * ** this is ONLY applicable with the units as packets or bytes - * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t - * ** all packet filter descriptions are target GPU centric - * ** these can be "OR'd" together - */ -typedef enum nvmlNvLinkUtilizationCountPktTypes_enum -{ - NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1, // no operation packets - NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2, // read packets - NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4, // write packets - NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8, // reduction atomic requests - NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10, // non-reduction atomic requests - NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20, // flush requests - NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40, // responses with data - NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80, // responses without data - NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets -} nvmlNvLinkUtilizationCountPktTypes_t; - -/** - * Struct to define the NVLINK counter controls - */ -typedef struct nvmlNvLinkUtilizationControl_st -{ - nvmlNvLinkUtilizationCountUnits_t units; - nvmlNvLinkUtilizationCountPktTypes_t pktfilter; -} nvmlNvLinkUtilizationControl_t; - -/** - * Enum to represent NvLink queryable capabilities - */ -typedef enum nvmlNvLinkCapability_enum -{ - NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported - NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported - NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported - NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported - NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link - NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device - // should be last - NVML_NVLINK_CAP_COUNT -} nvmlNvLinkCapability_t; - -/** - * Enum to represent NvLink queryable error counters - */ -typedef enum nvmlNvLinkErrorCounter_enum -{ - NVML_NVLINK_ERROR_DL_REPLAY = 0, // Data link transmit replay error counter - NVML_NVLINK_ERROR_DL_RECOVERY = 1, // Data link transmit recovery error counter - NVML_NVLINK_ERROR_DL_CRC_FLIT = 2, // Data link receive flow control digit CRC error counter - NVML_NVLINK_ERROR_DL_CRC_DATA = 3, // Data link receive data CRC error counter - - // this must be last - NVML_NVLINK_ERROR_COUNT -} nvmlNvLinkErrorCounter_t; - -/** - * Represents level relationships within a system between two GPUs - * The enums are spaced to allow for future relationships - */ -typedef enum nvmlGpuLevel_enum -{ - NVML_TOPOLOGY_INTERNAL = 0, // e.g. Tesla K80 - NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch - NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge - NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge - NVML_TOPOLOGY_NODE = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges - NVML_TOPOLOGY_SYSTEM = 50, // all devices in the system - - // there is purposefully no COUNT here because of the need for spacing above -} nvmlGpuTopologyLevel_t; - -/* Compatibility for CPU->NODE renaming */ -#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE - -/* P2P Capability Index Status*/ -typedef enum nvmlGpuP2PStatus_enum -{ - NVML_P2P_STATUS_OK = 0, - NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, - NVML_P2P_STATUS_GPU_NOT_SUPPORTED, - NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, - NVML_P2P_STATUS_DISABLED_BY_REGKEY, - NVML_P2P_STATUS_NOT_SUPPORTED, - NVML_P2P_STATUS_UNKNOWN - -} nvmlGpuP2PStatus_t; - -/* P2P Capability Index*/ -typedef enum nvmlGpuP2PCapsIndex_enum -{ - NVML_P2P_CAPS_INDEX_READ = 0, - NVML_P2P_CAPS_INDEX_WRITE, - NVML_P2P_CAPS_INDEX_NVLINK, - NVML_P2P_CAPS_INDEX_ATOMICS, - NVML_P2P_CAPS_INDEX_PROP, - NVML_P2P_CAPS_INDEX_UNKNOWN -}nvmlGpuP2PCapsIndex_t; - -/** - * Maximum limit on Physical Bridges per Board - */ -#define NVML_MAX_PHYSICAL_BRIDGE (128) - -/** - * Information about the Bridge Chip Firmware - */ -typedef struct nvmlBridgeChipInfo_st -{ - nvmlBridgeChipType_t type; //!< Type of Bridge Chip - unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable -}nvmlBridgeChipInfo_t; - -/** - * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate - * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. - */ -typedef struct nvmlBridgeChipHierarchy_st -{ - unsigned char bridgeCount; //!< Number of Bridge Chips on the Board - nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board -}nvmlBridgeChipHierarchy_t; - -/** - * Represents Type of Sampling Event - */ -typedef enum nvmlSamplingType_enum -{ - NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU - NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU - NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written - NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy - NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy - NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples - NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples - - // Keep this last - NVML_SAMPLINGTYPE_COUNT -}nvmlSamplingType_t; - -/** - * Represents the queryable PCIe utilization counters - */ -typedef enum nvmlPcieUtilCounter_enum -{ - NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity - NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity - - // Keep this last - NVML_PCIE_UTIL_COUNT -} nvmlPcieUtilCounter_t; - -/** - * Represents the type for sample value returned - */ -typedef enum nvmlValueType_enum -{ - NVML_VALUE_TYPE_DOUBLE = 0, - NVML_VALUE_TYPE_UNSIGNED_INT = 1, - NVML_VALUE_TYPE_UNSIGNED_LONG = 2, - NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, - NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, - - // Keep this last - NVML_VALUE_TYPE_COUNT -}nvmlValueType_t; - - -/** - * Union to represent different types of Value - */ -typedef union nvmlValue_st -{ - double dVal; //!< If the value is double - unsigned int uiVal; //!< If the value is unsigned int - unsigned long ulVal; //!< If the value is unsigned long - unsigned long long ullVal; //!< If the value is unsigned long long - signed long long sllVal; //!< If the value is signed long long -}nvmlValue_t; - -/** - * Information for Sample - */ -typedef struct nvmlSample_st -{ - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - nvmlValue_t sampleValue; //!< Sample Value -}nvmlSample_t; - -/** - * Represents type of perf policy for which violation times can be queried - */ -typedef enum nvmlPerfPolicyType_enum -{ - NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks - NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks - NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks - NVML_PERF_POLICY_BOARD_LIMIT = 3, //!< How long did the board limit cause the GPU to be below application clocks - NVML_PERF_POLICY_LOW_UTILIZATION = 4, //!< How long did low utilization cause the GPU to be below application clocks - NVML_PERF_POLICY_RELIABILITY = 5, //!< How long did the board reliability limit cause the GPU to be below application clocks - - NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above) - NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks - - // Keep this last - NVML_PERF_POLICY_COUNT -}nvmlPerfPolicyType_t; - -/** - * Struct to hold perf policy violation status data - */ -typedef struct nvmlViolationTime_st -{ - unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds - unsigned long long violationTime; //!< violationTime in Nanoseconds -}nvmlViolationTime_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceEnumvs Device Enums - * @{ - */ -/***************************************************************************************************/ - -/** - * Generic enable/disable enum. - */ -typedef enum nvmlEnableState_enum -{ - NVML_FEATURE_DISABLED = 0, //!< Feature disabled - NVML_FEATURE_ENABLED = 1 //!< Feature enabled -} nvmlEnableState_t; - -//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details. -#define nvmlFlagDefault 0x00 -//! Generic flag used to force some behavior. See description of particular functions for details. -#define nvmlFlagForce 0x01 - -/** - * * The Brand of the GPU - * */ -typedef enum nvmlBrandType_enum -{ - NVML_BRAND_UNKNOWN = 0, - NVML_BRAND_QUADRO = 1, - NVML_BRAND_TESLA = 2, - NVML_BRAND_NVS = 3, - NVML_BRAND_GRID = 4, - NVML_BRAND_GEFORCE = 5, - NVML_BRAND_TITAN = 6, - - // Keep this last - NVML_BRAND_COUNT -} nvmlBrandType_t; - -/** - * Temperature thresholds. - */ -typedef enum nvmlTemperatureThresholds_enum -{ - NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will shut down - // for HW protection - NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will begin HW slowdown - NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, // Memory Temperature at which the GPU will begin SW slowdown - NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, // GPU Temperature at which the GPU can be throttled below base clock - // Keep this last - NVML_TEMPERATURE_THRESHOLD_COUNT -} nvmlTemperatureThresholds_t; - -/** - * Temperature sensors. - */ -typedef enum nvmlTemperatureSensors_enum -{ - NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die - - // Keep this last - NVML_TEMPERATURE_COUNT -} nvmlTemperatureSensors_t; - -/** - * Compute mode. - * - * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. - * Earlier CUDA versions supported a single exclusive mode, - * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. - */ -typedef enum nvmlComputeMode_enum -{ - NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device - NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed - NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device - NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time - - // Keep this last - NVML_COMPUTEMODE_COUNT -} nvmlComputeMode_t; - -/** - * ECC bit types. - * - * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type - */ -#define nvmlEccBitType_t nvmlMemoryErrorType_t - -/** - * Single bit ECC errors - * - * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED - */ -#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED - -/** - * Double bit ECC errors - * - * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED - */ -#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED - -/** - * Memory error types - */ -typedef enum nvmlMemoryErrorType_enum -{ - /** - * A memory error that was corrected - * - * For ECC errors, these are single bit errors - * For Texture memory, these are errors fixed by resend - */ - NVML_MEMORY_ERROR_TYPE_CORRECTED = 0, - /** - * A memory error that was not corrected - * - * For ECC errors, these are double bit errors - * For Texture memory, these are errors where the resend fails - */ - NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1, - - - // Keep this last - NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types - -} nvmlMemoryErrorType_t; - -/** - * ECC counter types. - * - * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent. - * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver - * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app - * is run. - */ -typedef enum nvmlEccCounterType_enum -{ - NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads. - NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device) - - // Keep this last - NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types -} nvmlEccCounterType_t; - -/** - * Clock types. - * - * All speeds are in Mhz. - */ -typedef enum nvmlClockType_enum -{ - NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain - NVML_CLOCK_SM = 1, //!< SM clock domain - NVML_CLOCK_MEM = 2, //!< Memory clock domain - NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain - - // Keep this last - NVML_CLOCK_COUNT //!< Count of clock types -} nvmlClockType_t; - -/** - * Clock Ids. These are used in combination with nvmlClockType_t - * to specify a single clock value. - */ -typedef enum nvmlClockId_enum -{ - NVML_CLOCK_ID_CURRENT = 0, //!< Current actual clock value - NVML_CLOCK_ID_APP_CLOCK_TARGET = 1, //!< Target application clock - NVML_CLOCK_ID_APP_CLOCK_DEFAULT = 2, //!< Default application clock target - NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3, //!< OEM-defined maximum clock rate - - //Keep this last - NVML_CLOCK_ID_COUNT //!< Count of Clock Ids. -} nvmlClockId_t; - -/** - * Driver models. - * - * Windows only. - */ -typedef enum nvmlDriverModel_enum -{ - NVML_DRIVER_WDDM = 0, //!< WDDM driver model -- GPU treated as a display device - NVML_DRIVER_WDM = 1 //!< WDM (TCC) model (recommended) -- GPU treated as a generic device -} nvmlDriverModel_t; - -/** - * Allowed PStates. - */ -typedef enum nvmlPStates_enum -{ - NVML_PSTATE_0 = 0, //!< Performance state 0 -- Maximum Performance - NVML_PSTATE_1 = 1, //!< Performance state 1 - NVML_PSTATE_2 = 2, //!< Performance state 2 - NVML_PSTATE_3 = 3, //!< Performance state 3 - NVML_PSTATE_4 = 4, //!< Performance state 4 - NVML_PSTATE_5 = 5, //!< Performance state 5 - NVML_PSTATE_6 = 6, //!< Performance state 6 - NVML_PSTATE_7 = 7, //!< Performance state 7 - NVML_PSTATE_8 = 8, //!< Performance state 8 - NVML_PSTATE_9 = 9, //!< Performance state 9 - NVML_PSTATE_10 = 10, //!< Performance state 10 - NVML_PSTATE_11 = 11, //!< Performance state 11 - NVML_PSTATE_12 = 12, //!< Performance state 12 - NVML_PSTATE_13 = 13, //!< Performance state 13 - NVML_PSTATE_14 = 14, //!< Performance state 14 - NVML_PSTATE_15 = 15, //!< Performance state 15 -- Minimum Performance - NVML_PSTATE_UNKNOWN = 32 //!< Unknown performance state -} nvmlPstates_t; - -/** - * GPU Operation Mode - * - * GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features. - * - * Each GOM is designed to meet specific user needs. - */ -typedef enum nvmlGom_enum -{ - NVML_GOM_ALL_ON = 0, //!< Everything is enabled and running at full speed - - NVML_GOM_COMPUTE = 1, //!< Designed for running only compute tasks. Graphics operations - //!< are not allowed - - NVML_GOM_LOW_DP = 2 //!< Designed for running graphics applications that don't require - //!< high bandwidth double precision -} nvmlGpuOperationMode_t; - -/** - * Available infoROM objects. - */ -typedef enum nvmlInforomObject_enum -{ - NVML_INFOROM_OEM = 0, //!< An object defined by OEM - NVML_INFOROM_ECC = 1, //!< The ECC object determining the level of ECC support - NVML_INFOROM_POWER = 2, //!< The power management object - - // Keep this last - NVML_INFOROM_COUNT //!< This counts the number of infoROM objects the driver knows about -} nvmlInforomObject_t; - -/** - * Return values for NVML API calls. - */ -typedef enum nvmlReturn_enum -{ - NVML_SUCCESS = 0, //!< The operation was successful - NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() - NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid - NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device - NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation - NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting - NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful - NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough - NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached - NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded - NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed - NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU - NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded - NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function - NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted - NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible - NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again - NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups - NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch - NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use - NVML_ERROR_MEMORY = 20, //!< Insufficient memory - NVML_ERROR_NO_DATA = 21, //!usedGpuMemory is not supported - - - unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if - //!< the process is not terminated - - unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process - - unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated) - - unsigned int reserved[5]; //!< Reserved for future use -} nvmlAccountingStats_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpuConstants Vgpu Constants - * @{ - */ -/***************************************************************************************************/ - -/** - * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense - */ -#define NVML_GRID_LICENSE_BUFFER_SIZE 128 - -#define NVML_VGPU_NAME_BUFFER_SIZE 64 - -#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3 - -/*! - * Macros for pGPU's virtualization capabilities bitfield. - */ -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpuEnum Vgpu Enum - * @{ - */ -/***************************************************************************************************/ - -/*! - * Types of VM identifiers - */ -typedef enum nvmlVgpuVmIdType { - NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID - NVML_VGPU_VM_ID_UUID = 1, //!< VM ID represents UUID -} nvmlVgpuVmIdType_t; - -/** - * vGPU GUEST info state. - */ -typedef enum nvmlVgpuGuestInfoState_enum -{ - NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //!< Guest-dependent fields uninitialized - NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED = 1, //!< Guest-dependent fields initialized -} nvmlVgpuGuestInfoState_t; - -/** - * GRID license feature code - */ -typedef enum { - NVML_GRID_LICENSE_FEATURE_CODE_VGPU = 1, //!< Virtual GPU - NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = 2 //!< Virtual Workstation -} nvmlGridLicenseFeatureCode_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpuStructs Vgpu Structs - * @{ - */ -/***************************************************************************************************/ - -typedef unsigned int nvmlVgpuTypeId_t; - -typedef unsigned int nvmlVgpuInstance_t; - -/** - * Structure to store Utilization Value and vgpuInstance - */ -typedef struct nvmlVgpuInstanceUtilizationSample_st -{ - nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - nvmlValue_t smUtil; //!< SM (3D/Compute) Util Value - nvmlValue_t memUtil; //!< Frame Buffer Memory Util Value - nvmlValue_t encUtil; //!< Encoder Util Value - nvmlValue_t decUtil; //!< Decoder Util Value -} nvmlVgpuInstanceUtilizationSample_t; - -/** - * Structure to store Utilization Value, vgpuInstance and subprocess information - */ -typedef struct nvmlVgpuProcessUtilizationSample_st -{ - nvmlVgpuInstance_t vgpuInstance; //!< vGPU Instance - unsigned int pid; //!< PID of process running within the vGPU VM - char processName[NVML_VGPU_NAME_BUFFER_SIZE]; //!< Name of process running within the vGPU VM - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - unsigned int smUtil; //!< SM (3D/Compute) Util Value - unsigned int memUtil; //!< Frame Buffer Memory Util Value - unsigned int encUtil; //!< Encoder Util Value - unsigned int decUtil; //!< Decoder Util Value -} nvmlVgpuProcessUtilizationSample_t; - -/** - * Structure to store utilization value and process Id - */ -typedef struct nvmlProcessUtilizationSample_st -{ - unsigned int pid; //!< PID of process - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - unsigned int smUtil; //!< SM (3D/Compute) Util Value - unsigned int memUtil; //!< Frame Buffer Memory Util Value - unsigned int encUtil; //!< Encoder Util Value - unsigned int decUtil; //!< Decoder Util Value -} nvmlProcessUtilizationSample_t; - -/** - * Structure containing GRID licensable feature information - */ -typedef struct nvmlGridLicensableFeature_st -{ - nvmlGridLicenseFeatureCode_t featureCode; //!< Licensed feature code - unsigned int featureState; //!< Non-zero if feature is currently licensed, otherwise zero - char licenseInfo[NVML_GRID_LICENSE_BUFFER_SIZE]; -} nvmlGridLicensableFeature_t; - -/** - * Structure to store GRID licensable features - */ -typedef struct nvmlGridLicensableFeatures_st -{ - int isGridLicenseSupported; //!< Non-zero if GRID Software Licensing is supported on the system, otherwise zero - unsigned int licensableFeaturesCount; //!< Entries returned in \a gridLicensableFeatures array - nvmlGridLicensableFeature_t gridLicensableFeatures[NVML_GRID_LICENSE_FEATURE_MAX_COUNT]; //!< Array of GRID licensable features. -} nvmlGridLicensableFeatures_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlEncoderStructs Encoder Structs - * @{ - */ -/***************************************************************************************************/ - -/** - * Represents type of encoder for capacity can be queried - */ -typedef enum nvmlEncoderQueryType_enum -{ - NVML_ENCODER_QUERY_H264 = 0, //!< H264 encoder - NVML_ENCODER_QUERY_HEVC = 1, //!< HEVC encoder -}nvmlEncoderType_t; - -/** - * Structure to hold encoder session data - */ -typedef struct nvmlEncoderSessionInfo_st -{ - unsigned int sessionId; //!< Unique session ID - unsigned int pid; //!< Owning process ID - nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) - nvmlEncoderType_t codecType; //!< Video encoder type - unsigned int hResolution; //!< Current encode horizontal resolution - unsigned int vResolution; //!< Current encode vertical resolution - unsigned int averageFps; //!< Moving average encode frames per second - unsigned int averageLatency; //!< Moving average encode latency in microseconds -}nvmlEncoderSessionInfo_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlFBCStructs Frame Buffer Capture Structures -* @{ -*/ -/***************************************************************************************************/ - -/** - * Represents frame buffer capture session type - */ -typedef enum nvmlFBCSessionType_enum -{ - NVML_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknwon - NVML_FBC_SESSION_TYPE_TOSYS, //!< ToSys - NVML_FBC_SESSION_TYPE_CUDA, //!< Cuda - NVML_FBC_SESSION_TYPE_VID, //!< Vid - NVML_FBC_SESSION_TYPE_HWENC, //!< HEnc -} nvmlFBCSessionType_t; - -/** - * Structure to hold frame buffer capture sessions stats - */ -typedef struct nvmlFBCStats_st -{ - unsigned int sessionsCount; //!< Total no of sessions - unsigned int averageFPS; //!< Moving average new frames captured per second - unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds -} nvmlFBCStats_t; - -#define NVML_NVFBC_SESSION_FLAG_DIFFMAP_ENABLED 0x00000001 //!< Bit specifying differential map state. -#define NVML_NVFBC_SESSION_FLAG_CLASSIFICATIONMAP_ENABLED 0x00000002 //!< Bit specifying classification map state. -#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_NO_WAIT 0x00000004 //!< Bit specifying if capture was requested as non-blocking call. -#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_INFINITE 0x00000008 //!< Bit specifying if capture was requested as blocking call. -#define NVML_NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT 0x00000010 //!< Bit specifying if capture was requested as blocking call with timeout period. - -/** - * Structure to hold FBC session data - */ -typedef struct nvmlFBCSessionInfo_st -{ - unsigned int sessionId; //!< Unique session ID - unsigned int pid; //!< Owning process ID - nvmlVgpuInstance_t vgpuInstance; //!< Owning vGPU instance ID (only valid on vGPU hosts, otherwise zero) - unsigned int displayOrdinal; //!< Display identifier - nvmlFBCSessionType_t sessionType; //!< Type of frame buffer capture session - unsigned int sessionFlags; //!< Session flags (one or more of NVML_NVFBC_SESSION_FLAG_XXX). - unsigned int hMaxResolution; //!< Max horizontal resolution supported by the capture session - unsigned int vMaxResolution; //!< Max vertical resolution supported by the capture session - unsigned int hResolution; //!< Horizontal resolution requested by caller in capture call - unsigned int vResolution; //!< Vertical resolution requested by caller in capture call - unsigned int averageFPS; //!< Moving average new frames captured per second - unsigned int averageLatency; //!< Moving average new frame capture latency in microseconds -} nvmlFBCSessionInfo_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDrainDefs definitions related to the drain state - * @{ - */ -/***************************************************************************************************/ - -/** - * Is the GPU device to be removed from the kernel by nvmlDeviceRemoveGpu() - */ -typedef enum nvmlDetachGpuState_enum -{ - NVML_DETACH_GPU_KEEP = 0, - NVML_DETACH_GPU_REMOVE, -} nvmlDetachGpuState_t; - -/** - * Parent bridge PCIe link state requested by nvmlDeviceRemoveGpu() - */ -typedef enum nvmlPcieLinkState_enum -{ - NVML_PCIE_LINK_KEEP = 0, - NVML_PCIE_LINK_SHUT_DOWN, -} nvmlPcieLinkState_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup - * This chapter describes the methods that handle NVML initialization and cleanup. - * It is the user's responsibility to call \ref nvmlInit() before calling any other methods, and - * nvmlShutdown() once NVML is no longer being used. - * @{ - */ -/***************************************************************************************************/ - -#define NVML_INIT_FLAG_NO_GPUS 1 //!< Don't fail nvmlInit() when no GPUs are found -#define NVML_INIT_FLAG_NO_ATTACH 2 //!< Don't attach GPUs - -/** - * Initialize NVML, but don't initialize any GPUs yet. - * - * \note nvmlInit_v3 introduces a "flags" argument, that allows passing boolean values - * modifying the behaviour of nvmlInit(). - * \note In NVML 5.319 new nvmlInit_v2 has replaced nvmlInit"_v1" (default in NVML 4.304 and older) that - * did initialize all GPU devices in the system. - * - * This allows NVML to communicate with a GPU - * when other GPUs in the system are unstable or in a bad state. When using this API, GPUs are - * discovered and initialized in nvmlDeviceGetHandleBy* functions instead. - * - * \note To contrast nvmlInit_v2 with nvmlInit"_v1", NVML 4.304 nvmlInit"_v1" will fail when any detected GPU is in - * a bad or unstable state. - * - * For all products. - * - * This method, should be called once before invoking any other methods in the library. - * A reference count of the number of initializations is maintained. Shutdown only occurs - * when the reference count reaches zero. - * - * @return - * - \ref NVML_SUCCESS if NVML has been properly initialized - * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running - * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlInit(void); - -/** - * nvmlInitWithFlags is a variant of nvmlInit(), that allows passing a set of boolean values - * modifying the behaviour of nvmlInit(). - * Other than the "flags" parameter it is completely similar to \ref nvmlInit. - * - * For all products. - * - * @param flags behaviour modifier flags - * - * @return - * - \ref NVML_SUCCESS if NVML has been properly initialized - * - \ref NVML_ERROR_DRIVER_NOT_LOADED if NVIDIA driver is not running - * - \ref NVML_ERROR_NO_PERMISSION if NVML does not have permission to talk to the driver - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlInitWithFlags(unsigned int flags); - -/** - * Shut down NVML by releasing all GPU resources previously allocated with \ref nvmlInit(). - * - * For all products. - * - * This method should be called after NVML work is done, once for each call to \ref nvmlInit() - * A reference count of the number of initializations is maintained. Shutdown only occurs - * when the reference count reaches zero. For backwards compatibility, no error is reported if - * nvmlShutdown() is called more times than nvmlInit(). - * - * @return - * - \ref NVML_SUCCESS if NVML has been properly shut down - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlShutdown(void); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlErrorReporting Error reporting - * This chapter describes helper functions for error reporting routines. - * @{ - */ -/***************************************************************************************************/ - -/** - * Helper method for converting NVML error codes into readable strings. - * - * For all products. - * - * @param result NVML error code to convert - * - * @return String representation of the error. - * - */ -const DECLDIR char* nvmlErrorString(nvmlReturn_t result); -/** @} */ - - -/***************************************************************************************************/ -/** @defgroup nvmlConstants Constants - * @{ - */ -/***************************************************************************************************/ - -/** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetInforomVersion and \ref nvmlDeviceGetInforomImageVersion - */ -#define NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE 16 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID - */ -#define NVML_DEVICE_UUID_BUFFER_SIZE 80 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetBoardPartNumber - */ -#define NVML_DEVICE_PART_NUMBER_BUFFER_SIZE 80 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlSystemGetDriverVersion - */ -#define NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE 80 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlSystemGetNVMLVersion - */ -#define NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE 80 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName - */ -#define NVML_DEVICE_NAME_BUFFER_SIZE 64 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetSerial - */ -#define NVML_DEVICE_SERIAL_BUFFER_SIZE 30 - -/** - * Buffer size guaranteed to be large enough for \ref nvmlDeviceGetVbiosVersion - */ -#define NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE 32 - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlSystemQueries System Queries - * This chapter describes the queries that NVML can perform against the local system. These queries - * are not device-specific. - * @{ - */ -/***************************************************************************************************/ - -/** - * Retrieves the version of the system's graphics driver. - * - * For all products. - * - * The version identifier is an alphanumeric string. It will not exceed 80 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. - * - * @param version Reference in which to return the version identifier - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - */ -nvmlReturn_t DECLDIR nvmlSystemGetDriverVersion(char *version, unsigned int length); - -/** - * Retrieves the version of the NVML library. - * - * For all products. - * - * The version identifier is an alphanumeric string. It will not exceed 80 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE. - * - * @param version Reference in which to return the version identifier - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - */ -nvmlReturn_t DECLDIR nvmlSystemGetNVMLVersion(char *version, unsigned int length); - -/** - * Retrieves the version of the CUDA driver. - * - * For all products. - * - * The returned CUDA driver version is the same as the CUDA API - * cuDriverGetVersion() would return on the system. - * - * @param cudaDriverVersion Reference in which to return the version identifier - * - * @return - * - \ref NVML_SUCCESS if \a cudaDriverVersion has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cudaDriverVersion is NULL - */ -nvmlReturn_t DECLDIR nvmlSystemGetCudaDriverVersion(int *cudaDriverVersion); - -/** - * Gets name of the process with provided process id - * - * For all products. - * - * Returned process name is cropped to provided length. - * name string is encoded in ANSI. - * - * @param pid The identifier of the process - * @param name Reference in which to return the process name - * @param length The maximum allowed length of the string returned in \a name - * - * @return - * - \ref NVML_SUCCESS if \a name has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a name is NULL or \a length is 0. - * - \ref NVML_ERROR_NOT_FOUND if process doesn't exists - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlSystemGetProcessName(unsigned int pid, char *name, unsigned int length); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlUnitQueries Unit Queries - * This chapter describes that queries that NVML can perform against each unit. For S-class systems only. - * In each case the device is identified with an nvmlUnit_t handle. This handle is obtained by - * calling \ref nvmlUnitGetHandleByIndex(). - * @{ - */ -/***************************************************************************************************/ - - /** - * Retrieves the number of units in the system. - * - * For S-class products. - * - * @param unitCount Reference in which to return the number of units - * - * @return - * - \ref NVML_SUCCESS if \a unitCount has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unitCount is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetCount(unsigned int *unitCount); - -/** - * Acquire the handle for a particular unit, based on its index. - * - * For S-class products. - * - * Valid indices are derived from the \a unitCount returned by \ref nvmlUnitGetCount(). - * For example, if \a unitCount is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1. - * - * The order in which NVML enumerates units has no guarantees of consistency between reboots. - * - * @param index The index of the target unit, >= 0 and < \a unitCount - * @param unit Reference in which to return the unit handle - * - * @return - * - \ref NVML_SUCCESS if \a unit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit); - -/** - * Retrieves the static information associated with a unit. - * - * For S-class products. - * - * See \ref nvmlUnitInfo_t for details on available unit info. - * - * @param unit The identifier of the target unit - * @param info Reference in which to return the unit information - * - * @return - * - \ref NVML_SUCCESS if \a info has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL - */ -nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info); - -/** - * Retrieves the LED state associated with this unit. - * - * For S-class products. - * - * See \ref nvmlLedState_t for details on allowed states. - * - * @param unit The identifier of the target unit - * @param state Reference in which to return the current LED state - * - * @return - * - \ref NVML_SUCCESS if \a state has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlUnitSetLedState() - */ -nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state); - -/** - * Retrieves the PSU stats for the unit. - * - * For S-class products. - * - * See \ref nvmlPSUInfo_t for details on available PSU info. - * - * @param unit The identifier of the target unit - * @param psu Reference in which to return the PSU information - * - * @return - * - \ref NVML_SUCCESS if \a psu has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu); - -/** - * Retrieves the temperature readings for the unit, in degrees C. - * - * For S-class products. - * - * Depending on the product, readings may be available for intake (type=0), - * exhaust (type=1) and board (type=2). - * - * @param unit The identifier of the target unit - * @param type The type of reading to take - * @param temp Reference in which to return the intake temperature - * - * @return - * - \ref NVML_SUCCESS if \a temp has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp); - -/** - * Retrieves the fan speed readings for the unit. - * - * For S-class products. - * - * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info. - * - * @param unit The identifier of the target unit - * @param fanSpeeds Reference in which to return the fan speed information - * - * @return - * - \ref NVML_SUCCESS if \a fanSpeeds has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds); - -/** - * Retrieves the set of GPU devices that are attached to the specified unit. - * - * For S-class products. - * - * The \a deviceCount argument is expected to be set to the size of the input \a devices array. - * - * @param unit The identifier of the target unit - * @param deviceCount Reference in which to provide the \a devices array size, and - * to return the number of attached GPU devices - * @param devices Reference in which to return the references to the attached GPU devices - * - * @return - * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid, either of \a deviceCount or \a devices is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices); - -/** - * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. - * - * For S-class products. - * - * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. - * The HIC must be connected to an S-class system for it to be reported by this function. - * - * @param hwbcCount Size of hwbcEntries array - * @param hwbcEntries Array holding information about hwbc - * - * @return - * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small - */ -nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceQueries Device Queries - * This chapter describes that queries that NVML can perform against each device. - * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by - * calling one of \ref nvmlDeviceGetHandleByIndex(), \ref nvmlDeviceGetHandleBySerial(), - * \ref nvmlDeviceGetHandleByPciBusId(). or \ref nvmlDeviceGetHandleByUUID(). - * @{ - */ -/***************************************************************************************************/ - - /** - * Retrieves the number of compute devices in the system. A compute device is a single GPU. - * - * For all products. - * - * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system - * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. - * Update your code to handle this error, or use NVML 4.304 or older nvml header file. - * For backward binary compatibility reasons _v1 version of the API is still present in the shared - * library. - * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. - * - * @param deviceCount Reference in which to return the number of accessible devices - * - * @return - * - \ref NVML_SUCCESS if \a deviceCount has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); - -/** - * Acquire the handle for a particular device, based on its index. - * - * For all products. - * - * Valid indices are derived from the \a accessibleDevices count returned by - * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. - * - * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it - * is recommended that devices be looked up by their PCI ids or UUID. See - * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId(). - * - * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs if: - * - The target GPU is an SLI slave - * - * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system - * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. - * Update your code to handle this error, or use NVML 4.304 or older nvml header file. - * For backward binary compatibility reasons _v1 version of the API is still present in the shared - * library. - * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. - * - * This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index. - * If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't - * need to worry about that. - * - * @param index The index of the target GPU, >= 0 and < \a accessibleDevices - * @param device Reference in which to return the device handle - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL - * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetIndex - * @see nvmlDeviceGetCount - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); - -/** - * Acquire the handle for a particular device, based on its board serial number. - * - * For Fermi &tm; or newer fully supported devices. - * - * This number corresponds to the value printed directly on the board, and to the value returned by - * \ref nvmlDeviceGetSerial(). - * - * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor - * of \ref nvmlDeviceGetHandleByUUID. - * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs as it searches for the target GPU - * - * @param serial The board serial number of the target GPU - * @param device Reference in which to return the device handle - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one - * device has the same serial (dual GPU boards) - * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system - * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetSerial - * @see nvmlDeviceGetHandleByUUID - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); - -/** - * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device. - * - * For all products. - * - * @param uuid The UUID of the target GPU - * @param device Reference in which to return the device handle - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs as it searches for the target GPU - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null - * - \ref NVML_ERROR_NOT_FOUND if \a uuid does not match a valid device on the system - * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetUUID - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device); - -/** - * Acquire the handle for a particular device, based on its PCI bus id. - * - * For all products. - * - * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo(). - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs if: - * - The target GPU is an SLI slave - * - * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND - * instead of NVML_ERROR_NO_PERMISSION. - * - * @param pciBusId The PCI bus id of the target GPU - * @param device Reference in which to return the device handle - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a pciBusId does not match a valid device on the system - * - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device); - -/** - * Retrieves the name of this device. - * - * For all products. - * - * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not - * exceed 64 characters in length (including the NULL terminator). See \ref - * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param name Reference in which to return the product name - * @param length The maximum allowed length of the string returned in \a name - * - * @return - * - \ref NVML_SUCCESS if \a name has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length); - -/** - * Retrieves the brand of this device. - * - * For all products. - * - * The type is a member of \ref nvmlBrandType_t defined above. - * - * @param device The identifier of the target device - * @param type Reference in which to return the product brand type - * - * @return - * - \ref NVML_SUCCESS if \a name has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a type is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type); - -/** - * Retrieves the NVML index of this device. - * - * For all products. - * - * Valid indices are derived from the \a accessibleDevices count returned by - * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. - * - * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it - * is recommended that devices be looked up by their PCI ids or GPU UUID. See - * \ref nvmlDeviceGetHandleByPciBusId() and \ref nvmlDeviceGetHandleByUUID(). - * - * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. - * - * @param device The identifier of the target device - * @param index Reference in which to return the NVML index of the device - * - * @return - * - \ref NVML_SUCCESS if \a index has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetHandleByIndex() - * @see nvmlDeviceGetCount() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); - -/** - * Retrieves the globally unique board serial number associated with this device's board. - * - * For all products with an inforom. - * - * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator). - * This number matches the serial number tag that is physically attached to the board. See \ref - * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param serial Reference in which to return the board/module serial number - * @param length The maximum allowed length of the string returned in \a serial - * - * @return - * - \ref NVML_SUCCESS if \a serial has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); - -/** - * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device - * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, - * result[0] = 0x3, result[1] = 0x3 - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * @param cpuSetSize The size of the cpuSet array that is safe to access - * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per - * unsigned long on 64-bit machines, 32 on 32-bit machines - * - * @return - * - \ref NVML_SUCCESS if \a cpuAffinity has been filled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet); - -/** - * Sets the ideal affinity for the calling thread and device using the guidelines - * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0. - * Older versions set the affinity for a calling process and all children. - * Currently supports up to 64 processors. - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if the calling process has been successfully bound - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); - -/** - * Clear all affinity bindings for the calling thread. Note, this is a change as of version - * 8.0 as older versions cleared the affinity for a calling process and all children. - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if the calling process has been successfully unbound - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); - -/** - * Retrieve the common ancestor for two devices - * For all products. - * Supported on Linux only. - * - * @param device1 The identifier of the first device - * @param device2 The identifier of the second device - * @param pathInfo A \ref nvmlGpuTopologyLevel_t that gives the path type - * - * @return - * - \ref NVML_SUCCESS if \a pathInfo has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1, or \a device2 is invalid, or \a pathInfo is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); - -/** - * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level - * For all products. - * Supported on Linux only. - * - * @param device The identifier of the first device - * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs - * @param count When zero, is set to the number of matching GPUs such that \a deviceArray - * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count - * number of device handles. - * @param deviceArray An array of device handles for GPUs found at \a level - * - * @return - * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray); - -/** - * Retrieve the set of GPUs that have a CPU affinity with the given CPU number - * For all products. - * Supported on Linux only. - * - * @param cpuNumber The CPU number - * @param count When zero, is set to the number of matching GPUs such that \a deviceArray - * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count - * number of device handles. - * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber - * - * @return - * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); - -/** - * Retrieve the status for a given p2p capability index between a given pair of GPU - * - * @param device1 The first device - * @param device2 The second device - * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2 - * @param p2pStatus Reference in which to return the status of the \a p2pIndex - * between \a device1 and \a device2 - * @return - * - \ref NVML_SUCCESS if \a p2pStatus has been populated - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus); - -/** - * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string, - * that augments the immutable, board serial identifier. - * - * For all products. - * - * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products. - * It does NOT correspond to any identifier printed on the board. It will not exceed 80 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param uuid Reference in which to return the GPU UUID - * @param length The maximum allowed length of the string returned in \a uuid - * - * @return - * - \ref NVML_SUCCESS if \a uuid has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); - -/** - * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for - * each GPU will have the form /dev/nvidia[minor number]. - * - * For all products. - * Supported only for Linux - * - * @param device The identifier of the target device - * @param minorNumber Reference in which to return the minor number for the device - * @return - * - \ref NVML_SUCCESS if the minor number is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minorNumber is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber); - -/** - * Retrieves the the device board part number which is programmed into the board's InfoROM - * - * For all products. - * - * @param device Identifier of the target device - * @param partNumber Reference to the buffer to return - * @param length Length of the buffer reference - * - * @return - * - \ref NVML_SUCCESS if \a partNumber has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length); - -/** - * Retrieves the version information for the device's infoROM object. - * - * For all products with an inforom. - * - * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate - * ECC counts. The version of the data structures in this memory may change from time to time. It will not - * exceed 16 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. - * - * See \ref nvmlInforomObject_t for details on the available infoROM objects. - * - * @param device The identifier of the target device - * @param object The target infoROM object - * @param version Reference in which to return the infoROM version - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetInforomImageVersion - */ -nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length); - -/** - * Retrieves the global infoROM image version - * - * For all products with an inforom. - * - * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board - * in contrast to infoROM object version which is only an indicator of supported features. - * Version string will not exceed 16 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param version Reference in which to return the infoROM image version - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetInforomVersion - */ -nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length); - -/** - * Retrieves the checksum of the configuration stored in the device's infoROM. - * - * For all products with an inforom. - * - * Can be used to make sure that two GPUs have the exact same configuration. - * Current checksum takes into account configuration stored in PWR and ECC infoROM objects. - * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC) - * - * @param device The identifier of the target device - * @param checksum Reference in which to return the infoROM configuration checksum - * - * @return - * - \ref NVML_SUCCESS if \a checksum has been set - * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum); - -/** - * Reads the infoROM from the flash and verifies the checksums. - * - * For all products with an inforom. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if infoROM is not corrupted - * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); - -/** - * Retrieves the display mode for the device. - * - * For all products. - * - * This method indicates whether a physical display (e.g. monitor) is currently connected to - * any of the device's connectors. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param display Reference in which to return the display mode - * - * @return - * - \ref NVML_SUCCESS if \a display has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display); - -/** - * Retrieves the display active state for the device. - * - * For all products. - * - * This method indicates whether a display is initialized on the device. - * For example whether X Server is attached to this device and has allocated memory for the screen. - * - * Display can be active even when no monitor is physically attached. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param isActive Reference in which to return the display active state - * - * @return - * - \ref NVML_SUCCESS if \a isActive has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive); - -/** - * Retrieves the persistence mode associated with this device. - * - * For all products. - * For Linux only. - * - * When driver persistence mode is enabled the driver software state is not torn down when the last - * client disconnects. By default this feature is disabled. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current driver persistence mode - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetPersistenceMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode); - -/** - * Retrieves the PCI attributes of this device. - * - * For all products. - * - * See \ref nvmlPciInfo_t for details on the available PCI info. - * - * @param device The identifier of the target device - * @param pci Reference in which to return the PCI info - * - * @return - * - \ref NVML_SUCCESS if \a pci has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); - -/** - * Retrieves the maximum PCIe link generation possible with this device and system - * - * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will - * report is generation 1. - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param maxLinkGen Reference in which to return the max PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a maxLinkGen has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); - -/** - * Retrieves the maximum PCIe link width possible with this device and system - * - * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report - * a max link width of 8. - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param maxLinkWidth Reference in which to return the max PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); - -/** - * Retrieves the current PCIe link generation - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param currLinkGen Reference in which to return the current PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a currLinkGen has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen); - -/** - * Retrieves the current PCIe link width - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param currLinkWidth Reference in which to return the current PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a currLinkWidth has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth); - -/** - * Retrieve PCIe utilization information. - * This function is querying a byte counter over a 20ms interval and thus is the - * PCIe throughput over that interval. - * - * For Maxwell &tm; or newer fully supported devices. - * - * This method is not supported in virtual machines running virtual GPU (vGPU). - * - * @param device The identifier of the target device - * @param counter The specific counter that should be queried \ref nvmlPcieUtilCounter_t - * @param value Reference in which to return throughput in KB/s - * - * @return - * - \ref NVML_SUCCESS if \a value has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value); - -/** - * Retrieve the PCIe replay counter. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param value Reference in which to return the counter's value - * - * @return - * - \ref NVML_SUCCESS if \a value has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); - -/** - * Retrieves the current clock speeds for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlClockType_t for details on available clock information. - * - * @param device The identifier of the target device - * @param type Identify which clock domain to query - * @param clock Reference in which to return the clock speed in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clock has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); - -/** - * Retrieves the maximum clock speeds for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlClockType_t for details on available clock information. - * - * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks - * by few MHz. - * - * @param device The identifier of the target device - * @param type Identify which clock domain to query - * @param clock Reference in which to return the clock speed in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clock has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); - -/** - * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. - * Can be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); - -/** - * Retrieves the default applications clock that GPU boots with or - * defaults to after \ref nvmlDeviceResetApplicationsClocks call. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the default clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * \see nvmlDeviceGetApplicationsClock - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); - -/** - * Resets the application clock to the default value - * - * This is the applications clock that will be used after system reboot or driver reload. - * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, - * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above - * base clocks as thermal limits allow. - * - * @see nvmlDeviceGetApplicationsClock - * @see nvmlDeviceSetApplicationsClocks - * - * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); - -/** - * Retrieves the clock speed for the clock specified by the clock type and clock ID. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockId Identify which clock in the domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz); - -/** - * Retrieves the customer defined maximum boost clock speed specified by the given clock type. - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or the \a clockType on this device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); - -/** - * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param count Reference in which to provide the \a clocksMHz array size, and - * to return the number of elements - * @param clocksMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of - * required elements) - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetApplicationsClocks - * @see nvmlDeviceGetSupportedGraphicsClocks - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz); - -/** - * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param memoryClockMHz Memory clock for which to return possible graphics clocks - * @param count Reference in which to provide the \a clocksMHz array size, and - * to return the number of elements - * @param clocksMHz Reference in which to return the clocks in MHz - * - * @return - * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetApplicationsClocks - * @see nvmlDeviceGetSupportedMemoryClocks - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz); - -/** - * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled - * - * For Kepler &tm; or newer fully supported devices. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. - * - * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param isEnabled Where to store the current state of Auto Boosted clocks of the target device - * @param defaultIsEnabled Where to store the default Auto Boosted clocks behavior of the target device that the device will - * revert to when no applications are using the GPU - * - * @return - * - \ref NVML_SUCCESS If \a isEnabled has been been set with the Auto Boosted clocks state of \a device - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isEnabled is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); - -/** - * Try to set the current state of Auto Boosted clocks on a device. - * - * For Kepler &tm; or newer fully supported devices. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock - * rates are desired. - * - * Non-root users may use this API by default but can be restricted by root from using this API by calling - * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. - * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. - * - * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param enabled What state to try to set Auto Boosted clocks of the target device to - * - * @return - * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); - -/** - * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will - * return to when no compute running processes (e.g. CUDA application which have an active context) are running - * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock - * rates are desired. - * - * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param enabled What state to try to set default Auto Boosted clocks of the target device to - * @param flags Flags that change the default behavior. Currently Unused. - * - * @return - * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); - - -/** - * Retrieves the intended operating speed of the device's fan. - * - * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the - * output will not match the actual fan speed. - * - * For all discrete products with dedicated fans. - * - * The fan speed is expressed as a percent of the maximum, i.e. full speed is 100%. - * - * @param device The identifier of the target device - * @param speed Reference in which to return the fan speed percentage - * - * @return - * - \ref NVML_SUCCESS if \a speed has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); - -/** - * Retrieves the current temperature readings for the device, in degrees C. - * - * For all products. - * - * See \ref nvmlTemperatureSensors_t for details on available temperature sensors. - * - * @param device The identifier of the target device - * @param sensorType Flag that indicates which sensor reading to retrieve - * @param temp Reference in which to return the temperature reading - * - * @return - * - \ref NVML_SUCCESS if \a temp has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); - -/** - * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. - * - * For Kepler &tm; or newer fully supported devices. - * - * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. - * - * @param device The identifier of the target device - * @param thresholdType The type of threshold value queried - * @param temp Reference in which to return the temperature reading - * @return - * - \ref NVML_SUCCESS if \a temp has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); - -/** - * Retrieves the current performance state for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlPstates_t for details on allowed performance states. - * - * @param device The identifier of the target device - * @param pState Reference in which to return the performance state reading - * - * @return - * - \ref NVML_SUCCESS if \a pState has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState); - -/** - * Retrieves current clocks throttling reasons. - * - * For all fully supported products. - * - * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. - * - * @param device The identifier of the target device - * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle - * reasons - * - * @return - * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlClocksThrottleReasons - * @see nvmlDeviceGetSupportedClocksThrottleReasons - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); - -/** - * Retrieves bitmask of supported clocks throttle reasons that can be returned by - * \ref nvmlDeviceGetCurrentClocksThrottleReasons - * - * For all fully supported products. - * - * This method is not supported in virtual machines running virtual GPU (vGPU). - * - * @param device The identifier of the target device - * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported - * clocks throttle reasons - * - * @return - * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlClocksThrottleReasons - * @see nvmlDeviceGetCurrentClocksThrottleReasons - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); - -/** - * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. - * - * Retrieve the current performance state for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlPstates_t for details on allowed performance states. - * - * @param device The identifier of the target device - * @param pState Reference in which to return the performance state reading - * - * @return - * - \ref NVML_SUCCESS if \a pState has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); - -/** - * This API has been deprecated. - * - * Retrieves the power management mode associated with this device. - * - * For products from the Fermi family. - * - Requires \a NVML_INFOROM_POWER version 3.0 or higher. - * - * For from the Kepler or newer families. - * - Does not require \a NVML_INFOROM_POWER object. - * - * This flag indicates whether any power management algorithm is currently active on the device. An - * enabled state does not necessarily mean the device is being actively throttled -- only that - * that the driver will do so if the appropriate conditions are met. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current power management mode - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); - -/** - * Retrieves the power management limit associated with this device. - * - * For Fermi &tm; or newer fully supported devices. - * - * The power limit defines the upper boundary for the card's power draw. If - * the card's total power draw reaches this limit the power management algorithm kicks in. - * - * This reading is only available if power management mode is supported. - * See \ref nvmlDeviceGetPowerManagementMode. - * - * @param device The identifier of the target device - * @param limit Reference in which to return the power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit); - -/** - * Retrieves information about possible values of power management limits on this device. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param minLimit Reference in which to return the minimum power management limit in milliwatts - * @param maxLimit Reference in which to return the maximum power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetPowerManagementLimit - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit); - -/** - * Retrieves default power management limit on this device, in milliwatts. - * Default power management limit is a power management limit that the device boots with. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param defaultLimit Reference in which to return the default power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a defaultLimit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit); - -/** - * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) - * - * For Fermi &tm; or newer fully supported devices. - * - * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. - * - * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. - * - * @param device The identifier of the target device - * @param power Reference in which to return the power usage information - * - * @return - * - \ref NVML_SUCCESS if \a power has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); - -/** - * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded - * - * For newer than Pascal &tm; fully supported devices. - * - * @param device The identifier of the target device - * @param energy Reference in which to return the energy consumption information - * - * @return - * - \ref NVML_SUCCESS if \a energy has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a energy is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support energy readings - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy); - -/** - * Get the effective power limit that the driver enforces after taking into account all limiters - * - * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere - * This includes the out of band power limit interface - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The device to communicate with - * @param limit Reference in which to return the power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit); - -/** - * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot). - * - * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. - * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. - * Not supported on Quadro ® and Tesla &tm; C-class products. - * - * @param device The identifier of the target device - * @param current Reference in which to return the current GOM - * @param pending Reference in which to return the pending GOM - * - * @return - * - \ref NVML_SUCCESS if \a mode has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlGpuOperationMode_t - * @see nvmlDeviceSetGpuOperationMode - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending); - -/** - * Retrieves the amount of used, free and total memory available on the device, in bytes. - * - * For all products. - * - * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. - * Under WDDM most device memory is allocated and managed on startup by Windows. - * - * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated - * by all active channels on the device. - * - * See \ref nvmlMemory_t for details on available memory info. - * - * @param device The identifier of the target device - * @param memory Reference in which to return the memory information - * - * @return - * - \ref NVML_SUCCESS if \a memory has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory); - -/** - * Retrieves the current compute mode for the device. - * - * For all products. - * - * See \ref nvmlComputeMode_t for details on allowed compute modes. - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current compute mode - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetComputeMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode); - -/** - * Retrieves the CUDA compute capability of the device. - * - * For all products. - * - * Returns the major and minor compute capability version numbers of the - * device. The major and minor versions are equivalent to the - * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and - * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be - * returned by CUDA's cuDeviceGetAttribute(). - * - * @param device The identifier of the target device - * @param major Reference in which to return the major CUDA compute capability - * @param minor Reference in which to return the minor CUDA compute capability - * - * @return - * - \ref NVML_SUCCESS if \a major and \a minor have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor); - -/** - * Retrieves the current and pending ECC modes for the device. - * - * For Fermi &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * - * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following - * the next reboot. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param current Reference in which to return the current ECC mode - * @param pending Reference in which to return the pending ECC mode - * - * @return - * - \ref NVML_SUCCESS if \a current and \a pending have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetEccMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending); - -/** - * Retrieves the device boardId from 0-N. - * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with - * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. - * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across - * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and - * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will - * always return those values but they will always be different from each other). - * - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param boardId Reference in which to return the device's board ID - * - * @return - * - \ref NVML_SUCCESS if \a boardId has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId); - -/** - * Retrieves whether the device is on a Multi-GPU Board - * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value. - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param multiGpuBool Reference in which to return a zero or non-zero value - * to indicate whether the device is on a multi GPU board - * - * @return - * - \ref NVML_SUCCESS if \a multiGpuBool has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool); - -/** - * Retrieves the total ECC error counts for the device. - * - * For Fermi &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * Requires ECC Mode to be enabled. - * - * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of - * errors across the entire device. - * - * See \ref nvmlMemoryErrorType_t for a description of available error types.\n - * See \ref nvmlEccCounterType_t for a description of available counter types. - * - * @param device The identifier of the target device - * @param errorType Flag that specifies the type of the errors. - * @param counterType Flag that specifies the counter-type of the errors. - * @param eccCounts Reference in which to return the specified ECC errors - * - * @return - * - \ref NVML_SUCCESS if \a eccCounts has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceClearEccErrorCounts() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts); - -/** - * Retrieves the detailed ECC error counts for the device. - * - * @deprecated This API supports only a fixed set of ECC error locations - * On different GPU architectures different locations are supported - * See \ref nvmlDeviceGetMemoryErrorCounter - * - * For Fermi &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts. - * Requires ECC Mode to be enabled. - * - * Detailed errors provide separate ECC counts for specific parts of the memory system. - * - * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported. - * - * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n - * See \ref nvmlEccCounterType_t for a description of available counter types.\n - * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts. - * - * @param device The identifier of the target device - * @param errorType Flag that specifies the type of the errors. - * @param counterType Flag that specifies the counter-type of the errors. - * @param eccCounts Reference in which to return the specified ECC errors - * - * @return - * - \ref NVML_SUCCESS if \a eccCounts has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceClearEccErrorCounts() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts); - -/** - * Retrieves the requested memory error counter for the device. - * - * For Fermi &tm; or newer fully supported devices. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts. - * - * Only applicable to devices with ECC. - * - * Requires ECC Mode to be enabled. - * - * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n - * See \ref nvmlEccCounterType_t for a description of available counter types.\n - * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n - * - * @param device The identifier of the target device - * @param errorType Flag that specifies the type of error. - * @param counterType Flag that specifies the counter-type of the errors. - * @param locationType Specifies the location of the counter. - * @param count Reference in which to return the ECC counter - * - * @return - * - \ref NVML_SUCCESS if \a count has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is - * invalid, or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support ECC error reporting in the specified memory - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, - nvmlEccCounterType_t counterType, - nvmlMemoryLocation_t locationType, unsigned long long *count); - -/** - * Retrieves the current utilization rates for the device's major subsystems. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlUtilization_t for details on available utilization rates. - * - * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. - * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. - * - * @param device The identifier of the target device - * @param utilization Reference in which to return the utilization information - * - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization); - -/** - * Retrieves the current utilization and sampling size in microseconds for the Encoder - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param utilization Reference to an unsigned int for encoder utilization info - * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US - * - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); - -/** - * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param encoderQueryType Type of encoder to query - * @param encoderCapacity Reference to an unsigned int for the encoder capacity - * - * @return - * - \ref NVML_SUCCESS if \a encoderCapacity is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a encoderCapacity is NULL, or \a device or \a encoderQueryType - * are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if device does not support the encoder specified in \a encodeQueryType - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity); - -/** - * Retrieves the current encoder statistics for a given device. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param sessionCount Reference to an unsigned int for count of active encoder sessions - * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions - * @param averageLatency Reference to an unsigned int for encode latency in microseconds - * - * @return - * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount, or \a device or \a averageFps, - * or \a averageLatency is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount, - unsigned int *averageFps, unsigned int *averageLatency); - -/** - * Retrieves information about active encoder sessions on a target device. - * - * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The - * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. - * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return - * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param sessionCount Reference to caller supplied array size, and returns the number of sessions. - * @param sessionInfos Reference in which to return the session information - * - * @return - * - \ref NVML_SUCCESS if \a sessionInfos is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos); - -/** - * Retrieves the current utilization and sampling size in microseconds for the Decoder - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param utilization Reference to an unsigned int for decoder utilization info - * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US - * - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); - -/** -* Retrieves the active frame buffer capture sessions statistics for a given device. -* -* For Maxwell &tm; or newer fully supported devices. -* -* @param device The identifier of the target device -* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats -* -* @return -* - \ref NVML_SUCCESS if \a fbcStats is fetched -* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized -* - \ref NVML_ERROR_INVALID_ARGUMENT if \a fbcStats is NULL -* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible -* - \ref NVML_ERROR_UNKNOWN on any unexpected error -*/ -nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *fbcStats); - -/** -* Retrieves information about active frame buffer capture sessions on a target device. -* -* An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The -* array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions -* written to the buffer. -* -* If the supplied buffer is not large enough to accomodate the active session array, the function returns -* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. -* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return -* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. -* -* For Maxwell &tm; or newer fully supported devices. -* -* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may -* be zero if there are no new frames captured since the session started. -* -* @param device The identifier of the target device -* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. -* @param sessionInfo Reference in which to return the session information -* -* @return -* - \ref NVML_SUCCESS if \a sessionInfo is fetched -* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized -* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount -* - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. -* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible -* - \ref NVML_ERROR_UNKNOWN on any unexpected error -*/ -nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); - -/** - * Retrieves the current and pending driver model for the device. - * - * For Fermi &tm; or newer fully supported devices. - * For windows only. - * - * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached - * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached. - * - * See \ref nvmlDriverModel_t for details on available driver models. - * - * @param device The identifier of the target device - * @param current Reference in which to return the current driver model - * @param pending Reference in which to return the pending driver model - * - * @return - * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetDriverModel() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); - -/** - * Get VBIOS version of the device. - * - * For all products. - * - * The VBIOS version may change from time to time. It will not exceed 32 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param version Reference to which to return the VBIOS version - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length); - -/** - * Get Bridge Chip Information for all the bridge chips on the board. - * - * For all fully supported products. - * Only applicable to multi-GPU products. - * - * @param device The identifier of the target device - * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy - * - * @return - * - \ref NVML_SUCCESS if bridge chip exists - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy); - -/** - * Get information about processes with a compute context on a device - * - * For Fermi &tm; or newer fully supported devices. - * - * This function returns information only about compute running processes (e.g. CUDA application which have - * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. - * - * To query the current number of running compute processes, call this function with *infoCount = 0. The - * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call - * \a infos is allowed to be NULL. - * - * The usedGpuMemory field returned is all of the memory used by the application. - * - * Keep in mind that information returned by this call is dynamic and the number of elements might change in - * time. Allocate more space for \a infos table in case new compute processes are spawned. - * - * @param device The identifier of the target device - * @param infoCount Reference in which to provide the \a infos array size, and - * to return the number of returned elements - * @param infos Reference in which to return the process information - * - * @return - * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small - * \a infoCount will contain minimal amount of space necessary for - * the call to complete - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see \ref nvmlSystemGetProcessName - */ -nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); - -/** - * Get information about processes with a graphics context on a device - * - * For Kepler &tm; or newer fully supported devices. - * - * This function returns information only about graphics based processes - * (eg. applications using OpenGL, DirectX) - * - * To query the current number of running graphics processes, call this function with *infoCount = 0. The - * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call - * \a infos is allowed to be NULL. - * - * The usedGpuMemory field returned is all of the memory used by the application. - * - * Keep in mind that information returned by this call is dynamic and the number of elements might change in - * time. Allocate more space for \a infos table in case new graphics processes are spawned. - * - * @param device The identifier of the target device - * @param infoCount Reference in which to provide the \a infos array size, and - * to return the number of returned elements - * @param infos Reference in which to return the process information - * - * @return - * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small - * \a infoCount will contain minimal amount of space necessary for - * the call to complete - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see \ref nvmlSystemGetProcessName - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); - -/** - * Check if the GPU devices are on the same physical board. - * - * For all fully supported products. - * - * @param device1 The first GPU device - * @param device2 The second GPU device - * @param onSameBoard Reference in which to return the status. - * Non-zero indicates that the GPUs are on the same board. - * - * @return - * - \ref NVML_SUCCESS if \a onSameBoard has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); - -/** - * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs. - * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions. - * - * For all fully supported products. - * - * @param device The identifier of the target device - * @param apiType Target API type for this operation - * @param isRestricted Reference in which to return the current restriction - * NVML_FEATURE_ENABLED indicates that the API is root-only - * NVML_FEATURE_DISABLED indicates that the API is accessible to all users - * - * @return - * - \ref NVML_SUCCESS if \a isRestricted has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or the device does not support - * the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is - * not supported by the device) - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlRestrictedAPI_t - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted); - -/** - * Gets recent samples for the GPU. - * - * For Kepler &tm; or newer fully supported devices. - * - * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by - * the driver. - * - * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t. - * - * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. - * The returned samplesCount will provide the number of samples that can be queried. The user needs to - * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t). - * - * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the - * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query - * to get more recent samples. - * - * This method fetches the number of entries which can be accommodated in the provided samples array, and the - * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this - * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost. - * - * @param device The identifier for the target device - * @param type Type of sampling event - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t - * @param sampleCount Reference to provide the number of elements which can be queried in samples array - * @param samples Reference in which samples are returned - - * @return - * - \ref NVML_SUCCESS if samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or - * reference to \a sampleCount is 0 for non null \a samples - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, - nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples); - -/** - * Gets Total, Available and Used size of BAR1 memory. - * - * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party - * devices (peer-to-peer on the PCIE bus). - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param bar1Memory Reference in which BAR1 memory - * information is returned. - * - * @return - * - \ref NVML_SUCCESS if BAR1 memory is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a bar1Memory is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory); - - -/** - * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power - * or thermal constraints. - * - * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The - * difference in violation times at two different reference times gives the indication of GPU throttling event. - * - * Violation for thermal capping is not supported at this time. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param perfPolicyType Represents Performance policy which can trigger GPU throttling - * @param violTime Reference to which violation time related information is returned - * - * - * @return - * - \ref NVML_SUCCESS if violation time is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime); - -/** - * @} - */ - -/** @addtogroup nvmlAccountingStats - * @{ - */ - -/** - * Queries the state of per process accounting mode. - * - * For Kepler &tm; or newer fully supported devices. - * - * See \ref nvmlDeviceGetAccountingStats for more details. - * See \ref nvmlDeviceSetAccountingMode - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current accounting mode - * - * @return - * - \ref NVML_SUCCESS if the mode has been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); - -/** - * Queries process's accounting stats. - * - * For Kepler &tm; or newer fully supported devices. - * - * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. - * Accounting stats can be queried during life time of the process and after its termination. - * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and - * updated to actual running time after its termination. - * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old - * processes. - * - * See \ref nvmlAccountingStats_t for description of each returned metric. - * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. - * - * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. - * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be - * queried since they don't contribute to GPU utilization. - * @note In case of pid collision stats of only the latest process (that terminated last) will be reported - * - * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. - * - * @param device The identifier of the target device - * @param pid Process Id of the target process to query stats for - * @param stats Reference in which to return the process's accounting stats - * - * @return - * - \ref NVML_SUCCESS if stats have been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL - * - \ref NVML_ERROR_NOT_FOUND if process stats were not found - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingBufferSize - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); - -/** - * Queries list of processes that can be queried for accounting stats. The list of processes returned - * can be in running or terminated state. - * - * For Kepler &tm; or newer fully supported devices. - * - * To just query the number of processes ready to be queried, call this function with *count = 0 and - * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. - * - * For more details see \ref nvmlDeviceGetAccountingStats. - * - * @note In case of PID collision some processes might not be accessible before the circular buffer is full. - * - * @param device The identifier of the target device - * @param count Reference in which to provide the \a pids array size, and - * to return the number of elements ready to be queried - * @param pids Reference in which to return list of process ids - * - * @return - * - \ref NVML_SUCCESS if pids were successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to - * expected value) - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingBufferSize - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); - -/** - * Returns the number of processes that the circular buffer with accounting pids can hold. - * - * For Kepler &tm; or newer fully supported devices. - * - * This is the maximum number of processes that accounting information will be stored for before information - * about oldest processes will get overwritten by information about new processes. - * - * @param device The identifier of the target device - * @param bufferSize Reference in which to provide the size (in number of elements) - * of the circular buffer for accounting stats. - * - * @return - * - \ref NVML_SUCCESS if buffer size was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingStats - * @see nvmlDeviceGetAccountingPids - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); - -/** @} */ - -/** @addtogroup nvmlDeviceQueries - * @{ - */ - -/** - * Returns the list of retired pages by source, including pages that are pending retirement - * The address information provided from this API is the hardware address of the page that was retired. Note - * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param cause Filter page addresses by cause of retirement - * @param pageCount Reference in which to provide the \a addresses buffer size, and - * to return the number of retired pages that match \a cause - * Set to 0 to query the size without allocating an \a addresses buffer - * @param addresses Buffer to write the page addresses into - * - * @return - * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the - * matching page addresses. \a pageCount is set to the needed size. - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or - * \a addresses is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, - unsigned int *pageCount, unsigned long long *addresses); - -/** - * Returns the list of retired pages by source, including pages that are pending retirement - * The address information provided from this API is the hardware address of the page that was retired. Note - * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 - * - * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps paramter to return the time of each page's - * retirement. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param cause Filter page addresses by cause of retirement - * @param pageCount Reference in which to provide the \a addresses buffer size, and - * to return the number of retired pages that match \a cause - * Set to 0 to query the size without allocating an \a addresses buffer - * @param addresses Buffer to write the page addresses into - * @param timestamps Buffer to write the timestamps of page retirement, additional for _v2 - * - * @return - * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the - * matching page addresses. \a pageCount is set to the needed size. - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or - * \a addresses is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause, - unsigned int *pageCount, unsigned long long *addresses, unsigned long long *timestamps); - -/** - * Check if any pages are pending retirement and need a reboot to fully retire. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param isPending Reference in which to return the pending status - * - * @return - * - \ref NVML_SUCCESS if \a isPending was populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlUnitCommands Unit Commands - * This chapter describes NVML operations that change the state of the unit. For S-class products. - * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION - * error code when invoking any of these methods. - * @{ - */ -/***************************************************************************************************/ - -/** - * Set the LED state for the unit. The LED can be either green (0) or amber (1). - * - * For S-class products. - * Requires root/admin permissions. - * - * This operation takes effect immediately. - * - * - * Current S-Class products don't provide unique LEDs for each unit. As such, both front - * and back LEDs will be toggled in unison regardless of which unit is specified with this command. - * - * See \ref nvmlLedColor_t for available colors. - * - * @param unit The identifier of the target unit - * @param color The target LED color - * - * @return - * - \ref NVML_SUCCESS if the LED color has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlUnitGetLedState() - */ -nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceCommands Device Commands - * This chapter describes NVML operations that change the state of the device. - * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION - * error code when invoking any of these methods. - * @{ - */ -/***************************************************************************************************/ - -/** - * Set the persistence mode for the device. - * - * For all products. - * For Linux only. - * Requires root/admin permissions. - * - * The persistence mode determines whether the GPU driver software is torn down after the last client - * exits. - * - * This operation takes effect immediately. It is not persistent across reboots. After each reboot the - * persistence mode is reset to "Disabled". - * - * See \ref nvmlEnableState_t for available modes. - * - * @param device The identifier of the target device - * @param mode The target persistence mode - * - * @return - * - \ref NVML_SUCCESS if the persistence mode was set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetPersistenceMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode); - -/** - * Set the compute mode for the device. - * - * For all products. - * Requires root/admin permissions. - * - * The compute mode determines whether a GPU can be used for compute operations and whether it can - * be shared across contexts. - * - * This operation takes effect immediately. Under Linux it is not persistent across reboots and - * always resets to "Default". Under windows it is persistent. - * - * Under windows compute mode may only be set to DEFAULT when running in WDDM - * - * See \ref nvmlComputeMode_t for details on available compute modes. - * - * @param device The identifier of the target device - * @param mode The target compute mode - * - * @return - * - \ref NVML_SUCCESS if the compute mode was set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetComputeMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode); - -/** - * Set the ECC mode for the device. - * - * For Kepler &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * Requires root/admin permissions. - * - * The ECC mode determines whether the GPU enables its ECC support. - * - * This operation takes effect after the next reboot. - * - * See \ref nvmlEnableState_t for details on available modes. - * - * @param device The identifier of the target device - * @param ecc The target ECC mode - * - * @return - * - \ref NVML_SUCCESS if the ECC mode was set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetEccMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); - -/** - * Clear the ECC error and other memory error counts for the device. - * - * For Kepler &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. - * Requires root/admin permissions. - * Requires ECC Mode to be enabled. - * - * Sets all of the specified ECC counters to 0, including both detailed and total counts. - * - * This operation takes effect immediately. - * - * See \ref nvmlMemoryErrorType_t for details on available counter types. - * - * @param device The identifier of the target device - * @param counterType Flag that indicates which type of errors should be cleared. - * - * @return - * - \ref NVML_SUCCESS if the error counts were cleared - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see - * - nvmlDeviceGetDetailedEccErrors() - * - nvmlDeviceGetTotalEccErrors() - */ -nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType); - -/** - * Set the driver model for the device. - * - * For Fermi &tm; or newer fully supported devices. - * For windows only. - * Requires root/admin permissions. - * - * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached - * to the device it must run in WDDM mode. - * - * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). - * This should only be done if the host is subsequently powered down and the display is detached from the device - * before the next reboot. - * - * This operation takes effect after the next reboot. - * - * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. - * - * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or - * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. - * - * See \ref nvmlDriverModel_t for details on available driver models. - * See \ref nvmlFlagDefault and \ref nvmlFlagForce - * - * @param device The identifier of the target device - * @param driverModel The target driver model - * @param flags Flags that change the default behavior - * - * @return - * - \ref NVML_SUCCESS if the driver model has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetDriverModel() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); - -/** - * Set clocks that device will lock to. - * - * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz. - * Setting this will supercede application clock values and take effect regardless if a cuda app is running. - * See /ref nvmlDeviceSetApplicationsClocks - * - * Can be used as a setting to request constant performance. - * - * Requires root/admin permissions. - * - * After system reboot or driver reload applications clocks go back to their default value. - * See \ref nvmlDeviceResetGpuLockedClocks. - * - * For newer than Pascal &tm; fully supported devices. - * - * @param device The identifier of the target device - * @param minGpuClockMHz Requested minimum gpu clock in MHz - * @param maxGpuClockMHz Requested maximum gpu clock in MHz - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz - * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz); - -/** - * Resets the gpu clock to the default value - * - * This is the gpu clock that will be used after system reboot or driver reload. - * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * @see nvmlDeviceSetGpuLockedClocks - * - * For newer than Pascal &tm; fully supported devices. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device); - -/** - * Set clocks that applications will lock to. - * - * Sets the clocks that compute and graphics applications will be running at. - * e.g. CUDA driver requests these clocks during context creation which means this property - * defines clocks at which CUDA applications will be running unless some overspec event - * occurs (e.g. over power, over thermal or external HW brake). - * - * Can be used as a setting to request constant performance. - * - * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. - * - * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call - * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting - * above the clock value being set. - * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks - * for details on how to list available clocks combinations. - * - * After system reboot or driver reload applications clocks go back to their default value. - * See \ref nvmlDeviceResetApplicationsClocks. - * - * @param device The identifier of the target device - * @param memClockMHz Requested memory clock in MHz - * @param graphicsClockMHz Requested graphics clock in MHz - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz - * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); - -/** - * Set new power limit of this device. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. - * - * \note Limit is not persistent across reboots or driver unloads. - * Enable persistent mode to prevent driver from unloading when no application is using the device. - * - * @param device The identifier of the target device - * @param limit Power management limit in milliwatts to set - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetPowerManagementLimitConstraints - * @see nvmlDeviceGetPowerManagementDefaultLimit - */ -nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit); - -/** - * Sets new GOM. See \a nvmlGpuOperationMode_t for details. - * - * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. - * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. - * Not supported on Quadro ® and Tesla &tm; C-class products. - * Requires root/admin permissions. - * - * Changing GOMs requires a reboot. - * The reboot requirement might be removed in the future. - * - * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when - * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. - * - * @param device The identifier of the target device - * @param mode Target GOM - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlGpuOperationMode_t - * @see nvmlDeviceGetGpuOperationMode - */ -nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode); - -/** - * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs. - * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. - * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction - * to query the current restriction settings. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * @param device The identifier of the target device - * @param apiType Target API type for this operation - * @param isRestricted The target restriction - * - * @return - * - \ref NVML_SUCCESS if \a isRestricted has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support - * the feature that api restrictions are being set for (E.G. Enabling/disabling auto - * boosted clocks is not supported by the device) - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlRestrictedAPI_t - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); - -/** - * @} - */ - -/** @addtogroup nvmlAccountingStats - * @{ - */ - -/** - * Enables or disables per process accounting. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * @note This setting is not persistent and will default to disabled after driver unloads. - * Enable persistence mode to be sure the setting doesn't switch off to disabled. - * - * @note Enabling accounting mode has no negative impact on the GPU performance. - * - * @note Disabling accounting clears all accounting pids information. - * - * See \ref nvmlDeviceGetAccountingMode - * See \ref nvmlDeviceGetAccountingStats - * See \ref nvmlDeviceClearAccountingPids - * - * @param device The identifier of the target device - * @param mode The target accounting mode - * - * @return - * - \ref NVML_SUCCESS if the new mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode); - -/** - * Clears accounting information about all processes that have already terminated. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetAccountingMode - * See \ref nvmlDeviceGetAccountingStats - * See \ref nvmlDeviceSetAccountingMode - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if accounting information has been cleared - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup NvLink NvLink Methods - * This chapter describes methods that NVML can perform on NVLINK enabled devices. - * @{ - */ -/***************************************************************************************************/ - -/** - * Retrieves the state of the device's NvLink for the link specified - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that - * the link is active and NVML_FEATURE_DISABLED indicates it - * is inactive - * - * @return - * - \ref NVML_SUCCESS if \a isActive has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); - -/** - * Retrieves the version of the device's NvLink for the link specified - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param version Requested NvLink version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version); - -/** - * Retrieves the requested capability from the device's NvLink for the link specified - * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried - * The return value should be treated as a boolean. - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried - * @param capResult A boolean for the queried capability indicating that feature is available - * - * @return - * - \ref NVML_SUCCESS if \a capResult has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult); - -/** - * Retrieves the PCI information for the remote node on a NvLink link - * Note: pciSubSystemId is not filled in this function and is indeterminate - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param pci \a nvmlPciInfo_t of the remote node for the specified link - * - * @return - * - \ref NVML_SUCCESS if \a pci has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); - -/** - * Retrieves the specified error counter value - * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the NvLink counter to be queried - * @param counterValue Returned counter value - * - * @return - * - \ref NVML_SUCCESS if \a counter has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, - nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue); - -/** - * Resets all error counters to zero - * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * - * @return - * - \ref NVML_SUCCESS if the reset is successful - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); - -/** - * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. - * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset - * of the counters if the reset parameter is non-zero. - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param counter Specifies the counter that should be set (0 or 1). - * @param link Specifies the NvLink link to be queried - * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set - * @param reset Resets the counters on set if non-zero - * - * @return - * - \ref NVML_SUCCESS if the control has been set successfully - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, - nvmlNvLinkUtilizationControl_t *control, unsigned int reset); - -/** - * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. - * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param counter Specifies the counter that should be set (0 or 1). - * @param link Specifies the NvLink link to be queried - * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information - * - * @return - * - \ref NVML_SUCCESS if the control has been set successfully - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, - nvmlNvLinkUtilizationControl_t *control); - - -/** - * Retrieve the NVLINK utilization counter based on the current control for a specified counter. - * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl - * before reading the utilization counters as they have no default state - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the counter that should be read (0 or 1). - * @param rxcounter Receive counter return value - * @param txcounter Transmit counter return value - * - * @return - * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, - unsigned long long *rxcounter, unsigned long long *txcounter); - -/** - * Freeze the NVLINK utilization counters - * Both the receive and transmit counters are operated on by this function - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the counter that should be frozen (0 or 1). - * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters - * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters - * - * @return - * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, - unsigned int counter, nvmlEnableState_t freeze); - -/** - * Reset the NVLINK utilization counters - * Both the receive and transmit counters are operated on by this function - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be reset - * @param counter Specifies the counter that should be reset (0 or 1) - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlEvents Event Handling Methods - * This chapter describes methods that NVML can perform against each device to register and wait for - * some event to occur. - * @{ - */ -/***************************************************************************************************/ - -/** - * Create an empty set of events. - * Event set should be freed by \ref nvmlEventSetFree - * - * For Fermi &tm; or newer fully supported devices. - * @param set Reference in which to return the event handle - * - * @return - * - \ref NVML_SUCCESS if the event has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventSetFree - */ -nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); - -/** - * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t - * - * For Fermi &tm; or newer fully supported devices. - * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) - * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode) - * - * For Linux only. - * - * \b IMPORTANT: Operations on \a set are not thread safe - * - * This call starts recording of events on specific device. - * All events that occurred before this call are not recorded. - * Checking if some event occurred can be done with \ref nvmlEventSetWait - * - * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. - * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes - * are registered in that case. - * - * @param device The identifier of the target device - * @param eventTypes Bitmask of \ref nvmlEventType to record - * @param set Set to which add new event types - * - * @return - * - \ref NVML_SUCCESS if the event has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventType - * @see nvmlDeviceGetSupportedEventTypes - * @see nvmlEventSetWait - * @see nvmlEventSetFree - */ -nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set); - -/** - * Returns information about events supported on device - * - * For Fermi &tm; or newer fully supported devices. - * - * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. - * - * @param device The identifier of the target device - * @param eventTypes Reference in which to return bitmask of supported events - * - * @return - * - \ref NVML_SUCCESS if the eventTypes has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventType - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); - -/** - * Waits on events and delivers events - * - * For Fermi &tm; or newer fully supported devices. - * - * If some events are ready to be delivered at the time of the call, function returns immediately. - * If there are no events ready to be delivered, function sleeps till event arrives - * but not longer than specified timeout. This function in certain conditions can return before - * specified timeout passes (e.g. when interrupt arrives) - * - * In case of xid error, the function returns the most recent xid error type seen by the system. If there are multiple - * xid errors generated before nvmlEventSetWait is invoked then the last seen xid error type is returned for all - * xid error events. - * - * @param set Reference to set of events to wait on - * @param data Reference in which to return event data - * @param timeoutms Maximum amount of wait time in milliseconds for registered event - * - * @return - * - \ref NVML_SUCCESS if the data has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL - * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived - * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventType - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); - -/** - * Releases events in the set - * - * For Fermi &tm; or newer fully supported devices. - * - * @param set Reference to events to be released - * - * @return - * - \ref NVML_SUCCESS if the event has been successfully released - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlZPI Drain states - * This chapter describes methods that NVML can perform against each device to control their drain state - * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to - * power on/off GPUs, enable robust reset scenarios, etc. - * @{ - */ -/***************************************************************************************************/ - -/** - * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. - * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before - * this call is made. - * Must be called as administrator. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI address of the GPU drain state to be modified - * @param newState The drain state that should be entered, see \ref nvmlEnableState_t - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation - * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState); - -/** - * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining - * state. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI address of the GPU drain state to be queried - * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState); - -/** - * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver - * as long as no other processes are attached. If other processes are attached, this call will return - * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the - * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called - * to initiate the draining state is if that process was using, and is still using, a GPU before the - * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled - * prior to this call. - * - * For long-running NVML processes please note that this will change the enumeration of current GPUs. - * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. - * Also, device handles after the removed GPU will not be valid and must be re-established. - * Must be run as administrator. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI address of the GPU to be removed - * @param gpuState Whether the GPU is to be removed, from the OS - * see \ref nvmlDetachGpuState_t - * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed - */ -nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); - -/** - * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that - * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. - * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes - * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. - * - * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds - * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery. - * - * Must be run as administrator. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device - * fields are used in this call. - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature - * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature - * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlFieldValueQueries Field Value Queries - * This chapter describes NVML operations that are associated with retrieving Field Values from NVML - * @{ - */ -/***************************************************************************************************/ - -/** - * Request values for a list of fields for a device. This API allows multiple fields to be queried at once. - * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs - * will be populated from a single call rather than making a driver call for each fieldId. - * - * @param device The device handle of the GPU to request field values for - * @param valuesCount Number of entries in values that should be retrieved - * @param values Array of \a valuesCount structures to hold field values. - * Each value's fieldId must be populated prior to this call - * - * @return - * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must - * check the nvmlReturn field of each value for each individual - * status - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL - */ -nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); - - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlGridQueries Grid Queries - * This chapter describes NVML operations that are associated with NVIDIA GRID products. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to get the virtualization mode corresponding to the GPU. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device Identifier of the target device - * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? - * - * @return - * - \ref NVML_SUCCESS if \a pVirtualMode is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlGridCommands Grid Commands - * This chapter describes NVML operations that are associated with NVIDIA GRID products. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to set the virtualization mode corresponding to the GPU. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device Identifier of the target device - * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? - * - * @return - * - \ref NVML_SUCCESS if \a pVirtualMode is set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. - * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. - */ -nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpu vGPU Management - * @{ - * - * Set of APIs supporting GRID vGPU - */ -/***************************************************************************************************/ - -/** - * Retrieve the supported vGPU types on a physical GPU (device). - * - * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer - * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount - * is used to return the number of vGPU types written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. - * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. - * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. - * - * @param device The identifier of the target device - * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types - * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); - -/** - * Retrieve the currently creatable vGPU types on a physical GPU (device). - * - * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer - * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount - * is used to return the number of vGPU types written to the buffer. - * - * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types - * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable - * list will be restricted to whatever vGPU type is already running on the device. - * - * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. - * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0. - * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. - * - * @param device The identifier of the target device - * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types - * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); - -/** - * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeClass Pointer to string array to return class in - * @param size Size of string - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size); - -/** - * Retrieve the vGPU type name. - * - * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not - * exceed 64 characters in length (including the NUL terminator). See \ref - * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeName Pointer to buffer to return name - * @param size Size of buffer - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size); - -/** - * Retrieve the device ID of a vGPU type. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value - * @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID); - -/** - * Retrieve the vGPU framebuffer size in bytes. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param fbSize Pointer to framebuffer size in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize); - -/** - * Retrieve count of vGPU's supported display heads. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param numDisplayHeads Pointer to number of display heads - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads); - -/** - * Retrieve vGPU display head's maximum supported resolution. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param displayIndex Zero-based index of display head - * @param xdim Pointer to maximum number of pixels in X dimension - * @param ydim Pointer to maximum number of pixels in Y dimension - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex - * is out of range. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim); - -/** - * Retrieve license requirements for a vGPU type - * - * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form - * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license, - * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0". - * - * The total length of the returned string will not exceed 128 characters, including the NUL terminator. - * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeLicenseString Pointer to buffer to return license info - * @param size Size of \a vgpuTypeLicenseString buffer - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size); - -/** - * Retrieve the static frame rate limit value of the vGPU type - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param frameRateLimit Reference to return the frame rate limit value - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a frameRateLimit is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit); - -/** - * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param vgpuTypeId Handle to vGPU type - * @param vgpuInstanceCount Pointer to get the max number of vGPU instances - * that can be created on a deicve for given vgpuTypeId - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device, - * or \a vgpuInstanceCount is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount); - -/** - * Retrieve the active vGPU instances on a device. - * - * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The - * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. - * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return - * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param vgpuCount Pointer which passes in the array size as well as get - * back the number of types - * @param vgpuInstances Pointer to array in which to return list of vGPU instances - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances); - -/** - * Retrieve the VM ID associated with a vGPU instance. - * - * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param vmId Pointer to caller-supplied buffer to hold VM ID - * @param size Size of buffer in bytes - * @param vmIdType Pointer to hold VM ID type - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vmId or \a vmIdType is NULL, or \a vgpuInstance is 0 - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType); - -/** - * Retrieve the UUID of a vGPU instance. - * - * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string, - * not exceeding 80 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID - * @param size Size of buffer in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a uuid is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size); - -/** - * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU. - * - * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version - * string will not exceed 80 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. - * - * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is - * returned as "Unknown" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the - * NVIDIA driver is loaded and initialized. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param version Caller-supplied buffer to return driver version string - * @param length Size of \a version buffer - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0 - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length); - -/** - * Retrieve the framebuffer usage in bytes. - * - * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance The identifier of the target instance - * @param fbUsage Pointer to framebuffer usage in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbUsage is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); - -/** - * Retrieve the current licensing state of the vGPU instance. - * - * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param licensed Reference to return the licensing status - * - * @return - * - \ref NVML_SUCCESS if \a licensed has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licensed is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); - -/** - * Retrieve the vGPU type of a vGPU instance. - * - * Returns the vGPU type ID of vgpu assigned to the vGPU instance. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param vgpuTypeId Reference to return the vgpuTypeId - * - * @return - * - \ref NVML_SUCCESS if \a vgpuTypeId has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuTypeId is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); - -/** - * Retrieve the frame rate limit set for the vGPU instance. - * - * Returns the value of the frame rate limit set for the vGPU instance - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param frameRateLimit Reference to return the frame rate limit - * - * @return - * - \ref NVML_SUCCESS if \a frameRateLimit has been set - * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a frameRateLimit is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); - -/** - * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param encoderCapacity Reference to an unsigned int for the encoder capacity - * - * @return - * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderQueryType is invalid - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); - -/** - * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param encoderCapacity Unsigned int for the encoder capacity value - * - * @return - * - \ref NVML_SUCCESS if \a encoderCapacity has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0 - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); - -/** - * Retrieves current utilization for vGPUs on a physical GPU (device). - * - * For Kepler &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running - * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer - * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the - * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values - * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to - * indicate the returned value type. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance - * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate - * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with - * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the - * buffer is sized for. - * - * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample - * structures that were actually written. This may differ from a previously read value as vGPU instances are created or - * destroyed. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier for the target device - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values - * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances - * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned - - * @return - * - \ref NVML_SUCCESS if utilization samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is - * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all - * vGPU instances currently executing on the device - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, - nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, - nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); - -/** - * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). - * - * For Maxwell &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on - * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the - * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running - * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which - * the samples were recorded. Individual utilization values are returned as "unsigned int" values. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance - * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size - * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with - * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the - * buffer is sized for. - * - * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample - * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active - * in any given sample period. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier for the target device - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances - * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned - - * @return - * - \ref NVML_SUCCESS if utilization samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is - * passed with a non-NULL \a utilizationSamples - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all - * vGPU instances currently executing on the device - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, - unsigned int *vgpuProcessSamplesCount, - nvmlVgpuProcessUtilizationSample_t *utilizationSamples); -/** - * Retrieve the GRID licensable features. - * - * Identifies whether the system supports GRID Software Licensing. If it does, return the list of licensable feature(s) - * and their current license status. - * - * @param device Identifier of the target device - * @param pGridLicensableFeatures Pointer to structure in which GRID licensable features are returned - * - * @return - * - \ref NVML_SUCCESS if licensable features are successfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); - -/** - * Retrieves the current encoder statistics of a vGPU Instance - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param sessionCount Reference to an unsigned int for count of active encoder sessions - * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions - * @param averageLatency Reference to an unsigned int for encode latency in microseconds - * - * @return - * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL - * or \a vgpuInstance is 0. - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, - unsigned int *averageFps, unsigned int *averageLatency); - -/** - * Retrieves information about all active encoder sessions on a vGPU Instance. - * - * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The - * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. - * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return - * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param sessionCount Reference to caller supplied array size, and returns - * the number of sessions. - * @param sessionInfo Reference to caller supplied array in which the list - * of session information us returned. - * - * @return - * - \ref NVML_SUCCESS if \a sessionInfo is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is - returned in \a sessionCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL, or \a vgpuInstance is 0. - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo); - -/** -* Retrieves the active frame buffer capture sessions statistics of a vGPU Instance -* -* For Maxwell &tm; or newer fully supported devices. -* -* @param vgpuInstance Identifier of the target vGPU instance -* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats -* -* @return -* - \ref NVML_SUCCESS if \a fbcStats is fetched -* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized -* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbcStats is NULL -* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system -* - \ref NVML_ERROR_UNKNOWN on any unexpected error -*/ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFBCStats_t *fbcStats); - -/** -* Retrieves information about active frame buffer capture sessions on a vGPU Instance. -* -* An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The -* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions -* written to the buffer. -* -* If the supplied buffer is not large enough to accomodate the active session array, the function returns -* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. -* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return -* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. -* -* For Maxwell &tm; or newer fully supported devices. -* -* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may -* be zero if there are no new frames captured since the session started. -* -* @param vgpuInstance Identifier of the target vGPU instance -* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. -* @param sessionInfo Reference in which to return the session information -* -* @return -* - \ref NVML_SUCCESS if \a sessionInfo is fetched -* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized -* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a sessionCount is NULL. -* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system -* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount -* - \ref NVML_ERROR_UNKNOWN on any unexpected error -*/ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); - -/** - * Retrieves the current utilization and process ID - * - * For Maxwell &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. - * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at - * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization - * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values - * are returned as "unsigned int" values. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilization set to NULL. The caller should allocate a buffer of size - * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed - * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. - * - * On successful return, the function updates \a processSamplesCount with the number of process utilization sample - * structures that were actually written. This may differ from a previously read value as instances are created or - * destroyed. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier of the target device - * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned - * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, - unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); - -/** - * Queries the state of per process accounting mode on vGPU. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance The identifier of the target vGPU VM - * @param mode Reference in which to return the current accounting mode - * - * @return - * - \ref NVML_SUCCESS if the mode has been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *mode); - -/** - * Queries list of processes running on vGPU that can be queried for accounting stats. The list of processes - * returned can be in running or terminated state. - * - * For Maxwell &tm; or newer fully supported devices. - * - * To just query the maximum number of processes that can be queried, call this function with *count = 0 and - * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. - * - * For more details see \ref nvmlVgpuInstanceGetAccountingStats. - * - * @note In case of PID collision some processes might not be accessible before the circular buffer is full. - * - * @param vgpuInstance The identifier of the target vGPU VM - * @param count Reference in which to provide the \a pids array size, and - * to return the number of elements ready to be queried - * @param pids Reference in which to return list of process ids - * - * @return - * - \ref NVML_SUCCESS if pids were successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a count is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to expected value) - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlVgpuInstanceGetAccountingPids - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int *count, unsigned int *pids); - -/** - * Queries process's accounting stats. - * - * For Maxwell &tm; or newer fully supported devices. - * - * Accounting stats capture GPU utilization and other statistics across the lifetime of a process, and - * can be queried during life time of the process or after its termination. - * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and - * updated to actual running time after its termination. - * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old - * processes. - * - * See \ref nvmlAccountingStats_t for description of each returned metric. - * List of processes that can be queried can be retrieved from \ref nvmlVgpuInstanceGetAccountingPids. - * - * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. - * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be - * queried since they don't contribute to GPU utilization. - * @note In case of pid collision stats of only the latest process (that terminated last) will be reported - * - * @param vgpuInstance The identifier of the target vGPU VM - * @param pid Process Id of the target process to query stats for - * @param stats Reference in which to return the process's accounting stats - * - * @return - * - \ref NVML_SUCCESS if stats have been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a stats is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * or \a stats is not found - * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t *stats); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvml vGPU Migration - * This chapter describes NVML operations that are associated with vGPU Migration. - * @{ - */ -/***************************************************************************************************/ - -/** - * vGPU metadata structure. - */ -typedef struct nvmlVgpuMetadata_st -{ - unsigned int version; //!< Current version of the structure - unsigned int revision; //!< Current revision of the structure - nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields - char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest - char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host - unsigned int reserved[8]; //!< Reserved for internal use - unsigned int opaqueDataSize; //!< Size of opaque data field in bytes - char opaqueData[4]; //!< Opaque data -} nvmlVgpuMetadata_t; - -/** - * Physical GPU metadata structure - */ -typedef struct nvmlVgpuPgpuMetadata_st -{ - unsigned int version; //!< Current version of the structure - unsigned int revision; //!< Current revision of the structure - char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version - unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld - unsigned int reserved[7]; //!< Reserved for internal use - unsigned int opaqueDataSize; //!< Size of opaque data field in bytes - char opaqueData[4]; //!< Opaque data -} nvmlVgpuPgpuMetadata_t; - -/** - * vGPU VM compatibility codes - */ -typedef enum nvmlVgpuVmCompatibility_enum -{ - NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0, //!< vGPU is not runnable - NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1, //!< vGPU is runnable from a cold / powered-off state (ACPI S5) - NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2, //!< vGPU is runnable from a hibernated state (ACPI S4) - NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4, //!< vGPU is runnable from a sleeped state (ACPI S3) - NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8, //!< vGPU is runnable from a live/paused (ACPI S0) -} nvmlVgpuVmCompatibility_t; - -/** - * vGPU-pGPU compatibility limit codes - */ -typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum -{ - NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0, //!< Compatibility is not limited. - NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1, //!< Compatibility is limited by host driver version. - NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2, //!< Compatibility is limited by guest driver version. - NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4, //!< Compatibility is limited by GPU hardware. - NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000, //!< Compatibility is limited by an undefined factor. -} nvmlVgpuPgpuCompatibilityLimitCode_t; - -/** - * vGPU-pGPU compatibility structure - */ -typedef struct nvmlVgpuPgpuCompatibility_st -{ - nvmlVgpuVmCompatibility_t vgpuVmCompatibility; //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t - nvmlVgpuPgpuCompatibilityLimitCode_t compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t -} nvmlVgpuPgpuCompatibility_t; - -/** - * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM - * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section - * containing internal state. - * - * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are - * dependent on information obtained from the guest VM, which may not yet have reached a state where that information - * is available. The current state of these dependent fields is reflected in the info structure's \ref guestInfoState field. - * - * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide - * it to GRID Virtual GPU Manager when creating a vGPU for subsequent instances of the VM. - * - * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure - * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed - * in \a bufferSize. - * - * @param vgpuInstance vGPU instance handle - * @param vgpuMetadata Pointer to caller-supplied buffer into which vGPU metadata is written - * @param bufferSize Size of vgpuMetadata buffer - * - * @return - * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned - * - \ref NVML_ERROR_INSUFFICIENT_SIZE vgpuMetadata buffer is too small, required size is returned in \a bufferSize - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a vgpuInstance is 0; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0. - * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize); - -/** - * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about - * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section - * containing internal state. - * - * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata - * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed - * in \a bufferSize. - * - * @param device The identifier of the target device - * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written - * @param bufferSize Pointer to size of \a pgpuMetadata buffer - * - * @return - * - \ref NVML_SUCCESS GPU metadata structure was successfully returned - * - \ref NVML_ERROR_INSUFFICIENT_SIZE pgpuMetadata buffer is too small, required size is returned in \a bufferSize - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. - * - \ref NVML_ERROR_NOT_SUPPORTED vGPU is not supported by the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize); - -/** - * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a - * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the - * physical GPU. - * - * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The - * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility - * with the physical GPU is limited, a limit code indicates the factor limiting compability. - * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). - * - * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to - * boot a given vGPU or associated VM. - * - * @param vgpuMetadata Pointer to caller-supplied vGPU metadata structure - * @param pgpuMetadata Pointer to caller-supplied GPU metadata structure - * @param compatibilityInfo Pointer to caller-supplied buffer to hold compatibility info - * - * @return - * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlGpuBlacklistQueries GPU Blacklist Queries - * This chapter describes NVML operations that are associated with blacklisted GPUs. - * @{ - */ -/***************************************************************************************************/ - -/** - * Blacklist GPU device information - **/ -typedef struct nvmlBlacklistDeviceInfo_st -{ - nvmlPciInfo_t pciInfo; //!< The PCI information for the blacklisted GPU - char uuid[NVML_DEVICE_UUID_BUFFER_SIZE]; //!< The ASCII string UUID for the blacklisted GPU -} nvmlBlacklistDeviceInfo_t; - - /** - * Retrieves the number of blacklisted GPU devices in the system. - * - * For all products. - * - * @param deviceCount Reference in which to return the number of blacklisted devices - * - * @return - * - \ref NVML_SUCCESS if \a deviceCount has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL - */ -nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceCount(unsigned int *deviceCount); - -/** - * Acquire the device information for a blacklisted device, based on its index. - * - * For all products. - * - * Valid indices are derived from the \a deviceCount returned by - * \ref nvmlGetBlacklistDeviceCount(). For example, if \a deviceCount is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. - * - * @param index The index of the target GPU, >= 0 and < \a deviceCount - * @param info Reference in which to return the device information - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a info is NULL - * - * @see nvmlGetBlacklistDeviceCount - */ -nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceInfoByIndex(unsigned int index, nvmlBlacklistDeviceInfo_t *info); - -/** @} */ - -/** - * NVML API versioning support - */ -#if defined(__NVML_API_VERSION_INTERNAL) -#undef nvmlDeviceRemoveGpu -#undef nvmlDeviceGetNvLinkRemotePciInfo -#undef nvmlDeviceGetPciInfo -#undef nvmlDeviceGetCount -#undef nvmlDeviceGetHandleByIndex -#undef nvmlDeviceGetHandleByPciBusId -#undef nvmlInit -#endif - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/include/triton/external/half.hpp b/include/triton/external/half.hpp deleted file mode 100644 index 625cce7cb..000000000 --- a/include/triton/external/half.hpp +++ /dev/null @@ -1,3067 +0,0 @@ -// half - IEEE 754-based half-precision floating point library. -// -// Copyright (c) 2012-2017 Christian Rau -// -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, -// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -// Version 1.12.0 - -/// \file -/// Main header file for half precision functionality. - -#ifndef HALF_HALF_HPP -#define HALF_HALF_HPP - -/// Combined gcc version number. -#define HALF_GNUC_VERSION (__GNUC__*100+__GNUC_MINOR__) - -//check C++11 language features -#if defined(__clang__) //clang - #if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) - #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 - #endif - #if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR) - #define HALF_ENABLE_CPP11_CONSTEXPR 1 - #endif - #if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT) - #define HALF_ENABLE_CPP11_NOEXCEPT 1 - #endif - #if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS) - #define HALF_ENABLE_CPP11_USER_LITERALS 1 - #endif - #if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG) - #define HALF_ENABLE_CPP11_LONG_LONG 1 - #endif -/*#elif defined(__INTEL_COMPILER) //Intel C++ - #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) ???????? - #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 - #endif - #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) ???????? - #define HALF_ENABLE_CPP11_CONSTEXPR 1 - #endif - #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) ???????? - #define HALF_ENABLE_CPP11_NOEXCEPT 1 - #endif - #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_LONG_LONG) ???????? - #define HALF_ENABLE_CPP11_LONG_LONG 1 - #endif*/ -#elif defined(__GNUC__) //gcc - #if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L - #if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) - #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 - #endif - #if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) - #define HALF_ENABLE_CPP11_CONSTEXPR 1 - #endif - #if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) - #define HALF_ENABLE_CPP11_NOEXCEPT 1 - #endif - #if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) - #define HALF_ENABLE_CPP11_USER_LITERALS 1 - #endif - #if !defined(HALF_ENABLE_CPP11_LONG_LONG) - #define HALF_ENABLE_CPP11_LONG_LONG 1 - #endif - #endif -#elif defined(_MSC_VER) //Visual C++ - #if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) - #define HALF_ENABLE_CPP11_CONSTEXPR 1 - #endif - #if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) - #define HALF_ENABLE_CPP11_NOEXCEPT 1 - #endif - #if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) - #define HALF_ENABLE_CPP11_USER_LITERALS 1 - #endif - #if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) - #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 - #endif - #if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG) - #define HALF_ENABLE_CPP11_LONG_LONG 1 - #endif - #define HALF_POP_WARNINGS 1 - #pragma warning(push) - #pragma warning(disable : 4099 4127 4146) //struct vs class, constant in if, negative unsigned -#endif - -//check C++11 library features -#include -#if defined(_LIBCPP_VERSION) //libc++ - #if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 - #ifndef HALF_ENABLE_CPP11_TYPE_TRAITS - #define HALF_ENABLE_CPP11_TYPE_TRAITS 1 - #endif - #ifndef HALF_ENABLE_CPP11_CSTDINT - #define HALF_ENABLE_CPP11_CSTDINT 1 - #endif - #ifndef HALF_ENABLE_CPP11_CMATH - #define HALF_ENABLE_CPP11_CMATH 1 - #endif - #ifndef HALF_ENABLE_CPP11_HASH - #define HALF_ENABLE_CPP11_HASH 1 - #endif - #endif -#elif defined(__GLIBCXX__) //libstdc++ - #if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 - #ifdef __clang__ - #if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS) - #define HALF_ENABLE_CPP11_TYPE_TRAITS 1 - #endif - #if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT) - #define HALF_ENABLE_CPP11_CSTDINT 1 - #endif - #if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH) - #define HALF_ENABLE_CPP11_CMATH 1 - #endif - #if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH) - #define HALF_ENABLE_CPP11_HASH 1 - #endif - #else - #if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT) - #define HALF_ENABLE_CPP11_CSTDINT 1 - #endif - #if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH) - #define HALF_ENABLE_CPP11_CMATH 1 - #endif - #if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH) - #define HALF_ENABLE_CPP11_HASH 1 - #endif - #endif - #endif -#elif defined(_CPPLIB_VER) //Dinkumware/Visual C++ - #if _CPPLIB_VER >= 520 - #ifndef HALF_ENABLE_CPP11_TYPE_TRAITS - #define HALF_ENABLE_CPP11_TYPE_TRAITS 1 - #endif - #ifndef HALF_ENABLE_CPP11_CSTDINT - #define HALF_ENABLE_CPP11_CSTDINT 1 - #endif - #ifndef HALF_ENABLE_CPP11_HASH - #define HALF_ENABLE_CPP11_HASH 1 - #endif - #endif - #if _CPPLIB_VER >= 610 - #ifndef HALF_ENABLE_CPP11_CMATH - #define HALF_ENABLE_CPP11_CMATH 1 - #endif - #endif -#endif -#undef HALF_GNUC_VERSION - -//support constexpr -#if HALF_ENABLE_CPP11_CONSTEXPR - #define HALF_CONSTEXPR constexpr - #define HALF_CONSTEXPR_CONST constexpr -#else - #define HALF_CONSTEXPR - #define HALF_CONSTEXPR_CONST const -#endif - -//support noexcept -#if HALF_ENABLE_CPP11_NOEXCEPT - #define HALF_NOEXCEPT noexcept - #define HALF_NOTHROW noexcept -#else - #define HALF_NOEXCEPT - #define HALF_NOTHROW throw() -#endif - -#include -#include -#include -#include -#include -#include -#if HALF_ENABLE_CPP11_TYPE_TRAITS - #include -#endif -#if HALF_ENABLE_CPP11_CSTDINT - #include -#endif -#if HALF_ENABLE_CPP11_HASH - #include -#endif - - -/// Default rounding mode. -/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and `float`s as well as -/// for the half_cast() if not specifying a rounding mode explicitly. It can be redefined (before including half.hpp) to one -/// of the standard rounding modes using their respective constants or the equivalent values of `std::float_round_style`: -/// -/// `std::float_round_style` | value | rounding -/// ---------------------------------|-------|------------------------- -/// `std::round_indeterminate` | -1 | fastest (default) -/// `std::round_toward_zero` | 0 | toward zero -/// `std::round_to_nearest` | 1 | to nearest -/// `std::round_toward_infinity` | 2 | toward positive infinity -/// `std::round_toward_neg_infinity` | 3 | toward negative infinity -/// -/// By default this is set to `-1` (`std::round_indeterminate`), which uses truncation (round toward zero, but with overflows -/// set to infinity) and is the fastest rounding mode possible. It can even be set to `std::numeric_limits::round_style` -/// to synchronize the rounding mode with that of the underlying single-precision implementation. -#ifndef HALF_ROUND_STYLE - #define HALF_ROUND_STYLE -1 // = std::round_indeterminate -#endif - -/// Tie-breaking behaviour for round to nearest. -/// This specifies if ties in round to nearest should be resolved by rounding to the nearest even value. By default this is -/// defined to `0` resulting in the faster but slightly more biased behaviour of rounding away from zero in half-way cases (and -/// thus equal to the round() function), but can be redefined to `1` (before including half.hpp) if more IEEE-conformant -/// behaviour is needed. -#ifndef HALF_ROUND_TIES_TO_EVEN - #define HALF_ROUND_TIES_TO_EVEN 0 // ties away from zero -#endif - -/// Value signaling overflow. -/// In correspondence with `HUGE_VAL[F|L]` from `` this symbol expands to a positive value signaling the overflow of an -/// operation, in particular it just evaluates to positive infinity. -#define HUGE_VALH std::numeric_limits::infinity() - -/// Fast half-precision fma function. -/// This symbol is only defined if the fma() function generally executes as fast as, or faster than, a separate -/// half-precision multiplication followed by an addition. Due to the internal single-precision implementation of all -/// arithmetic operations, this is in fact always the case. -#define FP_FAST_FMAH 1 - -#ifndef FP_ILOGB0 - #define FP_ILOGB0 INT_MIN -#endif -#ifndef FP_ILOGBNAN - #define FP_ILOGBNAN INT_MAX -#endif -#ifndef FP_SUBNORMAL - #define FP_SUBNORMAL 0 -#endif -#ifndef FP_ZERO - #define FP_ZERO 1 -#endif -#ifndef FP_NAN - #define FP_NAN 2 -#endif -#ifndef FP_INFINITE - #define FP_INFINITE 3 -#endif -#ifndef FP_NORMAL - #define FP_NORMAL 4 -#endif - - -/// Main namespace for half precision functionality. -/// This namespace contains all the functionality provided by the library. -namespace half_float -{ - class half; - -#if HALF_ENABLE_CPP11_USER_LITERALS - /// Library-defined half-precision literals. - /// Import this namespace to enable half-precision floating point literals: - /// ~~~~{.cpp} - /// using namespace half_float::literal; - /// half_float::half = 4.2_h; - /// ~~~~ - namespace literal - { - half operator""_h(long double); - } -#endif - - /// \internal - /// \brief Implementation details. - namespace detail - { - #if HALF_ENABLE_CPP11_TYPE_TRAITS - /// Conditional type. - template struct conditional : std::conditional {}; - - /// Helper for tag dispatching. - template struct bool_type : std::integral_constant {}; - using std::true_type; - using std::false_type; - - /// Type traits for floating point types. - template struct is_float : std::is_floating_point {}; - #else - /// Conditional type. - template struct conditional { typedef T type; }; - template struct conditional { typedef F type; }; - - /// Helper for tag dispatching. - template struct bool_type {}; - typedef bool_type true_type; - typedef bool_type false_type; - - /// Type traits for floating point types. - template struct is_float : false_type {}; - template struct is_float : is_float {}; - template struct is_float : is_float {}; - template struct is_float : is_float {}; - template<> struct is_float : true_type {}; - template<> struct is_float : true_type {}; - template<> struct is_float : true_type {}; - #endif - - /// Type traits for floating point bits. - template struct bits { typedef unsigned char type; }; - template struct bits : bits {}; - template struct bits : bits {}; - template struct bits : bits {}; - - #if HALF_ENABLE_CPP11_CSTDINT - /// Unsigned integer of (at least) 16 bits width. - typedef std::uint_least16_t uint16; - - /// Unsigned integer of (at least) 32 bits width. - template<> struct bits { typedef std::uint_least32_t type; }; - - /// Unsigned integer of (at least) 64 bits width. - template<> struct bits { typedef std::uint_least64_t type; }; - #else - /// Unsigned integer of (at least) 16 bits width. - typedef unsigned short uint16; - - /// Unsigned integer of (at least) 32 bits width. - template<> struct bits : conditional::digits>=32,unsigned int,unsigned long> {}; - - #if HALF_ENABLE_CPP11_LONG_LONG - /// Unsigned integer of (at least) 64 bits width. - template<> struct bits : conditional::digits>=64,unsigned long,unsigned long long> {}; - #else - /// Unsigned integer of (at least) 64 bits width. - template<> struct bits { typedef unsigned long type; }; - #endif - #endif - - /// Tag type for binary construction. - struct binary_t {}; - - /// Tag for binary construction. - HALF_CONSTEXPR_CONST binary_t binary = binary_t(); - - /// Temporary half-precision expression. - /// This class represents a half-precision expression which just stores a single-precision value internally. - struct expr - { - /// Conversion constructor. - /// \param f single-precision value to convert - explicit HALF_CONSTEXPR expr(float f) HALF_NOEXCEPT : value_(f) {} - - /// Conversion to single-precision. - /// \return single precision value representing expression value - HALF_CONSTEXPR operator float() const HALF_NOEXCEPT { return value_; } - - private: - /// Internal expression value stored in single-precision. - float value_; - }; - - /// SFINAE helper for generic half-precision functions. - /// This class template has to be specialized for each valid combination of argument types to provide a corresponding - /// `type` member equivalent to \a T. - /// \tparam T type to return - template struct enable {}; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - template struct enable { typedef T type; }; - - /// Return type for specialized generic 2-argument half-precision functions. - /// This class template has to be specialized for each valid combination of argument types to provide a corresponding - /// `type` member denoting the appropriate return type. - /// \tparam T first argument type - /// \tparam U first argument type - template struct result : enable {}; - template<> struct result { typedef half type; }; - - /// \name Classification helpers - /// \{ - - /// Check for infinity. - /// \tparam T argument type (builtin floating point type) - /// \param arg value to query - /// \retval true if infinity - /// \retval false else - template bool builtin_isinf(T arg) - { - #if HALF_ENABLE_CPP11_CMATH - return std::isinf(arg); - #elif defined(_MSC_VER) - return !::_finite(static_cast(arg)) && !::_isnan(static_cast(arg)); - #else - return arg == std::numeric_limits::infinity() || arg == -std::numeric_limits::infinity(); - #endif - } - - /// Check for NaN. - /// \tparam T argument type (builtin floating point type) - /// \param arg value to query - /// \retval true if not a number - /// \retval false else - template bool builtin_isnan(T arg) - { - #if HALF_ENABLE_CPP11_CMATH - return std::isnan(arg); - #elif defined(_MSC_VER) - return ::_isnan(static_cast(arg)) != 0; - #else - return arg != arg; - #endif - } - - /// Check sign. - /// \tparam T argument type (builtin floating point type) - /// \param arg value to query - /// \retval true if signbit set - /// \retval false else - template bool builtin_signbit(T arg) - { - #if HALF_ENABLE_CPP11_CMATH - return std::signbit(arg); - #else - return arg < T() || (arg == T() && T(1)/arg < T()); - #endif - } - - /// \} - /// \name Conversion - /// \{ - - /// Convert IEEE single-precision to half-precision. - /// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). - /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding - /// \param value single-precision value - /// \return binary representation of half-precision value - template uint16 float2half_impl(float value, true_type) - { - typedef bits::type uint32; - uint32 bits;// = *reinterpret_cast(&value); //violating strict aliasing! - std::memcpy(&bits, &value, sizeof(float)); -/* uint16 hbits = (bits>>16) & 0x8000; - bits &= 0x7FFFFFFF; - int exp = bits >> 23; - if(exp == 255) - return hbits | 0x7C00 | (0x3FF&-static_cast((bits&0x7FFFFF)!=0)); - if(exp > 142) - { - if(R == std::round_toward_infinity) - return hbits | 0x7C00 - (hbits>>15); - if(R == std::round_toward_neg_infinity) - return hbits | 0x7BFF + (hbits>>15); - return hbits | 0x7BFF + (R!=std::round_toward_zero); - } - int g, s; - if(exp > 112) - { - g = (bits>>12) & 1; - s = (bits&0xFFF) != 0; - hbits |= ((exp-112)<<10) | ((bits>>13)&0x3FF); - } - else if(exp > 101) - { - int i = 125 - exp; - bits = (bits&0x7FFFFF) | 0x800000; - g = (bits>>i) & 1; - s = (bits&((1L<> (i+1); - } - else - { - g = 0; - s = bits != 0; - } - if(R == std::round_to_nearest) - #if HALF_ROUND_TIES_TO_EVEN - hbits += g & (s|hbits); - #else - hbits += g; - #endif - else if(R == std::round_toward_infinity) - hbits += ~(hbits>>15) & (s|g); - else if(R == std::round_toward_neg_infinity) - hbits += (hbits>>15) & (g|s); -*/ static const uint16 base_table[512] = { - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, - 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, - 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, - 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, - 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, - 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, - 0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, - 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 }; - static const unsigned char shift_table[512] = { - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 }; - uint16 hbits = base_table[bits>>23] + static_cast((bits&0x7FFFFF)>>shift_table[bits>>23]); - if(R == std::round_to_nearest) - hbits += (((bits&0x7FFFFF)>>(shift_table[bits>>23]-1))|(((bits>>23)&0xFF)==102)) & ((hbits&0x7C00)!=0x7C00) - #if HALF_ROUND_TIES_TO_EVEN - & (((((static_cast(1)<<(shift_table[bits>>23]-1))-1)&bits)!=0)|hbits) - #endif - ; - else if(R == std::round_toward_zero) - hbits -= ((hbits&0x7FFF)==0x7C00) & ~shift_table[bits>>23]; - else if(R == std::round_toward_infinity) - hbits += ((((bits&0x7FFFFF&((static_cast(1)<<(shift_table[bits>>23]))-1))!=0)|(((bits>>23)<=102)& - ((bits>>23)!=0)))&(hbits<0x7C00)) - ((hbits==0xFC00)&((bits>>23)!=511)); - else if(R == std::round_toward_neg_infinity) - hbits += ((((bits&0x7FFFFF&((static_cast(1)<<(shift_table[bits>>23]))-1))!=0)|(((bits>>23)<=358)& - ((bits>>23)!=256)))&(hbits<0xFC00)&(hbits>>15)) - ((hbits==0x7C00)&((bits>>23)!=255)); - return hbits; - } - - /// Convert IEEE double-precision to half-precision. - /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding - /// \param value double-precision value - /// \return binary representation of half-precision value - template uint16 float2half_impl(double value, true_type) - { - typedef bits::type uint32; - typedef bits::type uint64; - uint64 bits;// = *reinterpret_cast(&value); //violating strict aliasing! - std::memcpy(&bits, &value, sizeof(double)); - uint32 hi = bits >> 32, lo = bits & 0xFFFFFFFF; - uint16 hbits = (hi>>16) & 0x8000; - hi &= 0x7FFFFFFF; - int exp = hi >> 20; - if(exp == 2047) - return hbits | 0x7C00 | (0x3FF&-static_cast((bits&0xFFFFFFFFFFFFF)!=0)); - if(exp > 1038) - { - if(R == std::round_toward_infinity) - return hbits | 0x7C00 - (hbits>>15); - if(R == std::round_toward_neg_infinity) - return hbits | 0x7BFF + (hbits>>15); - return hbits | 0x7BFF + (R!=std::round_toward_zero); - } - int g, s = lo != 0; - if(exp > 1008) - { - g = (hi>>9) & 1; - s |= (hi&0x1FF) != 0; - hbits |= ((exp-1008)<<10) | ((hi>>10)&0x3FF); - } - else if(exp > 997) - { - int i = 1018 - exp; - hi = (hi&0xFFFFF) | 0x100000; - g = (hi>>i) & 1; - s |= (hi&((1L<> (i+1); - } - else - { - g = 0; - s |= hi != 0; - } - if(R == std::round_to_nearest) - #if HALF_ROUND_TIES_TO_EVEN - hbits += g & (s|hbits); - #else - hbits += g; - #endif - else if(R == std::round_toward_infinity) - hbits += ~(hbits>>15) & (s|g); - else if(R == std::round_toward_neg_infinity) - hbits += (hbits>>15) & (g|s); - return hbits; - } - - /// Convert non-IEEE floating point to half-precision. - /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding - /// \tparam T source type (builtin floating point type) - /// \param value floating point value - /// \return binary representation of half-precision value - template uint16 float2half_impl(T value, ...) - { - uint16 hbits = static_cast(builtin_signbit(value)) << 15; - if(value == T()) - return hbits; - if(builtin_isnan(value)) - return hbits | 0x7FFF; - if(builtin_isinf(value)) - return hbits | 0x7C00; - int exp; - std::frexp(value, &exp); - if(exp > 16) - { - if(R == std::round_toward_infinity) - return hbits | 0x7C00 - (hbits>>15); - else if(R == std::round_toward_neg_infinity) - return hbits | 0x7BFF + (hbits>>15); - return hbits | 0x7BFF + (R!=std::round_toward_zero); - } - if(exp < -13) - value = std::ldexp(value, 24); - else - { - value = std::ldexp(value, 11-exp); - hbits |= ((exp+13)<<10); - } - T ival, frac = std::modf(value, &ival); - hbits += static_cast(std::abs(static_cast(ival))); - if(R == std::round_to_nearest) - { - frac = std::abs(frac); - #if HALF_ROUND_TIES_TO_EVEN - hbits += (frac>T(0.5)) | ((frac==T(0.5))&hbits); - #else - hbits += frac >= T(0.5); - #endif - } - else if(R == std::round_toward_infinity) - hbits += frac > T(); - else if(R == std::round_toward_neg_infinity) - hbits += frac < T(); - return hbits; - } - - /// Convert floating point to half-precision. - /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding - /// \tparam T source type (builtin floating point type) - /// \param value floating point value - /// \return binary representation of half-precision value - template uint16 float2half(T value) - { - return float2half_impl(value, bool_type::is_iec559&&sizeof(typename bits::type)==sizeof(T)>()); - } - - /// Convert integer to half-precision floating point. - /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding - /// \tparam S `true` if value negative, `false` else - /// \tparam T type to convert (builtin integer type) - /// \param value non-negative integral value - /// \return binary representation of half-precision value - template uint16 int2half_impl(T value) - { - #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_integral::value, "int to half conversion only supports builtin integer types"); - #endif - if(S) - value = -value; - uint16 bits = S << 15; - if(value > 0xFFFF) - { - if(R == std::round_toward_infinity) - bits |= 0x7C00 - S; - else if(R == std::round_toward_neg_infinity) - bits |= 0x7BFF + S; - else - bits |= 0x7BFF + (R!=std::round_toward_zero); - } - else if(value) - { - unsigned int m = value, exp = 24; - for(; m<0x400; m<<=1,--exp) ; - for(; m>0x7FF; m>>=1,++exp) ; - bits |= (exp<<10) + m; - if(exp > 24) - { - if(R == std::round_to_nearest) - bits += (value>>(exp-25)) & 1 - #if HALF_ROUND_TIES_TO_EVEN - & (((((1<<(exp-25))-1)&value)!=0)|bits) - #endif - ; - else if(R == std::round_toward_infinity) - bits += ((value&((1<<(exp-24))-1))!=0) & !S; - else if(R == std::round_toward_neg_infinity) - bits += ((value&((1<<(exp-24))-1))!=0) & S; - } - } - return bits; - } - - /// Convert integer to half-precision floating point. - /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding - /// \tparam T type to convert (builtin integer type) - /// \param value integral value - /// \return binary representation of half-precision value - template uint16 int2half(T value) - { - return (value<0) ? int2half_impl(value) : int2half_impl(value); - } - - /// Convert half-precision to IEEE single-precision. - /// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). - /// \param value binary representation of half-precision value - /// \return single-precision value - inline float half2float_impl(uint16 value, float, true_type) - { - typedef bits::type uint32; -/* uint32 bits = static_cast(value&0x8000) << 16; - int abs = value & 0x7FFF; - if(abs) - { - bits |= 0x38000000 << static_cast(abs>=0x7C00); - for(; abs<0x400; abs<<=1,bits-=0x800000) ; - bits += static_cast(abs) << 13; - } -*/ static const uint32 mantissa_table[2048] = { - 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, - 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, - 0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, - 0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, - 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, - 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, - 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, - 0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, - 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, - 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, - 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, - 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, - 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, - 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, - 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, - 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, - 0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, - 0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, - 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, - 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, - 0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, - 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, - 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, - 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, - 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, - 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, - 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, - 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, - 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, - 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, - 0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, - 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, - 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, - 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, - 0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, - 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, - 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, - 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, - 0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, - 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, - 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, - 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, - 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, - 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, - 0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, - 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, - 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, - 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, - 0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, - 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, - 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, - 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, - 0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, - 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, - 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, - 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, - 0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, - 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, - 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, - 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, - 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, - 0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, - 0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, - 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, - 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, - 0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, - 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, - 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, - 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, - 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, - 0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, - 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, - 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, - 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, - 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, - 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, - 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, - 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, - 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, - 0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, - 0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, - 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, - 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, - 0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, - 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, - 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, - 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, - 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, - 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, - 0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, - 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, - 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, - 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, - 0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, - 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, - 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, - 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, - 0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, - 0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, - 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, - 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, - 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, - 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, - 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, - 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, - 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, - 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, - 0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, - 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, - 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, - 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, - 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, - 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, - 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, - 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, - 0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, - 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, - 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, - 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, - 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, - 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, - 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, - 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, - 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, - 0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, - 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, - 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, - 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 }; - static const uint32 exponent_table[64] = { - 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, - 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, - 0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, - 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 }; - static const unsigned short offset_table[64] = { - 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, - 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 }; - uint32 bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10]; -// return *reinterpret_cast(&bits); //violating strict aliasing! - float out; - std::memcpy(&out, &bits, sizeof(float)); - return out; - } - - /// Convert half-precision to IEEE double-precision. - /// \param value binary representation of half-precision value - /// \return double-precision value - inline double half2float_impl(uint16 value, double, true_type) - { - typedef bits::type uint32; - typedef bits::type uint64; - uint32 hi = static_cast(value&0x8000) << 16; - int abs = value & 0x7FFF; - if(abs) - { - hi |= 0x3F000000 << static_cast(abs>=0x7C00); - for(; abs<0x400; abs<<=1,hi-=0x100000) ; - hi += static_cast(abs) << 10; - } - uint64 bits = static_cast(hi) << 32; -// return *reinterpret_cast(&bits); //violating strict aliasing! - double out; - std::memcpy(&out, &bits, sizeof(double)); - return out; - } - - /// Convert half-precision to non-IEEE floating point. - /// \tparam T type to convert to (builtin integer type) - /// \param value binary representation of half-precision value - /// \return floating point value - template T half2float_impl(uint16 value, T, ...) - { - T out; - int abs = value & 0x7FFF; - if(abs > 0x7C00) - out = std::numeric_limits::has_quiet_NaN ? std::numeric_limits::quiet_NaN() : T(); - else if(abs == 0x7C00) - out = std::numeric_limits::has_infinity ? std::numeric_limits::infinity() : std::numeric_limits::max(); - else if(abs > 0x3FF) - out = std::ldexp(static_cast((abs&0x3FF)|0x400), (abs>>10)-25); - else - out = std::ldexp(static_cast(abs), -24); - return (value&0x8000) ? -out : out; - } - - /// Convert half-precision to floating point. - /// \tparam T type to convert to (builtin integer type) - /// \param value binary representation of half-precision value - /// \return floating point value - template T half2float(uint16 value) - { - return half2float_impl(value, T(), bool_type::is_iec559&&sizeof(typename bits::type)==sizeof(T)>()); - } - - /// Convert half-precision floating point to integer. - /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding - /// \tparam E `true` for round to even, `false` for round away from zero - /// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits) - /// \param value binary representation of half-precision value - /// \return integral value - template T half2int_impl(uint16 value) - { - #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_integral::value, "half to int conversion only supports builtin integer types"); - #endif - unsigned int e = value & 0x7FFF; - if(e >= 0x7C00) - return (value&0x8000) ? std::numeric_limits::min() : std::numeric_limits::max(); - if(e < 0x3800) - { - if(R == std::round_toward_infinity) - return T(~(value>>15)&(e!=0)); - else if(R == std::round_toward_neg_infinity) - return -T(value>0x8000); - return T(); - } - unsigned int m = (value&0x3FF) | 0x400; - e >>= 10; - if(e < 25) - { - if(R == std::round_to_nearest) - m += (1<<(24-e)) - (~(m>>(25-e))&E); - else if(R == std::round_toward_infinity) - m += ((value>>15)-1) & ((1<<(25-e))-1U); - else if(R == std::round_toward_neg_infinity) - m += -(value>>15) & ((1<<(25-e))-1U); - m >>= 25 - e; - } - else - m <<= e - 25; - return (value&0x8000) ? -static_cast(m) : static_cast(m); - } - - /// Convert half-precision floating point to integer. - /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding - /// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits) - /// \param value binary representation of half-precision value - /// \return integral value - template T half2int(uint16 value) { return half2int_impl(value); } - - /// Convert half-precision floating point to integer using round-to-nearest-away-from-zero. - /// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits) - /// \param value binary representation of half-precision value - /// \return integral value - template T half2int_up(uint16 value) { return half2int_impl(value); } - - /// Round half-precision number to nearest integer value. - /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding - /// \tparam E `true` for round to even, `false` for round away from zero - /// \param value binary representation of half-precision value - /// \return half-precision bits for nearest integral value - template uint16 round_half_impl(uint16 value) - { - unsigned int e = value & 0x7FFF; - uint16 result = value; - if(e < 0x3C00) - { - result &= 0x8000; - if(R == std::round_to_nearest) - result |= 0x3C00U & -(e>=(0x3800+E)); - else if(R == std::round_toward_infinity) - result |= 0x3C00U & -(~(value>>15)&(e!=0)); - else if(R == std::round_toward_neg_infinity) - result |= 0x3C00U & -(value>0x8000); - } - else if(e < 0x6400) - { - e = 25 - (e>>10); - unsigned int mask = (1<>e)&E); - else if(R == std::round_toward_infinity) - result += mask & ((value>>15)-1); - else if(R == std::round_toward_neg_infinity) - result += mask & -(value>>15); - result &= ~mask; - } - return result; - } - - /// Round half-precision number to nearest integer value. - /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding - /// \param value binary representation of half-precision value - /// \return half-precision bits for nearest integral value - template uint16 round_half(uint16 value) { return round_half_impl(value); } - - /// Round half-precision number to nearest integer value using round-to-nearest-away-from-zero. - /// \param value binary representation of half-precision value - /// \return half-precision bits for nearest integral value - inline uint16 round_half_up(uint16 value) { return round_half_impl(value); } - /// \} - - struct functions; - template struct unary_specialized; - template struct binary_specialized; - template struct half_caster; - } - - /// Half-precision floating point type. - /// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and - /// conversions. It is implicitly convertible to single-precision floating point, which makes artihmetic expressions and - /// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations - /// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to - /// half-precision are done using the library's default rounding mode, but temporary results inside chained arithmetic - /// expressions are kept in single-precision as long as possible (while of course still maintaining a strong half-precision type). - /// - /// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and - /// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which - /// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the - /// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be of - /// exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will most - /// probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying 16-bit - /// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 bits if - /// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the case on - /// nearly any reasonable platform. - /// - /// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable - /// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation. - class half - { - friend struct detail::functions; - friend struct detail::unary_specialized; - friend struct detail::binary_specialized; - template friend struct detail::half_caster; - friend class std::numeric_limits; - #if HALF_ENABLE_CPP11_HASH - friend struct std::hash; - #endif - #if HALF_ENABLE_CPP11_USER_LITERALS - friend half literal::operator""_h(long double); - #endif - - public: - /// Default constructor. - /// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics - /// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics. - HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {} - - /// Copy constructor. - /// \tparam T type of concrete half expression - /// \param rhs half expression to copy from - half(detail::expr rhs) : data_(detail::float2half(static_cast(rhs))) {} - - /// Conversion constructor. - /// \param rhs float to convert - explicit half(float rhs) : data_(detail::float2half(rhs)) {} - - /// Conversion to single-precision. - /// \return single precision value representing expression value - operator float() const { return detail::half2float(data_); } - - /// Assignment operator. - /// \tparam T type of concrete half expression - /// \param rhs half expression to copy from - /// \return reference to this half - half& operator=(detail::expr rhs) { return *this = static_cast(rhs); } - - /// Arithmetic assignment. - /// \tparam T type of concrete half expression - /// \param rhs half expression to add - /// \return reference to this half - template typename detail::enable::type operator+=(T rhs) { return *this += static_cast(rhs); } - - /// Arithmetic assignment. - /// \tparam T type of concrete half expression - /// \param rhs half expression to subtract - /// \return reference to this half - template typename detail::enable::type operator-=(T rhs) { return *this -= static_cast(rhs); } - - /// Arithmetic assignment. - /// \tparam T type of concrete half expression - /// \param rhs half expression to multiply with - /// \return reference to this half - template typename detail::enable::type operator*=(T rhs) { return *this *= static_cast(rhs); } - - /// Arithmetic assignment. - /// \tparam T type of concrete half expression - /// \param rhs half expression to divide by - /// \return reference to this half - template typename detail::enable::type operator/=(T rhs) { return *this /= static_cast(rhs); } - - /// Assignment operator. - /// \param rhs single-precision value to copy from - /// \return reference to this half - half& operator=(float rhs) { data_ = detail::float2half(rhs); return *this; } - - /// Arithmetic assignment. - /// \param rhs single-precision value to add - /// \return reference to this half - half& operator+=(float rhs) { data_ = detail::float2half(detail::half2float(data_)+rhs); return *this; } - - /// Arithmetic assignment. - /// \param rhs single-precision value to subtract - /// \return reference to this half - half& operator-=(float rhs) { data_ = detail::float2half(detail::half2float(data_)-rhs); return *this; } - - /// Arithmetic assignment. - /// \param rhs single-precision value to multiply with - /// \return reference to this half - half& operator*=(float rhs) { data_ = detail::float2half(detail::half2float(data_)*rhs); return *this; } - - /// Arithmetic assignment. - /// \param rhs single-precision value to divide by - /// \return reference to this half - half& operator/=(float rhs) { data_ = detail::float2half(detail::half2float(data_)/rhs); return *this; } - - /// Prefix increment. - /// \return incremented half value - half& operator++() { return *this += 1.0f; } - - /// Prefix decrement. - /// \return decremented half value - half& operator--() { return *this -= 1.0f; } - - /// Postfix increment. - /// \return non-incremented half value - half operator++(int) { half out(*this); ++*this; return out; } - - /// Postfix decrement. - /// \return non-decremented half value - half operator--(int) { half out(*this); --*this; return out; } - - private: - /// Rounding mode to use - static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE); - - /// Constructor. - /// \param bits binary representation to set half to - HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) HALF_NOEXCEPT : data_(bits) {} - - /// Internal binary representation - detail::uint16 data_; - }; - -#if HALF_ENABLE_CPP11_USER_LITERALS - namespace literal - { - /// Half literal. - /// While this returns an actual half-precision value, half literals can unfortunately not be constant expressions due - /// to rather involved conversions. - /// \param value literal value - /// \return half with given value (if representable) - inline half operator""_h(long double value) { return half(detail::binary, detail::float2half(value)); } - } -#endif - - namespace detail - { - /// Wrapper implementing unspecialized half-precision functions. - struct functions - { - /// Addition implementation. - /// \param x first operand - /// \param y second operand - /// \return Half-precision sum stored in single-precision - static expr plus(float x, float y) { return expr(x+y); } - - /// Subtraction implementation. - /// \param x first operand - /// \param y second operand - /// \return Half-precision difference stored in single-precision - static expr minus(float x, float y) { return expr(x-y); } - - /// Multiplication implementation. - /// \param x first operand - /// \param y second operand - /// \return Half-precision product stored in single-precision - static expr multiplies(float x, float y) { return expr(x*y); } - - /// Division implementation. - /// \param x first operand - /// \param y second operand - /// \return Half-precision quotient stored in single-precision - static expr divides(float x, float y) { return expr(x/y); } - - /// Output implementation. - /// \param out stream to write to - /// \param arg value to write - /// \return reference to stream - template static std::basic_ostream& write(std::basic_ostream &out, float arg) { return out << arg; } - - /// Input implementation. - /// \param in stream to read from - /// \param arg half to read into - /// \return reference to stream - template static std::basic_istream& read(std::basic_istream &in, half &arg) - { - float f; - if(in >> f) - arg = f; - return in; - } - - /// Modulo implementation. - /// \param x first operand - /// \param y second operand - /// \return Half-precision division remainder stored in single-precision - static expr fmod(float x, float y) { return expr(std::fmod(x, y)); } - - /// Remainder implementation. - /// \param x first operand - /// \param y second operand - /// \return Half-precision division remainder stored in single-precision - static expr remainder(float x, float y) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::remainder(x, y)); - #else - if(builtin_isnan(x) || builtin_isnan(y)) - return expr(std::numeric_limits::quiet_NaN()); - float ax = std::fabs(x), ay = std::fabs(y); - if(ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) - return expr(std::numeric_limits::quiet_NaN()); - if(ay >= 65536.0f) - return expr(x); - if(ax == ay) - return expr(builtin_signbit(x) ? -0.0f : 0.0f); - ax = std::fmod(ax, ay+ay); - float y2 = 0.5f * ay; - if(ax > y2) - { - ax -= ay; - if(ax >= y2) - ax -= ay; - } - return expr(builtin_signbit(x) ? -ax : ax); - #endif - } - - /// Remainder implementation. - /// \param x first operand - /// \param y second operand - /// \param quo address to store quotient bits at - /// \return Half-precision division remainder stored in single-precision - static expr remquo(float x, float y, int *quo) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::remquo(x, y, quo)); - #else - if(builtin_isnan(x) || builtin_isnan(y)) - return expr(std::numeric_limits::quiet_NaN()); - bool sign = builtin_signbit(x), qsign = static_cast(sign^builtin_signbit(y)); - float ax = std::fabs(x), ay = std::fabs(y); - if(ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) - return expr(std::numeric_limits::quiet_NaN()); - if(ay >= 65536.0f) - return expr(x); - if(ax == ay) - return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f); - ax = std::fmod(ax, 8.0f*ay); - int cquo = 0; - if(ax >= 4.0f * ay) - { - ax -= 4.0f * ay; - cquo += 4; - } - if(ax >= 2.0f * ay) - { - ax -= 2.0f * ay; - cquo += 2; - } - float y2 = 0.5f * ay; - if(ax > y2) - { - ax -= ay; - ++cquo; - if(ax >= y2) - { - ax -= ay; - ++cquo; - } - } - return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax); - #endif - } - - /// Positive difference implementation. - /// \param x first operand - /// \param y second operand - /// \return Positive difference stored in single-precision - static expr fdim(float x, float y) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::fdim(x, y)); - #else - return expr((x<=y) ? 0.0f : (x-y)); - #endif - } - - /// Fused multiply-add implementation. - /// \param x first operand - /// \param y second operand - /// \param z third operand - /// \return \a x * \a y + \a z stored in single-precision - static expr fma(float x, float y, float z) - { - #if HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF) - return expr(std::fma(x, y, z)); - #else - return expr(x*y+z); - #endif - } - - /// Get NaN. - /// \return Half-precision quiet NaN - static half nanh() { return half(binary, 0x7FFF); } - - /// Exponential implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr exp(float arg) { return expr(std::exp(arg)); } - - /// Exponential implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr expm1(float arg) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::expm1(arg)); - #else - return expr(static_cast(std::exp(static_cast(arg))-1.0)); - #endif - } - - /// Binary exponential implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr exp2(float arg) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::exp2(arg)); - #else - return expr(static_cast(std::exp(arg*0.69314718055994530941723212145818))); - #endif - } - - /// Logarithm implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr log(float arg) { return expr(std::log(arg)); } - - /// Common logarithm implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr log10(float arg) { return expr(std::log10(arg)); } - - /// Logarithm implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr log1p(float arg) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::log1p(arg)); - #else - return expr(static_cast(std::log(1.0+arg))); - #endif - } - - /// Binary logarithm implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr log2(float arg) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::log2(arg)); - #else - return expr(static_cast(std::log(static_cast(arg))*1.4426950408889634073599246810019)); - #endif - } - - /// Square root implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr sqrt(float arg) { return expr(std::sqrt(arg)); } - - /// Cubic root implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr cbrt(float arg) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::cbrt(arg)); - #else - if(builtin_isnan(arg) || builtin_isinf(arg)) - return expr(arg); - return expr(builtin_signbit(arg) ? -static_cast(std::pow(-static_cast(arg), 1.0/3.0)) : - static_cast(std::pow(static_cast(arg), 1.0/3.0))); - #endif - } - - /// Hypotenuse implementation. - /// \param x first argument - /// \param y second argument - /// \return function value stored in single-preicision - static expr hypot(float x, float y) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::hypot(x, y)); - #else - return expr((builtin_isinf(x) || builtin_isinf(y)) ? std::numeric_limits::infinity() : - static_cast(std::sqrt(static_cast(x)*x+static_cast(y)*y))); - #endif - } - - /// Power implementation. - /// \param base value to exponentiate - /// \param exp power to expontiate to - /// \return function value stored in single-preicision - static expr pow(float base, float exp) { return expr(std::pow(base, exp)); } - - /// Sine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr sin(float arg) { return expr(std::sin(arg)); } - - /// Cosine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr cos(float arg) { return expr(std::cos(arg)); } - - /// Tan implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr tan(float arg) { return expr(std::tan(arg)); } - - /// Arc sine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr asin(float arg) { return expr(std::asin(arg)); } - - /// Arc cosine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr acos(float arg) { return expr(std::acos(arg)); } - - /// Arc tangent implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr atan(float arg) { return expr(std::atan(arg)); } - - /// Arc tangent implementation. - /// \param x first argument - /// \param y second argument - /// \return function value stored in single-preicision - static expr atan2(float x, float y) { return expr(std::atan2(x, y)); } - - /// Hyperbolic sine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr sinh(float arg) { return expr(std::sinh(arg)); } - - /// Hyperbolic cosine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr cosh(float arg) { return expr(std::cosh(arg)); } - - /// Hyperbolic tangent implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr tanh(float arg) { return expr(std::tanh(arg)); } - - /// Hyperbolic area sine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr asinh(float arg) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::asinh(arg)); - #else - return expr((arg==-std::numeric_limits::infinity()) ? arg : static_cast(std::log(arg+std::sqrt(arg*arg+1.0)))); - #endif - } - - /// Hyperbolic area cosine implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr acosh(float arg) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::acosh(arg)); - #else - return expr((arg<-1.0f) ? std::numeric_limits::quiet_NaN() : static_cast(std::log(arg+std::sqrt(arg*arg-1.0)))); - #endif - } - - /// Hyperbolic area tangent implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr atanh(float arg) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::atanh(arg)); - #else - return expr(static_cast(0.5*std::log((1.0+arg)/(1.0-arg)))); - #endif - } - - /// Error function implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr erf(float arg) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::erf(arg)); - #else - return expr(static_cast(erf(static_cast(arg)))); - #endif - } - - /// Complementary implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr erfc(float arg) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::erfc(arg)); - #else - return expr(static_cast(1.0-erf(static_cast(arg)))); - #endif - } - - /// Gamma logarithm implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr lgamma(float arg) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::lgamma(arg)); - #else - if(builtin_isinf(arg)) - return expr(std::numeric_limits::infinity()); - if(arg < 0.0f) - { - float i, f = std::modf(-arg, &i); - if(f == 0.0f) - return expr(std::numeric_limits::infinity()); - return expr(static_cast(1.1447298858494001741434273513531- - std::log(std::abs(std::sin(3.1415926535897932384626433832795*f)))-lgamma(1.0-arg))); - } - return expr(static_cast(lgamma(static_cast(arg)))); - #endif - } - - /// Gamma implementation. - /// \param arg function argument - /// \return function value stored in single-preicision - static expr tgamma(float arg) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::tgamma(arg)); - #else - if(arg == 0.0f) - return builtin_signbit(arg) ? expr(-std::numeric_limits::infinity()) : expr(std::numeric_limits::infinity()); - if(arg < 0.0f) - { - float i, f = std::modf(-arg, &i); - if(f == 0.0f) - return expr(std::numeric_limits::quiet_NaN()); - double value = 3.1415926535897932384626433832795 / (std::sin(3.1415926535897932384626433832795*f)*std::exp(lgamma(1.0-arg))); - return expr(static_cast((std::fmod(i, 2.0f)==0.0f) ? -value : value)); - } - if(builtin_isinf(arg)) - return expr(arg); - return expr(static_cast(std::exp(lgamma(static_cast(arg))))); - #endif - } - - /// Floor implementation. - /// \param arg value to round - /// \return rounded value - static half floor(half arg) { return half(binary, round_half(arg.data_)); } - - /// Ceiling implementation. - /// \param arg value to round - /// \return rounded value - static half ceil(half arg) { return half(binary, round_half(arg.data_)); } - - /// Truncation implementation. - /// \param arg value to round - /// \return rounded value - static half trunc(half arg) { return half(binary, round_half(arg.data_)); } - - /// Nearest integer implementation. - /// \param arg value to round - /// \return rounded value - static half round(half arg) { return half(binary, round_half_up(arg.data_)); } - - /// Nearest integer implementation. - /// \param arg value to round - /// \return rounded value - static long lround(half arg) { return detail::half2int_up(arg.data_); } - - /// Nearest integer implementation. - /// \param arg value to round - /// \return rounded value - static half rint(half arg) { return half(binary, round_half(arg.data_)); } - - /// Nearest integer implementation. - /// \param arg value to round - /// \return rounded value - static long lrint(half arg) { return detail::half2int(arg.data_); } - - #if HALF_ENABLE_CPP11_LONG_LONG - /// Nearest integer implementation. - /// \param arg value to round - /// \return rounded value - static long long llround(half arg) { return detail::half2int_up(arg.data_); } - - /// Nearest integer implementation. - /// \param arg value to round - /// \return rounded value - static long long llrint(half arg) { return detail::half2int(arg.data_); } - #endif - - /// Decompression implementation. - /// \param arg number to decompress - /// \param exp address to store exponent at - /// \return normalized significant - static half frexp(half arg, int *exp) - { - int m = arg.data_ & 0x7FFF, e = -14; - if(m >= 0x7C00 || !m) - return *exp = 0, arg; - for(; m<0x400; m<<=1,--e) ; - return *exp = e+(m>>10), half(binary, (arg.data_&0x8000)|0x3800|(m&0x3FF)); - } - - /// Decompression implementation. - /// \param arg number to decompress - /// \param iptr address to store integer part at - /// \return fractional part - static half modf(half arg, half *iptr) - { - unsigned int e = arg.data_ & 0x7FFF; - if(e >= 0x6400) - return *iptr = arg, half(binary, arg.data_&(0x8000U|-(e>0x7C00))); - if(e < 0x3C00) - return iptr->data_ = arg.data_ & 0x8000, arg; - e >>= 10; - unsigned int mask = (1<<(25-e)) - 1, m = arg.data_ & mask; - iptr->data_ = arg.data_ & ~mask; - if(!m) - return half(binary, arg.data_&0x8000); - for(; m<0x400; m<<=1,--e) ; - return half(binary, static_cast((arg.data_&0x8000)|(e<<10)|(m&0x3FF))); - } - - /// Scaling implementation. - /// \param arg number to scale - /// \param exp power of two to scale by - /// \return scaled number - static half scalbln(half arg, long exp) - { - unsigned int m = arg.data_ & 0x7FFF; - if(m >= 0x7C00 || !m) - return arg; - for(; m<0x400; m<<=1,--exp) ; - exp += m >> 10; - uint16 value = arg.data_ & 0x8000; - if(exp > 30) - { - if(half::round_style == std::round_toward_zero) - value |= 0x7BFF; - else if(half::round_style == std::round_toward_infinity) - value |= 0x7C00 - (value>>15); - else if(half::round_style == std::round_toward_neg_infinity) - value |= 0x7BFF + (value>>15); - else - value |= 0x7C00; - } - else if(exp > 0) - value |= (exp<<10) | (m&0x3FF); - else if(exp > -11) - { - m = (m&0x3FF) | 0x400; - if(half::round_style == std::round_to_nearest) - { - m += 1 << -exp; - #if HALF_ROUND_TIES_TO_EVEN - m -= (m>>(1-exp)) & 1; - #endif - } - else if(half::round_style == std::round_toward_infinity) - m += ((value>>15)-1) & ((1<<(1-exp))-1U); - else if(half::round_style == std::round_toward_neg_infinity) - m += -(value>>15) & ((1<<(1-exp))-1U); - value |= m >> (1-exp); - } - else if(half::round_style == std::round_toward_infinity) - value -= (value>>15) - 1; - else if(half::round_style == std::round_toward_neg_infinity) - value += value >> 15; - return half(binary, value); - } - - /// Exponent implementation. - /// \param arg number to query - /// \return floating point exponent - static int ilogb(half arg) - { - int abs = arg.data_ & 0x7FFF; - if(!abs) - return FP_ILOGB0; - if(abs < 0x7C00) - { - int exp = (abs>>10) - 15; - if(abs < 0x400) - for(; abs<0x200; abs<<=1,--exp) ; - return exp; - } - if(abs > 0x7C00) - return FP_ILOGBNAN; - return INT_MAX; - } - - /// Exponent implementation. - /// \param arg number to query - /// \return floating point exponent - static half logb(half arg) - { - int abs = arg.data_ & 0x7FFF; - if(!abs) - return half(binary, 0xFC00); - if(abs < 0x7C00) - { - int exp = (abs>>10) - 15; - if(abs < 0x400) - for(; abs<0x200; abs<<=1,--exp) ; - uint16 bits = (exp<0) << 15; - if(exp) - { - unsigned int m = std::abs(exp) << 6, e = 18; - for(; m<0x400; m<<=1,--e) ; - bits |= (e<<10) + m; - } - return half(binary, bits); - } - if(abs > 0x7C00) - return arg; - return half(binary, 0x7C00); - } - - /// Enumeration implementation. - /// \param from number to increase/decrease - /// \param to direction to enumerate into - /// \return next representable number - static half nextafter(half from, half to) - { - uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF; - if(fabs > 0x7C00) - return from; - if(tabs > 0x7C00 || from.data_ == to.data_ || !(fabs|tabs)) - return to; - if(!fabs) - return half(binary, (to.data_&0x8000)+1); - bool lt = ((fabs==from.data_) ? static_cast(fabs) : -static_cast(fabs)) < - ((tabs==to.data_) ? static_cast(tabs) : -static_cast(tabs)); - return half(binary, from.data_+(((from.data_>>15)^static_cast(lt))<<1)-1); - } - - /// Enumeration implementation. - /// \param from number to increase/decrease - /// \param to direction to enumerate into - /// \return next representable number - static half nexttoward(half from, long double to) - { - if(isnan(from)) - return from; - long double lfrom = static_cast(from); - if(builtin_isnan(to) || lfrom == to) - return half(static_cast(to)); - if(!(from.data_&0x7FFF)) - return half(binary, (static_cast(builtin_signbit(to))<<15)+1); - return half(binary, from.data_+(((from.data_>>15)^static_cast(lfrom0x3FF) ? ((abs>=0x7C00) ? ((abs>0x7C00) ? FP_NAN : FP_INFINITE) : FP_NORMAL) :FP_SUBNORMAL) : FP_ZERO; - } - - /// Classification implementation. - /// \param arg value to classify - /// \retval true if finite number - /// \retval false else - static bool isfinite(half arg) { return (arg.data_&0x7C00) != 0x7C00; } - - /// Classification implementation. - /// \param arg value to classify - /// \retval true if infinite number - /// \retval false else - static bool isinf(half arg) { return (arg.data_&0x7FFF) == 0x7C00; } - - /// Classification implementation. - /// \param arg value to classify - /// \retval true if not a number - /// \retval false else - static bool isnan(half arg) { return (arg.data_&0x7FFF) > 0x7C00; } - - /// Classification implementation. - /// \param arg value to classify - /// \retval true if normal number - /// \retval false else - static bool isnormal(half arg) { return ((arg.data_&0x7C00)!=0) & ((arg.data_&0x7C00)!=0x7C00); } - - /// Sign bit implementation. - /// \param arg value to check - /// \retval true if signed - /// \retval false if unsigned - static bool signbit(half arg) { return (arg.data_&0x8000) != 0; } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if operands equal - /// \retval false else - static bool isequal(half x, half y) { return (x.data_==y.data_ || !((x.data_|y.data_)&0x7FFF)) && !isnan(x); } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if operands not equal - /// \retval false else - static bool isnotequal(half x, half y) { return (x.data_!=y.data_ && ((x.data_|y.data_)&0x7FFF)) || isnan(x); } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x > \a y - /// \retval false else - static bool isgreater(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) > ((yabs==y.data_) ? yabs : -yabs)); - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x >= \a y - /// \retval false else - static bool isgreaterequal(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) >= ((yabs==y.data_) ? yabs : -yabs)); - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x < \a y - /// \retval false else - static bool isless(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) < ((yabs==y.data_) ? yabs : -yabs)); - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x <= \a y - /// \retval false else - static bool islessequal(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) <= ((yabs==y.data_) ? yabs : -yabs)); - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if either \a x > \a y nor \a x < \a y - /// \retval false else - static bool islessgreater(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - if(xabs > 0x7C00 || yabs > 0x7C00) - return false; - int a = (xabs==x.data_) ? xabs : -xabs, b = (yabs==y.data_) ? yabs : -yabs; - return a < b || a > b; - } - - /// Comparison implementation. - /// \param x first operand - /// \param y second operand - /// \retval true if operand unordered - /// \retval false else - static bool isunordered(half x, half y) { return isnan(x) || isnan(y); } - - private: - static double erf(double arg) - { - if(builtin_isinf(arg)) - return (arg<0.0) ? -1.0 : 1.0; - double x2 = arg * arg, ax2 = 0.147 * x2, value = std::sqrt(1.0-std::exp(-x2*(1.2732395447351626861510701069801+ax2)/(1.0+ax2))); - return builtin_signbit(arg) ? -value : value; - } - - static double lgamma(double arg) - { - double v = 1.0; - for(; arg<8.0; ++arg) v *= arg; - double w = 1.0 / (arg*arg); - return (((((((-0.02955065359477124183006535947712*w+0.00641025641025641025641025641026)*w+ - -0.00191752691752691752691752691753)*w+8.4175084175084175084175084175084e-4)*w+ - -5.952380952380952380952380952381e-4)*w+7.9365079365079365079365079365079e-4)*w+ - -0.00277777777777777777777777777778)*w+0.08333333333333333333333333333333)/arg + - 0.91893853320467274178032973640562 - std::log(v) - arg + (arg-0.5) * std::log(arg); - } - }; - - /// Wrapper for unary half-precision functions needing specialization for individual argument types. - /// \tparam T argument type - template struct unary_specialized - { - /// Negation implementation. - /// \param arg value to negate - /// \return negated value - static HALF_CONSTEXPR half negate(half arg) { return half(binary, arg.data_^0x8000); } - - /// Absolute value implementation. - /// \param arg function argument - /// \return absolute value - static half fabs(half arg) { return half(binary, arg.data_&0x7FFF); } - }; - template<> struct unary_specialized - { - static HALF_CONSTEXPR expr negate(float arg) { return expr(-arg); } - static expr fabs(float arg) { return expr(std::fabs(arg)); } - }; - - /// Wrapper for binary half-precision functions needing specialization for individual argument types. - /// \tparam T first argument type - /// \tparam U first argument type - template struct binary_specialized - { - /// Minimum implementation. - /// \param x first operand - /// \param y second operand - /// \return minimum value - static expr fmin(float x, float y) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::fmin(x, y)); - #else - if(builtin_isnan(x)) - return expr(y); - if(builtin_isnan(y)) - return expr(x); - return expr(std::min(x, y)); - #endif - } - - /// Maximum implementation. - /// \param x first operand - /// \param y second operand - /// \return maximum value - static expr fmax(float x, float y) - { - #if HALF_ENABLE_CPP11_CMATH - return expr(std::fmax(x, y)); - #else - if(builtin_isnan(x)) - return expr(y); - if(builtin_isnan(y)) - return expr(x); - return expr(std::max(x, y)); - #endif - } - }; - template<> struct binary_specialized - { - static half fmin(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - if(xabs > 0x7C00) - return y; - if(yabs > 0x7C00) - return x; - return (((xabs==x.data_) ? xabs : -xabs) > ((yabs==y.data_) ? yabs : -yabs)) ? y : x; - } - static half fmax(half x, half y) - { - int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; - if(xabs > 0x7C00) - return y; - if(yabs > 0x7C00) - return x; - return (((xabs==x.data_) ? xabs : -xabs) < ((yabs==y.data_) ? yabs : -yabs)) ? y : x; - } - }; - - /// Helper class for half casts. - /// This class template has to be specialized for all valid cast argument to define an appropriate static `cast` member - /// function and a corresponding `type` member denoting its return type. - /// \tparam T destination type - /// \tparam U source type - /// \tparam R rounding mode to use - template struct half_caster {}; - template struct half_caster - { - #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_arithmetic::value, "half_cast from non-arithmetic type unsupported"); - #endif - - static half cast(U arg) { return cast_impl(arg, is_float()); }; - - private: - static half cast_impl(U arg, true_type) { return half(binary, float2half(arg)); } - static half cast_impl(U arg, false_type) { return half(binary, int2half(arg)); } - }; - template struct half_caster - { - #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); - #endif - - static T cast(half arg) { return cast_impl(arg, is_float()); } - - private: - static T cast_impl(half arg, true_type) { return half2float(arg.data_); } - static T cast_impl(half arg, false_type) { return half2int(arg.data_); } - }; - template struct half_caster - { - #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS - static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); - #endif - - static T cast(expr arg) { return cast_impl(arg, is_float()); } - - private: - static T cast_impl(float arg, true_type) { return static_cast(arg); } - static T cast_impl(half arg, false_type) { return half2int(arg.data_); } - }; - template struct half_caster - { - static half cast(half arg) { return arg; } - }; - template struct half_caster : half_caster {}; - - /// \name Comparison operators - /// \{ - - /// Comparison for equality. - /// \param x first operand - /// \param y second operand - /// \retval true if operands equal - /// \retval false else - template typename enable::type operator==(T x, U y) { return functions::isequal(x, y); } - - /// Comparison for inequality. - /// \param x first operand - /// \param y second operand - /// \retval true if operands not equal - /// \retval false else - template typename enable::type operator!=(T x, U y) { return functions::isnotequal(x, y); } - - /// Comparison for less than. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x less than \a y - /// \retval false else - template typename enable::type operator<(T x, U y) { return functions::isless(x, y); } - - /// Comparison for greater than. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x greater than \a y - /// \retval false else - template typename enable::type operator>(T x, U y) { return functions::isgreater(x, y); } - - /// Comparison for less equal. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x less equal \a y - /// \retval false else - template typename enable::type operator<=(T x, U y) { return functions::islessequal(x, y); } - - /// Comparison for greater equal. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x greater equal \a y - /// \retval false else - template typename enable::type operator>=(T x, U y) { return functions::isgreaterequal(x, y); } - - /// \} - /// \name Arithmetic operators - /// \{ - - /// Add halfs. - /// \param x left operand - /// \param y right operand - /// \return sum of half expressions - template typename enable::type operator+(T x, U y) { return functions::plus(x, y); } - - /// Subtract halfs. - /// \param x left operand - /// \param y right operand - /// \return difference of half expressions - template typename enable::type operator-(T x, U y) { return functions::minus(x, y); } - - /// Multiply halfs. - /// \param x left operand - /// \param y right operand - /// \return product of half expressions - template typename enable::type operator*(T x, U y) { return functions::multiplies(x, y); } - - /// Divide halfs. - /// \param x left operand - /// \param y right operand - /// \return quotient of half expressions - template typename enable::type operator/(T x, U y) { return functions::divides(x, y); } - - /// Identity. - /// \param arg operand - /// \return uncahnged operand - template HALF_CONSTEXPR typename enable::type operator+(T arg) { return arg; } - - /// Negation. - /// \param arg operand - /// \return negated operand - template HALF_CONSTEXPR typename enable::type operator-(T arg) { return unary_specialized::negate(arg); } - - /// \} - /// \name Input and output - /// \{ - - /// Output operator. - /// \param out output stream to write into - /// \param arg half expression to write - /// \return reference to output stream - template typename enable&,T>::type - operator<<(std::basic_ostream &out, T arg) { return functions::write(out, arg); } - - /// Input operator. - /// \param in input stream to read from - /// \param arg half to read into - /// \return reference to input stream - template std::basic_istream& - operator>>(std::basic_istream &in, half &arg) { return functions::read(in, arg); } - - /// \} - /// \name Basic mathematical operations - /// \{ - - /// Absolute value. - /// \param arg operand - /// \return absolute value of \a arg -// template typename enable::type abs(T arg) { return unary_specialized::fabs(arg); } - inline half abs(half arg) { return unary_specialized::fabs(arg); } - inline expr abs(expr arg) { return unary_specialized::fabs(arg); } - - /// Absolute value. - /// \param arg operand - /// \return absolute value of \a arg -// template typename enable::type fabs(T arg) { return unary_specialized::fabs(arg); } - inline half fabs(half arg) { return unary_specialized::fabs(arg); } - inline expr fabs(expr arg) { return unary_specialized::fabs(arg); } - - /// Remainder of division. - /// \param x first operand - /// \param y second operand - /// \return remainder of floating point division. -// template typename enable::type fmod(T x, U y) { return functions::fmod(x, y); } - inline expr fmod(half x, half y) { return functions::fmod(x, y); } - inline expr fmod(half x, expr y) { return functions::fmod(x, y); } - inline expr fmod(expr x, half y) { return functions::fmod(x, y); } - inline expr fmod(expr x, expr y) { return functions::fmod(x, y); } - - /// Remainder of division. - /// \param x first operand - /// \param y second operand - /// \return remainder of floating point division. -// template typename enable::type remainder(T x, U y) { return functions::remainder(x, y); } - inline expr remainder(half x, half y) { return functions::remainder(x, y); } - inline expr remainder(half x, expr y) { return functions::remainder(x, y); } - inline expr remainder(expr x, half y) { return functions::remainder(x, y); } - inline expr remainder(expr x, expr y) { return functions::remainder(x, y); } - - /// Remainder of division. - /// \param x first operand - /// \param y second operand - /// \param quo address to store some bits of quotient at - /// \return remainder of floating point division. -// template typename enable::type remquo(T x, U y, int *quo) { return functions::remquo(x, y, quo); } - inline expr remquo(half x, half y, int *quo) { return functions::remquo(x, y, quo); } - inline expr remquo(half x, expr y, int *quo) { return functions::remquo(x, y, quo); } - inline expr remquo(expr x, half y, int *quo) { return functions::remquo(x, y, quo); } - inline expr remquo(expr x, expr y, int *quo) { return functions::remquo(x, y, quo); } - - /// Fused multiply add. - /// \param x first operand - /// \param y second operand - /// \param z third operand - /// \return ( \a x * \a y ) + \a z rounded as one operation. -// template typename enable::type fma(T x, U y, V z) { return functions::fma(x, y, z); } - inline expr fma(half x, half y, half z) { return functions::fma(x, y, z); } - inline expr fma(half x, half y, expr z) { return functions::fma(x, y, z); } - inline expr fma(half x, expr y, half z) { return functions::fma(x, y, z); } - inline expr fma(half x, expr y, expr z) { return functions::fma(x, y, z); } - inline expr fma(expr x, half y, half z) { return functions::fma(x, y, z); } - inline expr fma(expr x, half y, expr z) { return functions::fma(x, y, z); } - inline expr fma(expr x, expr y, half z) { return functions::fma(x, y, z); } - inline expr fma(expr x, expr y, expr z) { return functions::fma(x, y, z); } - - /// Maximum of half expressions. - /// \param x first operand - /// \param y second operand - /// \return maximum of operands -// template typename result::type fmax(T x, U y) { return binary_specialized::fmax(x, y); } - inline half fmax(half x, half y) { return binary_specialized::fmax(x, y); } - inline expr fmax(half x, expr y) { return binary_specialized::fmax(x, y); } - inline expr fmax(expr x, half y) { return binary_specialized::fmax(x, y); } - inline expr fmax(expr x, expr y) { return binary_specialized::fmax(x, y); } - - /// Minimum of half expressions. - /// \param x first operand - /// \param y second operand - /// \return minimum of operands -// template typename result::type fmin(T x, U y) { return binary_specialized::fmin(x, y); } - inline half fmin(half x, half y) { return binary_specialized::fmin(x, y); } - inline expr fmin(half x, expr y) { return binary_specialized::fmin(x, y); } - inline expr fmin(expr x, half y) { return binary_specialized::fmin(x, y); } - inline expr fmin(expr x, expr y) { return binary_specialized::fmin(x, y); } - - /// Positive difference. - /// \param x first operand - /// \param y second operand - /// \return \a x - \a y or 0 if difference negative -// template typename enable::type fdim(T x, U y) { return functions::fdim(x, y); } - inline expr fdim(half x, half y) { return functions::fdim(x, y); } - inline expr fdim(half x, expr y) { return functions::fdim(x, y); } - inline expr fdim(expr x, half y) { return functions::fdim(x, y); } - inline expr fdim(expr x, expr y) { return functions::fdim(x, y); } - - /// Get NaN value. - /// \return quiet NaN - inline half nanh(const char*) { return functions::nanh(); } - - /// \} - /// \name Exponential functions - /// \{ - - /// Exponential function. - /// \param arg function argument - /// \return e raised to \a arg -// template typename enable::type exp(T arg) { return functions::exp(arg); } - inline expr exp(half arg) { return functions::exp(arg); } - inline expr exp(expr arg) { return functions::exp(arg); } - - /// Exponential minus one. - /// \param arg function argument - /// \return e raised to \a arg subtracted by 1 -// template typename enable::type expm1(T arg) { return functions::expm1(arg); } - inline expr expm1(half arg) { return functions::expm1(arg); } - inline expr expm1(expr arg) { return functions::expm1(arg); } - - /// Binary exponential. - /// \param arg function argument - /// \return 2 raised to \a arg -// template typename enable::type exp2(T arg) { return functions::exp2(arg); } - inline expr exp2(half arg) { return functions::exp2(arg); } - inline expr exp2(expr arg) { return functions::exp2(arg); } - - /// Natural logorithm. - /// \param arg function argument - /// \return logarithm of \a arg to base e -// template typename enable::type log(T arg) { return functions::log(arg); } - inline expr log(half arg) { return functions::log(arg); } - inline expr log(expr arg) { return functions::log(arg); } - - /// Common logorithm. - /// \param arg function argument - /// \return logarithm of \a arg to base 10 -// template typename enable::type log10(T arg) { return functions::log10(arg); } - inline expr log10(half arg) { return functions::log10(arg); } - inline expr log10(expr arg) { return functions::log10(arg); } - - /// Natural logorithm. - /// \param arg function argument - /// \return logarithm of \a arg plus 1 to base e -// template typename enable::type log1p(T arg) { return functions::log1p(arg); } - inline expr log1p(half arg) { return functions::log1p(arg); } - inline expr log1p(expr arg) { return functions::log1p(arg); } - - /// Binary logorithm. - /// \param arg function argument - /// \return logarithm of \a arg to base 2 -// template typename enable::type log2(T arg) { return functions::log2(arg); } - inline expr log2(half arg) { return functions::log2(arg); } - inline expr log2(expr arg) { return functions::log2(arg); } - - /// \} - /// \name Power functions - /// \{ - - /// Square root. - /// \param arg function argument - /// \return square root of \a arg -// template typename enable::type sqrt(T arg) { return functions::sqrt(arg); } - inline expr sqrt(half arg) { return functions::sqrt(arg); } - inline expr sqrt(expr arg) { return functions::sqrt(arg); } - - /// Cubic root. - /// \param arg function argument - /// \return cubic root of \a arg -// template typename enable::type cbrt(T arg) { return functions::cbrt(arg); } - inline expr cbrt(half arg) { return functions::cbrt(arg); } - inline expr cbrt(expr arg) { return functions::cbrt(arg); } - - /// Hypotenuse function. - /// \param x first argument - /// \param y second argument - /// \return square root of sum of squares without internal over- or underflows -// template typename enable::type hypot(T x, U y) { return functions::hypot(x, y); } - inline expr hypot(half x, half y) { return functions::hypot(x, y); } - inline expr hypot(half x, expr y) { return functions::hypot(x, y); } - inline expr hypot(expr x, half y) { return functions::hypot(x, y); } - inline expr hypot(expr x, expr y) { return functions::hypot(x, y); } - - /// Power function. - /// \param base first argument - /// \param exp second argument - /// \return \a base raised to \a exp -// template typename enable::type pow(T base, U exp) { return functions::pow(base, exp); } - inline expr pow(half base, half exp) { return functions::pow(base, exp); } - inline expr pow(half base, expr exp) { return functions::pow(base, exp); } - inline expr pow(expr base, half exp) { return functions::pow(base, exp); } - inline expr pow(expr base, expr exp) { return functions::pow(base, exp); } - - /// \} - /// \name Trigonometric functions - /// \{ - - /// Sine function. - /// \param arg function argument - /// \return sine value of \a arg -// template typename enable::type sin(T arg) { return functions::sin(arg); } - inline expr sin(half arg) { return functions::sin(arg); } - inline expr sin(expr arg) { return functions::sin(arg); } - - /// Cosine function. - /// \param arg function argument - /// \return cosine value of \a arg -// template typename enable::type cos(T arg) { return functions::cos(arg); } - inline expr cos(half arg) { return functions::cos(arg); } - inline expr cos(expr arg) { return functions::cos(arg); } - - /// Tangent function. - /// \param arg function argument - /// \return tangent value of \a arg -// template typename enable::type tan(T arg) { return functions::tan(arg); } - inline expr tan(half arg) { return functions::tan(arg); } - inline expr tan(expr arg) { return functions::tan(arg); } - - /// Arc sine. - /// \param arg function argument - /// \return arc sine value of \a arg -// template typename enable::type asin(T arg) { return functions::asin(arg); } - inline expr asin(half arg) { return functions::asin(arg); } - inline expr asin(expr arg) { return functions::asin(arg); } - - /// Arc cosine function. - /// \param arg function argument - /// \return arc cosine value of \a arg -// template typename enable::type acos(T arg) { return functions::acos(arg); } - inline expr acos(half arg) { return functions::acos(arg); } - inline expr acos(expr arg) { return functions::acos(arg); } - - /// Arc tangent function. - /// \param arg function argument - /// \return arc tangent value of \a arg -// template typename enable::type atan(T arg) { return functions::atan(arg); } - inline expr atan(half arg) { return functions::atan(arg); } - inline expr atan(expr arg) { return functions::atan(arg); } - - /// Arc tangent function. - /// \param x first argument - /// \param y second argument - /// \return arc tangent value -// template typename enable::type atan2(T x, U y) { return functions::atan2(x, y); } - inline expr atan2(half x, half y) { return functions::atan2(x, y); } - inline expr atan2(half x, expr y) { return functions::atan2(x, y); } - inline expr atan2(expr x, half y) { return functions::atan2(x, y); } - inline expr atan2(expr x, expr y) { return functions::atan2(x, y); } - - /// \} - /// \name Hyperbolic functions - /// \{ - - /// Hyperbolic sine. - /// \param arg function argument - /// \return hyperbolic sine value of \a arg -// template typename enable::type sinh(T arg) { return functions::sinh(arg); } - inline expr sinh(half arg) { return functions::sinh(arg); } - inline expr sinh(expr arg) { return functions::sinh(arg); } - - /// Hyperbolic cosine. - /// \param arg function argument - /// \return hyperbolic cosine value of \a arg -// template typename enable::type cosh(T arg) { return functions::cosh(arg); } - inline expr cosh(half arg) { return functions::cosh(arg); } - inline expr cosh(expr arg) { return functions::cosh(arg); } - - /// Hyperbolic tangent. - /// \param arg function argument - /// \return hyperbolic tangent value of \a arg -// template typename enable::type tanh(T arg) { return functions::tanh(arg); } - inline expr tanh(half arg) { return functions::tanh(arg); } - inline expr tanh(expr arg) { return functions::tanh(arg); } - - /// Hyperbolic area sine. - /// \param arg function argument - /// \return area sine value of \a arg -// template typename enable::type asinh(T arg) { return functions::asinh(arg); } - inline expr asinh(half arg) { return functions::asinh(arg); } - inline expr asinh(expr arg) { return functions::asinh(arg); } - - /// Hyperbolic area cosine. - /// \param arg function argument - /// \return area cosine value of \a arg -// template typename enable::type acosh(T arg) { return functions::acosh(arg); } - inline expr acosh(half arg) { return functions::acosh(arg); } - inline expr acosh(expr arg) { return functions::acosh(arg); } - - /// Hyperbolic area tangent. - /// \param arg function argument - /// \return area tangent value of \a arg -// template typename enable::type atanh(T arg) { return functions::atanh(arg); } - inline expr atanh(half arg) { return functions::atanh(arg); } - inline expr atanh(expr arg) { return functions::atanh(arg); } - - /// \} - /// \name Error and gamma functions - /// \{ - - /// Error function. - /// \param arg function argument - /// \return error function value of \a arg -// template typename enable::type erf(T arg) { return functions::erf(arg); } - inline expr erf(half arg) { return functions::erf(arg); } - inline expr erf(expr arg) { return functions::erf(arg); } - - /// Complementary error function. - /// \param arg function argument - /// \return 1 minus error function value of \a arg -// template typename enable::type erfc(T arg) { return functions::erfc(arg); } - inline expr erfc(half arg) { return functions::erfc(arg); } - inline expr erfc(expr arg) { return functions::erfc(arg); } - - /// Natural logarithm of gamma function. - /// \param arg function argument - /// \return natural logarith of gamma function for \a arg -// template typename enable::type lgamma(T arg) { return functions::lgamma(arg); } - inline expr lgamma(half arg) { return functions::lgamma(arg); } - inline expr lgamma(expr arg) { return functions::lgamma(arg); } - - /// Gamma function. - /// \param arg function argument - /// \return gamma function value of \a arg -// template typename enable::type tgamma(T arg) { return functions::tgamma(arg); } - inline expr tgamma(half arg) { return functions::tgamma(arg); } - inline expr tgamma(expr arg) { return functions::tgamma(arg); } - - /// \} - /// \name Rounding - /// \{ - - /// Nearest integer not less than half value. - /// \param arg half to round - /// \return nearest integer not less than \a arg -// template typename enable::type ceil(T arg) { return functions::ceil(arg); } - inline half ceil(half arg) { return functions::ceil(arg); } - inline half ceil(expr arg) { return functions::ceil(arg); } - - /// Nearest integer not greater than half value. - /// \param arg half to round - /// \return nearest integer not greater than \a arg -// template typename enable::type floor(T arg) { return functions::floor(arg); } - inline half floor(half arg) { return functions::floor(arg); } - inline half floor(expr arg) { return functions::floor(arg); } - - /// Nearest integer not greater in magnitude than half value. - /// \param arg half to round - /// \return nearest integer not greater in magnitude than \a arg -// template typename enable::type trunc(T arg) { return functions::trunc(arg); } - inline half trunc(half arg) { return functions::trunc(arg); } - inline half trunc(expr arg) { return functions::trunc(arg); } - - /// Nearest integer. - /// \param arg half to round - /// \return nearest integer, rounded away from zero in half-way cases -// template typename enable::type round(T arg) { return functions::round(arg); } - inline half round(half arg) { return functions::round(arg); } - inline half round(expr arg) { return functions::round(arg); } - - /// Nearest integer. - /// \param arg half to round - /// \return nearest integer, rounded away from zero in half-way cases -// template typename enable::type lround(T arg) { return functions::lround(arg); } - inline long lround(half arg) { return functions::lround(arg); } - inline long lround(expr arg) { return functions::lround(arg); } - - /// Nearest integer using half's internal rounding mode. - /// \param arg half expression to round - /// \return nearest integer using default rounding mode -// template typename enable::type nearbyint(T arg) { return functions::nearbyint(arg); } - inline half nearbyint(half arg) { return functions::rint(arg); } - inline half nearbyint(expr arg) { return functions::rint(arg); } - - /// Nearest integer using half's internal rounding mode. - /// \param arg half expression to round - /// \return nearest integer using default rounding mode -// template typename enable::type rint(T arg) { return functions::rint(arg); } - inline half rint(half arg) { return functions::rint(arg); } - inline half rint(expr arg) { return functions::rint(arg); } - - /// Nearest integer using half's internal rounding mode. - /// \param arg half expression to round - /// \return nearest integer using default rounding mode -// template typename enable::type lrint(T arg) { return functions::lrint(arg); } - inline long lrint(half arg) { return functions::lrint(arg); } - inline long lrint(expr arg) { return functions::lrint(arg); } - #if HALF_ENABLE_CPP11_LONG_LONG - /// Nearest integer. - /// \param arg half to round - /// \return nearest integer, rounded away from zero in half-way cases -// template typename enable::type llround(T arg) { return functions::llround(arg); } - inline long long llround(half arg) { return functions::llround(arg); } - inline long long llround(expr arg) { return functions::llround(arg); } - - /// Nearest integer using half's internal rounding mode. - /// \param arg half expression to round - /// \return nearest integer using default rounding mode -// template typename enable::type llrint(T arg) { return functions::llrint(arg); } - inline long long llrint(half arg) { return functions::llrint(arg); } - inline long long llrint(expr arg) { return functions::llrint(arg); } - #endif - - /// \} - /// \name Floating point manipulation - /// \{ - - /// Decompress floating point number. - /// \param arg number to decompress - /// \param exp address to store exponent at - /// \return significant in range [0.5, 1) -// template typename enable::type frexp(T arg, int *exp) { return functions::frexp(arg, exp); } - inline half frexp(half arg, int *exp) { return functions::frexp(arg, exp); } - inline half frexp(expr arg, int *exp) { return functions::frexp(arg, exp); } - - /// Multiply by power of two. - /// \param arg number to modify - /// \param exp power of two to multiply with - /// \return \a arg multplied by 2 raised to \a exp -// template typename enable::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp); } - inline half ldexp(half arg, int exp) { return functions::scalbln(arg, exp); } - inline half ldexp(expr arg, int exp) { return functions::scalbln(arg, exp); } - - /// Extract integer and fractional parts. - /// \param arg number to decompress - /// \param iptr address to store integer part at - /// \return fractional part -// template typename enable::type modf(T arg, half *iptr) { return functions::modf(arg, iptr); } - inline half modf(half arg, half *iptr) { return functions::modf(arg, iptr); } - inline half modf(expr arg, half *iptr) { return functions::modf(arg, iptr); } - - /// Multiply by power of two. - /// \param arg number to modify - /// \param exp power of two to multiply with - /// \return \a arg multplied by 2 raised to \a exp -// template typename enable::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp); } - inline half scalbn(half arg, int exp) { return functions::scalbln(arg, exp); } - inline half scalbn(expr arg, int exp) { return functions::scalbln(arg, exp); } - - /// Multiply by power of two. - /// \param arg number to modify - /// \param exp power of two to multiply with - /// \return \a arg multplied by 2 raised to \a exp -// template typename enable::type scalbln(T arg, long exp) { return functions::scalbln(arg, exp); } - inline half scalbln(half arg, long exp) { return functions::scalbln(arg, exp); } - inline half scalbln(expr arg, long exp) { return functions::scalbln(arg, exp); } - - /// Extract exponent. - /// \param arg number to query - /// \return floating point exponent - /// \retval FP_ILOGB0 for zero - /// \retval FP_ILOGBNAN for NaN - /// \retval MAX_INT for infinity -// template typename enable::type ilogb(T arg) { return functions::ilogb(arg); } - inline int ilogb(half arg) { return functions::ilogb(arg); } - inline int ilogb(expr arg) { return functions::ilogb(arg); } - - /// Extract exponent. - /// \param arg number to query - /// \return floating point exponent -// template typename enable::type logb(T arg) { return functions::logb(arg); } - inline half logb(half arg) { return functions::logb(arg); } - inline half logb(expr arg) { return functions::logb(arg); } - - /// Next representable value. - /// \param from value to compute next representable value for - /// \param to direction towards which to compute next value - /// \return next representable value after \a from in direction towards \a to -// template typename enable::type nextafter(T from, U to) { return functions::nextafter(from, to); } - inline half nextafter(half from, half to) { return functions::nextafter(from, to); } - inline half nextafter(half from, expr to) { return functions::nextafter(from, to); } - inline half nextafter(expr from, half to) { return functions::nextafter(from, to); } - inline half nextafter(expr from, expr to) { return functions::nextafter(from, to); } - - /// Next representable value. - /// \param from value to compute next representable value for - /// \param to direction towards which to compute next value - /// \return next representable value after \a from in direction towards \a to -// template typename enable::type nexttoward(T from, long double to) { return functions::nexttoward(from, to); } - inline half nexttoward(half from, long double to) { return functions::nexttoward(from, to); } - inline half nexttoward(expr from, long double to) { return functions::nexttoward(from, to); } - - /// Take sign. - /// \param x value to change sign for - /// \param y value to take sign from - /// \return value equal to \a x in magnitude and to \a y in sign -// template typename enable::type copysign(T x, U y) { return functions::copysign(x, y); } - inline half copysign(half x, half y) { return functions::copysign(x, y); } - inline half copysign(half x, expr y) { return functions::copysign(x, y); } - inline half copysign(expr x, half y) { return functions::copysign(x, y); } - inline half copysign(expr x, expr y) { return functions::copysign(x, y); } - - /// \} - /// \name Floating point classification - /// \{ - - - /// Classify floating point value. - /// \param arg number to classify - /// \retval FP_ZERO for positive and negative zero - /// \retval FP_SUBNORMAL for subnormal numbers - /// \retval FP_INFINITY for positive and negative infinity - /// \retval FP_NAN for NaNs - /// \retval FP_NORMAL for all other (normal) values -// template typename enable::type fpclassify(T arg) { return functions::fpclassify(arg); } - inline int fpclassify(half arg) { return functions::fpclassify(arg); } - inline int fpclassify(expr arg) { return functions::fpclassify(arg); } - - /// Check if finite number. - /// \param arg number to check - /// \retval true if neither infinity nor NaN - /// \retval false else -// template typename enable::type isfinite(T arg) { return functions::isfinite(arg); } - inline bool isfinite(half arg) { return functions::isfinite(arg); } - inline bool isfinite(expr arg) { return functions::isfinite(arg); } - - /// Check for infinity. - /// \param arg number to check - /// \retval true for positive or negative infinity - /// \retval false else -// template typename enable::type isinf(T arg) { return functions::isinf(arg); } - inline bool isinf(half arg) { return functions::isinf(arg); } - inline bool isinf(expr arg) { return functions::isinf(arg); } - - /// Check for NaN. - /// \param arg number to check - /// \retval true for NaNs - /// \retval false else -// template typename enable::type isnan(T arg) { return functions::isnan(arg); } - inline bool isnan(half arg) { return functions::isnan(arg); } - inline bool isnan(expr arg) { return functions::isnan(arg); } - - /// Check if normal number. - /// \param arg number to check - /// \retval true if normal number - /// \retval false if either subnormal, zero, infinity or NaN -// template typename enable::type isnormal(T arg) { return functions::isnormal(arg); } - inline bool isnormal(half arg) { return functions::isnormal(arg); } - inline bool isnormal(expr arg) { return functions::isnormal(arg); } - - /// Check sign. - /// \param arg number to check - /// \retval true for negative number - /// \retval false for positive number -// template typename enable::type signbit(T arg) { return functions::signbit(arg); } - inline bool signbit(half arg) { return functions::signbit(arg); } - inline bool signbit(expr arg) { return functions::signbit(arg); } - - /// \} - /// \name Comparison - /// \{ - - /// Comparison for greater than. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x greater than \a y - /// \retval false else -// template typename enable::type isgreater(T x, U y) { return functions::isgreater(x, y); } - inline bool isgreater(half x, half y) { return functions::isgreater(x, y); } - inline bool isgreater(half x, expr y) { return functions::isgreater(x, y); } - inline bool isgreater(expr x, half y) { return functions::isgreater(x, y); } - inline bool isgreater(expr x, expr y) { return functions::isgreater(x, y); } - - /// Comparison for greater equal. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x greater equal \a y - /// \retval false else -// template typename enable::type isgreaterequal(T x, U y) { return functions::isgreaterequal(x, y); } - inline bool isgreaterequal(half x, half y) { return functions::isgreaterequal(x, y); } - inline bool isgreaterequal(half x, expr y) { return functions::isgreaterequal(x, y); } - inline bool isgreaterequal(expr x, half y) { return functions::isgreaterequal(x, y); } - inline bool isgreaterequal(expr x, expr y) { return functions::isgreaterequal(x, y); } - - /// Comparison for less than. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x less than \a y - /// \retval false else -// template typename enable::type isless(T x, U y) { return functions::isless(x, y); } - inline bool isless(half x, half y) { return functions::isless(x, y); } - inline bool isless(half x, expr y) { return functions::isless(x, y); } - inline bool isless(expr x, half y) { return functions::isless(x, y); } - inline bool isless(expr x, expr y) { return functions::isless(x, y); } - - /// Comparison for less equal. - /// \param x first operand - /// \param y second operand - /// \retval true if \a x less equal \a y - /// \retval false else -// template typename enable::type islessequal(T x, U y) { return functions::islessequal(x, y); } - inline bool islessequal(half x, half y) { return functions::islessequal(x, y); } - inline bool islessequal(half x, expr y) { return functions::islessequal(x, y); } - inline bool islessequal(expr x, half y) { return functions::islessequal(x, y); } - inline bool islessequal(expr x, expr y) { return functions::islessequal(x, y); } - - /// Comarison for less or greater. - /// \param x first operand - /// \param y second operand - /// \retval true if either less or greater - /// \retval false else -// template typename enable::type islessgreater(T x, U y) { return functions::islessgreater(x, y); } - inline bool islessgreater(half x, half y) { return functions::islessgreater(x, y); } - inline bool islessgreater(half x, expr y) { return functions::islessgreater(x, y); } - inline bool islessgreater(expr x, half y) { return functions::islessgreater(x, y); } - inline bool islessgreater(expr x, expr y) { return functions::islessgreater(x, y); } - - /// Check if unordered. - /// \param x first operand - /// \param y second operand - /// \retval true if unordered (one or two NaN operands) - /// \retval false else -// template typename enable::type isunordered(T x, U y) { return functions::isunordered(x, y); } - inline bool isunordered(half x, half y) { return functions::isunordered(x, y); } - inline bool isunordered(half x, expr y) { return functions::isunordered(x, y); } - inline bool isunordered(expr x, half y) { return functions::isunordered(x, y); } - inline bool isunordered(expr x, expr y) { return functions::isunordered(x, y); } - - /// \name Casting - /// \{ - - /// Cast to or from half-precision floating point number. - /// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted - /// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. - /// It uses the default rounding mode. - /// - /// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types - /// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler - /// error and casting between [half](\ref half_float::half)s is just a no-op. - /// \tparam T destination type (half or built-in arithmetic type) - /// \tparam U source type (half or built-in arithmetic type) - /// \param arg value to cast - /// \return \a arg converted to destination type - template T half_cast(U arg) { return half_caster::cast(arg); } - - /// Cast to or from half-precision floating point number. - /// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted - /// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. - /// - /// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types - /// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler - /// error and casting between [half](\ref half_float::half)s is just a no-op. - /// \tparam T destination type (half or built-in arithmetic type) - /// \tparam R rounding mode to use. - /// \tparam U source type (half or built-in arithmetic type) - /// \param arg value to cast - /// \return \a arg converted to destination type - template T half_cast(U arg) { return half_caster::cast(arg); } - /// \} - } - - using detail::operator==; - using detail::operator!=; - using detail::operator<; - using detail::operator>; - using detail::operator<=; - using detail::operator>=; - using detail::operator+; - using detail::operator-; - using detail::operator*; - using detail::operator/; - using detail::operator<<; - using detail::operator>>; - - using detail::abs; - using detail::fabs; - using detail::fmod; - using detail::remainder; - using detail::remquo; - using detail::fma; - using detail::fmax; - using detail::fmin; - using detail::fdim; - using detail::nanh; - using detail::exp; - using detail::expm1; - using detail::exp2; - using detail::log; - using detail::log10; - using detail::log1p; - using detail::log2; - using detail::sqrt; - using detail::cbrt; - using detail::hypot; - using detail::pow; - using detail::sin; - using detail::cos; - using detail::tan; - using detail::asin; - using detail::acos; - using detail::atan; - using detail::atan2; - using detail::sinh; - using detail::cosh; - using detail::tanh; - using detail::asinh; - using detail::acosh; - using detail::atanh; - using detail::erf; - using detail::erfc; - using detail::lgamma; - using detail::tgamma; - using detail::ceil; - using detail::floor; - using detail::trunc; - using detail::round; - using detail::lround; - using detail::nearbyint; - using detail::rint; - using detail::lrint; -#if HALF_ENABLE_CPP11_LONG_LONG - using detail::llround; - using detail::llrint; -#endif - using detail::frexp; - using detail::ldexp; - using detail::modf; - using detail::scalbn; - using detail::scalbln; - using detail::ilogb; - using detail::logb; - using detail::nextafter; - using detail::nexttoward; - using detail::copysign; - using detail::fpclassify; - using detail::isfinite; - using detail::isinf; - using detail::isnan; - using detail::isnormal; - using detail::signbit; - using detail::isgreater; - using detail::isgreaterequal; - using detail::isless; - using detail::islessequal; - using detail::islessgreater; - using detail::isunordered; - - using detail::half_cast; -} - - -/// Extensions to the C++ standard library. -namespace std -{ - /// Numeric limits for half-precision floats. - /// Because of the underlying single-precision implementation of many operations, it inherits some properties from - /// `std::numeric_limits`. - template<> class numeric_limits : public numeric_limits - { - public: - /// Supports signed values. - static HALF_CONSTEXPR_CONST bool is_signed = true; - - /// Is not exact. - static HALF_CONSTEXPR_CONST bool is_exact = false; - - /// Doesn't provide modulo arithmetic. - static HALF_CONSTEXPR_CONST bool is_modulo = false; - - /// IEEE conformant. - static HALF_CONSTEXPR_CONST bool is_iec559 = true; - - /// Supports infinity. - static HALF_CONSTEXPR_CONST bool has_infinity = true; - - /// Supports quiet NaNs. - static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true; - - /// Supports subnormal values. - static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present; - - /// Rounding mode. - /// Due to the mix of internal single-precision computations (using the rounding mode of the underlying - /// single-precision implementation) with the rounding mode of the single-to-half conversions, the actual rounding - /// mode might be `std::round_indeterminate` if the default half-precision rounding mode doesn't match the - /// single-precision rounding mode. - static HALF_CONSTEXPR_CONST float_round_style round_style = (std::numeric_limits::round_style== - half_float::half::round_style) ? half_float::half::round_style : round_indeterminate; - - /// Significant digits. - static HALF_CONSTEXPR_CONST int digits = 11; - - /// Significant decimal digits. - static HALF_CONSTEXPR_CONST int digits10 = 3; - - /// Required decimal digits to represent all possible values. - static HALF_CONSTEXPR_CONST int max_digits10 = 5; - - /// Number base. - static HALF_CONSTEXPR_CONST int radix = 2; - - /// One more than smallest exponent. - static HALF_CONSTEXPR_CONST int min_exponent = -13; - - /// Smallest normalized representable power of 10. - static HALF_CONSTEXPR_CONST int min_exponent10 = -4; - - /// One more than largest exponent - static HALF_CONSTEXPR_CONST int max_exponent = 16; - - /// Largest finitely representable power of 10. - static HALF_CONSTEXPR_CONST int max_exponent10 = 4; - - /// Smallest positive normal value. - static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0400); } - - /// Smallest finite value. - static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0xFBFF); } - - /// Largest finite value. - static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7BFF); } - - /// Difference between one and next representable value. - static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x1400); } - - /// Maximum rounding error. - static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW - { return half_float::half(half_float::detail::binary, (round_style==std::round_to_nearest) ? 0x3800 : 0x3C00); } - - /// Positive infinity. - static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7C00); } - - /// Quiet NaN. - static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7FFF); } - - /// Signalling NaN. - static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7DFF); } - - /// Smallest positive subnormal value. - static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0001); } - }; - -#if HALF_ENABLE_CPP11_HASH - /// Hash function for half-precision floats. - /// This is only defined if C++11 `std::hash` is supported and enabled. - template<> struct hash //: unary_function - { - /// Type of function argument. - typedef half_float::half argument_type; - - /// Function return type. - typedef size_t result_type; - - /// Compute hash function. - /// \param arg half to hash - /// \return hash value - result_type operator()(argument_type arg) const - { return hash()(static_cast(arg.data_)&-(arg.data_!=0x8000)); } - }; -#endif -} - - -#undef HALF_CONSTEXPR -#undef HALF_CONSTEXPR_CONST -#undef HALF_NOEXCEPT -#undef HALF_NOTHROW -#ifdef HALF_POP_WARNINGS - #pragma warning(pop) - #undef HALF_POP_WARNINGS -#endif - -#endif diff --git a/include/triton/external/hip.h b/include/triton/external/hip.h deleted file mode 100644 index a295eed68..000000000 --- a/include/triton/external/hip.h +++ /dev/null @@ -1,293 +0,0 @@ -#ifndef __external_hip_h__ -#define __external_hip_h__ - -/* - * @brief hipError_t - * @enum - * @ingroup Enumerations - */ -// Developer note - when updating these, update the hipErrorName and hipErrorString functions in -// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path. - -// Ignoring error-code return values from hip APIs is discouraged. On C++17, -// we can make that yield a warning - -/* - * @brief hipError_t - * @enum - * @ingroup Enumerations - */ -// Developer note - when updating these, update the hipErrorName and hipErrorString functions in -// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path. - -#include - -typedef enum hipError_t { - hipSuccess = 0, ///< Successful completion. - hipErrorInvalidValue = 1, ///< One or more of the parameters passed to the API call is NULL - ///< or not in an acceptable range. - hipErrorOutOfMemory = 2, - // Deprecated - hipErrorMemoryAllocation = 2, ///< Memory allocation error. - hipErrorNotInitialized = 3, - // Deprecated - hipErrorInitializationError = 3, - hipErrorDeinitialized = 4, - hipErrorProfilerDisabled = 5, - hipErrorProfilerNotInitialized = 6, - hipErrorProfilerAlreadyStarted = 7, - hipErrorProfilerAlreadyStopped = 8, - hipErrorInvalidConfiguration = 9, - hipErrorInvalidPitchValue = 12, - hipErrorInvalidSymbol = 13, - hipErrorInvalidDevicePointer = 17, ///< Invalid Device Pointer - hipErrorInvalidMemcpyDirection = 21, ///< Invalid memory copy direction - hipErrorInsufficientDriver = 35, - hipErrorMissingConfiguration = 52, - hipErrorPriorLaunchFailure = 53, - hipErrorInvalidDeviceFunction = 98, - hipErrorNoDevice = 100, ///< Call to hipGetDeviceCount returned 0 devices - hipErrorInvalidDevice = 101, ///< DeviceID must be in range 0...#compute-devices. - hipErrorInvalidImage = 200, - hipErrorInvalidContext = 201, ///< Produced when input context is invalid. - hipErrorContextAlreadyCurrent = 202, - hipErrorMapFailed = 205, - // Deprecated - hipErrorMapBufferObjectFailed = 205, ///< Produced when the IPC memory attach failed from ROCr. - hipErrorUnmapFailed = 206, - hipErrorArrayIsMapped = 207, - hipErrorAlreadyMapped = 208, - hipErrorNoBinaryForGpu = 209, - hipErrorAlreadyAcquired = 210, - hipErrorNotMapped = 211, - hipErrorNotMappedAsArray = 212, - hipErrorNotMappedAsPointer = 213, - hipErrorECCNotCorrectable = 214, - hipErrorUnsupportedLimit = 215, - hipErrorContextAlreadyInUse = 216, - hipErrorPeerAccessUnsupported = 217, - hipErrorInvalidKernelFile = 218, ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX - hipErrorInvalidGraphicsContext = 219, - hipErrorInvalidSource = 300, - hipErrorFileNotFound = 301, - hipErrorSharedObjectSymbolNotFound = 302, - hipErrorSharedObjectInitFailed = 303, - hipErrorOperatingSystem = 304, - hipErrorInvalidHandle = 400, - // Deprecated - hipErrorInvalidResourceHandle = 400, ///< Resource handle (hipEvent_t or hipStream_t) invalid. - hipErrorNotFound = 500, - hipErrorNotReady = 600, ///< Indicates that asynchronous operations enqueued earlier are not - ///< ready. This is not actually an error, but is used to distinguish - ///< from hipSuccess (which indicates completion). APIs that return - ///< this error include hipEventQuery and hipStreamQuery. - hipErrorIllegalAddress = 700, - hipErrorLaunchOutOfResources = 701, ///< Out of resources error. - hipErrorLaunchTimeOut = 702, - hipErrorPeerAccessAlreadyEnabled = - 704, ///< Peer access was already enabled from the current device. - hipErrorPeerAccessNotEnabled = - 705, ///< Peer access was never enabled from the current device. - hipErrorSetOnActiveProcess = 708, - hipErrorAssert = 710, ///< Produced when the kernel calls assert. - hipErrorHostMemoryAlreadyRegistered = - 712, ///< Produced when trying to lock a page-locked memory. - hipErrorHostMemoryNotRegistered = - 713, ///< Produced when trying to unlock a non-page-locked memory. - hipErrorLaunchFailure = - 719, ///< An exception occurred on the device while executing a kernel. - hipErrorCooperativeLaunchTooLarge = - 720, ///< This error indicates that the number of blocks launched per grid for a kernel - ///< that was launched via cooperative launch APIs exceeds the maximum number of - ///< allowed blocks for the current device - hipErrorNotSupported = 801, ///< Produced when the hip API is not supported/implemented - hipErrorUnknown = 999, //< Unknown error. - // HSA Runtime Error Codes start here. - hipErrorRuntimeMemory = 1052, ///< HSA runtime memory call returned error. Typically not seen - ///< in production systems. - hipErrorRuntimeOther = 1053, ///< HSA runtime call other than memory returned error. Typically - ///< not seen in production systems. - hipErrorTbd ///< Marker that more error codes are needed. -} hipError_t; - - -typedef struct ihipCtx_t* hipCtx_t; - -// Note many APIs also use integer deviceIds as an alternative to the device pointer: -typedef int hipDevice_t; - -typedef enum hipDeviceP2PAttr { - hipDevP2PAttrPerformanceRank = 0, - hipDevP2PAttrAccessSupported, - hipDevP2PAttrNativeAtomicSupported, - hipDevP2PAttrHipArrayAccessSupported -} hipDeviceP2PAttr; - -typedef struct ihipStream_t* hipStream_t; - -#define hipIpcMemLazyEnablePeerAccess 0 - -#define HIP_IPC_HANDLE_SIZE 64 - -typedef struct hipIpcMemHandle_st { - char reserved[HIP_IPC_HANDLE_SIZE]; -} hipIpcMemHandle_t; - -typedef struct hipIpcEventHandle_st { - char reserved[HIP_IPC_HANDLE_SIZE]; -} hipIpcEventHandle_t; - -typedef struct ihipModule_t* hipModule_t; - -typedef struct ihipModuleSymbol_t* hipFunction_t; - -typedef struct hipFuncAttributes { - int binaryVersion; - int cacheModeCA; - size_t constSizeBytes; - size_t localSizeBytes; - int maxDynamicSharedSizeBytes; - int maxThreadsPerBlock; - int numRegs; - int preferredShmemCarveout; - int ptxVersion; - size_t sharedSizeBytes; -} hipFuncAttributes; - -typedef struct ihipEvent_t* hipEvent_t; - -/* - * @brief hipDeviceAttribute_t - * @enum - * @ingroup Enumerations - */ -typedef enum hipDeviceAttribute_t { - hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block. - hipDeviceAttributeMaxBlockDimX, ///< Maximum x-dimension of a block. - hipDeviceAttributeMaxBlockDimY, ///< Maximum y-dimension of a block. - hipDeviceAttributeMaxBlockDimZ, ///< Maximum z-dimension of a block. - hipDeviceAttributeMaxGridDimX, ///< Maximum x-dimension of a grid. - hipDeviceAttributeMaxGridDimY, ///< Maximum y-dimension of a grid. - hipDeviceAttributeMaxGridDimZ, ///< Maximum z-dimension of a grid. - hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in - ///< bytes. - hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes. - hipDeviceAttributeWarpSize, ///< Warp size in threads. - hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a - ///< thread block. This number is shared by all thread - ///< blocks simultaneously resident on a - ///< multiprocessor. - hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. - hipDeviceAttributeMemoryClockRate, ///< Peak memory clock frequency in kilohertz. - hipDeviceAttributeMemoryBusWidth, ///< Global memory bus width in bits. - hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. - hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. - hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 - ///< cache. - hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per - ///< multiprocessor. - hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. - hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. - hipDeviceAttributeConcurrentKernels, ///< Device can possibly execute multiple kernels - ///< concurrently. - hipDeviceAttributePciBusId, ///< PCI Bus ID. - hipDeviceAttributePciDeviceId, ///< PCI Device ID. - hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, ///< Maximum Shared Memory Per - ///< Multiprocessor. - hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. - hipDeviceAttributeIntegrated, ///< iGPU - hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch - hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices - hipDeviceAttributeMaxTexture1DWidth, ///< Maximum number of elements in 1D images - hipDeviceAttributeMaxTexture2DWidth, ///< Maximum dimension width of 2D images in image elements - hipDeviceAttributeMaxTexture2DHeight, ///< Maximum dimension height of 2D images in image elements - hipDeviceAttributeMaxTexture3DWidth, ///< Maximum dimension width of 3D images in image elements - hipDeviceAttributeMaxTexture3DHeight, ///< Maximum dimensions height of 3D images in image elements - hipDeviceAttributeMaxTexture3DDepth, ///< Maximum dimensions depth of 3D images in image elements - - hipDeviceAttributeHdpMemFlushCntl, ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register - hipDeviceAttributeHdpRegFlushCntl, ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register - - hipDeviceAttributeMaxPitch, ///< Maximum pitch in bytes allowed by memory copies - hipDeviceAttributeTextureAlignment, /// -#include -#include - -namespace triton { -namespace tools { - -class timer { - typedef std::chrono::high_resolution_clock high_resolution_clock; - typedef std::chrono::nanoseconds nanoseconds; - -public: - explicit timer(bool run = false) { - if (run) - start(); - } - - void start() { _start = high_resolution_clock::now(); } - - nanoseconds get() const { - return std::chrono::duration_cast( - high_resolution_clock::now() - _start); - } - -private: - high_resolution_clock::time_point _start; -}; - -inline double bench(std::function const &op, driver::stream *stream, - size_t warmup = 10, size_t repeat = 200) { - timer tmr; - std::vector times; - double total_time = 0; - for (size_t i = 0; i < warmup; i++) - op(); - stream->synchronize(); - tmr.start(); - for (size_t i = 0; i < repeat; i++) { - op(); - } - stream->synchronize(); - return (float)tmr.get().count() / repeat; - - // return *std::min_element(times.begin(), times.end()); -} - -} // namespace tools -} // namespace triton - -#endif diff --git a/include/triton/tools/graph.h b/include/triton/tools/graph.h deleted file mode 100644 index 3725eb091..000000000 --- a/include/triton/tools/graph.h +++ /dev/null @@ -1,68 +0,0 @@ -#pragma once - -#ifndef _TRITON_TOOLS_THREAD_GRAPH_H_ -#define _TRITON_TOOLS_THREAD_GRAPH_H_ - -#include -#include -#include -#include - -namespace triton { -namespace tools { - -template class graph { - typedef std::map> edges_t; - -public: - typedef std::map> cmap_t; - typedef std::map nmap_t; - -private: - void connected_components_impl(node_t x, std::set &nodes, - nmap_t *nmap, cmap_t *cmap, int id) const { - if (nmap) - (*nmap)[x] = id; - if (cmap) - (*cmap)[id].push_back(x); - if (nodes.find(x) != nodes.end()) { - nodes.erase(x); - for (const node_t &y : edges_.at(x)) - connected_components_impl(y, nodes, nmap, cmap, id); - } - } - -public: - void connected_components(cmap_t *cmap, nmap_t *nmap) const { - if (cmap) - cmap->clear(); - if (nmap) - nmap->clear(); - std::set nodes = nodes_; - unsigned id = 0; - while (!nodes.empty()) { - connected_components_impl(*nodes.begin(), nodes, nmap, cmap, id++); - } - } - - void add_edge(node_t x, node_t y) { - nodes_.insert(x); - nodes_.insert(y); - edges_[x].insert(y); - edges_[y].insert(x); - } - - void clear() { - nodes_.clear(); - edges_.clear(); - } - -private: - std::set nodes_; - edges_t edges_; -}; - -} // namespace tools -} // namespace triton - -#endif diff --git a/include/triton/tools/sha1.hpp b/include/triton/tools/sha1.hpp deleted file mode 100644 index 1e71034de..000000000 --- a/include/triton/tools/sha1.hpp +++ /dev/null @@ -1,172 +0,0 @@ -/* - Copyright (c) 2011, Micael Hildenborg - All rights reserved. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of Micael Hildenborg nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - THIS SOFTWARE IS PROVIDED BY Micael Hildenborg ''AS IS'' AND ANY - EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL Micael Hildenborg BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Contributors: - Gustav - Several members in the gamedev.se forum. - Gregory Petrosyan - */ - -#ifndef _TRITON_TOOLS_SHA1_HPP_ -#define _TRITON_TOOLS_SHA1_HPP_ - -namespace sha1 { -namespace // local -{ -// Rotate an integer value to left. -inline unsigned int rol(const unsigned int value, const unsigned int steps) { - return ((value << steps) | (value >> (32 - steps))); -} - -// Sets the first 16 integers in the buffert to zero. -// Used for clearing the W buffert. -inline void clearWBuffert(unsigned int *buffert) { - for (int pos = 16; --pos >= 0;) { - buffert[pos] = 0; - } -} - -inline void innerHash(unsigned int *result, unsigned int *w) { - unsigned int a = result[0]; - unsigned int b = result[1]; - unsigned int c = result[2]; - unsigned int d = result[3]; - unsigned int e = result[4]; - - int round = 0; - -#define sha1macro(func, val) \ - { \ - const unsigned int t = rol(a, 5) + (func) + e + val + w[round]; \ - e = d; \ - d = c; \ - c = rol(b, 30); \ - b = a; \ - a = t; \ - } - - while (round < 16) { - sha1macro((b & c) | (~b & d), 0x5a827999)++ round; - } - while (round < 20) { - w[round] = - rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1); - sha1macro((b & c) | (~b & d), 0x5a827999)++ round; - } - while (round < 40) { - w[round] = - rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1); - sha1macro(b ^ c ^ d, 0x6ed9eba1)++ round; - } - while (round < 60) { - w[round] = - rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1); - sha1macro((b & c) | (b & d) | (c & d), 0x8f1bbcdc)++ round; - } - while (round < 80) { - w[round] = - rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1); - sha1macro(b ^ c ^ d, 0xca62c1d6)++ round; - } - -#undef sha1macro - - result[0] += a; - result[1] += b; - result[2] += c; - result[3] += d; - result[4] += e; -} -} // namespace - -inline void calc(const void *src, const int bytelength, unsigned char *hash) { - // Init the result array. - unsigned int result[5] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, - 0xc3d2e1f0}; - - // Cast the void src pointer to be the byte array we can work with. - const unsigned char *sarray = (const unsigned char *)src; - - // The reusable round buffer - unsigned int w[80]; - - // Loop through all complete 64byte blocks. - const int endOfFullBlocks = bytelength - 64; - int endCurrentBlock; - int currentBlock = 0; - - while (currentBlock <= endOfFullBlocks) { - endCurrentBlock = currentBlock + 64; - - // Init the round buffer with the 64 byte block data. - for (int roundPos = 0; currentBlock < endCurrentBlock; currentBlock += 4) { - // This line will swap endian on big endian and keep endian on little - // endian. - w[roundPos++] = (unsigned int)sarray[currentBlock + 3] | - (((unsigned int)sarray[currentBlock + 2]) << 8) | - (((unsigned int)sarray[currentBlock + 1]) << 16) | - (((unsigned int)sarray[currentBlock]) << 24); - } - innerHash(result, w); - } - - // Handle the last and not full 64 byte block if existing. - endCurrentBlock = bytelength - currentBlock; - clearWBuffert(w); - int lastBlockBytes = 0; - for (; lastBlockBytes < endCurrentBlock; ++lastBlockBytes) { - w[lastBlockBytes >> 2] |= - (unsigned int)sarray[lastBlockBytes + currentBlock] - << ((3 - (lastBlockBytes & 3)) << 3); - } - w[lastBlockBytes >> 2] |= 0x80 << ((3 - (lastBlockBytes & 3)) << 3); - if (endCurrentBlock >= 56) { - innerHash(result, w); - clearWBuffert(w); - } - w[15] = bytelength << 3; - innerHash(result, w); - - // Store hash in result pointer, and make sure we get in in the correct order - // on both endian models. - for (int hashByte = 20; --hashByte >= 0;) { - hash[hashByte] = - (result[hashByte >> 2] >> (((3 - hashByte) & 0x3) << 3)) & 0xff; - } -} - -inline void toHexString(const unsigned char *hash, char *hexstring) { - const char hexDigits[] = {"0123456789abcdef"}; - - for (int hashByte = 20; --hashByte >= 0;) { - hexstring[hashByte << 1] = hexDigits[(hash[hashByte] >> 4) & 0xf]; - hexstring[(hashByte << 1) + 1] = hexDigits[hash[hashByte] & 0xf]; - } - hexstring[40] = 0; -} -} // namespace sha1 - -#endif diff --git a/include/triton/tools/sys/exec.hpp b/include/triton/tools/sys/exec.hpp deleted file mode 100644 index e96a04314..000000000 --- a/include/triton/tools/sys/exec.hpp +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef TRITON_TOOLS_SYS_EXEC_HPP -#define TRITON_TOOLS_SYS_EXEC_HPP - -#include -#include -#include -#include -#include - -namespace triton { -namespace tools { - -#ifdef _WIN32 -#define popen _popen -#define pclose _pclose -#endif - -#ifndef WEXITSTATUS -#define WEXITSTATUS(stat_val) ((unsigned)(stat_val)&255) -#endif - -int exec(const std::string &cmd, std::string &result) { - char buffer[128]; - FILE *pipe = popen(cmd.c_str(), "r"); - if (!pipe) - return 0; - result.clear(); - try { - while (fgets(buffer, sizeof buffer, pipe) != NULL) - result += buffer; - } catch (...) { - pclose(pipe); - return 0; - } - int status = pclose(pipe); - return WEXITSTATUS(status); -} - -} // namespace tools -} // namespace triton - -#endif diff --git a/include/triton/tools/sys/mkdir.hpp b/include/triton/tools/sys/mkdir.hpp deleted file mode 100644 index 10cb0da6a..000000000 --- a/include/triton/tools/sys/mkdir.hpp +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved. - * - * This file is part of ISAAC. - * - * ISAAC is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, - * MA 02110-1301 USA - */ - -#ifndef TDL_TOOLS_SYS_MKDIR_HPP -#define TDL_TOOLS_SYS_MKDIR_HPP - -#include -#include -#include -#include -#include -#if defined(_WIN32) -#include -#endif - -namespace triton { - -namespace tools { - -inline int mkdir(std::string const &path) { -#if defined(_WIN32) - return _mkdir(path.c_str()); -#else - return ::mkdir(path.c_str(), 0777); -#endif -} - -inline int mkpath(std::string const &path) { - int status = 0; - size_t pp = 0; - size_t sp; - while ((sp = path.find('/', pp)) != std::string::npos) { - if (sp != pp) { - status = mkdir(path.substr(0, sp)); - } - pp = sp + 1; - } - return (status == 0 || errno == EEXIST) ? 0 : -1; -} - -inline int mtime(std::string const &path) { - struct stat st; - if (stat(path.c_str(), &st) != 0) - return 0; - return st.st_mtime; -} - -} // namespace tools - -} // namespace triton - -#endif diff --git a/include/triton/tools/thread_pool.h b/include/triton/tools/thread_pool.h deleted file mode 100644 index 045e983f8..000000000 --- a/include/triton/tools/thread_pool.h +++ /dev/null @@ -1,81 +0,0 @@ -#pragma once - -#ifndef _TRITON_TOOLS_THREAD_POOL_H_ -#define _TRITON_TOOLS_THREAD_POOL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -class ThreadPool { -public: - ThreadPool(size_t threads) : stop(false) { - for (size_t i = 0; i < threads; ++i) - workers.emplace_back([this] { - for (;;) { - std::function task; - { - std::unique_lock lock(this->queue_mutex); - this->condition.wait( - lock, [this] { return this->stop || !this->tasks.empty(); }); - if (this->stop && this->tasks.empty()) - return; - task = std::move(this->tasks.front()); - this->tasks.pop(); - } - task(); - } - }); - } - - template - auto enqueue(F &&f, Args &&...args) - -> std::future::type> { - using return_type = typename std::result_of::type; - - auto task = std::make_shared>( - std::bind(std::forward(f), std::forward(args)...)); - - std::future res = task->get_future(); - { - std::unique_lock lock(queue_mutex); - - // don't allow enqueueing after stopping the pool - if (stop) - throw std::runtime_error("enqueue on stopped ThreadPool"); - - tasks.emplace([task]() { (*task)(); }); - } - condition.notify_one(); - return res; - } - - ~ThreadPool() { - { - std::unique_lock lock(queue_mutex); - stop = true; - } - condition.notify_all(); - for (std::thread &worker : workers) - worker.join(); - } - -private: - // need to keep track of threads so we can join them - std::vector workers; - // the task queue - std::queue> tasks; - - // synchronization - std::mutex queue_mutex; - std::condition_variable condition; - bool stop; -}; - -#endif diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 5a6ba8951..ab1d31a76 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1,5 +1,4 @@ # add_subdirectory(codegen) -add_subdirectory(driver) add_subdirectory(Analysis) add_subdirectory(Conversion) add_subdirectory(Dialect) diff --git a/lib/Target/LLVMIR/LLVMIRTranslation.cpp b/lib/Target/LLVMIR/LLVMIRTranslation.cpp index 179c9391a..5837b0973 100644 --- a/lib/Target/LLVMIR/LLVMIRTranslation.cpp +++ b/lib/Target/LLVMIR/LLVMIRTranslation.cpp @@ -13,7 +13,6 @@ #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h" #include "mlir/Transforms/Passes.h" #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.h" -#include "triton/driver/llvm.h" #include "triton/tools/sys/getenv.hpp" #include "llvm/IR/Constants.h" @@ -99,7 +98,6 @@ translateLLVMToLLVMIR(llvm::LLVMContext *llvmContext, mlir::ModuleOp module) { } // Initialize LLVM targets. - ::triton::driver::init_llvm(); mlir::ExecutionEngine::setupTargetTriple(llvmModule.get()); auto optPipeline = mlir::makeOptimizingTransformer( diff --git a/lib/Target/PTX/PTXTranslation.cpp b/lib/Target/PTX/PTXTranslation.cpp index b286e612a..631af81cc 100644 --- a/lib/Target/PTX/PTXTranslation.cpp +++ b/lib/Target/PTX/PTXTranslation.cpp @@ -11,31 +11,129 @@ #include "mlir/Target/LLVMIR/Export.h" #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h" #include "triton/Target/LLVMIR/LLVMIRTranslation.h" -#include "triton/driver/dispatch.h" -#include "triton/driver/llvm.h" + +#include "llvm/ExecutionEngine/ExecutionEngine.h" +#include "llvm/ExecutionEngine/SectionMemoryManager.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include namespace triton { -void getCuCCAndVersionFromDevice(uint64_t device, int *cc, int *version, - std::string *ptxasPath) { - CUdevice dev = (CUdevice)device; - size_t major = cuGetInfo(dev); - size_t minor = cuGetInfo(dev); - *cc = major * 10 + minor; - *ptxasPath = driver::path_to_ptxas(*version); // assign version +extern "C" { +int set_curterm(char *nterm) { return 0; } +int del_curterm(char *nterm) { return 0; } +int tigetnum(char *capname) { return 0; } +int setupterm(char *term, int fildes, int *errret) { return 0; } } -std::tuple -translateTritonGPUToPTX(mlir::ModuleOp module, uint64_t device) { - int cc; - int version; - std::string ptxasPath; - getCuCCAndVersionFromDevice(device, &cc, &version, &ptxasPath); +static void init_llvm() { + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); +} - llvm::LLVMContext ctx; - auto llModule = mlir::triton::translateTritonGPUToLLVMIR(&ctx, module); - auto ptxCode = driver::llir_to_ptx(llModule.get(), cc, version); - return std::make_tuple(ptxCode, cc, version, ptxasPath); +static bool find_and_replace(std::string &str, const std::string &begin, + const std::string &end, + const std::string &target) { + size_t start_replace = str.find(begin); + if (start_replace == std::string::npos) + return false; + size_t end_replace = str.find(end, start_replace); + if (end_replace == std::string::npos) + return false; + str.replace(start_replace, end_replace + 1 - start_replace, target); + return true; +} + +static std::string llir_to_ptx(llvm::Module *module, int capability, int ptx) { + // LLVM version in use may not officially support target hardware + int max_nvvm_cc = 75; + int max_nvvm_ptx = 74; + // options + auto options = llvm::cl::getRegisteredOptions(); + auto *short_ptr = + static_cast *>(options["nvptx-short-ptr"]); + assert(short_ptr); + short_ptr->setValue(true); + // compute capability + std::string sm = "sm_" + std::to_string(capability); + // max PTX version + int ptx_major = ptx / 10; + int ptx_minor = ptx % 10; + // create + llvm::SmallVector buffer; + std::string triple = "nvptx64-nvidia-cuda"; + std::string proc = "sm_" + std::to_string(std::min(capability, max_nvvm_cc)); + std::string layout = ""; + std::string features = ""; + // std::string features = "+ptx" + std::to_string(std::min(ptx, + // max_nvvm_ptx)); + init_llvm(); + // verify and store llvm + llvm::legacy::PassManager pm; + pm.add(llvm::createVerifierPass()); + pm.run(*module); + // module->print(llvm::outs(), nullptr); + + // create machine + module->setTargetTriple(triple); + std::string error; + auto target = + llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); + llvm::TargetOptions opt; + opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; + opt.UnsafeFPMath = false; + opt.NoInfsFPMath = false; + opt.NoNaNsFPMath = true; + llvm::TargetMachine *machine = target->createTargetMachine( + module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_, + llvm::None, llvm::CodeGenOpt::Aggressive); + // set data layout + if (layout.empty()) + module->setDataLayout(machine->createDataLayout()); + else + module->setDataLayout(layout); + // emit machine code + for (llvm::Function &f : module->functions()) + f.addFnAttr(llvm::Attribute::AlwaysInline); + llvm::legacy::PassManager pass; + llvm::raw_svector_ostream stream(buffer); + // emit + machine->addPassesToEmitFile(pass, stream, nullptr, + llvm::CodeGenFileType::CGFT_AssemblyFile); + pass.run(*module); + + // post-process + std::string result(buffer.begin(), buffer.end()); + find_and_replace(result, ".version", "\n", + ".version " + std::to_string(ptx_major) + "." + + std::to_string(ptx_minor) + "\n"); + find_and_replace(result, ".target", "\n", ".target " + sm + "\n"); + while (find_and_replace(result, "\t// begin inline asm", "\n", "")) + ; + while (find_and_replace(result, "\t// end inline asm", "\n", "")) + ; + return result; +} + +std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version) { + auto ptxCode = llir_to_ptx(&module, cc, version); + return ptxCode; } } // namespace triton diff --git a/lib/driver/CMakeLists.txt b/lib/driver/CMakeLists.txt deleted file mode 100644 index d08c5a107..000000000 --- a/lib/driver/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library(TritonDriver - dispatch.cc - error.cc - llvm.cc -) diff --git a/lib/driver/dispatch.cc b/lib/driver/dispatch.cc deleted file mode 100644 index 427453b38..000000000 --- a/lib/driver/dispatch.cc +++ /dev/null @@ -1,395 +0,0 @@ -/* Copyright 2015-2017 Philippe Tillet - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files - * (the "Software"), to deal in the Software without restriction, - * including without limitation the rights to use, copy, modify, merge, - * publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, - * subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "triton/driver/dispatch.h" - -namespace triton { -namespace driver { - -// Helpers for function definition -#define DEFINE0(init, hlib, ret, fname) \ - ret dispatch::fname() { \ - return f_impl(hlib, fname, fname##_, #fname); \ - } \ - void *dispatch::fname##_; - -#define DEFINE1(init, hlib, ret, fname, t1) \ - ret dispatch::fname(t1 a) { \ - return f_impl(hlib, fname, fname##_, #fname, a); \ - } \ - void *dispatch::fname##_; - -#define DEFINE2(init, hlib, ret, fname, t1, t2) \ - ret dispatch::fname(t1 a, t2 b) { \ - return f_impl(hlib, fname, fname##_, #fname, a, b); \ - } \ - void *dispatch::fname##_; - -#define DEFINE3(init, hlib, ret, fname, t1, t2, t3) \ - ret dispatch::fname(t1 a, t2 b, t3 c) { \ - return f_impl(hlib, fname, fname##_, #fname, a, b, c); \ - } \ - void *dispatch::fname##_; - -#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) \ - ret dispatch::fname(t1 a, t2 b, t3 c, t4 d) { \ - return f_impl(hlib, fname, fname##_, #fname, a, b, c, d); \ - } \ - void *dispatch::fname##_; - -#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) \ - ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e) { \ - return f_impl(hlib, fname, fname##_, #fname, a, b, c, d, \ - e); \ - } \ - void *dispatch::fname##_; - -#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) \ - ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f) { \ - return f_impl(hlib, fname, fname##_, #fname, a, b, c, d, \ - e, f); \ - } \ - void *dispatch::fname##_; - -#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) \ - ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g) { \ - return f_impl(hlib, fname, fname##_, #fname, a, b, c, d, \ - e, f, g); \ - } \ - void *dispatch::fname##_; - -#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) \ - ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h) { \ - return f_impl(hlib, fname, fname##_, #fname, a, b, c, d, \ - e, f, g, h); \ - } \ - void *dispatch::fname##_; - -#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) \ - ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i) { \ - return f_impl(hlib, fname, fname##_, #fname, a, b, c, d, \ - e, f, g, h, i); \ - } \ - void *dispatch::fname##_; - -#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, \ - t10) \ - ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, \ - t10 j) { \ - return f_impl(hlib, fname, fname##_, #fname, a, b, c, d, \ - e, f, g, h, i, j); \ - } \ - void *dispatch::fname##_; - -#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, \ - t10, t11) \ - ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, \ - t10 j, t11 k) { \ - return f_impl(hlib, fname, fname##_, #fname, a, b, c, d, \ - e, f, g, h, i, j, k); \ - } \ - void *dispatch::fname##_; - -#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, \ - t10, t11, t12, t13) \ - ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, \ - t10 j, t11 k, t12 l, t13 m) { \ - return f_impl(hlib, fname, fname##_, #fname, a, b, c, d, \ - e, f, g, h, i, j, k, l, m); \ - } \ - void *dispatch::fname##_; - -#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, \ - t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) \ - ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, \ - t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, \ - t18 r, t19 s) { \ - return f_impl(hlib, fname, fname##_, #fname, a, b, c, d, \ - e, f, g, h, i, j, k, l, m, n, o, p, q, r, \ - s); \ - } \ - void *dispatch::fname##_; - -/* ------------------- * - * CUDA - * ------------------- */ - -bool dispatch::cuinit() { - if (cuda_ == nullptr) { -#ifdef _WIN32 - cuda_ = dlopen("cudart64_110.dll", RTLD_LAZY); -#else - cuda_ = dlopen("libcuda.so", RTLD_LAZY); - if (!cuda_) - cuda_ = dlopen("libcuda.so.1", RTLD_LAZY); -#endif - if (!cuda_) - throw std::runtime_error("Could not find `libcuda.so`. Make sure it is " - "in your LD_LIBRARY_PATH."); - } - if (cuda_ == nullptr) - return false; - CUresult (*fptr)(unsigned int); - cuInit_ = dlsym(cuda_, "cuInit"); - *reinterpret_cast(&fptr) = cuInit_; - CUresult res = (*fptr)(0); - check(res); - return true; -} - -#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1) -#define CUDA_DEFINE2(ret, fname, t1, t2) \ - DEFINE2(cuinit, cuda_, ret, fname, t1, t2) -#define CUDA_DEFINE3(ret, fname, t1, t2, t3) \ - DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3) -#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) \ - DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4) -#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) \ - DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5) -#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) \ - DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6) -#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) \ - DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7) -#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) \ - DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) -#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) \ - DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) -#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) \ - DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) -#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, \ - t11) \ - DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, \ - t11) - -// context management -CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext) -CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice) -CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice *) -CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int) -CUDA_DEFINE1(CUresult, cuInit, unsigned int) -CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *) -// device management -CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int) -CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice) -CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice) -CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, - CUdevice) -CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *) - -// link management -CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void *, - size_t, const char *, unsigned int, CUjit_option *, void **); -CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option *, void **, - CUlinkState *); -CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState); -CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void **, size_t *); -// module management -CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr *, size_t *, CUmodule, - const char *) -CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *) -CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule) -CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *) -CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, - unsigned int, CUjit_option *, void **) -CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, - const char *) -// stream management -CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int) -CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream) -CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream) -CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext *) -CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, - unsigned int, unsigned int, unsigned int, unsigned int, - unsigned int, CUstream, void **, void **) -// function management -CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int *, CUfunction_attribute, - CUfunction) -CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, - int) -CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache) -// memory management -CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t) -CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr) -CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, - CUstream) -CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, - CUstream) -CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t) -CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr *, size_t) -CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void *, CUpointer_attribute, - CUdeviceptr) -CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, - CUstream) -// event management -CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int) -CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent) -CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream) -CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent) - -/* ------------------- * - * NVML - * ------------------- */ -bool dispatch::nvmlinit() { -#ifdef _WIN32 - if (nvml_ == nullptr) - nvml_ = dlopen("nvml.dll", RTLD_LAZY); -#else - if (nvml_ == nullptr) - nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY); -#endif - nvmlReturn_t (*fptr)(); - nvmlInit_v2_ = dlsym(nvml_, "nvmlInit_v2"); - *reinterpret_cast(&fptr) = nvmlInit_v2_; - nvmlReturn_t res = (*fptr)(); - check(res); - return res; -} - -#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname) -#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1) -#define NVML_DEFINE2(ret, fname, t1, t2) \ - DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2) -#define NVML_DEFINE3(ret, fname, t1, t2, t3) \ - DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3) - -NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, - nvmlDevice_t *) -NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, - nvmlClockType_t, unsigned int *) -NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, - nvmlClockType_t, unsigned int *) -NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, - unsigned int, unsigned int) - -/* ------------------- * - * HIP - * ------------------- */ -bool dispatch::hipinit() { - if (hip_ == nullptr) - hip_ = dlopen("libamdhip64.so", RTLD_LAZY); - if (hip_ == nullptr) - return false; - hipError_t (*fptr)(); - hipInit_ = dlsym(hip_, "hipInit"); - *reinterpret_cast(&fptr) = hipInit_; - hipError_t res = (*fptr)(); - check(res); - return res; -} - -#define HIP_DEFINE1(ret, fname, t1) DEFINE1(hipinit, hip_, ret, fname, t1) -#define HIP_DEFINE2(ret, fname, t1, t2) \ - DEFINE2(hipinit, hip_, ret, fname, t1, t2) -#define HIP_DEFINE3(ret, fname, t1, t2, t3) \ - DEFINE3(hipinit, hip_, ret, fname, t1, t2, t3) -#define HIP_DEFINE4(ret, fname, t1, t2, t3, t4) \ - DEFINE4(hipinit, hip_, ret, fname, t1, t2, t3, t4) -#define HIP_DEFINE5(ret, fname, t1, t2, t3, t4, t5) \ - DEFINE5(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5) -#define HIP_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) \ - DEFINE6(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6) -#define HIP_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) \ - DEFINE7(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7) -#define HIP_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) \ - DEFINE8(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) -#define HIP_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) \ - DEFINE9(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) -#define HIP_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) \ - DEFINE10(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) -#define HIP_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \ - DEFINE11(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, \ - t11) - -// context management -HIP_DEFINE1(hipError_t, hipCtxDestroy, hipCtx_t) -HIP_DEFINE3(hipError_t, hipCtxCreate, hipCtx_t *, unsigned int, hipDevice_t) -HIP_DEFINE1(hipError_t, hipCtxGetDevice, hipDevice_t *) -HIP_DEFINE1(hipError_t, hipCtxPushCurrent, hipCtx_t) -HIP_DEFINE1(hipError_t, hipCtxPopCurrent, hipCtx_t *) -HIP_DEFINE2(hipError_t, hipCtxEnablePeerAccess, hipCtx_t, unsigned int) -HIP_DEFINE1(hipError_t, hipInit, unsigned int) -HIP_DEFINE1(hipError_t, hipDriverGetVersion, int *) -// device management -HIP_DEFINE2(hipError_t, hipGetDevice, hipDevice_t *, int) -HIP_DEFINE3(hipError_t, hipDeviceGetName, char *, int, hipDevice_t) -HIP_DEFINE3(hipError_t, hipDeviceGetPCIBusId, char *, int, hipDevice_t) -HIP_DEFINE3(hipError_t, hipDeviceGetAttribute, int *, hipDeviceAttribute_t, - hipDevice_t) -HIP_DEFINE1(hipError_t, hipGetDeviceCount, int *) -// module management -HIP_DEFINE4(hipError_t, hipModuleGetGlobal, hipDeviceptr_t *, size_t *, - hipModule_t, const char *) -HIP_DEFINE2(hipError_t, hipModuleLoad, hipModule_t *, const char *) -HIP_DEFINE1(hipError_t, hipModuleUnload, hipModule_t) -HIP_DEFINE2(hipError_t, hipModuleLoadData, hipModule_t *, const void *) -HIP_DEFINE5(hipError_t, hipModuleLoadDataEx, hipModule_t *, const void *, - unsigned int, hipJitOption *, void **) -HIP_DEFINE3(hipError_t, hipModuleGetFunction, hipFunction_t *, hipModule_t, - const char *) -// stream management -HIP_DEFINE2(hipError_t, hipStreamCreate, hipStream_t *, unsigned int) -HIP_DEFINE1(hipError_t, hipStreamSynchronize, hipStream_t) -HIP_DEFINE1(hipError_t, hipStreamDestroy, hipStream_t) -HIP_DEFINE11(hipError_t, hipModuleLaunchKernel, hipFunction_t, unsigned int, - unsigned int, unsigned int, unsigned int, unsigned int, - unsigned int, unsigned int, hipStream_t, void **, void **) -// function management -HIP_DEFINE2(hipError_t, hipFuncGetAttributes, hipFuncAttributes *, void *) -HIP_DEFINE2(hipError_t, hipFuncSetCacheConfig, hipFunction_t, hipFuncCache_t) -// memory management -HIP_DEFINE3(hipError_t, hipMemcpyDtoH, void *, hipDeviceptr_t, size_t) -HIP_DEFINE1(hipError_t, hipFree, hipDeviceptr_t) -HIP_DEFINE4(hipError_t, hipMemcpyDtoHAsync, void *, hipDeviceptr_t, size_t, - hipStream_t) -HIP_DEFINE4(hipError_t, hipMemcpyHtoDAsync, hipDeviceptr_t, const void *, - size_t, hipStream_t) -HIP_DEFINE3(hipError_t, hipMemcpyHtoD, hipDeviceptr_t, const void *, size_t) -HIP_DEFINE2(hipError_t, hipMalloc, hipDeviceptr_t *, size_t) -HIP_DEFINE3(hipError_t, hipPointerGetAttribute, void *, CUpointer_attribute, - hipDeviceptr_t) -HIP_DEFINE4(hipError_t, hipMemsetD8Async, hipDeviceptr_t, unsigned char, size_t, - hipStream_t) -// event management -HIP_DEFINE2(hipError_t, hipEventCreate, hipEvent_t *, unsigned int) -HIP_DEFINE3(hipError_t, hipEventElapsedTime, float *, hipEvent_t, hipEvent_t) -HIP_DEFINE2(hipError_t, hipEventRecord, hipEvent_t, hipStream_t) -HIP_DEFINE1(hipError_t, hipEventDestroy, hipEvent_t) - -/* ------------------- * - * COMMON - * ------------------- */ - -// Release -void dispatch::release() { - if (cuda_) { - dlclose(cuda_); - cuda_ = nullptr; - } -} - -void *dispatch::cuda_; -void *dispatch::nvml_; -void *dispatch::nvmlInit_v2_; -void *dispatch::hip_; - -} // namespace driver -} // namespace triton diff --git a/lib/driver/error.cc b/lib/driver/error.cc deleted file mode 100644 index 4b366746e..000000000 --- a/lib/driver/error.cc +++ /dev/null @@ -1,270 +0,0 @@ -/* Copyright 2015-2017 Philippe Tillet - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files - * (the "Software"), to deal in the Software without restriction, - * including without limitation the rights to use, copy, modify, merge, - * publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, - * subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "triton/driver/error.h" - -namespace triton { -namespace driver { - -void check(CUresult err) { - using namespace exception::cuda; - switch (err) { - case CUDA_SUCCESS: - break; - case CUDA_ERROR_INVALID_VALUE: - throw invalid_value(); - case CUDA_ERROR_OUT_OF_MEMORY: - throw out_of_memory(); - case CUDA_ERROR_NOT_INITIALIZED: - throw not_initialized(); - case CUDA_ERROR_DEINITIALIZED: - throw deinitialized(); - case CUDA_ERROR_PROFILER_DISABLED: - throw profiler_disabled(); - case CUDA_ERROR_PROFILER_NOT_INITIALIZED: - throw profiler_not_initialized(); - case CUDA_ERROR_PROFILER_ALREADY_STARTED: - throw profiler_already_started(); - case CUDA_ERROR_PROFILER_ALREADY_STOPPED: - throw profiler_already_stopped(); - case CUDA_ERROR_NO_DEVICE: - throw no_device(); - case CUDA_ERROR_INVALID_DEVICE: - throw invalid_device(); - case CUDA_ERROR_INVALID_IMAGE: - throw invalid_image(); - case CUDA_ERROR_INVALID_CONTEXT: - throw invalid_context(); - case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: - throw context_already_current(); - case CUDA_ERROR_MAP_FAILED: - throw map_failed(); - case CUDA_ERROR_UNMAP_FAILED: - throw unmap_failed(); - case CUDA_ERROR_ARRAY_IS_MAPPED: - throw array_is_mapped(); - case CUDA_ERROR_ALREADY_MAPPED: - throw already_mapped(); - case CUDA_ERROR_NO_BINARY_FOR_GPU: - throw no_binary_for_gpu(); - case CUDA_ERROR_ALREADY_ACQUIRED: - throw already_acquired(); - case CUDA_ERROR_NOT_MAPPED: - throw not_mapped(); - case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: - throw not_mapped_as_array(); - case CUDA_ERROR_NOT_MAPPED_AS_POINTER: - throw not_mapped_as_pointer(); - case CUDA_ERROR_ECC_UNCORRECTABLE: - throw ecc_uncorrectable(); - case CUDA_ERROR_UNSUPPORTED_LIMIT: - throw unsupported_limit(); - case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: - throw context_already_in_use(); - case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: - throw peer_access_unsupported(); - case CUDA_ERROR_INVALID_PTX: - throw invalid_ptx(); - case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: - throw invalid_graphics_context(); - case CUDA_ERROR_INVALID_SOURCE: - throw invalid_source(); - case CUDA_ERROR_FILE_NOT_FOUND: - throw file_not_found(); - case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: - throw shared_object_symbol_not_found(); - case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: - throw shared_object_init_failed(); - case CUDA_ERROR_OPERATING_SYSTEM: - throw operating_system(); - case CUDA_ERROR_INVALID_HANDLE: - throw invalid_handle(); - case CUDA_ERROR_NOT_FOUND: - throw not_found(); - case CUDA_ERROR_NOT_READY: - throw not_ready(); - case CUDA_ERROR_ILLEGAL_ADDRESS: - throw illegal_address(); - case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: - throw launch_out_of_resources(); - case CUDA_ERROR_LAUNCH_TIMEOUT: - throw launch_timeout(); - case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: - throw launch_incompatible_texturing(); - case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: - throw peer_access_already_enabled(); - case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: - throw peer_access_not_enabled(); - case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: - throw primary_context_active(); - case CUDA_ERROR_CONTEXT_IS_DESTROYED: - throw context_is_destroyed(); - case CUDA_ERROR_ASSERT: - throw assert_error(); - case CUDA_ERROR_TOO_MANY_PEERS: - throw too_many_peers(); - case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: - throw host_memory_already_registered(); - case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: - throw host_memory_not_registered(); - case CUDA_ERROR_HARDWARE_STACK_ERROR: - throw hardware_stack_error(); - case CUDA_ERROR_ILLEGAL_INSTRUCTION: - throw illegal_instruction(); - case CUDA_ERROR_MISALIGNED_ADDRESS: - throw misaligned_address(); - case CUDA_ERROR_INVALID_ADDRESS_SPACE: - throw invalid_address_space(); - case CUDA_ERROR_INVALID_PC: - throw invalid_pc(); - case CUDA_ERROR_LAUNCH_FAILED: - throw launch_failed(); - case CUDA_ERROR_NOT_PERMITTED: - throw not_permitted(); - case CUDA_ERROR_NOT_SUPPORTED: - throw not_supported(); - case CUDA_ERROR_UNKNOWN: - throw unknown(); - default: - throw unknown(); - } -} - -void check(hipError_t error) { - using namespace exception::hip; - switch (error) { - case hipSuccess: - break; - case hipErrorInvalidValue: - throw invalid_value(); - case hipErrorMemoryAllocation: - throw out_of_memory(); - case hipErrorNotInitialized: - throw not_initialized(); - case hipErrorDeinitialized: - throw deinitialized(); - case hipErrorProfilerDisabled: - throw profiler_disabled(); - case hipErrorProfilerNotInitialized: - throw profiler_not_initialized(); - case hipErrorProfilerAlreadyStarted: - throw profiler_already_started(); - case hipErrorProfilerAlreadyStopped: - throw profiler_already_stopped(); - case hipErrorNoDevice: - throw no_device(); - case hipErrorInvalidSymbol: - throw invalid_symbol(); - case hipErrorInvalidDevice: - throw invalid_device(); - case hipErrorInvalidImage: - throw invalid_image(); - case hipErrorInvalidContext: - throw invalid_context(); - case hipErrorContextAlreadyCurrent: - throw context_already_current(); - case hipErrorMapFailed: - throw map_failed(); - case hipErrorUnmapFailed: - throw unmap_failed(); - case hipErrorArrayIsMapped: - throw array_is_mapped(); - case hipErrorAlreadyMapped: - throw already_mapped(); - case hipErrorNoBinaryForGpu: - throw no_binary_for_gpu(); - case hipErrorAlreadyAcquired: - throw already_acquired(); - case hipErrorNotMapped: - throw not_mapped(); - case hipErrorNotMappedAsArray: - throw not_mapped_as_array(); - case hipErrorNotMappedAsPointer: - throw not_mapped_as_pointer(); - case hipErrorECCNotCorrectable: - throw ecc_uncorrectable(); - case hipErrorUnsupportedLimit: - throw unsupported_limit(); - case hipErrorContextAlreadyInUse: - throw context_already_in_use(); - case hipErrorPeerAccessUnsupported: - throw peer_access_unsupported(); - case hipErrorInvalidKernelFile: - throw invalid_ptx(); - case hipErrorInvalidGraphicsContext: - throw invalid_graphics_context(); - case hipErrorInvalidSource: - throw invalid_source(); - case hipErrorFileNotFound: - throw file_not_found(); - case hipErrorSharedObjectSymbolNotFound: - throw shared_object_symbol_not_found(); - case hipErrorSharedObjectInitFailed: - throw shared_object_init_failed(); - case hipErrorOperatingSystem: - throw operating_system(); - case hipErrorInvalidResourceHandle: - throw invalid_handle(); - case hipErrorNotFound: - throw not_found(); - case hipErrorNotReady: - throw not_ready(); - case hipErrorIllegalAddress: - throw illegal_address(); - case hipErrorLaunchOutOfResources: - throw launch_out_of_resources(); - case hipErrorLaunchTimeOut: - throw launch_timeout(); - // case hipErrorLaunchIncompatibleTexturing : throw - // launch_incompatible_texturing(); - case hipErrorPeerAccessAlreadyEnabled: - throw peer_access_already_enabled(); - case hipErrorPeerAccessNotEnabled: - throw peer_access_not_enabled(); - // case hipErrorPrimaryContextActive : throw primary_context_active(); - // case hipErrorContextIsDestroyed : throw context_is_destroyed(); - case hipErrorAssert: - throw assert_error(); - // case hipErrorTooManyPeers : throw too_many_peers(); - case hipErrorHostMemoryAlreadyRegistered: - throw host_memory_already_registered(); - case hipErrorHostMemoryNotRegistered: - throw host_memory_not_registered(); - // case hipErrorHardwareStackError : throw hardware_stack_error(); - // case hipErrorIllegalInstruction : throw illegal_instruction(); - // case hipErrorMisalignedAddress : throw misaligned_address(); - // case hipErrorInvalidAddressSpace : throw invalid_address_space(); - // case hipErrorInvalidPc : throw invalid_pc(); - case hipErrorLaunchFailure: - throw launch_failed(); - // case hipErrorNotPermitted : throw not_permitted(); - case hipErrorNotSupported: - throw not_supported(); - case hipErrorUnknown: - throw unknown(); - default: - throw unknown(); - } -} - -} // namespace driver -} // namespace triton diff --git a/lib/driver/llvm.cc b/lib/driver/llvm.cc deleted file mode 100644 index 140eff6cd..000000000 --- a/lib/driver/llvm.cc +++ /dev/null @@ -1,392 +0,0 @@ -/* Copyright 2015-2017 Philippe Tillet - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files - * (the "Software"), to deal in the Software without restriction, - * including without limitation the rights to use, copy, modify, merge, - * publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, - * subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ -#include - -#if defined __has_include -#if __has_include() -#include -#endif -#endif - -#include "triton/driver/dispatch.h" -#include "triton/driver/error.h" -#include "triton/driver/llvm.h" -#include "triton/tools/sha1.hpp" -#include "triton/tools/sys/exec.hpp" -#include "triton/tools/sys/getenv.hpp" -#include "triton/tools/sys/mkdir.hpp" -#include "llvm/ExecutionEngine/ExecutionEngine.h" -#include "llvm/ExecutionEngine/SectionMemoryManager.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/IRPrintingPasses.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Verifier.h" -#include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/CodeGen.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include -#include - -// begin AMD stuff -#include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/FormattedStream.h" -#include "llvm/Support/Program.h" -#include "llvm/Support/ToolOutputFile.h" -// end AMD stuff - -extern "C" { -int set_curterm(char *nterm) { return 0; } -int del_curterm(char *nterm) { return 0; } -int tigetnum(char *capname) { return 0; } -int setupterm(char *term, int fildes, int *errret) { return 0; } -} - -namespace triton { -namespace driver { - -void init_llvm() { - LLVMInitializeNVPTXTargetInfo(); - LLVMInitializeNVPTXTarget(); - LLVMInitializeNVPTXTargetMC(); - LLVMInitializeNVPTXAsmPrinter(); - LLVMInitializeAMDGPUTargetInfo(); - LLVMInitializeAMDGPUTarget(); - LLVMInitializeAMDGPUTargetMC(); - LLVMInitializeAMDGPUAsmPrinter(); -} - -/* ------------------------ */ -// CUDA // -/* ------------------------ */ -static bool find_and_replace(std::string &str, const std::string &begin, - const std::string &end, - const std::string &target) { - size_t start_replace = str.find(begin); - if (start_replace == std::string::npos) - return false; - size_t end_replace = str.find(end, start_replace); - if (end_replace == std::string::npos) - return false; - str.replace(start_replace, end_replace + 1 - start_replace, target); - return true; -} - -std::string path_to_ptxas(int &version) { - std::vector rets; - std::string ret; - // search paths for ptxas - std::vector ptxas_prefixes = {"", "/usr/local/cuda/bin/"}; - std::string triton_ptxas = tools::getenv("TRITON_PTXAS_PATH"); - if (!triton_ptxas.empty()) - ptxas_prefixes.insert(ptxas_prefixes.begin(), triton_ptxas); - // see what path for ptxas are valid - std::vector working_ptxas; - for (const std::string &prefix : ptxas_prefixes) { - std::string ptxas = prefix + "ptxas"; - bool works = tools::exec(ptxas + " --version 2>&1", ret) == 0; - if (works) { - working_ptxas.push_back(ptxas); - rets.push_back(ret); - } - } - // error if no working ptxas was found - if (working_ptxas.empty()) - throw std::runtime_error("`ptxas` was searched in TRITON_PTXAS_PATH, " - "/usr/local/cuda/bin/ or PATH" - " but a working version could not be found."); - std::string ptxas = working_ptxas.front(); - // parse version - std::regex version_regex("release (\\d+)\\.(\\d+)"); - std::smatch match; - bool found = false; - // currently choosing the first ptxas. Other logics can be implemented in - // future - size_t i = 0; - while (i < rets.size()) { - if (std::regex_search(rets[i], match, version_regex)) { - int major = std::stoi(match[1]); - int minor = std::stoi(match[2]); - version = major * 1000 + minor * 10; - found = true; - break; - } - ++i; - } - if (not found) { - throw std::runtime_error("Error in parsing version"); - } - return working_ptxas[i]; -} - -int vptx(int version) { - if (version >= 11040) - return 74; - if (version >= 11030) - return 73; - if (version >= 11020) - return 72; - if (version >= 11010) - return 71; - if (version >= 11000) - return 70; - if (version >= 10020) - return 65; - if (version >= 10010) - return 64; - if (version >= 10000) - return 63; - throw std::runtime_error("Triton requires CUDA 10+"); -} - -std::string llir_to_ptx(llvm::Module *module, int cc, int version) { - // LLVM version in use may not officially support target hardware - int max_nvvm_cc = 75; - int max_nvvm_ptx = 74; - // options - auto options = llvm::cl::getRegisteredOptions(); - auto *short_ptr = - static_cast *>(options["nvptx-short-ptr"]); - assert(short_ptr); - short_ptr->setValue(true); - // compute capability - std::string sm = "sm_" + std::to_string(cc); - // max PTX version - int ptx = vptx(version); - int ptx_major = ptx / 10; - int ptx_minor = ptx % 10; - // create - llvm::SmallVector buffer; - std::string triple = "nvptx64-nvidia-cuda"; - std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc)); - std::string layout = ""; - std::string features = ""; - // std::string features = "+ptx" + std::to_string(std::min(ptx, - // max_nvvm_ptx)); - init_llvm(); - // verify and store llvm - llvm::legacy::PassManager pm; - pm.add(llvm::createVerifierPass()); - pm.run(*module); - // module->print(llvm::outs(), nullptr); - - // create machine - module->setTargetTriple(triple); - std::string error; - auto target = - llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); - llvm::TargetOptions opt; - opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; - opt.UnsafeFPMath = false; - opt.NoInfsFPMath = false; - opt.NoNaNsFPMath = true; - llvm::TargetMachine *machine = target->createTargetMachine( - module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_, - llvm::None, llvm::CodeGenOpt::Aggressive); - // set data layout - if (layout.empty()) - module->setDataLayout(machine->createDataLayout()); - else - module->setDataLayout(layout); - // emit machine code - for (llvm::Function &f : module->functions()) - f.addFnAttr(llvm::Attribute::AlwaysInline); - llvm::legacy::PassManager pass; - llvm::raw_svector_ostream stream(buffer); - // emit - machine->addPassesToEmitFile(pass, stream, nullptr, - llvm::CodeGenFileType::CGFT_AssemblyFile); - pass.run(*module); - - // post-process - std::string result(buffer.begin(), buffer.end()); - find_and_replace(result, ".version", "\n", - ".version " + std::to_string(ptx_major) + "." + - std::to_string(ptx_minor) + "\n"); - find_and_replace(result, ".target", "\n", ".target " + sm + "\n"); - while (find_and_replace(result, "\t// begin inline asm", "\n", "")) - ; - while (find_and_replace(result, "\t// end inline asm", "\n", "")) - ; - return result; -} - -std::string ptx_to_cubin(const std::string &ptx, const std::string &ptxas, - int cc) { - // compile ptx with ptxas - char _fsrc[L_tmpnam]; - char _flog[L_tmpnam]; - std::tmpnam(_fsrc); - std::tmpnam(_flog); - std::string fsrc = _fsrc; - std::string flog = _flog; - std::string fbin = fsrc + ".o"; - const char *_fbin = fbin.c_str(); - std::ofstream ofs(fsrc); - ofs << ptx << std::endl; - ofs.close(); - std::string cmd; - int err; - cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + - " -o " + fsrc + ".o 2> " + flog; - err = system(cmd.c_str()); - if (err != 0) { - std::ifstream _log(_flog); - std::string log(std::istreambuf_iterator(_log), {}); - unlink(_fsrc); - unlink(_flog); - throw std::runtime_error("Internal Triton PTX codegen error: \n" + log); - } - CUmodule ret; - std::ifstream _cubin(_fbin, std::ios::binary); - std::string cubin(std::istreambuf_iterator(_cubin), {}); - _cubin.close(); - unlink(_fsrc); - unlink(_flog); - unlink(_fbin); - dispatch::cuModuleLoadData(&ret, cubin.c_str()); - return cubin; -} - -/* ------------------------ */ -// HIP // -/* ------------------------ */ - -std::string llir_to_amdgpu(llvm::Module *module, const std::string &_proc) { - init_llvm(); - - // proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo)); - // features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo)); - - // create - llvm::SmallVector buffer; - std::string triple = "amdgcn-amd-amdhsa"; - std::string layout = ""; - std::string features; - std::string proc = "gfx908"; - // verify and store llvm - llvm::legacy::PassManager pm; - pm.add(llvm::createVerifierPass()); - pm.run(*module); - // create machine - module->setTargetTriple(triple); - std::string error; - auto target = - llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); - llvm::TargetOptions opt; - opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; - opt.UnsafeFPMath = false; - opt.NoInfsFPMath = false; - opt.NoNaNsFPMath = true; - llvm::TargetMachine *machine = target->createTargetMachine( - module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_, - llvm::None, llvm::CodeGenOpt::Aggressive); - // set data layout - if (layout.empty()) - module->setDataLayout(machine->createDataLayout()); - else - module->setDataLayout(layout); - // emit machine code - for (llvm::Function &f : module->functions()) - f.addFnAttr(llvm::Attribute::AlwaysInline); - llvm::legacy::PassManager pass; - llvm::raw_svector_ostream stream(buffer); - - // create dump files - std::string module_name = module->getModuleIdentifier(); - std::error_code ec; - - // Save GCN ISA binary. - std::string isabin_path = - std::string("/tmp/") + module_name + std::string(".o"); - std::unique_ptr isabin_fs( - new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text)); - if (ec) { - std::cout << isabin_path << " was not created. error code: " << ec - << std::endl; - } - - // emit - machine->addPassesToEmitFile(pass, *isabin_fs, nullptr, - llvm::CGFT_ObjectFile); - pass.run(*module); - // Save GCN ISA. - std::string amdgcn_path = - std::string("/tmp/") + module_name + std::string(".gcn"); - std::string result(buffer.begin(), buffer.end()); - std::ofstream amdgcn(amdgcn_path); - amdgcn << result; - amdgcn.close(); - - // generate HASCO file - std::string hsaco_path = - std::string("/tmp/") + module_name + std::string(".hsaco"); - std::string error_message; - int lld_result = - llvm::sys::ExecuteAndWait("/opt/rocm/llvm/bin/ld.lld", - {"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu", - "-shared", "-o", hsaco_path, isabin_path}, - llvm::None, {}, 0, 0, &error_message); - if (lld_result) { - std::cout << "ld.lld execute fail: " << std::endl; - std::cout << error_message << std::endl; - std::cout << lld_result << std::endl; - } - - return hsaco_path; -} - -hipModule_t amdgpu_to_hipmodule(const std::string &path) { - // Read HSACO. - std::ifstream hsaco_file(path, std::ios::binary | std::ios::ate); - std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg(); - - std::vector hsaco(hsaco_file_size); - hsaco_file.seekg(0, std::ios::beg); - hsaco_file.read(reinterpret_cast(&hsaco[0]), hsaco_file_size); - hsaco_file.close(); - hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, - hipJitOptionErrorLogBuffer, - hipJitOptionInfoLogBufferSizeBytes, - hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose}; - const unsigned int errbufsize = 8192; - const unsigned int logbufsize = 8192; - char _err[errbufsize]; - char _log[logbufsize]; - void *optval[] = {(void *)(uintptr_t)errbufsize, (void *)_err, - (void *)(uintptr_t)logbufsize, (void *)_log, (void *)1}; - hipModule_t ret; - dispatch::hipModuleLoadDataEx(&ret, hsaco.data(), 5, opt, optval); - return ret; -} - -} // namespace driver -} // namespace triton diff --git a/python/src/triton.cc b/python/src/triton.cc index 52dffd1ae..424c2a28e 100644 --- a/python/src/triton.cc +++ b/python/src/triton.cc @@ -1,7 +1,4 @@ -#include "triton/driver/error.h" -#include "triton/driver/llvm.h" - -#include "mlir/IR/Builders.h" +#include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/MLIRContext.h" #include "mlir/IR/Verifier.h" @@ -10,6 +7,9 @@ #include "mlir/Pass/PassManager.h" #include "mlir/Transforms/Passes.h" +#include "mlir/Parser.h" +#include "mlir/Support/FileUtilities.h" + #include "triton/Analysis/Allocation.h" #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.h" #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h" @@ -24,10 +24,14 @@ #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" +#include "llvm/IRReader/IRReader.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/SourceMgr.h" + #include #include +#include #include #include #include @@ -40,10 +44,6 @@ #include namespace py = pybind11; -// namespace ir = triton::ir; -namespace drv = triton::driver; - -using triton::cuGetInfo; enum backend_t { HOST, @@ -51,306 +51,6 @@ enum backend_t { ROCM, }; -void cu_enable_peer_access(uint64_t peer_ptr) { - CUcontext context; - drv::dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, - peer_ptr); - try { - drv::dispatch::cuCtxEnablePeerAccess(context, 0); - } catch (drv::exception::cuda::peer_access_already_enabled) { - } -} - -void host_enqueue(uint64_t stream, uint64_t kernel, uint64_t grid_0, - uint64_t grid_1, uint64_t grid_2, uint64_t block_0, - uint64_t block_1, uint64_t block_2, void *args_ptr, - size_t args_size, int64_t shared_mem) { - throw std::runtime_error("unsupported"); - // auto hst = kernel->module()->hst(); - // hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]); - // char* params = new char[args_size]; - // std::memcpy((void*)params, (void*)args, args_size); - // for(size_t i = 0; i < grid[0]; i++) - // for(size_t j = 0; j < grid[1]; j++) - // for(size_t k = 0; k < grid[2]; k++) - // hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn, - // (char**)params, int32_t(i), int32_t(j), int32_t(k))); -} - -void cu_enqueue(uint64_t stream, uint64_t kernel, uint64_t grid_0, - uint64_t grid_1, uint64_t grid_2, uint64_t block_0, - uint64_t block_1, uint64_t block_2, void *args_ptr, - size_t args_size, int64_t shared_mem) { - void *config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, (void *)args_ptr, - CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, - CU_LAUNCH_PARAM_END}; - drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2, - block_0, block_1, block_2, shared_mem, - (CUstream)stream, nullptr, config); -} - -long pow2_divisor(long N) { - if (N % 16 == 0) - return 16; - if (N % 8 == 0) - return 8; - if (N % 4 == 0) - return 4; - if (N % 2 == 0) - return 2; - return 1; -} - -// Returns something like "int16", whether dtype is a torch.dtype or -// triton.language.dtype. -std::string dtype_cache_key_part(const py::object &dtype) { - if (py::hasattr(dtype, "cache_key_part")) { - // Presumed to be a triton.language.dtype. - return std::string(py::str(py::getattr(dtype, "cache_key_part"))); - } else { - // Remove 'torch.' prefix from repr of torch.dtype. - py::object repr = py::repr(dtype); - size_t repr_len = PyUnicode_GET_LENGTH(repr.ptr()); - const char *repr_ptr = (const char *)PyUnicode_1BYTE_DATA(repr.ptr()); - if (repr_len <= 6 || strncmp(repr_ptr, "torch.", 6)) { - throw std::logic_error("invalid dtype: " + - std::string(repr_ptr, repr_len)); - } - return std::string(repr_ptr + 6, repr_len - 6); - } -} - -size_t get_pointer_range_size(uint64_t addr) { - if (addr == 0) - return 0; - size_t size; - drv::dispatch::cuPointerGetAttribute(&size, CU_POINTER_ATTRIBUTE_RANGE_SIZE, - (CUdeviceptr)addr); - return size; -} - -// Launch -void parse_args(py::list &args, py::list do_not_specialize, - const std::string &func_key, py::list &arg_names, - std::string &cache_key, std::string ¶ms, - size_t ¶ms_size, py::dict constants, int num_warps, - int num_stages) { - size_t len = PyList_Size(args.ptr()); - params.reserve(8 * len); // 8 max bytes by argument - char *params_ptr = ¶ms[0]; - cache_key = func_key; - cache_key += "-" + std::to_string(num_warps); - cache_key += "-" + std::to_string(num_stages); - cache_key += "-"; - for (int i = 0; i < len; i++) { - cache_key += "_"; - py::int_ py_i = py::int_(i); - bool specialize = !do_not_specialize.contains(py_i); - py::object arg = args[i]; - auto arg_ptr = arg.ptr(); - - // argument is `long` - if (PyLong_Check(arg_ptr)) { - int overflow; - long long value = PyLong_AsLongLongAndOverflow(arg_ptr, &overflow); - // values equal to 1 are specialized - if (specialize && (value == 1)) { - cache_key += "1"; - continue; - } - // int32, uint32, int64, and uint64 have different kernels - if (!overflow && -0x8000'0000LL <= value && value <= 0x7FFF'FFFFLL) { - cache_key += "int32"; - params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4)); - std::memcpy(params_ptr, &value, 4); - params_ptr += 4; - } else if (!overflow && 0x8000'0000LL <= value && - value <= 0xFFFF'FFFFLL) { - cache_key += "uint32"; - params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4)); - std::memcpy(params_ptr, &value, 4); - params_ptr += 4; - } else if (!overflow) { - cache_key += "int64"; - params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8)); - std::memcpy(params_ptr, &value, 8); - params_ptr += 8; - } else { - if (PyErr_Occurred()) { - throw std::logic_error("An error occurred?"); - } - unsigned long long unsigned_value = PyLong_AsUnsignedLongLong(arg_ptr); - if (PyErr_Occurred()) { - throw std::runtime_error("integer overflow in argument: " + - std::string(py::str(arg))); - } - cache_key += "uint64"; - params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8)); - std::memcpy(params_ptr, &unsigned_value, 8); - params_ptr += 8; - } - if (!specialize) - continue; - // values divisible by small powers of 2 are specialized - cache_key += "[multipleof("; - cache_key += std::to_string(pow2_divisor(value)); - cache_key += ")]"; - continue; - } - // argument is `float` - if (PyFloat_Check(arg_ptr)) { - cache_key += "float32"; - float value = PyFloat_AsDouble(arg_ptr); - params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4)); - std::memcpy(params_ptr, &value, 4); - params_ptr += 4; - continue; - } - // argument is `bool` - if (PyBool_Check(arg_ptr)) { - cache_key += "bool"; - bool value = arg_ptr == Py_True ? true : false; - std::memcpy(params_ptr, &value, 1); - params_ptr += 1; - continue; - } - // argument is tensor - if (py::hasattr(arg, "data_ptr")) { - py::object data_ptr = arg.attr("data_ptr")(); - long value = data_ptr.cast(); - params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8)); - // copy param - std::memcpy(params_ptr, &value, 8); - params_ptr += 8; - // update cache key - cache_key += dtype_cache_key_part(arg.attr("dtype")); - cache_key += "*"; - cache_key += "[multipleof("; - size_t range_size = get_pointer_range_size(value); - cache_key += std::to_string( - std::min(pow2_divisor(value), pow2_divisor(range_size))); - cache_key += ")]"; - continue; - } - // argument is `constexpr` - if (py::hasattr(arg, "value")) { - py::object value = arg.attr("value"); - py::object name = arg_names[i]; - constants[name] = value; - py::object repr = py::repr(value); - const char *start = (const char *)PyUnicode_1BYTE_DATA(repr.ptr()); - size_t len = PyUnicode_GET_LENGTH(repr.ptr()); - cache_key += std::string(start, len); - continue; - } - std::string ty_str = - arg.attr("__class__").attr("__name__").cast(); - if (ty_str == "NoneType") { - cache_key += "None"; - continue; - } - std::string err_msg = "Received type '" + ty_str + "' for argument " + - std::to_string(i) + "." + - " Only int, float, bool, torch.Tensor, and " - "triton.language.constexpr are supported."; - throw std::runtime_error(err_msg); - } - params_size = (std::ptrdiff_t)(params_ptr - ¶ms[0]); -} - -void parse_args(py::list &args, py::list &arg_names, std::string ¶ms, - size_t ¶ms_size, py::dict constants) { - size_t len = PyList_Size(args.ptr()); - params.reserve(8 * len); // 8 max bytes by argument - char *params_ptr = params.data(); - for (int i = 0; i < len; i++) { - py::object arg = args[i]; - auto arg_ptr = arg.ptr(); - - if (PyLong_Check(arg_ptr)) { - int overflow{}; - long long value = PyLong_AsLongLongAndOverflow(arg_ptr, &overflow); - - if (!overflow && -0x8000'0000LL <= value && value <= 0x7FFF'FFFFLL) { - params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4)); - std::memcpy(params_ptr, &value, 4); - params_ptr += 4; - } else if (!overflow && 0x8000'0000LL <= value && - value <= 0xFFFF'FFFFLL) { - params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4)); - std::memcpy(params_ptr, &value, 4); - params_ptr += 4; - } else if (!overflow) { - params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8)); - std::memcpy(params_ptr, &value, 8); - params_ptr += 8; - } else { - if (PyErr_Occurred()) { - throw std::logic_error("An error occurred?"); - } - unsigned long long unsigned_value = PyLong_AsUnsignedLongLong(arg_ptr); - if (PyErr_Occurred()) { - throw std::runtime_error("integer overflow in argument: " + - std::string(py::str(arg))); - } - params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8)); - std::memcpy(params_ptr, &unsigned_value, 8); - params_ptr += 8; - } - continue; - } - - if (PyFloat_Check(arg_ptr)) { - float value = PyFloat_AsDouble(arg_ptr); - params_ptr = (char *)(((uintptr_t)params_ptr + 3) & (-4)); - std::memcpy(params_ptr, &value, 4); - params_ptr += 4; - continue; - } - - // argument is `bool` - if (PyBool_Check(arg_ptr)) { - bool value = arg_ptr == Py_True ? true : false; - std::memcpy(params_ptr, &value, 1); - params_ptr += 1; - continue; - } - // argument is torch.tensor, get data_ptr as memory address - if (py::hasattr(arg, "data_ptr")) { - py::object data_ptr = arg.attr("data_ptr")(); - long value = data_ptr.cast(); - params_ptr = (char *)(((uintptr_t)params_ptr + 7) & (-8)); - // copy param - std::memcpy(params_ptr, &value, 8); - params_ptr += 8; - // update cache key - continue; - } - // argument is `constexpr` - if (py::hasattr(arg, "value")) { - py::object value = arg.attr("value"); - py::object name = arg_names[i]; - constants[name] = value; - continue; - } - // argument is `LoadedBinary` - if (py::hasattr(arg, "get_sass")) { - // Do nothing, just a placeholder here to indicate validity. - continue; - } - - std::string ty_str = - arg.attr("__class__").attr("__name__").cast(); - std::string err_msg = "Received type '" + ty_str + "' for argument " + - std::to_string(i) + "." + - " Only int, float, bool, torch.Tensor, and " - "triton.language.constexpr are supported."; - throw std::runtime_error(err_msg); - } - - params_size = (std::ptrdiff_t)(params_ptr - ¶ms[0]); -} - void init_triton_runtime(py::module &&m) { // wrap backend_t py::enum_(m, "backend") @@ -358,192 +58,8 @@ void init_triton_runtime(py::module &&m) { .value("CUDA", CUDA) // .value("ROCM", ROCM) .export_values(); - - // enable peer-to-peer - m.def("enable_peer_access", [](backend_t backend, uint64_t peer_ptr) { - if (backend != CUDA) - throw std::runtime_error("P2P only supported on CUDA devices!"); - cu_enable_peer_access(peer_ptr); - }); - - // get range size for the given pointer - m.def("get_pointer_range_size", &get_pointer_range_size); - - // cache key - m.def("launch", [](py::list args, py::list do_not_specialize, - const std::string &func_key, py::list &arg_names, - py::object device, py::int_ stream, py::dict bin_cache, - py::int_ num_warps, py::int_ num_stages, - py::function add_to_cache, py::object grid) { - // parse arguments to compute cache key, compile-time constants and packed - // kernel arguments - long _num_warps = PyLong_AsLong(num_warps.ptr()); - long _num_stages = PyLong_AsLong(num_stages.ptr()); - std::string cache_key; - std::string params; - size_t params_size; - py::dict constants; - parse_args(args, do_not_specialize, func_key, arg_names, cache_key, params, - params_size, constants, _num_warps, _num_stages); - - // get cached binary - py::str key(cache_key); - py::bool_ noop = false; - if (!bin_cache.contains(key)) { - noop = add_to_cache(key, args, device, num_warps, num_stages); - } - if (noop) - return (py::object)py::none(); - py::object bin = bin_cache[key]; - - // get grid - py::sequence seq; - if (!PySequence_Check(grid.ptr())) - seq = grid(constants); - else - seq = grid; - int size = seq.size(); - int grid_0 = py::cast(seq[0]); - int grid_1 = size < 2 ? 1 : py::cast(seq[1]); - int grid_2 = size < 3 ? 1 : py::cast(seq[2]); - - // enqueue - uint64_t kernel = py::cast(bin.attr("kernel")); - uint64_t shared_mem = py::cast(bin.attr("shared_mem")); - - // actually launch - void *config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, params.data(), - CU_LAUNCH_PARAM_BUFFER_SIZE, ¶ms_size, - CU_LAUNCH_PARAM_END}; - uint64_t _stream = PyLong_AsLong(stream.ptr()); - if (grid_0 * grid_1 * grid_2 > 0) { - // release the gil in case the enqueue blocks - // cuda will block if too many ops are enqueued - py::gil_scoped_release allow_threads; - drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2, - _num_warps * 32, 1, 1, shared_mem, - (CUstream)_stream, nullptr, config); - } - return bin; - }); - - m.def("cc", [](backend_t backend, uint64_t device) -> int { - if (backend == CUDA) { - CUdevice dev = (CUdevice)device; - int major = cuGetInfo(dev); - int minor = cuGetInfo(dev); - return major * 10 + minor; - } - return -1; - }); - - m.def("launch_binary", [](py::object binary, py::list args, - py::list do_not_specialize, py::list arg_names, - py::int_ stream, py::int_ num_warps, - py::int_ num_stages, py::object grid) { - long _num_warps = PyLong_AsLong(num_warps.ptr()); - long _num_stages = PyLong_AsLong(num_stages.ptr()); - - // get grid - py::sequence seq; - py::dict constants; - std::string params; - size_t params_size{}; - parse_args(args, arg_names, params, params_size, constants); - if (!PySequence_Check(grid.ptr())) - seq = grid(constants); - else - seq = grid; - - int size = seq.size(); - int grid_0 = py::cast(seq[0]); - int grid_1 = size < 2 ? 1 : py::cast(seq[1]); - int grid_2 = size < 3 ? 1 : py::cast(seq[2]); - - uint64_t kernel = py::cast(binary.attr("kernel")); - uint64_t shared_mem = py::cast(binary.attr("shared_mem")); - - // actually launch - void *config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, params.data(), - CU_LAUNCH_PARAM_BUFFER_SIZE, ¶ms_size, - CU_LAUNCH_PARAM_END}; - uint64_t _stream = PyLong_AsLong(stream.ptr()); - const int numGrids = grid_0 * grid_1 * grid_2; - if (numGrids) { - // release the gil in case the enqueue blocks - // cuda will block if too many ops are enqueued - py::gil_scoped_release allow_threads; - drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2, - _num_warps * 32, 1, 1, shared_mem, - (CUstream)_stream, nullptr, config); - } - return binary; - }); - - // query maximum shared memory - m.def("max_shared_memory", [](backend_t backend, uint64_t device) { - if (backend == HOST) - return 0; - if (backend == CUDA) - return cuGetInfo( - device); - return -1; - }); - - // query DRAM & L2 cache - m.def("memory_clock_rate", [](backend_t backend, uint64_t device) { - if (backend == CUDA) - return cuGetInfo(device); - return -1; - }); - m.def("global_memory_bus_width", [](backend_t backend, uint64_t device) { - if (backend == CUDA) - return cuGetInfo(device); - return -1; - }); - m.def("l2_cache_size", [](backend_t backend, uint64_t device) { - if (backend == CUDA) - return cuGetInfo(device); - return -1; - }); - - // query clock rate (in kilohertz) - m.def("clock_rate", [](backend_t backend, uint64_t device) { - if (backend == CUDA) - return cuGetInfo(device); - return -1; - }); - - m.def("num_sm", [](backend_t backend, uint64_t device) { - if (backend == CUDA) - return cuGetInfo(device); - return -1; - }); - - // enqueue - m.def("enqueue", - [](backend_t backend, uint64_t stream, uint64_t kernel, uint64_t grid_0, - uint64_t grid_1, uint64_t grid_2, uint64_t block_0, uint64_t block_1, - uint64_t block_2, const std::string &args, int64_t shared_mem) { - void *args_ptr = (void *)args.data(); - size_t args_size = args.size(); - // release the gil in case the enqueue blocks - // cuda will block if too many ops are enqueued - py::gil_scoped_release allow_threads; - if (backend == HOST) - host_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, - block_1, block_2, args_ptr, args_size, shared_mem); - if (backend == CUDA) - cu_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, - block_2, args_ptr, args_size, shared_mem); - }); } -/*****************************************************************************/ -/* Python bindings for triton::codegen */ -/*****************************************************************************/ -typedef std::map asm_map_t; - /*****************************************************************************/ /* Python bindings for triton::ir */ /*****************************************************************************/ @@ -783,6 +299,38 @@ void init_triton_ir(py::module &&m) { return self.lookupSymbol(funcName); }); + m.def( + "parse_mlir_module", + [](const std::string &inputFilename, mlir::MLIRContext &context) { + // open file + std::string errorMessage; + auto input = mlir::openInputFile(inputFilename, &errorMessage); + if (!input) + throw std::runtime_error(errorMessage); + + // initialize registry + mlir::DialectRegistry registry; + registry.insert(); + + context.appendDialectRegistry(registry); + context.loadAllAvailableDialects(); + context.allowUnregisteredDialects(); + + // parse module + llvm::SourceMgr sourceMgr; + sourceMgr.AddNewSourceBuffer(std::move(input), llvm::SMLoc()); + mlir::OwningOpRef module( + mlir::parseSourceFile(sourceMgr, &context)); + if (!module) + throw std::runtime_error("Parse MLIR file failed."); + + return module->clone(); + }, + ret::take_ownership); + py::class_(m, "function") // .def_property_readonly("attrs", &ir::function::attrs) // .def("add_attr", &ir::function::add_attr); @@ -1643,84 +1191,86 @@ void init_triton_ir(py::module &&m) { } void init_triton_translation(py::module &m) { - m.def("translate_triton_gpu_to_llvmir", [](mlir::ModuleOp op) -> std::string { - llvm::LLVMContext llvmContext; - auto llvmModule = - ::mlir::triton::translateTritonGPUToLLVMIR(&llvmContext, op); - std::string str; - llvm::raw_string_ostream os(str); - llvmModule->print(os, nullptr); - os.flush(); - return str; + using ret = py::return_value_policy; + + m.def("get_shared_memory_size", [](mlir::ModuleOp module) { + auto pass = std::make_unique(module); + return pass->getSharedMemorySize(); }); - m.def("translate_triton_gpu_to_ptx", - [](mlir::ModuleOp module, uint64_t device) - -> std::tuple { - auto [ptxCode, cc, version, ptxasPath] = - triton::translateTritonGPUToPTX(module, device); + m.def( + "translate_triton_gpu_to_llvmir", + [](mlir::ModuleOp op) { + llvm::LLVMContext llvmContext; + auto llvmModule = + ::mlir::triton::translateTritonGPUToLLVMIR(&llvmContext, op); - mlir::PassManager pm(module->getContext()); - auto pass = std::make_unique(module); - size_t size = pass->getSharedMemorySize(); + std::string str; + llvm::raw_string_ostream os(str); + llvmModule->print(os, nullptr); + os.flush(); + return str; + }, + ret::take_ownership); - return std::make_tuple(ptxCode, size); - }); + m.def( + "translate_llvmir_to_ptx", + [](const std::string llvmIR, int capability, int version) -> std::string { + // create LLVM module from C++ + llvm::LLVMContext context; + std::unique_ptr buffer = + llvm::MemoryBuffer::getMemBuffer(llvmIR.c_str()); + llvm::SMDiagnostic error; + std::unique_ptr module = + llvm::parseIR(buffer->getMemBufferRef(), error, context); + // translate module to PTX + auto ptxCode = + triton::translateLLVMIRToPTX(*module, capability, version); + return ptxCode; + }, + ret::take_ownership); m.def("compile_ptx_to_cubin", - [](const std::string &ptxCode, uint64_t device) -> py::object { + [](const std::string &ptxCode, const std::string &ptxasPath, + int capability) -> py::object { py::gil_scoped_release allow_threads; - int version; - int cc; - std::string ptxasPath; - triton::getCuCCAndVersionFromDevice(device, &cc, &version, - &ptxasPath); - std::string cubin = drv::ptx_to_cubin(ptxCode, ptxasPath, cc); + // compile ptx with ptxas + char _fsrc[L_tmpnam]; + char _flog[L_tmpnam]; + std::tmpnam(_fsrc); + std::tmpnam(_flog); + std::string fsrc = _fsrc; + std::string flog = _flog; + std::string fbin = fsrc + ".o"; + const char *_fbin = fbin.c_str(); + std::ofstream ofs(fsrc); + ofs << ptxCode << std::endl; + ofs.close(); + std::string cmd; + int err; + cmd = ptxasPath + " -v --gpu-name=sm_" + std::to_string(capability) + + " " + fsrc + " -o " + fsrc + ".o 2> " + flog; + err = system(cmd.c_str()); + if (err != 0) { + std::ifstream _log(_flog); + std::string log(std::istreambuf_iterator(_log), {}); + unlink(_fsrc); + unlink(_flog); + throw std::runtime_error("Internal Triton PTX codegen error: \n" + + log); + } + std::ifstream _cubin(_fbin, std::ios::binary); + std::string cubin(std::istreambuf_iterator(_cubin), {}); + _cubin.close(); + unlink(_fsrc); + unlink(_flog); + unlink(_fbin); + py::bytes bytes(cubin); return bytes; }); - - m.def( - "load_binary", - [](const std::string &name, const std::string &data, - size_t n_shared_bytes, uint64_t device) { - py::gil_scoped_release allow_threads; - // create driver handles - CUfunction fun; - CUmodule mod; - drv::dispatch::cuModuleLoadData(&mod, data.c_str()); - drv::dispatch::cuModuleGetFunction(&fun, mod, name.c_str()); - // get allocated registers and spilled registers from the function - int n_regs = 0; - int n_spills = 0; - drv::dispatch::cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, - fun); - drv::dispatch::cuFuncGetAttribute( - &n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun); - n_spills /= 4; - // set dynamic shared memory if necessary - int shared_optin; - drv::dispatch::cuDeviceGetAttribute( - &shared_optin, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device); - if (n_shared_bytes > 49152 && shared_optin > 49152) { - drv::dispatch::cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED); - int shared_total, shared_static; - drv::dispatch::cuDeviceGetAttribute( - &shared_total, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, device); - drv::dispatch::cuFuncGetAttribute( - &shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun); - drv::dispatch::cuFuncSetAttribute( - fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - shared_optin - shared_static); - } - return std::make_tuple((uint64_t)mod, (uint64_t)fun, (uint64_t)n_regs, - (uint64_t)n_spills); - }, - py::return_value_policy::take_ownership); } void init_triton(py::module &m) { diff --git a/python/triton/compiler.py b/python/triton/compiler.py index a97252f75..e7d0b1318 100644 --- a/python/triton/compiler.py +++ b/python/triton/compiler.py @@ -7,6 +7,7 @@ import hashlib import io import json import os +import re import shutil import subprocess import sys @@ -843,7 +844,11 @@ def optimize_tritongpu_ir(mod, num_stages): return mod -def make_ptx(mod: Any, device: int) -> Tuple[str, int]: +def make_llvm_ir(mod): + return _triton.translate_triton_gpu_to_llvmir(mod) + + +def make_ptx(mod: Any, compute_capability: int, ptx_version: int) -> Tuple[str, int]: ''' Translate TritonGPU module to PTX code. :param mod: a TritonGPU dialect module @@ -851,17 +856,17 @@ def make_ptx(mod: Any, device: int) -> Tuple[str, int]: - PTX code - shared memory alloaction size ''' - return _triton.translate_triton_gpu_to_ptx(mod, device) + return _triton.translate_llvmir_to_ptx(mod, compute_capability, ptx_version) -def make_cubin(ptx, device): +def make_cubin(ptx: str, ptxas: str, compute_capability: int): ''' Compile TritonGPU module to cubin. :param ptx: ptx code :param device: CUDA device :return: str ''' - return _triton.compile_ptx_to_cubin(ptx, device) + return _triton.compile_ptx_to_cubin(ptx, ptxas, compute_capability) def ptx_get_kernel_name(ptx: str) -> str: @@ -877,6 +882,46 @@ def ptx_get_kernel_name(ptx: str) -> str: return line.split()[-1] +@functools.lru_cache +def ptx_get_version(cuda_version) -> int: + ''' + Get the highest PTX version supported by the current CUDA driver. + ''' + assert isinstance(cuda_version, str) + major, minor = map(int, cuda_version.split('.')) + version = major * 1000 + minor * 10 + if version >= 11040: + return 74 + if version >= 11030: + return 73 + if version >= 11020: + return 72 + if version >= 11010: + return 71 + if version >= 11000: + return 70 + if version >= 10020: + return 65 + if version >= 10010: + return 64 + if version >= 10000: + return 63 + raise RuntimeError("Triton only support CUDA 10.0 or higher") + + +def path_to_ptxas(): + prefixes = [os.environ.get("TRITON_PTXAS_PATH", ""), "", "/usr/local/cuda/"] + for prefix in prefixes: + ptxas = os.path.join(prefix, "bin", "ptxas") + if os.path.exists(ptxas): + result = subprocess.check_output([ptxas, "--version"], stderr=subprocess.STDOUT) + if result is not None: + version = re.search(r".*release (\d+\.\d+).*", result.decode("utf-8"), flags=re.MULTILINE) + if version is not None: + return ptxas, version.group(1) + raise RuntimeError("Cannot find ptxas") + + instance_descriptor = namedtuple("instance_descriptor", ["divisible_by_16", "equal_to_1"], defaults=[set(), set()]) @@ -895,17 +940,24 @@ def _compile(fn, signature: str, device: int = -1, constants=dict(), specializat # tritongpu-ir module = make_tritongpu_ir(module, num_warps) module = optimize_tritongpu_ir(module, num_stages) - if output == "ttgir": return module.str() + # llvm-ir + llvm_ir = make_llvm_ir(module) + assert device >= 0, "device should be provided." - ptx, shem_size = make_ptx(module, device) + ptxas, cuda_version = path_to_ptxas() + compute_capability = torch.cuda.get_device_capability(device) + compute_capability = compute_capability[0] * 10 + compute_capability[1] + ptx_version = ptx_get_version(cuda_version) + ptx = make_ptx(llvm_ir, compute_capability, ptx_version) + shem_size = _triton.get_shared_memory_size(module) kernel_name = ptx_get_kernel_name(ptx) if output == "ptx": return ptx, shem_size, kernel_name - cubin = make_cubin(ptx, device) + cubin = make_cubin(ptx, ptxas, compute_capability) if output == "cubin": return cubin, ptx, shem_size, kernel_name @@ -980,6 +1032,7 @@ def generate_launcher(identifier, constants, signature): src = f""" #include \"cuda.h\" #include + static inline void gpuAssert(CUresult code, const char *file, int line) {{ if (code != CUDA_SUCCESS) @@ -993,13 +1046,16 @@ static inline void gpuAssert(CUresult code, const char *file, int line) PyErr_SetString(PyExc_RuntimeError, err); }} }} + #define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }} + void _launch(int gridX, int gridY, int gridZ, int num_warps, int shared_memory, CUstream stream, CUfunction function, {arg_decls}) {{ void *params[] = {{ {', '.join(f"&arg{i}" for i in signature.keys() if i not in constants)} }}; if(gridX*gridY*gridZ > 0){{ CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0)); }} }} + static inline CUdeviceptr getPointer(PyObject *obj, int idx) {{ if (PyLong_Check(obj)) {{ return (CUdeviceptr)PyLong_AsUnsignedLongLong(obj); @@ -1021,6 +1077,7 @@ static inline CUdeviceptr getPointer(PyObject *obj, int idx) {{ PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method"); return (CUdeviceptr)0; }} + static PyObject* launch(PyObject* self, PyObject* args) {{ int gridX, gridY, gridZ; uint64_t _stream; @@ -1039,10 +1096,12 @@ static PyObject* launch(PyObject* self, PyObject* args) {{ Py_INCREF(Py_None); return Py_None; }} + static PyMethodDef ModuleMethods[] = {{ {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}}, {{NULL, NULL, 0, NULL}} // sentinel }}; + static struct PyModuleDef ModuleDef = {{ PyModuleDef_HEAD_INIT, \"launcher\", @@ -1050,6 +1109,7 @@ static struct PyModuleDef ModuleDef = {{ -1, //size ModuleMethods }}; + PyMODINIT_FUNC PyInit_launcher(void) {{ PyObject *m = PyModule_Create(&ModuleDef); if(m == NULL) {{ @@ -1251,7 +1311,10 @@ class CompiledKernel: self.asm["ptx"] = f.read() device = torch.cuda.current_device() - mod, func, n_regs, n_spills = _triton.load_binary(metadata["name"], self.asm["cubin"], self.shared, device) + global cuda_utils + if cuda_utils is None: + cuda_utils = CudaUtils() + mod, func, n_regs, n_spills = cuda_utils.load_binary(metadata["name"], self.asm["cubin"], self.shared, device) self.cu_module = mod self.cu_function = func @@ -1261,3 +1324,118 @@ class CompiledKernel: stream = torch.cuda.current_stream().cuda_stream self.c_wrapper(grid[0], grid[1], grid[2], self.num_warps, self.shared, stream, self.cu_function, *args) return + + +class CudaUtils(object): + + def __new__(cls): + if not hasattr(cls, 'instance'): + cls.instance = super(CudaUtils, cls).__new__(cls) + return cls.instance + + def _generate_src(self): + return """ + #include + + #include \"cuda.h\" + #include + + static inline void gpuAssert(CUresult code, const char *file, int line) + { + if (code != CUDA_SUCCESS) + { + const char* prefix = "Triton Error [CUDA]: "; + const char* str; + cuGetErrorString(code, &str); + char err[1024] = {0}; + strcat(err, prefix); + strcat(err, str); + PyErr_SetString(PyExc_RuntimeError, err); + } + } + + #define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); } + + static PyObject* loadBinary(PyObject* self, PyObject* args) { + const char* name; + const char* data; + Py_ssize_t data_size; + int shared; + int device; + if(!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared, &device)) { + return NULL; + } + CUfunction fun; + CUmodule mod; + int32_t n_regs = 0; + int32_t n_spills = 0; + Py_BEGIN_ALLOW_THREADS; + // create driver handles + CUDA_CHECK(cuModuleLoadData(&mod, data)); + CUDA_CHECK(cuModuleGetFunction(&fun, mod, name)); + // get allocated registers and spilled registers from the function + CUDA_CHECK(cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun)); + CUDA_CHECK(cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun)); + n_spills /= 4; + // set dynamic shared memory if necessary + int shared_optin; + CUDA_CHECK(cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device)); + if (shared > 49152 && shared_optin > 49152) { + CUDA_CHECK(cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED)); + int shared_total, shared_static; + CUDA_CHECK(cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, device)); + CUDA_CHECK(cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun)); + CUDA_CHECK(cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static)); + } + Py_END_ALLOW_THREADS; + + if(PyErr_Occurred()) { + return NULL; + } + return Py_BuildValue("(KKii)", (uint64_t)mod, (uint64_t)fun, n_regs, n_spills); + } + + static PyMethodDef ModuleMethods[] = { + {"load_binary", loadBinary, METH_VARARGS, "Load provided cubin into CUDA driver"}, + {NULL, NULL, 0, NULL} // sentinel + }; + + static struct PyModuleDef ModuleDef = { + PyModuleDef_HEAD_INIT, + \"cuda_utils\", + NULL, //documentation + -1, //size + ModuleMethods + }; + + PyMODINIT_FUNC PyInit_cuda_utils(void) { + PyObject *m = PyModule_Create(&ModuleDef); + if(m == NULL) { + return NULL; + } + PyModule_AddFunctions(m, ModuleMethods); + return m; + } + """ + + def __init__(self): + src = self._generate_src() + key = hashlib.md5(src.encode("utf-8")).hexdigest() + cache = CacheManager(key) + fname = "cuda_utils.so" + if not cache.has_file(fname): + with tempfile.TemporaryDirectory() as tmpdir: + src_path = os.path.join(tmpdir, "main.c") + with open(src_path, "w") as f: + f.write(src) + so = _build("cuda_utils", src_path, tmpdir) + with open(so, "rb") as f: + cache.put(f.read(), fname, binary=True) + import importlib.util + spec = importlib.util.spec_from_file_location("cuda_utils", cache._make_path(fname)) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + self.load_binary = mod.load_binary + + +cuda_utils = None diff --git a/python/triton/tools/aot.py b/python/triton/tools/aot.py new file mode 100644 index 000000000..c1b6010df --- /dev/null +++ b/python/triton/tools/aot.py @@ -0,0 +1,61 @@ +import argparse + +import triton +import triton._C.libtriton.triton as libtriton + +if __name__ == '__main__': + + # valid source and target formats + VALID_FORMATS = ['llvm-ir', 'ptx', 'triton-ir', 'triton-gpu-ir'] + + # set up the argument parser + # TODO: conditional requirements + parser = argparse.ArgumentParser() + parser.add_argument('src', help="Source file to compile") + parser.add_argument('--target', required=True, + help="Target format, one of: " + ', '.join(VALID_FORMATS)) + parser.add_argument('--sm', type=int, help="Compute capability to compile for") + parser.add_argument('--ptx-version', type=int, help="PTX version to compile for") + + # parse the args + args = parser.parse_args() + + # TODO: clean-up and re-use triton.compiler primitive functions + # check for validity of format arguments + if args.target not in VALID_FORMATS: + print("Invalid target format: " + args.target) + exit(0) + + # parse source file to MLIR module + context = libtriton.ir.context() + module = libtriton.ir.parse_mlir_module(args.src, context) + module.context = context + + # optimizer triton-ir + module = triton.compiler.optimize_triton_ir(module) + if args.target == 'triton-ir': + print(module.str()) + exit(0) + + # triton-ir -> triton-gpu-ir + module = triton.compiler.make_tritongpu_ir(module, num_warps=4) + module = triton.compiler.optimize_tritongpu_ir(module, num_stages=3) + if args.target == 'triton-gpu-ir': + print(module.str()) + exit(0) + + # triton-gpu-ir -> llvm-ir + module = triton.compiler.make_llvm_ir(module) + if args.target == 'llvm-ir': + print(module) + exit(0) + + if not args.sm: + raise argparse.ArgumentError(None, "Must specify --sm for PTX compilation") + if not args.ptx_version: + raise argparse.ArgumentError(None, "Must specify --ptx-version for PTX compilation") + + # llvm-ir -> ptx + module = triton.compiler.make_ptx(module, compute_capability=args.sm, ptx_version=args.ptx_version) + assert args.target == 'ptx' + print(module) diff --git a/test/Target/tritongpu_to_llvmir.mlir b/test/Target/tritongpu_to_llvmir.mlir index 0f03323e9..7e203b1f9 100644 --- a/test/Target/tritongpu_to_llvmir.mlir +++ b/test/Target/tritongpu_to_llvmir.mlir @@ -1,4 +1,4 @@ -// RUN: triton-translate %s --target=llvmir | FileCheck %s +// RUN: python3 -m triton.tools.aot %s --target=llvm-ir | FileCheck %s // == LLVM IR check begin == // CHECK-LABEL: ; ModuleID = 'LLVMDialectModule' diff --git a/test/Target/tritongpu_to_ptx.mlir b/test/Target/tritongpu_to_ptx.mlir index 1fa6d85bc..c652e1b08 100644 --- a/test/Target/tritongpu_to_ptx.mlir +++ b/test/Target/tritongpu_to_ptx.mlir @@ -1,5 +1,4 @@ -// RUN: triton-translate %s --target=ptx --sm=80 --ptx-version=10000 | FileCheck %s - +// RUN: python3 -m triton.tools.aot %s --target=ptx --sm=80 --ptx-version=63 | FileCheck %s // CHECK-LABEL: // Generated by LLVM NVPTX Back-End // CHECK: .version 6.3 // CHECK: .target sm_80