Files
triton/include/isaac/external/CUDA/cuda_runtime_api.h
2017-05-11 16:11:40 -07:00

6521 lines
277 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_RUNTIME_API_H__)
#define __CUDA_RUNTIME_API_H__
/**
* \latexonly
* \page sync_async API synchronization behavior
*
* \section memcpy_sync_async_behavior Memcpy
* The API provides memcpy/memset functions in both synchronous and asynchronous forms,
* the latter having an \e "Async" suffix. This is a misnomer as each function
* may exhibit synchronous or asynchronous behavior depending on the arguments
* passed to the function. In the reference documentation, each memcpy function is
* categorized as \e synchronous or \e asynchronous, corresponding to the definitions
* below.
*
* \subsection MemcpySynchronousBehavior Synchronous
*
* <ol>
* <li> For transfers from pageable host memory to device memory, a stream sync is performed
* before the copy is initiated. The function will return once the pageable
* buffer has been copied to the staging memory for DMA transfer to device memory,
* but the DMA to final destination may not have completed.
*
* <li> For transfers from pinned host memory to device memory, the function is synchronous
* with respect to the host.
*
* <li> For transfers from device to either pageable or pinned host memory, the function returns
* only once the copy has completed.
*
* <li> For transfers from device memory to device memory, no host-side synchronization is
* performed.
*
* <li> For transfers from any host memory to any host memory, the function is fully
* synchronous with respect to the host.
* </ol>
*
* \subsection MemcpyAsynchronousBehavior Asynchronous
*
* <ol>
* <li> For transfers from device memory to pageable host memory, the function
* will return only once the copy has completed.
*
* <li> For transfers from any host memory to any host memory, the function is fully
* synchronous with respect to the host.
*
* <li> For all other transfers, the function is fully asynchronous. If pageable
* memory must first be staged to pinned memory, this will be handled
* asynchronously with a worker thread.
* </ol>
*
* \section memset_sync_async_behavior Memset
* The cudaMemset functions are asynchronous with respect to the host
* except when the target memory is pinned host memory. The \e Async
* versions are always asynchronous with respect to the host.
*
* \section kernel_launch_details Kernel Launches
* Kernel launches are asynchronous with respect to the host. Details of
* concurrent kernel execution and data transfers can be found in the CUDA
* Programmers Guide.
*
* \endlatexonly
*/
/**
* There are two levels for the runtime API.
*
* The C API (<i>cuda_runtime_api.h</i>) is
* a C-style interface that does not require compiling with \p nvcc.
*
* The \ref CUDART_HIGHLEVEL "C++ API" (<i>cuda_runtime.h</i>) is a
* C++-style interface built on top of the C API. It wraps some of the
* C API routines, using overloading, references and default arguments.
* These wrappers can be used from C++ code and can be compiled with any C++
* compiler. The C++ API also has some CUDA-specific wrappers that wrap
* C API routines that deal with symbols, textures, and device functions.
* These wrappers require the use of \p nvcc because they depend on code being
* generated by the compiler. For example, the execution configuration syntax
* to invoke kernels is only available in source code compiled with \p nvcc.
*/
/** CUDA Runtime API Version */
#define CUDART_VERSION 7050
#include "host_defines.h"
#include "builtin_types.h"
#if !defined(__CUDACC_INTEGRATED__)
#include "cuda_device_runtime_api.h"
#endif /* !defined(__CUDACC_INTEGRATED__) */
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) || defined(__CUDA_API_VERSION_INTERNAL)
#define __CUDART_API_PER_THREAD_DEFAULT_STREAM
#define __CUDART_API_PTDS(api) api ## _ptds
#define __CUDART_API_PTSZ(api) api ## _ptsz
#else
#define __CUDART_API_PTDS(api) api
#define __CUDART_API_PTSZ(api) api
#endif
#if defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
#define cudaMemcpy __CUDART_API_PTDS(cudaMemcpy)
#define cudaMemcpyToSymbol __CUDART_API_PTDS(cudaMemcpyToSymbol)
#define cudaMemcpyFromSymbol __CUDART_API_PTDS(cudaMemcpyFromSymbol)
#define cudaMemcpy2D __CUDART_API_PTDS(cudaMemcpy2D)
#define cudaMemcpyToArray __CUDART_API_PTDS(cudaMemcpyToArray)
#define cudaMemcpy2DToArray __CUDART_API_PTDS(cudaMemcpy2DToArray)
#define cudaMemcpyFromArray __CUDART_API_PTDS(cudaMemcpyFromArray)
#define cudaMemcpy2DFromArray __CUDART_API_PTDS(cudaMemcpy2DFromArray)
#define cudaMemcpyArrayToArray __CUDART_API_PTDS(cudaMemcpyArrayToArray)
#define cudaMemcpy2DArrayToArray __CUDART_API_PTDS(cudaMemcpy2DArrayToArray)
#define cudaMemcpy3D __CUDART_API_PTDS(cudaMemcpy3D)
#define cudaMemcpy3DPeer __CUDART_API_PTDS(cudaMemcpy3DPeer)
#define cudaMemset __CUDART_API_PTDS(cudaMemset)
#define cudaMemset2D __CUDART_API_PTDS(cudaMemset2D)
#define cudaMemset3D __CUDART_API_PTDS(cudaMemset3D)
#define cudaMemcpyAsync __CUDART_API_PTSZ(cudaMemcpyAsync)
#define cudaMemcpyToSymbolAsync __CUDART_API_PTSZ(cudaMemcpyToSymbolAsync)
#define cudaMemcpyFromSymbolAsync __CUDART_API_PTSZ(cudaMemcpyFromSymbolAsync)
#define cudaMemcpy2DAsync __CUDART_API_PTSZ(cudaMemcpy2DAsync)
#define cudaMemcpyToArrayAsync __CUDART_API_PTSZ(cudaMemcpyToArrayAsync)
#define cudaMemcpy2DToArrayAsync __CUDART_API_PTSZ(cudaMemcpy2DToArrayAsync)
#define cudaMemcpyFromArrayAsync __CUDART_API_PTSZ(cudaMemcpyFromArrayAsync)
#define cudaMemcpy2DFromArrayAsync __CUDART_API_PTSZ(cudaMemcpy2DFromArrayAsync)
#define cudaMemcpy3DAsync __CUDART_API_PTSZ(cudaMemcpy3DAsync)
#define cudaMemcpy3DPeerAsync __CUDART_API_PTSZ(cudaMemcpy3DPeerAsync)
#define cudaMemsetAsync __CUDART_API_PTSZ(cudaMemsetAsync)
#define cudaMemset2DAsync __CUDART_API_PTSZ(cudaMemset2DAsync)
#define cudaMemset3DAsync __CUDART_API_PTSZ(cudaMemset3DAsync)
#define cudaStreamQuery __CUDART_API_PTSZ(cudaStreamQuery)
#define cudaStreamGetFlags __CUDART_API_PTSZ(cudaStreamGetFlags)
#define cudaStreamGetPriority __CUDART_API_PTSZ(cudaStreamGetPriority)
#define cudaEventRecord __CUDART_API_PTSZ(cudaEventRecord)
#define cudaStreamWaitEvent __CUDART_API_PTSZ(cudaStreamWaitEvent)
#define cudaStreamAddCallback __CUDART_API_PTSZ(cudaStreamAddCallback)
#define cudaStreamAttachMemAsync __CUDART_API_PTSZ(cudaStreamAttachMemAsync)
#define cudaStreamSynchronize __CUDART_API_PTSZ(cudaStreamSynchronize)
#define cudaLaunch __CUDART_API_PTSZ(cudaLaunch)
#define cudaLaunchKernel __CUDART_API_PTSZ(cudaLaunchKernel)
#endif
/** \cond impl_private */
#if !defined(__dv)
#if defined(__cplusplus)
#define __dv(v) \
= v
#else /* __cplusplus */
#define __dv(v)
#endif /* __cplusplus */
#endif /* !__dv */
/** \endcond impl_private */
#if !defined(__CUDACC_INTEGRATED__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)) /** Visible to SM>=3.5 and "__host__ __device__" only **/
#define CUDART_DEVICE __device__
#else
#define CUDART_DEVICE
#endif /** CUDART_DEVICE */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
/**
* \defgroup CUDART_DEVICE Device Management
*
* ___MANBRIEF___ device management functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the device management functions of the CUDA runtime
* application programming interface.
*
* @{
*/
/**
* \brief Destroy all allocations and reset all state on the current device
* in the current process.
*
* Explicitly destroys and cleans up all resources associated with the current
* device in the current process. Any subsequent API call to this device will
* reinitialize the device.
*
* Note that this function will reset the device immediately. It is the caller's
* responsibility to ensure that the device is not being accessed by any
* other host threads from the process when this function is called.
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \sa ::cudaDeviceSynchronize
*/
extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void);
/**
* \brief Wait for compute device to finish
*
* Blocks until the device has completed all preceding requested tasks.
* ::cudaDeviceSynchronize() returns an error if one of the preceding tasks
* has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for
* this device, the host thread will block until the device has finished
* its work.
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \sa ::cudaDeviceReset
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
/**
* \brief Set resource limits
*
* Setting \p limit to \p value is a request by the application to update
* the current limit maintained by the device. The driver is free to
* modify the requested value to meet h/w requirements (this could be
* clamping to minimum or maximum values, rounding up to nearest element
* size, etc). The application can use ::cudaDeviceGetLimit() to find out
* exactly what the limit has been set to.
*
* Setting each ::cudaLimit has its own specific restrictions, so each is
* discussed here.
*
* - ::cudaLimitStackSize controls the stack size in bytes of each GPU thread.
*
* - ::cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO
* used by the ::printf() and ::fprintf() device system calls. Setting
* ::cudaLimitPrintfFifoSize must not be performed after launching any kernel
* that uses the ::printf() or ::fprintf() device system calls - in such case
* ::cudaErrorInvalidValue will be returned.
*
* - ::cudaLimitMallocHeapSize controls the size in bytes of the heap used by
* the ::malloc() and ::free() device system calls. Setting
* ::cudaLimitMallocHeapSize must not be performed after launching any kernel
* that uses the ::malloc() or ::free() device system calls - in such case
* ::cudaErrorInvalidValue will be returned.
*
* - ::cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a
* grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
* this limit must be performed before any launch of a kernel that uses the
* device runtime and calls ::cudaDeviceSynchronize() above the default sync
* depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
* with error code ::cudaErrorSyncDepthExceeded if the limitation is
* violated. This limit can be set smaller than the default or up the maximum
* launch depth of 24. When setting this limit, keep in mind that additional
* levels of sync depth require the runtime to reserve large amounts of
* device memory which can no longer be used for user allocations. If these
* reservations of device memory fail, ::cudaDeviceSetLimit will return
* ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value.
* This limit is only applicable to devices of compute capability 3.5 and
* higher. Attempting to set this limit on devices of compute capability less
* than 3.5 will result in the error ::cudaErrorUnsupportedLimit being
* returned.
*
* - ::cudaLimitDevRuntimePendingLaunchCount controls the maximum number of
* outstanding device runtime launches that can be made from the current
* device. A grid is outstanding from the point of launch up until the grid
* is known to have been completed. Device runtime launches which violate
* this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
* ::cudaGetLastError() is called after launch. If more pending launches than
* the default (2048 launches) are needed for a module using the device
* runtime, this limit can be increased. Keep in mind that being able to
* sustain additional pending launches will require the runtime to reserve
* larger amounts of device memory upfront which can no longer be used for
* allocations. If these reservations fail, ::cudaDeviceSetLimit will return
* ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value.
* This limit is only applicable to devices of compute capability 3.5 and
* higher. Attempting to set this limit on devices of compute capability less
* than 3.5 will result in the error ::cudaErrorUnsupportedLimit being
* returned.
*
* \param limit - Limit to set
* \param value - Size of limit
*
* \return
* ::cudaSuccess,
* ::cudaErrorUnsupportedLimit,
* ::cudaErrorInvalidValue,
* ::cudaErrorMemoryAllocation
* \notefnerr
*
* \sa ::cudaDeviceGetLimit
*/
extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value);
/**
* \brief Returns resource limits
*
* Returns in \p *pValue the current size of \p limit. The supported
* ::cudaLimit values are:
* - ::cudaLimitStackSize: stack size in bytes of each GPU thread;
* - ::cudaLimitPrintfFifoSize: size in bytes of the shared FIFO used by the
* ::printf() and ::fprintf() device system calls.
* - ::cudaLimitMallocHeapSize: size in bytes of the heap used by the
* ::malloc() and ::free() device system calls;
* - ::cudaLimitDevRuntimeSyncDepth: maximum grid depth at which a
* thread can isssue the device runtime call ::cudaDeviceSynchronize()
* to wait on child grid launches to complete.
* - ::cudaLimitDevRuntimePendingLaunchCount: maximum number of outstanding
* device runtime launches.
*
* \param limit - Limit to query
* \param pValue - Returned size of the limit
*
* \return
* ::cudaSuccess,
* ::cudaErrorUnsupportedLimit,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaDeviceSetLimit
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
/**
* \brief Returns the preferred cache configuration for the current device.
*
* On devices where the L1 cache and shared memory use the same hardware
* resources, this returns through \p pCacheConfig the preferred cache
* configuration for the current device. This is only a preference. The
* runtime will use the requested configuration if possible, but it is free to
* choose a different configuration if required to execute functions.
*
* This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices
* where the size of the L1 cache and shared memory are fixed.
*
* The supported cache configurations are:
* - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
* - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
* - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
* - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
*
* \param pCacheConfig - Returned cache configuration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError
* \notefnerr
*
* \sa cudaDeviceSetCacheConfig,
* \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
/**
* \brief Returns numerical values that correspond to the least and
* greatest stream priorities.
*
* Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
* to the least and greatest stream priorities respectively. Stream priorities
* follow a convention where lower numbers imply greater priorities. The range of
* meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
* If the user attempts to create a stream with a priority value that is
* outside the the meaningful range as specified by this API, the priority is
* automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
* respectively. See ::cudaStreamCreateWithPriority for details on creating a
* priority stream.
* A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
* is not desired.
*
* This function will return '0' in both \p *leastPriority and \p *greatestPriority if
* the current context's device does not support stream priorities
* (see ::cudaDeviceGetAttribute).
*
* \param leastPriority - Pointer to an int in which the numerical value for least
* stream priority is returned
* \param greatestPriority - Pointer to an int in which the numerical value for greatest
* stream priority is returned
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaStreamCreateWithPriority,
* ::cudaStreamGetPriority
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
/**
* \brief Sets the preferred cache configuration for the current device.
*
* On devices where the L1 cache and shared memory use the same hardware
* resources, this sets through \p cacheConfig the preferred cache
* configuration for the current device. This is only a preference. The
* runtime will use the requested configuration if possible, but it is free to
* choose a different configuration if required to execute the function. Any
* function preference set via
* \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)"
* or
* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
* will be preferred over this device-wide setting. Setting the device-wide
* cache configuration to ::cudaFuncCachePreferNone will cause subsequent
* kernel launches to prefer to not change the cache configuration unless
* required to launch the kernel.
*
* This setting does nothing on devices where the size of the L1 cache and
* shared memory are fixed.
*
* Launching a kernel with a different preference than the most recent
* preference setting may insert a device-side synchronization point.
*
* The supported cache configurations are:
* - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
* - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
* - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
* - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
*
* \param cacheConfig - Requested cache configuration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError
* \notefnerr
*
* \sa ::cudaDeviceGetCacheConfig,
* \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig);
/**
* \brief Returns the shared memory configuration for the current device.
*
* This function will return in \p pConfig the current size of shared memory banks
* on the current device. On devices with configurable shared memory banks,
* ::cudaDeviceSetSharedMemConfig can be used to change this setting, so that all
* subsequent kernel launches will by default use the new bank size. When
* ::cudaDeviceGetSharedMemConfig is called on devices without configurable shared
* memory, it will return the fixed bank size of the hardware.
*
* The returned bank configurations can be either:
* - ::cudaSharedMemBankSizeFourByte - shared memory bank width is four bytes.
* - ::cudaSharedMemBankSizeEightByte - shared memory bank width is eight bytes.
*
* \param pConfig - Returned cache configuration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInitializationError
* \notefnerr
*
* \sa ::cudaDeviceSetCacheConfig,
* ::cudaDeviceGetCacheConfig,
* ::cudaDeviceSetSharedMemConfig,
* ::cudaFuncSetCacheConfig
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
/**
* \brief Sets the shared memory configuration for the current device.
*
* On devices with configurable shared memory banks, this function will set
* the shared memory bank size which is used for all subsequent kernel launches.
* Any per-function setting of shared memory set via ::cudaFuncSetSharedMemConfig
* will override the device wide setting.
*
* Changing the shared memory configuration between launches may introduce
* a device side synchronization point.
*
* Changing the shared memory bank size will not increase shared memory usage
* or affect occupancy of kernels, but may have major effects on performance.
* Larger bank sizes will allow for greater potential bandwidth to shared memory,
* but will change what kinds of accesses to shared memory will result in bank
* conflicts.
*
* This function will do nothing on devices with fixed shared memory bank size.
*
* The supported bank configurations are:
* - ::cudaSharedMemBankSizeDefault: set bank width the device default (currently,
* four bytes)
* - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be four bytes
* natively.
* - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight
* bytes natively.
*
* \param config - Requested cache configuration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInitializationError
* \notefnerr
*
* \sa ::cudaDeviceSetCacheConfig,
* ::cudaDeviceGetCacheConfig,
* ::cudaDeviceGetSharedMemConfig,
* ::cudaFuncSetCacheConfig
*/
extern __host__ cudaError_t CUDARTAPI cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config);
/**
* \brief Returns a handle to a compute device
*
* Returns in \p *device a device ordinal given a PCI bus ID string.
*
* \param device - Returned device ordinal
*
* \param pciBusId - String in one of the following forms:
* [domain]:[bus]:[device].[function]
* [domain]:[bus]:[device]
* [bus]:[device].[function]
* where \p domain, \p bus, \p device, and \p function are all hexadecimal values
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevice
* \notefnerr
*
* \sa ::cudaDeviceGetPCIBusId
*/
extern __host__ cudaError_t CUDARTAPI cudaDeviceGetByPCIBusId(int *device, const char *pciBusId);
/**
* \brief Returns a PCI Bus Id string for the device
*
* Returns an ASCII string identifying the device \p dev in the NULL-terminated
* string pointed to by \p pciBusId. \p len specifies the maximum length of the
* string that may be returned.
*
* \param pciBusId - Returned identifier string for the device in the following format
* [domain]:[bus]:[device].[function]
* where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
* pciBusId should be large enough to store 13 characters including the NULL-terminator.
*
* \param len - Maximum length of string to store in \p name
*
* \param device - Device to get identifier string for
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevice
* \notefnerr
*
* \sa ::cudaDeviceGetByPCIBusId
*/
extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId, int len, int device);
/**
* \brief Gets an interprocess handle for a previously allocated event
*
* Takes as input a previously allocated event. This event must have been
* created with the ::cudaEventInterprocess and ::cudaEventDisableTiming
* flags set. This opaque handle may be copied into other processes and
* opened with ::cudaIpcOpenEventHandle to allow efficient hardware
* synchronization between GPU work in different processes.
*
* After the event has been been opened in the importing process,
* ::cudaEventRecord, ::cudaEventSynchronize, ::cudaStreamWaitEvent and
* ::cudaEventQuery may be used in either process. Performing operations
* on the imported event after the exported event has been freed
* with ::cudaEventDestroy will result in undefined behavior.
*
* IPC functionality is restricted to devices with support for unified
* addressing on Linux operating systems.
*
* \param handle - Pointer to a user allocated cudaIpcEventHandle
* in which to return the opaque event handle
* \param event - Event allocated with ::cudaEventInterprocess and
* ::cudaEventDisableTiming flags.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorMemoryAllocation,
* ::cudaErrorMapBufferObjectFailed
*
* \sa
* ::cudaEventCreate,
* ::cudaEventDestroy,
* ::cudaEventSynchronize,
* ::cudaEventQuery,
* ::cudaStreamWaitEvent,
* ::cudaIpcOpenEventHandle,
* ::cudaIpcGetMemHandle,
* ::cudaIpcOpenMemHandle,
* ::cudaIpcCloseMemHandle
*/
extern __host__ cudaError_t CUDARTAPI cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event);
/**
* \brief Opens an interprocess event handle for use in the current process
*
* Opens an interprocess event handle exported from another process with
* ::cudaIpcGetEventHandle. This function returns a ::cudaEvent_t that behaves like
* a locally created event with the ::cudaEventDisableTiming flag specified.
* This event must be freed with ::cudaEventDestroy.
*
* Performing operations on the imported event after the exported event has
* been freed with ::cudaEventDestroy will result in undefined behavior.
*
* IPC functionality is restricted to devices with support for unified
* addressing on Linux operating systems.
*
* \param event - Returns the imported event
* \param handle - Interprocess handle to open
*
* \returns
* ::cudaSuccess,
* ::cudaErrorMapBufferObjectFailed,
* ::cudaErrorInvalidResourceHandle
*
* \sa
* ::cudaEventCreate,
* ::cudaEventDestroy,
* ::cudaEventSynchronize,
* ::cudaEventQuery,
* ::cudaStreamWaitEvent,
* ::cudaIpcGetEventHandle,
* ::cudaIpcGetMemHandle,
* ::cudaIpcOpenMemHandle,
* ::cudaIpcCloseMemHandle
*/
extern __host__ cudaError_t CUDARTAPI cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle);
/**
* \brief Gets an interprocess memory handle for an existing device memory
* allocation
*
* Takes a pointer to the base of an existing device memory allocation created
* with ::cudaMalloc and exports it for use in another process. This is a
* lightweight operation and may be called multiple times on an allocation
* without adverse effects.
*
* If a region of memory is freed with ::cudaFree and a subsequent call
* to ::cudaMalloc returns memory with the same device address,
* ::cudaIpcGetMemHandle will return a unique handle for the
* new memory.
*
* IPC functionality is restricted to devices with support for unified
* addressing on Linux operating systems.
*
* \param handle - Pointer to user allocated ::cudaIpcMemHandle to return
* the handle in.
* \param devPtr - Base pointer to previously allocated device memory
*
* \returns
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorMemoryAllocation,
* ::cudaErrorMapBufferObjectFailed,
*
* \sa
* ::cudaMalloc,
* ::cudaFree,
* ::cudaIpcGetEventHandle,
* ::cudaIpcOpenEventHandle,
* ::cudaIpcOpenMemHandle,
* ::cudaIpcCloseMemHandle
*/
extern __host__ cudaError_t CUDARTAPI cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr);
/**
* \brief Opens an interprocess memory handle exported from another process
* and returns a device pointer usable in the local process.
*
* Maps memory exported from another process with ::cudaIpcGetMemHandle into
* the current device address space. For contexts on different devices
* ::cudaIpcOpenMemHandle can attempt to enable peer access between the
* devices as if the user called ::cudaDeviceEnablePeerAccess. This behavior is
* controlled by the ::cudaIpcMemLazyEnablePeerAccess flag.
* ::cudaDeviceCanAccessPeer can determine if a mapping is possible.
*
* Contexts that may open ::cudaIpcMemHandles are restricted in the following way.
* ::cudaIpcMemHandles from each device in a given process may only be opened
* by one context per device per other process.
*
* Memory returned from ::cudaIpcOpenMemHandle must be freed with
* ::cudaIpcCloseMemHandle.
*
* Calling ::cudaFree on an exported memory region before calling
* ::cudaIpcCloseMemHandle in the importing context will result in undefined
* behavior.
*
* IPC functionality is restricted to devices with support for unified
* addressing on Linux operating systems.
*
* \param devPtr - Returned device pointer
* \param handle - ::cudaIpcMemHandle to open
* \param flags - Flags for this operation. Must be specified as ::cudaIpcMemLazyEnablePeerAccess
*
* \returns
* ::cudaSuccess,
* ::cudaErrorMapBufferObjectFailed,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorTooManyPeers
*
* \note No guarantees are made about the address returned in \p *devPtr.
* In particular, multiple processes may not receive the same address for the same \p handle.
*
* \sa
* ::cudaMalloc,
* ::cudaFree,
* ::cudaIpcGetEventHandle,
* ::cudaIpcOpenEventHandle,
* ::cudaIpcGetMemHandle,
* ::cudaIpcCloseMemHandle,
* ::cudaDeviceEnablePeerAccess,
* ::cudaDeviceCanAccessPeer,
*/
extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags);
/**
* \brief Close memory mapped with cudaIpcOpenMemHandle
*
* Unmaps memory returnd by ::cudaIpcOpenMemHandle. The original allocation
* in the exporting process as well as imported mappings in other processes
* will be unaffected.
*
* Any resources used to enable peer access will be freed if this is the
* last mapping using them.
*
* IPC functionality is restricted to devices with support for unified
* addressing on Linux operating systems.
*
* \param devPtr - Device pointer returned by ::cudaIpcOpenMemHandle
*
* \returns
* ::cudaSuccess,
* ::cudaErrorMapBufferObjectFailed,
* ::cudaErrorInvalidResourceHandle,
*
* \sa
* ::cudaMalloc,
* ::cudaFree,
* ::cudaIpcGetEventHandle,
* ::cudaIpcOpenEventHandle,
* ::cudaIpcGetMemHandle,
* ::cudaIpcOpenMemHandle,
*/
extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr);
/** @} */ /* END CUDART_DEVICE */
/**
* \defgroup CUDART_THREAD_DEPRECATED Thread Management [DEPRECATED]
*
* ___MANBRIEF___ deprecated thread management functions of the CUDA runtime
* API (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes deprecated thread management functions of the CUDA runtime
* application programming interface.
*
* @{
*/
/**
* \brief Exit and clean up from CUDA launches
*
* \deprecated
*
* Note that this function is deprecated because its name does not
* reflect its behavior. Its functionality is identical to the
* non-deprecated function ::cudaDeviceReset(), which should be used
* instead.
*
* Explicitly destroys all cleans up all resources associated with the current
* device in the current process. Any subsequent API call to this device will
* reinitialize the device.
*
* Note that this function will reset the device immediately. It is the caller's
* responsibility to ensure that the device is not being accessed by any
* other host threads from the process when this function is called.
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \sa ::cudaDeviceReset
*/
extern __host__ cudaError_t CUDARTAPI cudaThreadExit(void);
/**
* \brief Wait for compute device to finish
*
* \deprecated
*
* Note that this function is deprecated because its name does not
* reflect its behavior. Its functionality is similar to the
* non-deprecated function ::cudaDeviceSynchronize(), which should be used
* instead.
*
* Blocks until the device has completed all preceding requested tasks.
* ::cudaThreadSynchronize() returns an error if one of the preceding tasks
* has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for
* this device, the host thread will block until the device has finished
* its work.
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \sa ::cudaDeviceSynchronize
*/
extern __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void);
/**
* \brief Set resource limits
*
* \deprecated
*
* Note that this function is deprecated because its name does not
* reflect its behavior. Its functionality is identical to the
* non-deprecated function ::cudaDeviceSetLimit(), which should be used
* instead.
*
* Setting \p limit to \p value is a request by the application to update
* the current limit maintained by the device. The driver is free to
* modify the requested value to meet h/w requirements (this could be
* clamping to minimum or maximum values, rounding up to nearest element
* size, etc). The application can use ::cudaThreadGetLimit() to find out
* exactly what the limit has been set to.
*
* Setting each ::cudaLimit has its own specific restrictions, so each is
* discussed here.
*
* - ::cudaLimitStackSize controls the stack size of each GPU thread.
*
* - ::cudaLimitPrintfFifoSize controls the size of the shared FIFO
* used by the ::printf() and ::fprintf() device system calls.
* Setting ::cudaLimitPrintfFifoSize must be performed before
* launching any kernel that uses the ::printf() or ::fprintf() device
* system calls, otherwise ::cudaErrorInvalidValue will be returned.
*
* - ::cudaLimitMallocHeapSize controls the size of the heap used
* by the ::malloc() and ::free() device system calls. Setting
* ::cudaLimitMallocHeapSize must be performed before launching
* any kernel that uses the ::malloc() or ::free() device system calls,
* otherwise ::cudaErrorInvalidValue will be returned.
*
* \param limit - Limit to set
* \param value - Size in bytes of limit
*
* \return
* ::cudaSuccess,
* ::cudaErrorUnsupportedLimit,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaDeviceSetLimit
*/
extern __host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit, size_t value);
/**
* \brief Returns resource limits
*
* \deprecated
*
* Note that this function is deprecated because its name does not
* reflect its behavior. Its functionality is identical to the
* non-deprecated function ::cudaDeviceGetLimit(), which should be used
* instead.
*
* Returns in \p *pValue the current size of \p limit. The supported
* ::cudaLimit values are:
* - ::cudaLimitStackSize: stack size of each GPU thread;
* - ::cudaLimitPrintfFifoSize: size of the shared FIFO used by the
* ::printf() and ::fprintf() device system calls.
* - ::cudaLimitMallocHeapSize: size of the heap used by the
* ::malloc() and ::free() device system calls;
*
* \param limit - Limit to query
* \param pValue - Returned size in bytes of limit
*
* \return
* ::cudaSuccess,
* ::cudaErrorUnsupportedLimit,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaDeviceGetLimit
*/
extern __host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit);
/**
* \brief Returns the preferred cache configuration for the current device.
*
* \deprecated
*
* Note that this function is deprecated because its name does not
* reflect its behavior. Its functionality is identical to the
* non-deprecated function ::cudaDeviceGetCacheConfig(), which should be
* used instead.
*
* On devices where the L1 cache and shared memory use the same hardware
* resources, this returns through \p pCacheConfig the preferred cache
* configuration for the current device. This is only a preference. The
* runtime will use the requested configuration if possible, but it is free to
* choose a different configuration if required to execute functions.
*
* This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices
* where the size of the L1 cache and shared memory are fixed.
*
* The supported cache configurations are:
* - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
* - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
* - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
*
* \param pCacheConfig - Returned cache configuration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError
* \notefnerr
*
* \sa cudaDeviceGetCacheConfig
*/
extern __host__ cudaError_t CUDARTAPI cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig);
/**
* \brief Sets the preferred cache configuration for the current device.
*
* \deprecated
*
* Note that this function is deprecated because its name does not
* reflect its behavior. Its functionality is identical to the
* non-deprecated function ::cudaDeviceSetCacheConfig(), which should be
* used instead.
*
* On devices where the L1 cache and shared memory use the same hardware
* resources, this sets through \p cacheConfig the preferred cache
* configuration for the current device. This is only a preference. The
* runtime will use the requested configuration if possible, but it is free to
* choose a different configuration if required to execute the function. Any
* function preference set via
* \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)"
* or
* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
* will be preferred over this device-wide setting. Setting the device-wide
* cache configuration to ::cudaFuncCachePreferNone will cause subsequent
* kernel launches to prefer to not change the cache configuration unless
* required to launch the kernel.
*
* This setting does nothing on devices where the size of the L1 cache and
* shared memory are fixed.
*
* Launching a kernel with a different preference than the most recent
* preference setting may insert a device-side synchronization point.
*
* The supported cache configurations are:
* - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
* - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
* - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
*
* \param cacheConfig - Requested cache configuration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError
* \notefnerr
*
* \sa ::cudaDeviceSetCacheConfig
*/
extern __host__ cudaError_t CUDARTAPI cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig);
/** @} */ /* END CUDART_THREAD_DEPRECATED */
/**
* \defgroup CUDART_ERROR Error Handling
*
* ___MANBRIEF___ error handling functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the error handling functions of the CUDA runtime
* application programming interface.
*
* @{
*/
/**
* \brief Returns the last error from a runtime call
*
* Returns the last error that has been produced by any of the runtime calls
* in the same host thread and resets it to ::cudaSuccess.
*
* \return
* ::cudaSuccess,
* ::cudaErrorMissingConfiguration,
* ::cudaErrorMemoryAllocation,
* ::cudaErrorInitializationError,
* ::cudaErrorLaunchFailure,
* ::cudaErrorLaunchTimeout,
* ::cudaErrorLaunchOutOfResources,
* ::cudaErrorInvalidDeviceFunction,
* ::cudaErrorInvalidConfiguration,
* ::cudaErrorInvalidDevice,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidPitchValue,
* ::cudaErrorInvalidSymbol,
* ::cudaErrorUnmapBufferObjectFailed,
* ::cudaErrorInvalidHostPointer,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidTexture,
* ::cudaErrorInvalidTextureBinding,
* ::cudaErrorInvalidChannelDescriptor,
* ::cudaErrorInvalidMemcpyDirection,
* ::cudaErrorInvalidFilterSetting,
* ::cudaErrorInvalidNormSetting,
* ::cudaErrorUnknown,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorInsufficientDriver,
* ::cudaErrorSetOnActiveProcess,
* ::cudaErrorStartupFailure,
* \notefnerr
*
* \sa ::cudaPeekAtLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
/**
* \brief Returns the last error from a runtime call
*
* Returns the last error that has been produced by any of the runtime calls
* in the same host thread. Note that this call does not reset the error to
* ::cudaSuccess like ::cudaGetLastError().
*
* \return
* ::cudaSuccess,
* ::cudaErrorMissingConfiguration,
* ::cudaErrorMemoryAllocation,
* ::cudaErrorInitializationError,
* ::cudaErrorLaunchFailure,
* ::cudaErrorLaunchTimeout,
* ::cudaErrorLaunchOutOfResources,
* ::cudaErrorInvalidDeviceFunction,
* ::cudaErrorInvalidConfiguration,
* ::cudaErrorInvalidDevice,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidPitchValue,
* ::cudaErrorInvalidSymbol,
* ::cudaErrorUnmapBufferObjectFailed,
* ::cudaErrorInvalidHostPointer,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidTexture,
* ::cudaErrorInvalidTextureBinding,
* ::cudaErrorInvalidChannelDescriptor,
* ::cudaErrorInvalidMemcpyDirection,
* ::cudaErrorInvalidFilterSetting,
* ::cudaErrorInvalidNormSetting,
* ::cudaErrorUnknown,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorInsufficientDriver,
* ::cudaErrorSetOnActiveProcess,
* ::cudaErrorStartupFailure,
* \notefnerr
*
* \sa ::cudaGetLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
/**
* \brief Returns the string representation of an error code enum name
*
* Returns a string containing the name of an error code in the enum. If the error
* code is not recognized, "unrecognized error code" is returned.
*
* \param error - Error code to convert to string
*
* \return
* \p char* pointer to a NULL-terminated string
*
* \sa ::cudaGetErrorString, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError
*/
extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
/**
* \brief Returns the description string for an error code
*
* Returns the description string for an error code. If the error
* code is not recognized, "unrecognized error code" is returned.
*
* \param error - Error code to convert to string
*
* \return
* \p char* pointer to a NULL-terminated string
*
* \sa ::cudaGetErrorName, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError
*/
extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
/** @} */ /* END CUDART_ERROR */
/**
* \addtogroup CUDART_DEVICE
*
* @{
*/
/**
* \brief Returns the number of compute-capable devices
*
* Returns in \p *count the number of devices with compute capability greater
* or equal to 2.0 that are available for execution. If there is no such
* device then ::cudaGetDeviceCount() will return ::cudaErrorNoDevice.
* If no driver can be loaded to determine if any such devices exist then
* ::cudaGetDeviceCount() will return ::cudaErrorInsufficientDriver.
*
* \param count - Returns the number of devices with compute capability
* greater or equal to 2.0
*
* \return
* ::cudaSuccess,
* ::cudaErrorNoDevice,
* ::cudaErrorInsufficientDriver
* \notefnerr
*
* \sa ::cudaGetDevice, ::cudaSetDevice, ::cudaGetDeviceProperties,
* ::cudaChooseDevice
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
/**
* \brief Returns information about the compute-device
*
* Returns in \p *prop the properties of device \p dev. The ::cudaDeviceProp
* structure is defined as:
* \code
struct cudaDeviceProp {
char name[256];
size_t totalGlobalMem;
size_t sharedMemPerBlock;
int regsPerBlock;
int warpSize;
size_t memPitch;
int maxThreadsPerBlock;
int maxThreadsDim[3];
int maxGridSize[3];
int clockRate;
size_t totalConstMem;
int major;
int minor;
size_t textureAlignment;
size_t texturePitchAlignment;
int deviceOverlap;
int multiProcessorCount;
int kernelExecTimeoutEnabled;
int integrated;
int canMapHostMemory;
int computeMode;
int maxTexture1D;
int maxTexture1DMipmap;
int maxTexture1DLinear;
int maxTexture2D[2];
int maxTexture2DMipmap[2];
int maxTexture2DLinear[3];
int maxTexture2DGather[2];
int maxTexture3D[3];
int maxTexture3DAlt[3];
int maxTextureCubemap;
int maxTexture1DLayered[2];
int maxTexture2DLayered[3];
int maxTextureCubemapLayered[2];
int maxSurface1D;
int maxSurface2D[2];
int maxSurface3D[3];
int maxSurface1DLayered[2];
int maxSurface2DLayered[3];
int maxSurfaceCubemap;
int maxSurfaceCubemapLayered[2];
size_t surfaceAlignment;
int concurrentKernels;
int ECCEnabled;
int pciBusID;
int pciDeviceID;
int pciDomainID;
int tccDriver;
int asyncEngineCount;
int unifiedAddressing;
int memoryClockRate;
int memoryBusWidth;
int l2CacheSize;
int maxThreadsPerMultiProcessor;
int streamPrioritiesSupported;
int globalL1CacheSupported;
int localL1CacheSupported;
size_t sharedMemPerMultiprocessor;
int regsPerMultiprocessor;
int managedMemSupported;
int isMultiGpuBoard;
int multiGpuBoardGroupID;
}
\endcode
* where:
* - \ref ::cudaDeviceProp::name "name[256]" is an ASCII string identifying
* the device;
* - \ref ::cudaDeviceProp::totalGlobalMem "totalGlobalMem" is the total
* amount of global memory available on the device in bytes;
* - \ref ::cudaDeviceProp::sharedMemPerBlock "sharedMemPerBlock" is the
* maximum amount of shared memory available to a thread block in bytes;
* - \ref ::cudaDeviceProp::regsPerBlock "regsPerBlock" is the maximum number
* of 32-bit registers available to a thread block;
* - \ref ::cudaDeviceProp::warpSize "warpSize" is the warp size in threads;
* - \ref ::cudaDeviceProp::memPitch "memPitch" is the maximum pitch in
* bytes allowed by the memory copy functions that involve memory regions
* allocated through ::cudaMallocPitch();
* - \ref ::cudaDeviceProp::maxThreadsPerBlock "maxThreadsPerBlock" is the
* maximum number of threads per block;
* - \ref ::cudaDeviceProp::maxThreadsDim "maxThreadsDim[3]" contains the
* maximum size of each dimension of a block;
* - \ref ::cudaDeviceProp::maxGridSize "maxGridSize[3]" contains the
* maximum size of each dimension of a grid;
* - \ref ::cudaDeviceProp::clockRate "clockRate" is the clock frequency in
* kilohertz;
* - \ref ::cudaDeviceProp::totalConstMem "totalConstMem" is the total amount
* of constant memory available on the device in bytes;
* - \ref ::cudaDeviceProp::major "major",
* \ref ::cudaDeviceProp::minor "minor" are the major and minor revision
* numbers defining the device's compute capability;
* - \ref ::cudaDeviceProp::textureAlignment "textureAlignment" is the
* alignment requirement; texture base addresses that are aligned to
* \ref ::cudaDeviceProp::textureAlignment "textureAlignment" bytes do not
* need an offset applied to texture fetches;
* - \ref ::cudaDeviceProp::texturePitchAlignment "texturePitchAlignment" is the
* pitch alignment requirement for 2D texture references that are bound to
* pitched memory;
* - \ref ::cudaDeviceProp::deviceOverlap "deviceOverlap" is 1 if the device
* can concurrently copy memory between host and device while executing a
* kernel, or 0 if not. Deprecated, use instead asyncEngineCount.
* - \ref ::cudaDeviceProp::multiProcessorCount "multiProcessorCount" is the
* number of multiprocessors on the device;
* - \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled"
* is 1 if there is a run time limit for kernels executed on the device, or
* 0 if not.
* - \ref ::cudaDeviceProp::integrated "integrated" is 1 if the device is an
* integrated (motherboard) GPU and 0 if it is a discrete (card) component.
* - \ref ::cudaDeviceProp::canMapHostMemory "canMapHostMemory" is 1 if the
* device can map host memory into the CUDA address space for use with
* ::cudaHostAlloc()/::cudaHostGetDevicePointer(), or 0 if not;
* - \ref ::cudaDeviceProp::computeMode "computeMode" is the compute mode
* that the device is currently in. Available modes are as follows:
* - cudaComputeModeDefault: Default mode - Device is not restricted and
* multiple threads can use ::cudaSetDevice() with this device.
* - cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will
* be able to use ::cudaSetDevice() with this device.
* - cudaComputeModeProhibited: Compute-prohibited mode - No threads can use
* ::cudaSetDevice() with this device.
* - cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many
* threads in one process will be able to use ::cudaSetDevice() with this device.
* <br> If ::cudaSetDevice() is called on an already occupied \p device with
* computeMode ::cudaComputeModeExclusive, ::cudaErrorDeviceAlreadyInUse
* will be immediately returned indicating the device cannot be used.
* When an occupied exclusive mode device is chosen with ::cudaSetDevice,
* all subsequent non-device management runtime functions will return
* ::cudaErrorDevicesUnavailable.
* - \ref ::cudaDeviceProp::maxTexture1D "maxTexture1D" is the maximum 1D
* texture size.
* - \ref ::cudaDeviceProp::maxTexture1DMipmap "maxTexture1DMipmap" is the maximum
* 1D mipmapped texture texture size.
* - \ref ::cudaDeviceProp::maxTexture1DLinear "maxTexture1DLinear" is the maximum
* 1D texture size for textures bound to linear memory.
* - \ref ::cudaDeviceProp::maxTexture2D "maxTexture2D[2]" contains the maximum
* 2D texture dimensions.
* - \ref ::cudaDeviceProp::maxTexture2DMipmap "maxTexture2DMipmap[2]" contains the
* maximum 2D mipmapped texture dimensions.
* - \ref ::cudaDeviceProp::maxTexture2DLinear "maxTexture2DLinear[3]" contains the
* maximum 2D texture dimensions for 2D textures bound to pitch linear memory.
* - \ref ::cudaDeviceProp::maxTexture2DGather "maxTexture2DGather[2]" contains the
* maximum 2D texture dimensions if texture gather operations have to be performed.
* - \ref ::cudaDeviceProp::maxTexture3D "maxTexture3D[3]" contains the maximum
* 3D texture dimensions.
* - \ref ::cudaDeviceProp::maxTexture3DAlt "maxTexture3DAlt[3]"
* contains the maximum alternate 3D texture dimensions.
* - \ref ::cudaDeviceProp::maxTextureCubemap "maxTextureCubemap" is the
* maximum cubemap texture width or height.
* - \ref ::cudaDeviceProp::maxTexture1DLayered "maxTexture1DLayered[2]" contains
* the maximum 1D layered texture dimensions.
* - \ref ::cudaDeviceProp::maxTexture2DLayered "maxTexture2DLayered[3]" contains
* the maximum 2D layered texture dimensions.
* - \ref ::cudaDeviceProp::maxTextureCubemapLayered "maxTextureCubemapLayered[2]"
* contains the maximum cubemap layered texture dimensions.
* - \ref ::cudaDeviceProp::maxSurface1D "maxSurface1D" is the maximum 1D
* surface size.
* - \ref ::cudaDeviceProp::maxSurface2D "maxSurface2D[2]" contains the maximum
* 2D surface dimensions.
* - \ref ::cudaDeviceProp::maxSurface3D "maxSurface3D[3]" contains the maximum
* 3D surface dimensions.
* - \ref ::cudaDeviceProp::maxSurface1DLayered "maxSurface1DLayered[2]" contains
* the maximum 1D layered surface dimensions.
* - \ref ::cudaDeviceProp::maxSurface2DLayered "maxSurface2DLayered[3]" contains
* the maximum 2D layered surface dimensions.
* - \ref ::cudaDeviceProp::maxSurfaceCubemap "maxSurfaceCubemap" is the maximum
* cubemap surface width or height.
* - \ref ::cudaDeviceProp::maxSurfaceCubemapLayered "maxSurfaceCubemapLayered[2]"
* contains the maximum cubemap layered surface dimensions.
* - \ref ::cudaDeviceProp::surfaceAlignment "surfaceAlignment" specifies the
* alignment requirements for surfaces.
* - \ref ::cudaDeviceProp::concurrentKernels "concurrentKernels" is 1 if the
* device supports executing multiple kernels within the same context
* simultaneously, or 0 if not. It is not guaranteed that multiple kernels
* will be resident on the device concurrently so this feature should not be
* relied upon for correctness;
* - \ref ::cudaDeviceProp::ECCEnabled "ECCEnabled" is 1 if the device has ECC
* support turned on, or 0 if not.
* - \ref ::cudaDeviceProp::pciBusID "pciBusID" is the PCI bus identifier of
* the device.
* - \ref ::cudaDeviceProp::pciDeviceID "pciDeviceID" is the PCI device
* (sometimes called slot) identifier of the device.
* - \ref ::cudaDeviceProp::pciDomainID "pciDomainID" is the PCI domain identifier
* of the device.
* - \ref ::cudaDeviceProp::tccDriver "tccDriver" is 1 if the device is using a
* TCC driver or 0 if not.
* - \ref ::cudaDeviceProp::asyncEngineCount "asyncEngineCount" is 1 when the
* device can concurrently copy memory between host and device while executing
* a kernel. It is 2 when the device can concurrently copy memory between host
* and device in both directions and execute a kernel at the same time. It is
* 0 if neither of these is supported.
* - \ref ::cudaDeviceProp::unifiedAddressing "unifiedAddressing" is 1 if the device
* shares a unified address space with the host and 0 otherwise.
* - \ref ::cudaDeviceProp::memoryClockRate "memoryClockRate" is the peak memory
* clock frequency in kilohertz.
* - \ref ::cudaDeviceProp::memoryBusWidth "memoryBusWidth" is the memory bus width
* in bits.
* - \ref ::cudaDeviceProp::l2CacheSize "l2CacheSize" is L2 cache size in bytes.
* - \ref ::cudaDeviceProp::maxThreadsPerMultiProcessor "maxThreadsPerMultiProcessor"
* is the number of maximum resident threads per multiprocessor.
* - \ref ::cudaDeviceProp::streamPrioritiesSupported "streamPrioritiesSupported"
* is 1 if the device supports stream priorities, or 0 if it is not supported.
* - \ref ::cudaDeviceProp::globalL1CacheSupported "globalL1CacheSupported"
* is 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported.
* - \ref ::cudaDeviceProp::localL1CacheSupported "localL1CacheSupported"
* is 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported.
* - \ref ::cudaDeviceProp::sharedMemPerMultiprocessor "sharedMemPerMultiprocessor" is the
* maximum amount of shared memory available to a multiprocessor in bytes; this amount is
* shared by all thread blocks simultaneously resident on a multiprocessor;
* - \ref ::cudaDeviceProp::regsPerMultiprocessor "regsPerMultiprocessor" is the maximum number
* of 32-bit registers available to a multiprocessor; this number is shared
* by all thread blocks simultaneously resident on a multiprocessor;
* - \ref ::cudaDeviceProp::managedMemory "managedMemory"
* is 1 if the device supports allocating managed memory on this system, or 0 if it is not supported.
* - \ref ::cudaDeviceProp::isMultiGpuBoard "isMultiGpuBoard"
* is 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not;
* - \ref ::cudaDeviceProp::multiGpuBoardGroupID "multiGpuBoardGroupID" is a unique identifier
* for a group of devices associated with the same board.
* Devices on the same multi-GPU board will share the same identifier;
*
* \param prop - Properties for the specified device
* \param device - Device number to get properties for
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice
*
* \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice,
* ::cudaDeviceGetAttribute
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device);
/**
* \brief Returns information about the device
*
* Returns in \p *value the integer value of the attribute \p attr on device
* \p device. The supported attributes are:
* - ::cudaDevAttrMaxThreadsPerBlock: Maximum number of threads per block;
* - ::cudaDevAttrMaxBlockDimX: Maximum x-dimension of a block;
* - ::cudaDevAttrMaxBlockDimY: Maximum y-dimension of a block;
* - ::cudaDevAttrMaxBlockDimZ: Maximum z-dimension of a block;
* - ::cudaDevAttrMaxGridDimX: Maximum x-dimension of a grid;
* - ::cudaDevAttrMaxGridDimY: Maximum y-dimension of a grid;
* - ::cudaDevAttrMaxGridDimZ: Maximum z-dimension of a grid;
* - ::cudaDevAttrMaxSharedMemoryPerBlock: Maximum amount of shared memory
* available to a thread block in bytes;
* - ::cudaDevAttrTotalConstantMemory: Memory available on device for
* __constant__ variables in a CUDA C kernel in bytes;
* - ::cudaDevAttrWarpSize: Warp size in threads;
* - ::cudaDevAttrMaxPitch: Maximum pitch in bytes allowed by the memory copy
* functions that involve memory regions allocated through ::cudaMallocPitch();
* - ::cudaDevAttrMaxTexture1DWidth: Maximum 1D texture width;
* - ::cudaDevAttrMaxTexture1DLinearWidth: Maximum width for a 1D texture bound
* to linear memory;
* - ::cudaDevAttrMaxTexture1DMipmappedWidth: Maximum mipmapped 1D texture width;
* - ::cudaDevAttrMaxTexture2DWidth: Maximum 2D texture width;
* - ::cudaDevAttrMaxTexture2DHeight: Maximum 2D texture height;
* - ::cudaDevAttrMaxTexture2DLinearWidth: Maximum width for a 2D texture
* bound to linear memory;
* - ::cudaDevAttrMaxTexture2DLinearHeight: Maximum height for a 2D texture
* bound to linear memory;
* - ::cudaDevAttrMaxTexture2DLinearPitch: Maximum pitch in bytes for a 2D
* texture bound to linear memory;
* - ::cudaDevAttrMaxTexture2DMipmappedWidth: Maximum mipmapped 2D texture
* width;
* - ::cudaDevAttrMaxTexture2DMipmappedHeight: Maximum mipmapped 2D texture
* height;
* - ::cudaDevAttrMaxTexture3DWidth: Maximum 3D texture width;
* - ::cudaDevAttrMaxTexture3DHeight: Maximum 3D texture height;
* - ::cudaDevAttrMaxTexture3DDepth: Maximum 3D texture depth;
* - ::cudaDevAttrMaxTexture3DWidthAlt: Alternate maximum 3D texture width,
* 0 if no alternate maximum 3D texture size is supported;
* - ::cudaDevAttrMaxTexture3DHeightAlt: Alternate maximum 3D texture height,
* 0 if no alternate maximum 3D texture size is supported;
* - ::cudaDevAttrMaxTexture3DDepthAlt: Alternate maximum 3D texture depth,
* 0 if no alternate maximum 3D texture size is supported;
* - ::cudaDevAttrMaxTextureCubemapWidth: Maximum cubemap texture width or
* height;
* - ::cudaDevAttrMaxTexture1DLayeredWidth: Maximum 1D layered texture width;
* - ::cudaDevAttrMaxTexture1DLayeredLayers: Maximum layers in a 1D layered
* texture;
* - ::cudaDevAttrMaxTexture2DLayeredWidth: Maximum 2D layered texture width;
* - ::cudaDevAttrMaxTexture2DLayeredHeight: Maximum 2D layered texture height;
* - ::cudaDevAttrMaxTexture2DLayeredLayers: Maximum layers in a 2D layered
* texture;
* - ::cudaDevAttrMaxTextureCubemapLayeredWidth: Maximum cubemap layered
* texture width or height;
* - ::cudaDevAttrMaxTextureCubemapLayeredLayers: Maximum layers in a cubemap
* layered texture;
* - ::cudaDevAttrMaxSurface1DWidth: Maximum 1D surface width;
* - ::cudaDevAttrMaxSurface2DWidth: Maximum 2D surface width;
* - ::cudaDevAttrMaxSurface2DHeight: Maximum 2D surface height;
* - ::cudaDevAttrMaxSurface3DWidth: Maximum 3D surface width;
* - ::cudaDevAttrMaxSurface3DHeight: Maximum 3D surface height;
* - ::cudaDevAttrMaxSurface3DDepth: Maximum 3D surface depth;
* - ::cudaDevAttrMaxSurface1DLayeredWidth: Maximum 1D layered surface width;
* - ::cudaDevAttrMaxSurface1DLayeredLayers: Maximum layers in a 1D layered
* surface;
* - ::cudaDevAttrMaxSurface2DLayeredWidth: Maximum 2D layered surface width;
* - ::cudaDevAttrMaxSurface2DLayeredHeight: Maximum 2D layered surface height;
* - ::cudaDevAttrMaxSurface2DLayeredLayers: Maximum layers in a 2D layered
* surface;
* - ::cudaDevAttrMaxSurfaceCubemapWidth: Maximum cubemap surface width;
* - ::cudaDevAttrMaxSurfaceCubemapLayeredWidth: Maximum cubemap layered
* surface width;
* - ::cudaDevAttrMaxSurfaceCubemapLayeredLayers: Maximum layers in a cubemap
* layered surface;
* - ::cudaDevAttrMaxRegistersPerBlock: Maximum number of 32-bit registers
* available to a thread block;
* - ::cudaDevAttrClockRate: Peak clock frequency in kilohertz;
* - ::cudaDevAttrTextureAlignment: Alignment requirement; texture base
* addresses aligned to ::textureAlign bytes do not need an offset applied
* to texture fetches;
* - ::cudaDevAttrTexturePitchAlignment: Pitch alignment requirement for 2D
* texture references bound to pitched memory;
* - ::cudaDevAttrGpuOverlap: 1 if the device can concurrently copy memory
* between host and device while executing a kernel, or 0 if not;
* - ::cudaDevAttrMultiProcessorCount: Number of multiprocessors on the device;
* - ::cudaDevAttrKernelExecTimeout: 1 if there is a run time limit for kernels
* executed on the device, or 0 if not;
* - ::cudaDevAttrIntegrated: 1 if the device is integrated with the memory
* subsystem, or 0 if not;
* - ::cudaDevAttrCanMapHostMemory: 1 if the device can map host memory into
* the CUDA address space, or 0 if not;
* - ::cudaDevAttrComputeMode: Compute mode is the compute mode that the device
* is currently in. Available modes are as follows:
* - ::cudaComputeModeDefault: Default mode - Device is not restricted and
* multiple threads can use ::cudaSetDevice() with this device.
* - ::cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will
* be able to use ::cudaSetDevice() with this device.
* - ::cudaComputeModeProhibited: Compute-prohibited mode - No threads can use
* ::cudaSetDevice() with this device.
* - ::cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many
* threads in one process will be able to use ::cudaSetDevice() with this
* device.
* - ::cudaDevAttrConcurrentKernels: 1 if the device supports executing
* multiple kernels within the same context simultaneously, or 0 if
* not. It is not guaranteed that multiple kernels will be resident on the
* device concurrently so this feature should not be relied upon for
* correctness;
* - ::cudaDevAttrEccEnabled: 1 if error correction is enabled on the device,
* 0 if error correction is disabled or not supported by the device;
* - ::cudaDevAttrPciBusId: PCI bus identifier of the device;
* - ::cudaDevAttrPciDeviceId: PCI device (also known as slot) identifier of
* the device;
* - ::cudaDevAttrTccDriver: 1 if the device is using a TCC driver. TCC is only
* available on Tesla hardware running Windows Vista or later;
* - ::cudaDevAttrMemoryClockRate: Peak memory clock frequency in kilohertz;
* - ::cudaDevAttrGlobalMemoryBusWidth: Global memory bus width in bits;
* - ::cudaDevAttrL2CacheSize: Size of L2 cache in bytes. 0 if the device
* doesn't have L2 cache;
* - ::cudaDevAttrMaxThreadsPerMultiProcessor: Maximum resident threads per
* multiprocessor;
* - ::cudaDevAttrUnifiedAddressing: 1 if the device shares a unified address
* space with the host, or 0 if not;
* - ::cudaDevAttrComputeCapabilityMajor: Major compute capability version
* number;
* - ::cudaDevAttrComputeCapabilityMinor: Minor compute capability version
* number;
* - ::cudaDevAttrStreamPrioritiesSupported: 1 if the device supports stream
* priorities, or 0 if not;
* - ::cudaDevAttrGlobalL1CacheSupported: 1 if device supports caching globals
* in L1 cache, 0 if not;
* - ::cudaDevAttrGlobalL1CacheSupported: 1 if device supports caching locals
* in L1 cache, 0 if not;
* - ::cudaDevAttrMaxSharedMemoryPerMultiprocessor: Maximum amount of shared memory
* available to a multiprocessor in bytes; this amount is shared by all
* thread blocks simultaneously resident on a multiprocessor;
* - ::cudaDevAttrMaxRegistersPerMultiprocessor: Maximum number of 32-bit registers
* available to a multiprocessor; this number is shared by all thread blocks
* simultaneously resident on a multiprocessor;
* - ::cudaDevAttrManagedMemSupported: 1 if device supports allocating
* managed memory, 0 if not;
* - ::cudaDevAttrIsMultiGpuBoard: 1 if device is on a multi-GPU board, 0 if not;
* - ::cudaDevAttrMultiGpuBoardGroupID: Unique identifier for a group of devices on the
* same multi-GPU board;
*
* \param value - Returned device attribute value
* \param attr - Device attribute to query
* \param device - Device number to query
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice,
* ::cudaGetDeviceProperties
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
/**
* \brief Select compute-device which best matches criteria
*
* Returns in \p *device the device which has properties that best match
* \p *prop.
*
* \param device - Device with best match
* \param prop - Desired device properties
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice,
* ::cudaGetDeviceProperties
*/
extern __host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const struct cudaDeviceProp *prop);
/**
* \brief Set device to be used for GPU executions
*
* Sets \p device as the current device for the calling host thread.
* Valid device id's are 0 to (::cudaGetDeviceCount() - 1).
*
* Any device memory subsequently allocated from this host thread
* using ::cudaMalloc(), ::cudaMallocPitch() or ::cudaMallocArray()
* will be physically resident on \p device. Any host memory allocated
* from this host thread using ::cudaMallocHost() or ::cudaHostAlloc()
* or ::cudaHostRegister() will have its lifetime associated with
* \p device. Any streams or events created from this host thread will
* be associated with \p device. Any kernels launched from this host
* thread using the <<<>>> operator or ::cudaLaunchKernel() will be executed
* on \p device.
*
* This call may be made from any host thread, to any device, and at
* any time. This function will do no synchronization with the previous
* or new device, and should be considered a very low overhead call.
*
* \param device - Device on which the active host thread should execute the
* device code.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice,
* ::cudaErrorDeviceAlreadyInUse
* \notefnerr
*
* \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties,
* ::cudaChooseDevice
*/
extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device);
/**
* \brief Returns which device is currently being used
*
* Returns in \p *device the current device for the calling host thread.
*
* \param device - Returns the device on which the active host thread
* executes the device code.
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties,
* ::cudaChooseDevice
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
/**
* \brief Set a list of devices that can be used for CUDA
*
* Sets a list of devices for CUDA execution in priority order using
* \p device_arr. The parameter \p len specifies the number of elements in the
* list. CUDA will try devices from the list sequentially until it finds one
* that works. If this function is not called, or if it is called with a \p len
* of 0, then CUDA will go back to its default behavior of trying devices
* sequentially from a default list containing all of the available CUDA
* devices in the system. If a specified device ID in the list does not exist,
* this function will return ::cudaErrorInvalidDevice. If \p len is not 0 and
* \p device_arr is NULL or if \p len exceeds the number of devices in
* the system, then ::cudaErrorInvalidValue is returned.
*
* \param device_arr - List of devices to try
* \param len - Number of devices in specified list
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevice
* \notefnerr
*
* \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties,
* ::cudaSetDeviceFlags,
* ::cudaChooseDevice
*/
extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr, int len);
/**
* \brief Sets flags to be used for device executions
*
* Records \p flags as the flags to use when initializing the current
* device. If no device has been made current to the calling thread,
* then \p flags will be applied to the initialization of any device
* initialized by the calling host thread, unless that device has had
* its initialization flags set explicitly by this or any host thread.
*
* If the current device has been set and that device has already been
* initialized then this call will fail with the error
* ::cudaErrorSetOnActiveProcess. In this case it is necessary
* to reset \p device using ::cudaDeviceReset() before the device's
* initialization flags may be set.
*
* The two LSBs of the \p flags parameter can be used to control how the CPU
* thread interacts with the OS scheduler when waiting for results from the
* device.
*
* - ::cudaDeviceScheduleAuto: The default value if the \p flags parameter is
* zero, uses a heuristic based on the number of active CUDA contexts in the
* process \p C and the number of logical processors in the system \p P. If
* \p C \> \p P, then CUDA will yield to other OS threads when waiting for the
* device, otherwise CUDA will not yield while waiting for results and
* actively spin on the processor.
* - ::cudaDeviceScheduleSpin: Instruct CUDA to actively spin when waiting for
* results from the device. This can decrease latency when waiting for the
* device, but may lower the performance of CPU threads if they are performing
* work in parallel with the CUDA thread.
* - ::cudaDeviceScheduleYield: Instruct CUDA to yield its thread when waiting
* for results from the device. This can increase latency when waiting for the
* device, but can increase the performance of CPU threads performing work in
* parallel with the device.
* - ::cudaDeviceScheduleBlockingSync: Instruct CUDA to block the CPU thread
* on a synchronization primitive when waiting for the device to finish work.
* - ::cudaDeviceBlockingSync: Instruct CUDA to block the CPU thread on a
* synchronization primitive when waiting for the device to finish work. <br>
* \ref deprecated "Deprecated:" This flag was deprecated as of CUDA 4.0 and
* replaced with ::cudaDeviceScheduleBlockingSync.
* - ::cudaDeviceMapHost: This flag enables allocating pinned
* host memory that is accessible to the device. It is implicit for the
* runtime but may be absent if a context is created using the driver API.
* If this flag is not set, ::cudaHostGetDevicePointer() will always return
* a failure code.
* - ::cudaDeviceLmemResizeToMax: Instruct CUDA to not reduce local memory
* after resizing local memory for a kernel. This can prevent thrashing by
* local memory allocations when launching many kernels with high local
* memory usage at the cost of potentially increased memory usage.
*
* \param flags - Parameters for device operation
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice,
* ::cudaErrorSetOnActiveProcess
*
* \sa ::cudaGetDeviceFlags, ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties,
* ::cudaSetDevice, ::cudaSetValidDevices,
* ::cudaChooseDevice
*/
extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags );
/**
* \brief Gets the flags for the current device
*
* Returns in \p flags the flags for the current device. If there is a
* current device for the calling thread, and the device has been initialized
* or flags have been set on that device specifically, the flags for the
* device are returned. If there is no current device, but flags have been
* set for the thread with ::cudaSetDeviceFlags, the thread flags are returned.
* Finally, if there is no current device and no thread flags, the flags for
* the first device are returned, which may be the default flags. Compare
* to the behavior of ::cudaSetDeviceFlags.
*
* Typically, the flags returned should match the behavior that will be seen
* if the calling thread uses a device after this call, without any change to
* the flags or current device inbetween by this or another thread. Note that
* if the device is not initialized, it is possible for another thread to
* change the flags for the current device before it is initialized.
* Additionally, when using exclusive mode, if this thread has not requested a
* specific device, it may use a device other than the first device, contrary
* to the assumption made by this function.
*
* If a context has been created via the driver API and is current to the
* calling thread, the flags for that context are always returned.
*
* Flags returned by this function may specifically include ::cudaDeviceMapHost
* even though it is not accepted by ::cudaSetDeviceFlags because it is
* implicit in runtime API flags. The reason for this is that the current
* context may have been created via the driver API in which case the flag is
* not implicit and may be unset.
*
* \param flags - Pointer to store the device flags
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice
*
* \sa ::cudaGetDevice, ::cudaGetDeviceProperties,
* ::cudaSetDevice, ::cudaSetDeviceFlags
*/
extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags( unsigned int *flags );
/** @} */ /* END CUDART_DEVICE */
/**
* \defgroup CUDART_STREAM Stream Management
*
* ___MANBRIEF___ stream management functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the stream management functions of the CUDA runtime
* application programming interface.
*
* @{
*/
/**
* \brief Create an asynchronous stream
*
* Creates a new asynchronous stream.
*
* \param pStream - Pointer to new stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaStreamCreateWithPriority,
* ::cudaStreamCreateWithFlags,
* ::cudaStreamGetPriority,
* ::cudaStreamGetFlags,
* ::cudaStreamQuery,
* ::cudaStreamSynchronize,
* ::cudaStreamWaitEvent,
* ::cudaStreamAddCallback,
* ::cudaStreamDestroy
*/
extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream);
/**
* \brief Create an asynchronous stream
*
* Creates a new asynchronous stream. The \p flags argument determines the
* behaviors of the stream. Valid values for \p flags are
* - ::cudaStreamDefault: Default stream creation flag.
* - ::cudaStreamNonBlocking: Specifies that work running in the created
* stream may run concurrently with work in stream 0 (the NULL stream), and that
* the created stream should perform no implicit synchronization with stream 0.
*
* \param pStream - Pointer to new stream identifier
* \param flags - Parameters for stream creation
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaStreamCreate,
* ::cudaStreamCreateWithPriority,
* ::cudaStreamGetFlags,
* ::cudaStreamQuery,
* ::cudaStreamSynchronize,
* ::cudaStreamWaitEvent,
* ::cudaStreamAddCallback,
* ::cudaStreamDestroy
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
/**
* \brief Create an asynchronous stream with the specified priority
*
* Creates a stream with the specified priority and returns a handle in \p pStream.
* This API alters the scheduler priority of work in the stream. Work in a higher
* priority stream may preempt work already executing in a low priority stream.
*
* \p priority follows a convention where lower numbers represent higher priorities.
* '0' represents default priority. The range of meaningful numerical priorities can
* be queried using ::cudaDeviceGetStreamPriorityRange. If the specified priority is
* outside the numerical range returned by ::cudaDeviceGetStreamPriorityRange,
* it will automatically be clamped to the lowest or the highest number in the range.
*
* \param pStream - Pointer to new stream identifier
* \param flags - Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed
* \param priority - Priority of the stream. Lower numbers represent higher priorities.
* See ::cudaDeviceGetStreamPriorityRange for more information about
* the meaningful stream priorities that can be passed.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \note Stream priorities are supported only on GPUs
* with compute capability 3.5 or higher.
*
* \note In the current implementation, only compute kernels launched in
* priority streams are affected by the stream's priority. Stream priorities have
* no effect on host-to-device and device-to-host memory operations.
*
* \sa ::cudaStreamCreate,
* ::cudaStreamCreateWithFlags,
* ::cudaDeviceGetStreamPriorityRange,
* ::cudaStreamGetPriority,
* ::cudaStreamQuery,
* ::cudaStreamWaitEvent,
* ::cudaStreamAddCallback,
* ::cudaStreamSynchronize,
* ::cudaStreamDestroy
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority);
/**
* \brief Query the priority of a stream
*
* Query the priority of a stream. The priority is returned in in \p priority.
* Note that if the stream was created with a priority outside the meaningful
* numerical range returned by ::cudaDeviceGetStreamPriorityRange,
* this function returns the clamped priority.
* See ::cudaStreamCreateWithPriority for details about priority clamping.
*
* \param hStream - Handle to the stream to be queried
* \param priority - Pointer to a signed integer in which the stream's priority is returned
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle
* \notefnerr
*
* \sa ::cudaStreamCreateWithPriority,
* ::cudaDeviceGetStreamPriorityRange,
* ::cudaStreamGetFlags
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority);
/**
* \brief Query the flags of a stream
*
* Query the flags of a stream. The flags are returned in \p flags.
* See ::cudaStreamCreateWithFlags for a list of valid flags.
*
* \param hStream - Handle to the stream to be queried
* \param flags - Pointer to an unsigned integer in which the stream's flags are returned
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle
* \notefnerr
*
* \sa ::cudaStreamCreateWithPriority,
* ::cudaStreamCreateWithFlags,
* ::cudaStreamGetPriority
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags);
/**
* \brief Destroys and cleans up an asynchronous stream
*
* Destroys and cleans up the asynchronous stream specified by \p stream.
*
* In case the device is still doing work in the stream \p stream
* when ::cudaStreamDestroy() is called, the function will return immediately
* and the resources associated with \p stream will be released automatically
* once the device has completed all work in \p stream.
*
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle
* \notefnerr
*
* \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
/**
* \brief Make a compute stream wait on an event
*
* Makes all future work submitted to \p stream wait until \p event reports
* completion before beginning execution. This synchronization will be
* performed efficiently on the device. The event \p event may
* be from a different context than \p stream, in which case this function
* will perform cross-device synchronization.
*
* The stream \p stream will wait only for the completion of the most recent
* host call to ::cudaEventRecord() on \p event. Once this call has returned,
* any functions (including ::cudaEventRecord() and ::cudaEventDestroy()) may be
* called on \p event again, and the subsequent calls will not have any effect
* on \p stream.
*
* If ::cudaEventRecord() has not been called on \p event, this call acts as if
* the record has already completed, and so is a functional no-op.
*
* \param stream - Stream to wait
* \param event - Event to wait on
* \param flags - Parameters for the operation (must be 0)
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle
* \note_null_stream
* \notefnerr
*
* \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
#ifdef _WIN32
#define CUDART_CB __stdcall
#else
#define CUDART_CB
#endif
/**
* Type of stream callback functions.
* \param stream The stream as passed to ::cudaStreamAddCallback, may be NULL.
* \param status ::cudaSuccess or any persistent error on the stream.
* \param userData User parameter provided at registration.
*/
typedef void (CUDART_CB *cudaStreamCallback_t)(cudaStream_t stream, cudaError_t status, void *userData);
/**
* \brief Add a callback to a compute stream
*
* Adds a callback to be called on the host after all currently enqueued
* items in the stream have completed. For each
* cudaStreamAddCallback call, a callback will be executed exactly once.
* The callback will block later work in the stream until it is finished.
*
* The callback may be passed ::cudaSuccess or an error code. In the event
* of a device error, all subsequently executed callbacks will receive an
* appropriate ::cudaError_t.
*
* Callbacks must not make any CUDA API calls. Attempting to use CUDA APIs
* will result in ::cudaErrorNotPermitted. Callbacks must not perform any
* synchronization that may depend on outstanding device work or other callbacks
* that are not mandated to run earlier. Callbacks without a mandated order
* (in independent streams) execute in undefined order and may be serialized.
*
* For the purposes of Unified Memory, callback execution makes a number of
* guarantees:
* <ul>
* <li>The callback stream is considered idle for the duration of the
* callback. Thus, for example, a callback may always use memory attached
* to the callback stream.</li>
* <li>The start of execution of a callback has the same effect as
* synchronizing an event recorded in the same stream immediately prior to
* the callback. It thus synchronizes streams which have been "joined"
* prior to the callback.</li>
* <li>Adding device work to any stream does not have the effect of making
* the stream active until all preceding callbacks have executed. Thus, for
* example, a callback might use global attached memory even if work has
* been added to another stream, if it has been properly ordered with an
* event.</li>
* <li>Completion of a callback does not cause a stream to become
* active except as described above. The callback stream will remain idle
* if no device work follows the callback, and will remain idle across
* consecutive callbacks without device work in between. Thus, for example,
* stream synchronization can be done by signaling from a callback at the
* end of the stream.</li>
* </ul>
*
* \param stream - Stream to add callback to
* \param callback - The function to call once preceding stream operations are complete
* \param userData - User specified data to be passed to the callback function
* \param flags - Reserved for future use, must be 0
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorNotSupported
* \note_null_stream
* \notefnerr
*
* \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamWaitEvent, ::cudaStreamDestroy, ::cudaMallocManaged, ::cudaStreamAttachMemAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream,
cudaStreamCallback_t callback, void *userData, unsigned int flags);
/**
* \brief Waits for stream tasks to complete
*
* Blocks until \p stream has completed all operations. If the
* ::cudaDeviceScheduleBlockingSync flag was set for this device,
* the host thread will block until the stream is finished with
* all of its tasks.
*
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle
* \notefnerr
*
* \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamWaitEvent, ::cudaStreamAddCallback, ::cudaStreamDestroy
*/
extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream);
/**
* \brief Queries an asynchronous stream for completion status
*
* Returns ::cudaSuccess if all operations in \p stream have
* completed, or ::cudaErrorNotReady if not.
*
* For the purposes of Unified Memory, a return value of ::cudaSuccess
* is equivalent to having called ::cudaStreamSynchronize().
*
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorNotReady,
* ::cudaErrorInvalidResourceHandle
* \notefnerr
*
* \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy
*/
extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream);
/**
* \brief Attach memory to a stream asynchronously
*
* Enqueues an operation in \p stream to specify stream association of
* \p length bytes of memory starting from \p devPtr. This function is a
* stream-ordered operation, meaning that it is dependent on, and will
* only take effect when, previous work in stream has completed. Any
* previous association is automatically replaced.
*
* \p devPtr must point to an address within managed memory space declared
* using the __managed__ keyword or allocated with ::cudaMallocManaged.
*
* \p length must be zero, to indicate that the entire allocation's
* stream association is being changed. Currently, it's not possible
* to change stream association for a portion of an allocation.
*
* The stream association is specified using \p flags which must be
* one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle.
* If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed
* by any stream on any device.
* If the ::cudaMemAttachHost flag is specified, the program makes a guarantee
* that it won't access the memory on the device from any stream.
* If the ::cudaMemAttachSingle flag is specified, the program makes a guarantee
* that it will only access the memory on the device from \p stream. It is illegal
* to attach singly to the NULL stream, because the NULL stream is a virtual global
* stream and not a specific stream. An error will be returned in this case.
*
* When memory is associated with a single stream, the Unified Memory system will
* allow CPU access to this memory region so long as all operations in \p stream
* have completed, regardless of whether other streams are active. In effect,
* this constrains exclusive ownership of the managed memory region by
* an active GPU to per-stream activity instead of whole-GPU activity.
*
* Accessing memory on the device from streams that are not associated with
* it will produce undefined results. No error checking is performed by the
* Unified Memory system to ensure that kernels launched into other streams
* do not access this region.
*
* It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync
* via events, synchronization or other means to ensure legal access to memory
* at all times. Data visibility and coherency will be changed appropriately
* for all kernels which follow a stream-association change.
*
* If \p stream is destroyed while data is associated with it, the association is
* removed and the association reverts to the default visibility of the allocation
* as specified at ::cudaMallocManaged. For __managed__ variables, the default
* association is always ::cudaMemAttachGlobal. Note that destroying a stream is an
* asynchronous operation, and as a result, the change to default association won't
* happen until all work in the stream has completed.
*
* \param stream - Stream in which to enqueue the attach operation
* \param devPtr - Pointer to memory (must be a pointer to managed memory)
* \param length - Length of memory (must be zero)
* \param flags - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle
*
* \return
* ::cudaSuccess,
* ::cudaErrorNotReady,
* ::cudaErrorInvalidValue
* ::cudaErrorInvalidResourceHandle
* \notefnerr
*
* \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length, unsigned int flags);
/** @} */ /* END CUDART_STREAM */
/**
* \defgroup CUDART_EVENT Event Management
*
* ___MANBRIEF___ event management functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the event management functions of the CUDA runtime
* application programming interface.
*
* @{
*/
/**
* \brief Creates an event object
*
* Creates an event object using ::cudaEventDefault.
*
* \param event - Newly created event
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidValue,
* ::cudaErrorLaunchFailure,
* ::cudaErrorMemoryAllocation
* \notefnerr
*
* \sa \ref ::cudaEventCreate(cudaEvent_t*, unsigned int) "cudaEventCreate (C++ API)",
* ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,
* ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
* ::cudaStreamWaitEvent
*/
extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event);
/**
* \brief Creates an event object with the specified flags
*
* Creates an event object with the specified flags. Valid flags include:
* - ::cudaEventDefault: Default event creation flag.
* - ::cudaEventBlockingSync: Specifies that event should use blocking
* synchronization. A host thread that uses ::cudaEventSynchronize() to wait
* on an event created with this flag will block until the event actually
* completes.
* - ::cudaEventDisableTiming: Specifies that the created event does not need
* to record timing data. Events created with this flag specified and
* the ::cudaEventBlockingSync flag not specified will provide the best
* performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery().
* - ::cudaEventInterprocess: Specifies that the created event may be used as an
* interprocess event by ::cudaIpcGetEventHandle(). ::cudaEventInterprocess must
* be specified along with ::cudaEventDisableTiming.
*
* \param event - Newly created event
* \param flags - Flags for new event
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidValue,
* ::cudaErrorLaunchFailure,
* ::cudaErrorMemoryAllocation
* \notefnerr
*
* \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
* ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
* ::cudaStreamWaitEvent
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
/**
* \brief Records an event
*
* Records an event. See note about NULL stream behavior. Since operation
* is asynchronous, ::cudaEventQuery() or ::cudaEventSynchronize() must
* be used to determine when the event has actually been recorded.
*
* If ::cudaEventRecord() has previously been called on \p event, then this
* call will overwrite any existing state in \p event. Any subsequent calls
* which examine the status of \p event will only examine the completion of
* this most recent call to ::cudaEventRecord().
*
* \param event - Event to record
* \param stream - Stream in which to record event
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorLaunchFailure
* \note_null_stream
* \notefnerr
*
* \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
* ::cudaEventCreateWithFlags, ::cudaEventQuery,
* ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
* ::cudaStreamWaitEvent
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0));
/**
* \brief Queries an event's status
*
* Query the status of all device work preceding the most recent call to
* ::cudaEventRecord() (in the appropriate compute streams, as specified by the
* arguments to ::cudaEventRecord()).
*
* If this work has successfully been completed by the device, or if
* ::cudaEventRecord() has not been called on \p event, then ::cudaSuccess is
* returned. If this work has not yet been completed by the device then
* ::cudaErrorNotReady is returned.
*
* For the purposes of Unified Memory, a return value of ::cudaSuccess
* is equivalent to having called ::cudaEventSynchronize().
*
* \param event - Event to query
*
* \return
* ::cudaSuccess,
* ::cudaErrorNotReady,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorLaunchFailure
* \notefnerr
*
* \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
* ::cudaEventCreateWithFlags, ::cudaEventRecord,
* ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime
*/
extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);
/**
* \brief Waits for an event to complete
*
* Wait until the completion of all device work preceding the most recent
* call to ::cudaEventRecord() (in the appropriate compute streams, as specified
* by the arguments to ::cudaEventRecord()).
*
* If ::cudaEventRecord() has not been called on \p event, ::cudaSuccess is
* returned immediately.
*
* Waiting for an event that was created with the ::cudaEventBlockingSync
* flag will cause the calling CPU thread to block until the event has
* been completed by the device. If the ::cudaEventBlockingSync flag has
* not been set, then the CPU thread will busy-wait until the event has
* been completed by the device.
*
* \param event - Event to wait for
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorLaunchFailure
* \notefnerr
*
* \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
* ::cudaEventCreateWithFlags, ::cudaEventRecord,
* ::cudaEventQuery, ::cudaEventDestroy, ::cudaEventElapsedTime
*/
extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event);
/**
* \brief Destroys an event object
*
* Destroys the event specified by \p event.
*
* In case \p event has been recorded but has not yet been completed
* when ::cudaEventDestroy() is called, the function will return immediately and
* the resources associated with \p event will be released automatically once
* the device has completed \p event.
*
* \param event - Event to destroy
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidValue,
* ::cudaErrorLaunchFailure
* \notefnerr
*
* \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
* ::cudaEventCreateWithFlags, ::cudaEventQuery,
* ::cudaEventSynchronize, ::cudaEventRecord, ::cudaEventElapsedTime
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
/**
* \brief Computes the elapsed time between events
*
* Computes the elapsed time between two events (in milliseconds with a
* resolution of around 0.5 microseconds).
*
* If either event was last recorded in a non-NULL stream, the resulting time
* may be greater than expected (even if both used the same stream handle). This
* happens because the ::cudaEventRecord() operation takes place asynchronously
* and there is no guarantee that the measured latency is actually just between
* the two events. Any number of other different stream operations could execute
* in between the two measured events, thus altering the timing in a significant
* way.
*
* If ::cudaEventRecord() has not been called on either event, then
* ::cudaErrorInvalidResourceHandle is returned. If ::cudaEventRecord() has been
* called on both events but one or both of them has not yet been completed
* (that is, ::cudaEventQuery() would return ::cudaErrorNotReady on at least one
* of the events), ::cudaErrorNotReady is returned. If either event was created
* with the ::cudaEventDisableTiming flag, then this function will return
* ::cudaErrorInvalidResourceHandle.
*
* \param ms - Time between \p start and \p end in ms
* \param start - Starting event
* \param end - Ending event
*
* \return
* ::cudaSuccess,
* ::cudaErrorNotReady,
* ::cudaErrorInvalidValue,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorLaunchFailure
* \notefnerr
*
* \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
* ::cudaEventCreateWithFlags, ::cudaEventQuery,
* ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventRecord
*/
extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end);
/** @} */ /* END CUDART_EVENT */
/**
* \defgroup CUDART_EXECUTION Execution Control
*
* ___MANBRIEF___ execution control functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the execution control functions of the CUDA runtime
* application programming interface.
*
* Some functions have overloaded C++ API template versions documented separately in the
* \ref CUDART_HIGHLEVEL "C++ API Routines" module.
*
* @{
*/
#if CUDART_VERSION >= 7000
/**
* \brief Launches a device function
*
* The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y
* × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x ×
* \p blockDim.y × \p blockDim.z) threads.
*
* If the kernel has N parameters the \p args should point to array of N pointers.
* Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
* of memory from which the actual parameter will be copied.
*
* For templated functions, pass the function symbol as follows:
* func_name<template_arg_0,...,template_arg_N>
*
* \p sharedMem sets the amount of dynamic shared memory that will be available to
* each thread block.
*
* \p stream specifies a stream the invocation is associated to.
*
* \param func - Device function symbol
* \param gridDim - Grid dimentions
* \param blockDim - Block dimentions
* \param args - Arguments
* \param sharedMem - Shared memory
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDeviceFunction,
* ::cudaErrorInvalidConfiguration,
* ::cudaErrorLaunchFailure,
* ::cudaErrorLaunchTimeout,
* ::cudaErrorLaunchOutOfResources,
* ::cudaErrorSharedObjectInitFailed
* \note_null_stream
* \notefnerr
*
* \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
#endif /* CUDART_VERSION >= 7000 */
/**
* \brief Sets the preferred cache configuration for a device function
*
* On devices where the L1 cache and shared memory use the same hardware
* resources, this sets through \p cacheConfig the preferred cache configuration
* for the function specified via \p func. This is only a preference. The
* runtime will use the requested configuration if possible, but it is free to
* choose a different configuration if required to execute \p func.
*
* \p func is a device function symbol and must be declared as a
* \c __global__ function. If the specified function does not exist,
* then ::cudaErrorInvalidDeviceFunction is returned. For templated functions,
* pass the function symbol as follows: func_name<template_arg_0,...,template_arg_N>
*
* This setting does nothing on devices where the size of the L1 cache and
* shared memory are fixed.
*
* Launching a kernel with a different preference than the most recent
* preference setting may insert a device-side synchronization point.
*
* The supported cache configurations are:
* - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
* - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
* - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
* - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
*
* \param func - Device function symbol
* \param cacheConfig - Requested cache configuration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidDeviceFunction
* \notefnerr
* \note_string_api_deprecation2
*
* \sa ::cudaConfigureCall,
* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
* \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
* ::cudaSetDoubleForDevice,
* ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)",
* ::cudaThreadGetCacheConfig,
* ::cudaThreadSetCacheConfig
*/
extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);
/**
* \brief Sets the shared memory configuration for a device function
*
* On devices with configurable shared memory banks, this function will
* force all subsequent launches of the specified device function to have
* the given shared memory bank size configuration. On any given launch of the
* function, the shared memory configuration of the device will be temporarily
* changed if needed to suit the function's preferred configuration. Changes in
* shared memory configuration between subsequent launches of functions,
* may introduce a device side synchronization point.
*
* Any per-function setting of shared memory bank size set via
* ::cudaFuncSetSharedMemConfig will override the device wide setting set by
* ::cudaDeviceSetSharedMemConfig.
*
* Changing the shared memory bank size will not increase shared memory usage
* or affect occupancy of kernels, but may have major effects on performance.
* Larger bank sizes will allow for greater potential bandwidth to shared memory,
* but will change what kinds of accesses to shared memory will result in bank
* conflicts.
*
* This function will do nothing on devices with fixed shared memory bank size.
*
* For templated functions, pass the function symbol as follows:
* func_name<template_arg_0,...,template_arg_N>
*
* The supported bank configurations are:
* - ::cudaSharedMemBankSizeDefault: use the device's shared memory configuration
* when launching this function.
* - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be
* four bytes natively when launching this function.
* - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight
* bytes natively when launching this function.
*
* \param func - Device function symbol
* \param config - Requested shared memory configuration
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidDeviceFunction,
* ::cudaErrorInvalidValue,
* \notefnerr
* \note_string_api_deprecation2
*
* \sa ::cudaConfigureCall,
* ::cudaDeviceSetSharedMemConfig,
* ::cudaDeviceGetSharedMemConfig,
* ::cudaDeviceSetCacheConfig,
* ::cudaDeviceGetCacheConfig,
* ::cudaFuncSetCacheConfig
*/
extern __host__ cudaError_t CUDARTAPI cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config);
/**
* \brief Find out attributes for a given function
*
* This function obtains the attributes of a function specified via \p func.
* \p func is a device function symbol and must be declared as a
* \c __global__ function. The fetched attributes are placed in \p attr.
* If the specified function does not exist, then
* ::cudaErrorInvalidDeviceFunction is returned. For templated functions, pass
* the function symbol as follows: func_name<template_arg_0,...,template_arg_N>
*
* Note that some function attributes such as
* \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock"
* may vary based on the device that is currently being used.
*
* \param attr - Return pointer to function's attributes
* \param func - Device function symbol
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidDeviceFunction
* \notefnerr
* \note_string_api_deprecation2
*
* \sa ::cudaConfigureCall,
* \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
* \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
* ::cudaSetDoubleForDevice,
* ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)"
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
/**
* \brief Converts a double argument to be executed on a device
*
* \param d - Double to convert
*
* \deprecated This function is deprecated as of CUDA 7.5
*
* Converts the double value of \p d to an internal float representation if
* the device does not support double arithmetic. If the device does natively
* support doubles, then this function does nothing.
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \ref ::cudaLaunch(const void*) "cudaLaunch (C API)",
* \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
* ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d);
/**
* \brief Converts a double argument after execution on a device
*
* \deprecated This function is deprecated as of CUDA 7.5
*
* Converts the double value of \p d from a potentially internal float
* representation if the device does not support double arithmetic. If the
* device does natively support doubles, then this function does nothing.
*
* \param d - Double to convert
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \ref ::cudaLaunch(const void*) "cudaLaunch (C API)",
* \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
* ::cudaSetDoubleForDevice,
* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d);
/** @} */ /* END CUDART_EXECUTION */
#if CUDART_VERSION >= 6050
/**
* \defgroup CUDART_OCCUPANCY Occupancy
*
* ___MANBRIEF___ occupancy calculation functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the occupancy calculation functions of the CUDA runtime
* application programming interface.
*
* Besides the occupancy calculator functions
* (\ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessor and \ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags),
* there are also C++ only occupancy-based launch configuration functions documented in
* \ref CUDART_HIGHLEVEL "C++ API Routines" module.
*
* See
* \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
* \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
* \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
* \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
*
* @{
*/
/**
* \brief Returns occupancy for a device function
*
* Returns in \p *numBlocks the maximum number of active blocks per
* streaming multiprocessor for the device function.
*
* \param numBlocks - Returned occupancy
* \param func - Kernel function for which occupancy is calculated
* \param blockSize - Block size the kernel is intended to be launched with
* \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
*
* \return
* ::cudaSuccess,
* ::cudaErrorCudartUnloading,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidDevice,
* ::cudaErrorInvalidDeviceFunction,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown,
* \notefnerr
*
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,
* \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
* \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
* \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)"
* \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)"
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize);
#if CUDART_VERSION >= 7000
/**
* \brief Returns occupancy for a device function with the specified flags
*
* Returns in \p *numBlocks the maximum number of active blocks per
* streaming multiprocessor for the device function.
*
* The \p flags parameter controls how special cases are handled. Valid flags include:
*
* - ::cudaOccupancyDefault: keeps the default behavior as
* ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
*
* - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
* on platform where global caching affects occupancy. On such platforms, if caching
* is enabled, but per-block SM resource usage would result in zero occupancy, the
* occupancy calculator will calculate the occupancy as if caching is disabled.
* Setting this flag makes the occupancy calculator to return 0 in such cases.
* More information can be found about this feature in the "Unified L1/Texture Cache"
* section of the Maxwell tuning guide.
*
* \param numBlocks - Returned occupancy
* \param func - Kernel function for which occupancy is calculated
* \param blockSize - Block size the kernel is intended to be launched with
* \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
* \param flags - Requested behavior for the occupancy calculator
*
* \return
* ::cudaSuccess,
* ::cudaErrorCudartUnloading,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidDevice,
* ::cudaErrorInvalidDeviceFunction,
* ::cudaErrorInvalidValue,
* ::cudaErrorUnknown,
* \notefnerr
*
* \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor,
* \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
* \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
* \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)"
* \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)"
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
/** @} */ /* END CUDA_OCCUPANCY */
#endif /* CUDART_VERSION >= 7000 */
#endif /* CUDART_VERSION >= 6050 */
/**
* \defgroup CUDART_EXECUTION_DEPRECATED Execution Control [DEPRECATED]
*
* ___MANBRIEF___ deprecated execution control functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the deprecated execution control functions of the CUDA runtime
* application programming interface.
*
* Some functions have overloaded C++ API template versions documented separately in the
* \ref CUDART_HIGHLEVEL "C++ API Routines" module.
*
* @{
*/
/**
* \brief Configure a device-launch
*
* \deprecated This function is deprecated as of CUDA 7.0
*
* Specifies the grid and block dimensions for the device call to be executed
* similar to the execution configuration syntax. ::cudaConfigureCall() is
* stack based. Each call pushes data on top of an execution stack. This data
* contains the dimension for the grid and thread blocks, together with any
* arguments for the call.
*
* \param gridDim - Grid dimensions
* \param blockDim - Block dimensions
* \param sharedMem - Shared memory
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidConfiguration
* \note_null_stream
* \notefnerr
*
* \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
* \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
* \ref ::cudaLaunch(const void*) "cudaLaunch (C API)",
* ::cudaSetDoubleForDevice,
* ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)",
*/
extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0));
/**
* \brief Configure a device launch
*
* \deprecated This function is deprecated as of CUDA 7.0
*
* Pushes \p size bytes of the argument pointed to by \p arg at \p offset
* bytes from the start of the parameter passing area, which starts at
* offset 0. The arguments are stored in the top of the execution stack.
* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument()"
* must be preceded by a call to ::cudaConfigureCall().
*
* \param arg - Argument to push for a kernel launch
* \param size - Size of argument
* \param offset - Offset in argument stack to push new arg
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
* \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
* \ref ::cudaLaunch(const void*) "cudaLaunch (C API)",
* ::cudaSetDoubleForDevice,
* ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)",
*/
extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size, size_t offset);
/**
* \brief Launches a device function
*
* \deprecated This function is deprecated as of CUDA 7.0
*
* Launches the function \p func on the device. The parameter \p func must
* be a device function symbol. The parameter specified by \p func must be
* declared as a \p __global__ function. For templated functions, pass the
* function symbol as follows: func_name<template_arg_0,...,template_arg_N>
* \ref ::cudaLaunch(const void*) "cudaLaunch()" must be preceded by a call to
* ::cudaConfigureCall() since it pops the data that was pushed by
* ::cudaConfigureCall() from the execution stack.
*
* \param func - Device function symbol
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDeviceFunction,
* ::cudaErrorInvalidConfiguration,
* ::cudaErrorLaunchFailure,
* ::cudaErrorLaunchTimeout,
* ::cudaErrorLaunchOutOfResources,
* ::cudaErrorSharedObjectInitFailed
* \notefnerr
* \note_string_api_deprecation_50
*
* \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
* \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
* \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
* ::cudaSetDoubleForDevice,
* ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)",
* ::cudaThreadGetCacheConfig,
* ::cudaThreadSetCacheConfig
*/
extern __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func);
/** @} */ /* END CUDART_EXECUTION_DEPRECATED */
/**
* \defgroup CUDART_MEMORY Memory Management
*
* ___MANBRIEF___ memory management functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the memory management functions of the CUDA runtime
* application programming interface.
*
* Some functions have overloaded C++ API template versions documented separately in the
* \ref CUDART_HIGHLEVEL "C++ API Routines" module.
*
* @{
*/
/**
* \brief Allocates memory that will be automatically managed by the Unified Memory system
*
* Allocates \p size bytes of managed memory on the device and returns in
* \p *devPtr a pointer to the allocated memory. If the device doesn't support
* allocating managed memory, ::cudaErrorNotSupported is returned. Support
* for managed memory can be queried using the device attribute
* ::cudaDevAttrManagedMemory. The allocated memory is suitably
* aligned for any kind of variable. The memory is not cleared. If \p size
* is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer
* is valid on the CPU and on all GPUs in the system that support managed memory.
* All accesses to this pointer must obey the Unified Memory programming model.
*
* \p flags specifies the default stream association for this allocation.
* \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost.
* If ::cudaMemAttachGlobal is specified, then this memory is accessible from
* any stream on any device. If ::cudaMemAttachHost is specified, then the
* allocation is created with initial visibility restricted to host access only;
* an explicit call to ::cudaStreamAttachMemAsync will be required to enable access
* on the device.
*
* If the association is later changed via ::cudaStreamAttachMemAsync to
* a single stream, the default association, as specifed during ::cudaMallocManaged,
* is restored when that stream is destroyed. For __managed__ variables, the
* default association is always ::cudaMemAttachGlobal. Note that destroying a
* stream is an asynchronous operation, and as a result, the change to default
* association won't happen until all work in the stream has completed.
*
* Memory allocated with ::cudaMallocManaged should be released with ::cudaFree.
*
* On a multi-GPU system with peer-to-peer support, where multiple GPUs support
* managed memory, the physical storage is created on the GPU which is active
* at the time ::cudaMallocManaged is called. All other GPUs will reference the
* data at reduced bandwidth via peer mappings over the PCIe bus. The Unified
* Memory management system does not migrate memory between GPUs.
*
* On a multi-GPU system where multiple GPUs support managed memory, but not
* all pairs of such GPUs have peer-to-peer support between them, the physical
* storage is created in 'zero-copy' or system memory. All GPUs will reference
* the data at reduced bandwidth over the PCIe bus. In these circumstances,
* use of the environment variable, CUDA_VISIBLE_DEVICES, is recommended to
* restrict CUDA to only use those GPUs that have peer-to-peer support.
* Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero
* value to force the driver to always use device memory for physical storage.
* When this environment variable is set to a non-zero value, all devices used in
* that process that support managed memory have to be peer-to-peer compatible
* with each other. The error ::cudaErrorInvalidDevice will be returned if a device
* that supports managed memory is used and it is not peer-to-peer compatible with
* any of the other managed memory supporting devices that were previously used in
* that process, even if ::cudaDeviceReset has been called on those devices. These
* environment variables are described in the CUDA programming guide under the
* "CUDA environment variables" section.
*
* \param devPtr - Pointer to allocated device memory
* \param size - Requested allocation size in bytes
* \param flags - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost
*
* \return
* ::cudaSuccess,
* ::cudaErrorMemoryAllocation
* ::cudaErrorNotSupported
* ::cudaErrorInvalidValue
*
* \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
* ::cudaMalloc3D, ::cudaMalloc3DArray,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags);
/**
* \brief Allocate memory on the device
*
* Allocates \p size bytes of linear memory on the device and returns in
* \p *devPtr a pointer to the allocated memory. The allocated memory is
* suitably aligned for any kind of variable. The memory is not cleared.
* ::cudaMalloc() returns ::cudaErrorMemoryAllocation in case of failure.
*
* The device version of ::cudaFree cannot be used with a \p *devPtr
* allocated using the host API, and vice versa.
*
* \param devPtr - Pointer to allocated device memory
* \param size - Requested allocation size in bytes
*
* \return
* ::cudaSuccess,
* ::cudaErrorMemoryAllocation
*
* \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
* ::cudaMalloc3D, ::cudaMalloc3DArray,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaFreeHost, ::cudaHostAlloc
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
/**
* \brief Allocates page-locked memory on the host
*
* Allocates \p size bytes of host memory that is page-locked and accessible
* to the device. The driver tracks the virtual memory ranges allocated with
* this function and automatically accelerates calls to functions such as
* ::cudaMemcpy*(). Since the memory can be accessed directly by the device,
* it can be read or written with much higher bandwidth than pageable memory
* obtained with functions such as ::malloc(). Allocating excessive amounts of
* memory with ::cudaMallocHost() may degrade system performance, since it
* reduces the amount of memory available to the system for paging. As a
* result, this function is best used sparingly to allocate staging areas for
* data exchange between host and device.
*
* \param ptr - Pointer to allocated host memory
* \param size - Requested allocation size in bytes
*
* \return
* ::cudaSuccess,
* ::cudaErrorMemoryAllocation
* \notefnerr
*
* \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocArray, ::cudaMalloc3D,
* ::cudaMalloc3DArray, ::cudaHostAlloc, ::cudaFree, ::cudaFreeArray,
* \ref ::cudaMallocHost(void**, size_t, unsigned int) "cudaMallocHost (C++ API)",
* ::cudaFreeHost, ::cudaHostAlloc
*/
extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size);
/**
* \brief Allocates pitched memory on the device
*
* Allocates at least \p width (in bytes) * \p height bytes of linear memory
* on the device and returns in \p *devPtr a pointer to the allocated memory.
* The function may pad the allocation to ensure that corresponding pointers
* in any given row will continue to meet the alignment requirements for
* coalescing as the address is updated from row to row. The pitch returned in
* \p *pitch by ::cudaMallocPitch() is the width in bytes of the allocation.
* The intended usage of \p pitch is as a separate parameter of the allocation,
* used to compute addresses within the 2D array. Given the row and column of
* an array element of type \p T, the address is computed as:
* \code
T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
\endcode
*
* For allocations of 2D arrays, it is recommended that programmers consider
* performing pitch allocations using ::cudaMallocPitch(). Due to pitch
* alignment restrictions in the hardware, this is especially true if the
* application will be performing 2D memory copies between different regions
* of device memory (whether linear memory or CUDA arrays).
*
* \param devPtr - Pointer to allocated pitched device memory
* \param pitch - Pitch for allocation
* \param width - Requested pitched allocation width (in bytes)
* \param height - Requested pitched allocation height
*
* \return
* ::cudaSuccess,
* ::cudaErrorMemoryAllocation
* \notefnerr
*
* \sa ::cudaMalloc, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray,
* ::cudaHostAlloc
*/
extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height);
/**
* \brief Allocate an array on the device
*
* Allocates a CUDA array according to the ::cudaChannelFormatDesc structure
* \p desc and returns a handle to the new CUDA array in \p *array.
*
* The ::cudaChannelFormatDesc is defined as:
* \code
struct cudaChannelFormatDesc {
int x, y, z, w;
enum cudaChannelFormatKind f;
};
\endcode
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
* ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
*
* The \p flags parameter enables different options to be specified that affect
* the allocation, as follows.
* - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation
* - ::cudaArraySurfaceLoadStore: Allocates an array that can be read from or written to using a surface reference
* - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the array.
*
* \p width and \p height must meet certain size requirements. See ::cudaMalloc3DArray() for more details.
*
* \param array - Pointer to allocated array in device memory
* \param desc - Requested channel format
* \param width - Requested array allocation width
* \param height - Requested array allocation height
* \param flags - Requested properties of allocated array
*
* \return
* ::cudaSuccess,
* ::cudaErrorMemoryAllocation
* \notefnerr
*
* \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaFreeArray,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray,
* ::cudaHostAlloc
*/
extern __host__ cudaError_t CUDARTAPI cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(0), unsigned int flags __dv(0));
/**
* \brief Frees memory on the device
*
* Frees the memory space pointed to by \p devPtr, which must have been
* returned by a previous call to ::cudaMalloc() or ::cudaMallocPitch().
* Otherwise, or if ::cudaFree(\p devPtr) has already been called before,
* an error is returned. If \p devPtr is 0, no operation is performed.
* ::cudaFree() returns ::cudaErrorInvalidDevicePointer in case of failure.
*
* The device version of ::cudaFree cannot be used with a \p *devPtr
* allocated using the host API, and vice versa.
*
* \param devPtr - Device pointer to memory to free
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInitializationError
* \notefnerr
*
* \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocArray, ::cudaFreeArray,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray,
* ::cudaHostAlloc
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
/**
* \brief Frees page-locked memory
*
* Frees the memory space pointed to by \p hostPtr, which must have been
* returned by a previous call to ::cudaMallocHost() or ::cudaHostAlloc().
*
* \param ptr - Pointer to memory to free
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError
* \notefnerr
*
* \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
* ::cudaFreeArray,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaHostAlloc
*/
extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr);
/**
* \brief Frees an array on the device
*
* Frees the CUDA array \p array, which must have been * returned by a
* previous call to ::cudaMallocArray(). If ::cudaFreeArray(\p array) has
* already been called before, ::cudaErrorInvalidValue is returned. If
* \p devPtr is 0, no operation is performed.
*
* \param array - Pointer to array to free
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInitializationError
* \notefnerr
*
* \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaFreeHost, ::cudaHostAlloc
*/
extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array);
/**
* \brief Frees a mipmapped array on the device
*
* Frees the CUDA mipmapped array \p mipmappedArray, which must have been
* returned by a previous call to ::cudaMallocMipmappedArray().
* If ::cudaFreeMipmappedArray(\p mipmappedArray) has already been called before,
* ::cudaErrorInvalidValue is returned.
*
* \param mipmappedArray - Pointer to mipmapped array to free
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInitializationError
* \notefnerr
*
* \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaFreeHost, ::cudaHostAlloc
*/
extern __host__ cudaError_t CUDARTAPI cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray);
/**
* \brief Allocates page-locked memory on the host
*
* Allocates \p size bytes of host memory that is page-locked and accessible
* to the device. The driver tracks the virtual memory ranges allocated with
* this function and automatically accelerates calls to functions such as
* ::cudaMemcpy(). Since the memory can be accessed directly by the device, it
* can be read or written with much higher bandwidth than pageable memory
* obtained with functions such as ::malloc(). Allocating excessive amounts of
* pinned memory may degrade system performance, since it reduces the amount
* of memory available to the system for paging. As a result, this function is
* best used sparingly to allocate staging areas for data exchange between host
* and device.
*
* The \p flags parameter enables different options to be specified that affect
* the allocation, as follows.
* - ::cudaHostAllocDefault: This flag's value is defined to be 0 and causes
* ::cudaHostAlloc() to emulate ::cudaMallocHost().
* - ::cudaHostAllocPortable: The memory returned by this call will be
* considered as pinned memory by all CUDA contexts, not just the one that
* performed the allocation.
* - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space.
* The device pointer to the memory may be obtained by calling
* ::cudaHostGetDevicePointer().
* - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC).
* WC memory can be transferred across the PCI Express bus more quickly on some
* system configurations, but cannot be read efficiently by most CPUs. WC
* memory is a good option for buffers that will be written by the CPU and read
* by the device via mapped pinned memory or host->device transfers.
*
* All of these flags are orthogonal to one another: a developer may allocate
* memory that is portable, mapped and/or write-combined with no restrictions.
*
* ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost
* flag in order for the ::cudaHostAllocMapped flag to have any effect.
*
* The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices
* that do not support mapped pinned memory. The failure is deferred to
* ::cudaHostGetDevicePointer() because the memory may be mapped into other
* CUDA contexts via the ::cudaHostAllocPortable flag.
*
* Memory allocated by this function must be freed with ::cudaFreeHost().
*
* \param pHost - Device pointer to allocated memory
* \param size - Requested allocation size in bytes
* \param flags - Requested properties of allocated memory
*
* \return
* ::cudaSuccess,
* ::cudaErrorMemoryAllocation
* \notefnerr
*
* \sa ::cudaSetDeviceFlags,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaFreeHost
*/
extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags);
/**
* \brief Registers an existing host memory range for use by CUDA
*
* Page-locks the memory range specified by \p ptr and \p size and maps it
* for the device(s) as specified by \p flags. This memory range also is added
* to the same tracking mechanism as ::cudaHostAlloc() to automatically accelerate
* calls to functions such as ::cudaMemcpy(). Since the memory can be accessed
* directly by the device, it can be read or written with much higher bandwidth
* than pageable memory that has not been registered. Page-locking excessive
* amounts of memory may degrade system performance, since it reduces the amount
* of memory available to the system for paging. As a result, this function is
* best used sparingly to register staging areas for data exchange between
* host and device.
*
* The \p flags parameter enables different options to be specified that
* affect the allocation, as follows.
*
* - ::cudaHostRegisterDefault: On a system with unified virtual addressing,
* the memory will be both mapped and portable. On a system with no unified
* virtual addressing, the memory will be neither mapped nor portable.
*
* - ::cudaHostRegisterPortable: The memory returned by this call will be
* considered as pinned memory by all CUDA contexts, not just the one that
* performed the allocation.
*
* - ::cudaHostRegisterMapped: Maps the allocation into the CUDA address
* space. The device pointer to the memory may be obtained by calling
* ::cudaHostGetDevicePointer().
*
* - ::cudaHostRegisterIoMemory: The passed memory pointer is treated as
* pointing to some memory-mapped I/O space, e.g. belonging to a
* third-party PCIe device, and it will marked as non cache-coherent and
* contiguous.
*
* All of these flags are orthogonal to one another: a developer may page-lock
* memory that is portable or mapped with no restrictions.
*
* The CUDA context must have been created with the ::cudaMapHost flag in
* order for the ::cudaHostRegisterMapped flag to have any effect.
*
* The ::cudaHostRegisterMapped flag may be specified on CUDA contexts for
* devices that do not support mapped pinned memory. The failure is deferred
* to ::cudaHostGetDevicePointer() because the memory may be mapped into
* other CUDA contexts via the ::cudaHostRegisterPortable flag.
*
* The memory page-locked by this function must be unregistered with ::cudaHostUnregister().
*
* \param ptr - Host pointer to memory to page-lock
* \param size - Size in bytes of the address range to page-lock in bytes
* \param flags - Flags for allocation request
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorMemoryAllocation,
* ::cudaErrorHostMemoryAlreadyRegistered
* \notefnerr
*
* \sa ::cudaHostUnregister, ::cudaHostGetFlags, ::cudaHostGetDevicePointer
*/
extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags);
/**
* \brief Unregisters a memory range that was registered with cudaHostRegister
*
* Unmaps the memory range whose base address is specified by \p ptr, and makes
* it pageable again.
*
* The base address must be the same one specified to ::cudaHostRegister().
*
* \param ptr - Host pointer to memory to unregister
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaHostUnregister
*/
extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr);
/**
* \brief Passes back device pointer of mapped host memory allocated by
* cudaHostAlloc or registered by cudaHostRegister
*
* Passes back the device pointer corresponding to the mapped, pinned host
* buffer allocated by ::cudaHostAlloc() or registered by ::cudaHostRegister().
*
* ::cudaHostGetDevicePointer() will fail if the ::cudaDeviceMapHost flag was
* not specified before deferred context creation occurred, or if called on a
* device that does not support mapped, pinned memory.
*
* \p flags provides for future releases. For now, it must be set to 0.
*
* \param pDevice - Returned device pointer for mapped memory
* \param pHost - Requested host pointer mapping
* \param flags - Flags for extensions (must be 0 for now)
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorMemoryAllocation
* \notefnerr
*
* \sa ::cudaSetDeviceFlags, ::cudaHostAlloc
*/
extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags);
/**
* \brief Passes back flags used to allocate pinned host memory allocated by
* cudaHostAlloc
*
* ::cudaHostGetFlags() will fail if the input pointer does not
* reside in an address range allocated by ::cudaHostAlloc().
*
* \param pFlags - Returned flags word
* \param pHost - Host pointer
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaHostAlloc
*/
extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags, void *pHost);
/**
* \brief Allocates logical 1D, 2D, or 3D memory objects on the device
*
* Allocates at least \p width * \p height * \p depth bytes of linear memory
* on the device and returns a ::cudaPitchedPtr in which \p ptr is a pointer
* to the allocated memory. The function may pad the allocation to ensure
* hardware alignment requirements are met. The pitch returned in the \p pitch
* field of \p pitchedDevPtr is the width in bytes of the allocation.
*
* The returned ::cudaPitchedPtr contains additional fields \p xsize and
* \p ysize, the logical width and height of the allocation, which are
* equivalent to the \p width and \p height \p extent parameters provided by
* the programmer during allocation.
*
* For allocations of 2D and 3D objects, it is highly recommended that
* programmers perform allocations using ::cudaMalloc3D() or
* ::cudaMallocPitch(). Due to alignment restrictions in the hardware, this is
* especially true if the application will be performing memory copies
* involving 2D or 3D objects (whether linear memory or CUDA arrays).
*
* \param pitchedDevPtr - Pointer to allocated pitched device memory
* \param extent - Requested allocation size (\p width field in bytes)
*
* \return
* ::cudaSuccess,
* ::cudaErrorMemoryAllocation
* \notefnerr
*
* \sa ::cudaMallocPitch, ::cudaFree, ::cudaMemcpy3D, ::cudaMemset3D,
* ::cudaMalloc3DArray, ::cudaMallocArray, ::cudaFreeArray,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaFreeHost, ::cudaHostAlloc, ::make_cudaPitchedPtr, ::make_cudaExtent
*/
extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent);
/**
* \brief Allocate an array on the device
*
* Allocates a CUDA array according to the ::cudaChannelFormatDesc structure
* \p desc and returns a handle to the new CUDA array in \p *array.
*
* The ::cudaChannelFormatDesc is defined as:
* \code
struct cudaChannelFormatDesc {
int x, y, z, w;
enum cudaChannelFormatKind f;
};
\endcode
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
* ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
*
* ::cudaMalloc3DArray() can allocate the following:
*
* - A 1D array is allocated if the height and depth extents are both zero.
* - A 2D array is allocated if only the depth extent is zero.
* - A 3D array is allocated if all three extents are non-zero.
* - A 1D layered CUDA array is allocated if only the height extent is zero and
* the cudaArrayLayered flag is set. Each layer is a 1D array. The number of layers is
* determined by the depth extent.
* - A 2D layered CUDA array is allocated if all three extents are non-zero and
* the cudaArrayLayered flag is set. Each layer is a 2D array. The number of layers is
* determined by the depth extent.
* - A cubemap CUDA array is allocated if all three extents are non-zero and the
* cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. A cubemap is
* a special type of 2D layered CUDA array, where the six layers represent the six faces of a cube.
* The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace.
* - A cubemap layered CUDA array is allocated if all three extents are non-zero, and both,
* cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be
* a multiple of six. A cubemap layered CUDA array is a special type of 2D layered CUDA array that consists
* of a collection of cubemaps. The first six layers represent the first cubemap, the next six layers form
* the second cubemap, and so on.
*
*
* The \p flags parameter enables different options to be specified that affect
* the allocation, as follows.
* - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation
* - ::cudaArrayLayered: Allocates a layered CUDA array, with the depth extent indicating the number of layers
* - ::cudaArrayCubemap: Allocates a cubemap CUDA array. Width must be equal to height, and depth must be six.
* If the cudaArrayLayered flag is also set, depth must be a multiple of six.
* - ::cudaArraySurfaceLoadStore: Allocates a CUDA array that could be read from or written to using a surface
* reference.
* - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA
* array. Texture gather can only be performed on 2D CUDA arrays.
*
* The width, height and depth extents must meet certain size requirements as listed in the following table.
* All values are specified in elements.
*
* Note that 2D CUDA arrays have different size requirements if the ::cudaArrayTextureGather flag is set. In that
* case, the valid range for (width, height, depth) is ((1,maxTexture2DGather[0]), (1,maxTexture2DGather[1]), 0).
*
* \xmlonly
* <table outputclass="xmlonly">
* <tgroup cols="3" colsep="1" rowsep="1">
* <colspec colname="c1" colwidth="1.0*"/>
* <colspec colname="c2" colwidth="3.0*"/>
* <colspec colname="c3" colwidth="3.0*"/>
* <thead>
* <row>
* <entry>CUDA array type</entry>
* <entry>Valid extents that must always be met {(width range in elements),
* (height range), (depth range)}</entry>
* <entry>Valid extents with cudaArraySurfaceLoadStore set {(width range in
* elements), (height range), (depth range)}</entry>
* </row>
* </thead>
* <tbody>
* <row>
* <entry>1D</entry>
* <entry>{ (1,maxTexture1D), 0, 0 }</entry>
* <entry>{ (1,maxSurface1D), 0, 0 }</entry>
* </row>
* <row>
* <entry>2D</entry>
* <entry>{ (1,maxTexture2D[0]), (1,maxTexture2D[1]), 0 }</entry>
* <entry>{ (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 }</entry>
* </row>
* <row>
* <entry>3D</entry>
* <entry>{ (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) }
* OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]),
* (1,maxTexture3DAlt[2]) }</entry>
* <entry>{ (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) }</entry>
* </row>
* <row>
* <entry>1D Layered</entry>
* <entry>{ (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) }</entry>
* <entry>{ (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) }</entry>
* </row>
* <row>
* <entry>2D Layered</entry>
* <entry>{ (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]),
* (1,maxTexture2DLayered[2]) }</entry>
* <entry>{ (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]),
* (1,maxSurface2DLayered[2]) }</entry>
* </row>
* <row>
* <entry>Cubemap</entry>
* <entry>{ (1,maxTextureCubemap), (1,maxTextureCubemap), 6 }</entry>
* <entry>{ (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 }</entry>
* </row>
* <row>
* <entry>Cubemap Layered</entry>
* <entry>{ (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]),
* (1,maxTextureCubemapLayered[1]) }</entry>
* <entry>{ (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]),
* (1,maxSurfaceCubemapLayered[1]) }</entry>
* </row>
* </tbody>
* </tgroup>
* </table>
* \endxmlonly
*
* \param array - Pointer to allocated array in device memory
* \param desc - Requested channel format
* \param extent - Requested allocation size (\p width field in elements)
* \param flags - Flags for extensions
*
* \return
* ::cudaSuccess,
* ::cudaErrorMemoryAllocation
* \notefnerr
*
* \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
* ::cudaFreeArray,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaFreeHost, ::cudaHostAlloc,
* ::make_cudaExtent
*/
extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags __dv(0));
/**
* \brief Allocate a mipmapped array on the device
*
* Allocates a CUDA mipmapped array according to the ::cudaChannelFormatDesc structure
* \p desc and returns a handle to the new CUDA mipmapped array in \p *mipmappedArray.
* \p numLevels specifies the number of mipmap levels to be allocated. This value is
* clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
*
* The ::cudaChannelFormatDesc is defined as:
* \code
struct cudaChannelFormatDesc {
int x, y, z, w;
enum cudaChannelFormatKind f;
};
\endcode
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
* ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
*
* ::cudaMallocMipmappedArray() can allocate the following:
*
* - A 1D mipmapped array is allocated if the height and depth extents are both zero.
* - A 2D mipmapped array is allocated if only the depth extent is zero.
* - A 3D mipmapped array is allocated if all three extents are non-zero.
* - A 1D layered CUDA mipmapped array is allocated if only the height extent is zero and
* the cudaArrayLayered flag is set. Each layer is a 1D mipmapped array. The number of layers is
* determined by the depth extent.
* - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and
* the cudaArrayLayered flag is set. Each layer is a 2D mipmapped array. The number of layers is
* determined by the depth extent.
* - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
* cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six.
* The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace.
* - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, and both,
* cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be
* a multiple of six. A cubemap layered CUDA mipmapped array is a special type of 2D layered CUDA mipmapped
* array that consists of a collection of cubemap mipmapped arrays. The first six layers represent the
* first cubemap mipmapped array, the next six layers form the second cubemap mipmapped array, and so on.
*
*
* The \p flags parameter enables different options to be specified that affect
* the allocation, as follows.
* - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default mipmapped array allocation
* - ::cudaArrayLayered: Allocates a layered CUDA mipmapped array, with the depth extent indicating the number of layers
* - ::cudaArrayCubemap: Allocates a cubemap CUDA mipmapped array. Width must be equal to height, and depth must be six.
* If the cudaArrayLayered flag is also set, depth must be a multiple of six.
* - ::cudaArraySurfaceLoadStore: This flag indicates that individual mipmap levels of the CUDA mipmapped array
* will be read from or written to using a surface reference.
* - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA
* array. Texture gather can only be performed on 2D CUDA mipmapped arrays, and the gather operations are
* performed only on the most detailed mipmap level.
*
* The width, height and depth extents must meet certain size requirements as listed in the following table.
* All values are specified in elements.
*
* \xmlonly
* <table outputclass="xmlonly">
* <tgroup cols="2" colsep="1" rowsep="1">
* <colspec colname="c1" colwidth="1.0*"/>
* <colspec colname="c2" colwidth="3.0*"/>
* <thead>
* <row>
* <entry>CUDA array type</entry>
* <entry>Valid extents {(width range in elements), (height range), (depth
* range)}</entry>
* </row>
* </thead>
* <tbody>
* <row>
* <entry>1D</entry>
* <entry>{ (1,maxTexture1DMipmap), 0, 0 }</entry>
* </row>
* <row>
* <entry>2D</entry>
* <entry>{ (1,maxTexture2DMipmap[0]), (1,maxTexture2DMipmap[1]), 0 }</entry>
* </row>
* <row>
* <entry>3D</entry>
* <entry>{ (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) }</entry>
* </row>
* <row>
* <entry>1D Layered</entry>
* <entry>{ (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) }</entry>
* </row>
* <row>
* <entry>2D Layered</entry>
* <entry>{ (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]),
* (1,maxTexture2DLayered[2]) }</entry>
* </row>
* <row>
* <entry>Cubemap</entry>
* <entry>{ (1,maxTextureCubemap), (1,maxTextureCubemap), 6 }</entry>
* </row>
* <row>
* <entry>Cubemap Layered</entry>
* <entry>{ (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]),
* (1,maxTextureCubemapLayered[1]) }</entry>
* </row>
* </tbody>
* </tgroup>
* </table>
* \endxmlonly
*
* \param mipmappedArray - Pointer to allocated mipmapped array in device memory
* \param desc - Requested channel format
* \param extent - Requested allocation size (\p width field in elements)
* \param numLevels - Number of mipmap levels to allocate
* \param flags - Flags for extensions
*
* \return
* ::cudaSuccess,
* ::cudaErrorMemoryAllocation
* \notefnerr
*
* \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
* ::cudaFreeArray,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaFreeHost, ::cudaHostAlloc,
* ::make_cudaExtent
*/
extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags __dv(0));
/**
* \brief Gets a mipmap level of a CUDA mipmapped array
*
* Returns in \p *levelArray a CUDA array that represents a single mipmap level
* of the CUDA mipmapped array \p mipmappedArray.
*
* If \p level is greater than the maximum number of levels in this mipmapped array,
* ::cudaErrorInvalidValue is returned.
*
* \param levelArray - Returned mipmap level CUDA array
* \param mipmappedArray - CUDA mipmapped array
* \param level - Mipmap level
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
* ::cudaFreeArray,
* \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
* ::cudaFreeHost, ::cudaHostAlloc,
* ::make_cudaExtent
*/
extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level);
/**
* \brief Copies data between 3D objects
*
\code
struct cudaExtent {
size_t width;
size_t height;
size_t depth;
};
struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);
struct cudaPos {
size_t x;
size_t y;
size_t z;
};
struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);
struct cudaMemcpy3DParms {
cudaArray_t srcArray;
struct cudaPos srcPos;
struct cudaPitchedPtr srcPtr;
cudaArray_t dstArray;
struct cudaPos dstPos;
struct cudaPitchedPtr dstPtr;
struct cudaExtent extent;
enum cudaMemcpyKind kind;
};
\endcode
*
* ::cudaMemcpy3D() copies data betwen two 3D objects. The source and
* destination objects may be in either host memory, device memory, or a CUDA
* array. The source, destination, extent, and kind of copy performed is
* specified by the ::cudaMemcpy3DParms struct which should be initialized to
* zero before use:
\code
cudaMemcpy3DParms myParms = {0};
\endcode
*
* The struct passed to ::cudaMemcpy3D() must specify one of \p srcArray or
* \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one
* non-zero source or destination will cause ::cudaMemcpy3D() to return an
* error.
*
* The \p srcPos and \p dstPos fields are optional offsets into the source and
* destination objects and are defined in units of each object's elements. The
* element for a host or device pointer is assumed to be <b>unsigned char</b>.
* For CUDA arrays, positions must be in the range [0, 2048) for any
* dimension.
*
* The \p extent field defines the dimensions of the transferred area in
* elements. If a CUDA array is participating in the copy, the extent is
* defined in terms of that array's elements. If no CUDA array is
* participating in the copy then the extents are defined in elements of
* <b>unsigned char</b>.
*
* The \p kind field defines the direction of the copy. It must be one of
* ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
* or ::cudaMemcpyDeviceToDevice.
*
* If the source and destination are both arrays, ::cudaMemcpy3D() will return
* an error if they do not have the same element size.
*
* The source and destination object may not overlap. If overlapping source
* and destination objects are specified, undefined behavior will result.
*
* The source object must lie entirely within the region defined by \p srcPos
* and \p extent. The destination object must lie entirely within the region
* defined by \p dstPos and \p extent.
*
* ::cudaMemcpy3D() returns an error if the pitch of \p srcPtr or \p dstPtr
* exceeds the maximum allowed. The pitch of a ::cudaPitchedPtr allocated
* with ::cudaMalloc3D() will always be valid.
*
* \param p - 3D memory copy parameters
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidPitchValue,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_sync
*
* \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3DAsync,
* ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
* ::make_cudaExtent, ::make_cudaPos
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p);
/**
* \brief Copies memory between devices
*
* Perform a 3D memory copy according to the parameters specified in
* \p p. See the definition of the ::cudaMemcpy3DPeerParms structure
* for documentation of its parameters.
*
* Note that this function is synchronous with respect to the host only if
* the source or destination of the transfer is host memory. Note also
* that this copy is serialized with respect to all pending and future
* asynchronous work in to the current device, the copy's source device,
* and the copy's destination device (use ::cudaMemcpy3DPeerAsync to avoid
* this synchronization).
*
* \param p - Parameters for the memory copy
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevice
* \notefnerr
* \note_sync
*
* \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
* ::cudaMemcpy3DPeerAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p);
/**
* \brief Copies data between 3D objects
*
\code
struct cudaExtent {
size_t width;
size_t height;
size_t depth;
};
struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);
struct cudaPos {
size_t x;
size_t y;
size_t z;
};
struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);
struct cudaMemcpy3DParms {
cudaArray_t srcArray;
struct cudaPos srcPos;
struct cudaPitchedPtr srcPtr;
cudaArray_t dstArray;
struct cudaPos dstPos;
struct cudaPitchedPtr dstPtr;
struct cudaExtent extent;
enum cudaMemcpyKind kind;
};
\endcode
*
* ::cudaMemcpy3DAsync() copies data betwen two 3D objects. The source and
* destination objects may be in either host memory, device memory, or a CUDA
* array. The source, destination, extent, and kind of copy performed is
* specified by the ::cudaMemcpy3DParms struct which should be initialized to
* zero before use:
\code
cudaMemcpy3DParms myParms = {0};
\endcode
*
* The struct passed to ::cudaMemcpy3DAsync() must specify one of \p srcArray
* or \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one
* non-zero source or destination will cause ::cudaMemcpy3DAsync() to return an
* error.
*
* The \p srcPos and \p dstPos fields are optional offsets into the source and
* destination objects and are defined in units of each object's elements. The
* element for a host or device pointer is assumed to be <b>unsigned char</b>.
* For CUDA arrays, positions must be in the range [0, 2048) for any
* dimension.
*
* The \p extent field defines the dimensions of the transferred area in
* elements. If a CUDA array is participating in the copy, the extent is
* defined in terms of that array's elements. If no CUDA array is
* participating in the copy then the extents are defined in elements of
* <b>unsigned char</b>.
*
* The \p kind field defines the direction of the copy. It must be one of
* ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
* or ::cudaMemcpyDeviceToDevice.
*
* If the source and destination are both arrays, ::cudaMemcpy3DAsync() will
* return an error if they do not have the same element size.
*
* The source and destination object may not overlap. If overlapping source
* and destination objects are specified, undefined behavior will result.
*
* The source object must lie entirely within the region defined by \p srcPos
* and \p extent. The destination object must lie entirely within the region
* defined by \p dstPos and \p extent.
*
* ::cudaMemcpy3DAsync() returns an error if the pitch of \p srcPtr or
* \p dstPtr exceeds the maximum allowed. The pitch of a
* ::cudaPitchedPtr allocated with ::cudaMalloc3D() will always be valid.
*
* ::cudaMemcpy3DAsync() is asynchronous with respect to the host, so
* the call may return before the copy is complete. The copy can optionally
* be associated to a stream by passing a non-zero \p stream argument. If
* \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
* is non-zero, the copy may overlap with operations in other streams.
*
* The device version of this function only handles device to device copies and
* cannot be given local or shared pointers.
*
* \param p - 3D memory copy parameters
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidPitchValue,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_async
* \note_null_stream
*
* \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3D,
* ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
* ::make_cudaExtent, ::make_cudaPos
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0));
/**
* \brief Copies memory between devices asynchronously.
*
* Perform a 3D memory copy according to the parameters specified in
* \p p. See the definition of the ::cudaMemcpy3DPeerParms structure
* for documentation of its parameters.
*
* \param p - Parameters for the memory copy
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevice
* \notefnerr
* \note_async
* \note_null_stream
*
* \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
* ::cudaMemcpy3DPeerAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0));
/**
* \brief Gets free and total device memory
*
* Returns in \p *free and \p *total respectively, the free and total amount of
* memory available for allocation by the device in bytes.
*
* \param free - Returned free memory in bytes
* \param total - Returned total memory in bytes
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidValue,
* ::cudaErrorLaunchFailure
* \notefnerr
*
*/
extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total);
/**
* \brief Gets info about the specified cudaArray
*
* Returns in \p *desc, \p *extent and \p *flags respectively, the type, shape
* and flags of \p array.
*
* Any of \p *desc, \p *extent and \p *flags may be specified as NULL.
*
* \param desc - Returned array type
* \param extent - Returned array shape. 2D arrays will have depth of zero
* \param flags - Returned array flags
* \param array - The ::cudaArray to get info for
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
* \notefnerr
*
*/
extern __host__ cudaError_t CUDARTAPI cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array);
/**
* \brief Copies data between host and device
*
* Copies \p count bytes from the memory area pointed to by \p src to the
* memory area pointed to by \p dst, where \p kind is one of
* ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
* or ::cudaMemcpyDeviceToDevice, and specifies the direction of the copy. The
* memory areas may not overlap. Calling ::cudaMemcpy() with \p dst and \p src
* pointers that do not match the direction of the copy results in an
* undefined behavior.
*
* \param dst - Destination memory address
* \param src - Source memory address
* \param count - Size in bytes to copy
* \param kind - Type of transfer
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
*
* \note_sync
*
* \sa ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
/**
* \brief Copies memory between two devices
*
* Copies memory from one device to memory on another device. \p dst is the
* base device pointer of the destination memory and \p dstDevice is the
* destination device. \p src is the base device pointer of the source memory
* and \p srcDevice is the source device. \p count specifies the number of bytes
* to copy.
*
* Note that this function is asynchronous with respect to the host, but
* serialized with respect all pending and future asynchronous work in to the
* current device, \p srcDevice, and \p dstDevice (use ::cudaMemcpyPeerAsync
* to avoid this synchronization).
*
* \param dst - Destination device pointer
* \param dstDevice - Destination device
* \param src - Source device pointer
* \param srcDevice - Source device
* \param count - Size of memory copy in bytes
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevice
* \notefnerr
* \note_sync
*
* \sa ::cudaMemcpy, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
* ::cudaMemcpy3DPeerAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count);
/**
* \brief Copies data between host and device
*
* Copies \p count bytes from the memory area pointed to by \p src to the
* CUDA array \p dst starting at the upper left corner
* (\p wOffset, \p hOffset), where \p kind is one of ::cudaMemcpyHostToHost,
* ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, or
* ::cudaMemcpyDeviceToDevice, and specifies the direction of the copy.
*
* \param dst - Destination memory address
* \param wOffset - Destination starting X offset
* \param hOffset - Destination starting Y offset
* \param src - Source memory address
* \param count - Size in bytes to copy
* \param kind - Type of transfer
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_sync
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind);
/**
* \brief Copies data between host and device
*
* Copies \p count bytes from the CUDA array \p src starting at the upper
* left corner (\p wOffset, hOffset) to the memory area pointed to by \p dst,
* where \p kind is one of ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice,
* ::cudaMemcpyDeviceToHost, or ::cudaMemcpyDeviceToDevice, and specifies the
* direction of the copy.
*
* \param dst - Destination memory address
* \param src - Source memory address
* \param wOffset - Source starting X offset
* \param hOffset - Source starting Y offset
* \param count - Size in bytes to copy
* \param kind - Type of transfer
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_sync
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind);
/**
* \brief Copies data between host and device
*
* Copies \p count bytes from the CUDA array \p src starting at the upper
* left corner (\p wOffsetSrc, \p hOffsetSrc) to the CUDA array \p dst
* starting at the upper left corner (\p wOffsetDst, \p hOffsetDst) where
* \p kind is one of ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice,
* ::cudaMemcpyDeviceToHost, or ::cudaMemcpyDeviceToDevice, and specifies the
* direction of the copy.
*
* \param dst - Destination memory address
* \param wOffsetDst - Destination starting X offset
* \param hOffsetDst - Destination starting Y offset
* \param src - Source memory address
* \param wOffsetSrc - Source starting X offset
* \param hOffsetSrc - Source starting Y offset
* \param count - Size in bytes to copy
* \param kind - Type of transfer
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
/**
* \brief Copies data between host and device
*
* Copies a matrix (\p height rows of \p width bytes each) from the memory
* area pointed to by \p src to the memory area pointed to by \p dst, where
* \p kind is one of ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice,
* ::cudaMemcpyDeviceToHost, or ::cudaMemcpyDeviceToDevice, and specifies the
* direction of the copy. \p dpitch and \p spitch are the widths in memory in
* bytes of the 2D arrays pointed to by \p dst and \p src, including any
* padding added to the end of each row. The memory areas may not overlap.
* \p width must not exceed either \p dpitch or \p spitch.
* Calling ::cudaMemcpy2D() with \p dst and \p src pointers that do not match
* the direction of the copy results in an undefined behavior.
* ::cudaMemcpy2D() returns an error if \p dpitch or \p spitch exceeds
* the maximum allowed.
*
* \param dst - Destination memory address
* \param dpitch - Pitch of destination memory
* \param src - Source memory address
* \param spitch - Pitch of source memory
* \param width - Width of matrix transfer (columns in bytes)
* \param height - Height of matrix transfer (rows)
* \param kind - Type of transfer
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidPitchValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
*
* \sa ::cudaMemcpy, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
/**
* \brief Copies data between host and device
*
* Copies a matrix (\p height rows of \p width bytes each) from the memory
* area pointed to by \p src to the CUDA array \p dst starting at the
* upper left corner (\p wOffset, \p hOffset) where \p kind is one of
* ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
* or ::cudaMemcpyDeviceToDevice, and specifies the direction of the copy.
* \p spitch is the width in memory in bytes of the 2D array pointed to by
* \p src, including any padding added to the end of each row. \p wOffset +
* \p width must not exceed the width of the CUDA array \p dst. \p width must
* not exceed \p spitch. ::cudaMemcpy2DToArray() returns an error if \p spitch
* exceeds the maximum allowed.
*
* \param dst - Destination memory address
* \param wOffset - Destination starting X offset
* \param hOffset - Destination starting Y offset
* \param src - Source memory address
* \param spitch - Pitch of source memory
* \param width - Width of matrix transfer (columns in bytes)
* \param height - Height of matrix transfer (rows)
* \param kind - Type of transfer
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidPitchValue,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_sync
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
/**
* \brief Copies data between host and device
*
* Copies a matrix (\p height rows of \p width bytes each) from the CUDA
* array \p srcArray starting at the upper left corner
* (\p wOffset, \p hOffset) to the memory area pointed to by \p dst, where
* \p kind is one of ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice,
* ::cudaMemcpyDeviceToHost, or ::cudaMemcpyDeviceToDevice, and specifies the
* direction of the copy. \p dpitch is the width in memory in bytes of the 2D
* array pointed to by \p dst, including any padding added to the end of each
* row. \p wOffset + \p width must not exceed the width of the CUDA array
* \p src. \p width must not exceed \p dpitch. ::cudaMemcpy2DFromArray()
* returns an error if \p dpitch exceeds the maximum allowed.
*
* \param dst - Destination memory address
* \param dpitch - Pitch of destination memory
* \param src - Source memory address
* \param wOffset - Source starting X offset
* \param hOffset - Source starting Y offset
* \param width - Width of matrix transfer (columns in bytes)
* \param height - Height of matrix transfer (rows)
* \param kind - Type of transfer
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidPitchValue,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_sync
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind);
/**
* \brief Copies data between host and device
*
* Copies a matrix (\p height rows of \p width bytes each) from the CUDA
* array \p srcArray starting at the upper left corner
* (\p wOffsetSrc, \p hOffsetSrc) to the CUDA array \p dst starting at
* the upper left corner (\p wOffsetDst, \p hOffsetDst), where \p kind is one
* of ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice,
* ::cudaMemcpyDeviceToHost, or ::cudaMemcpyDeviceToDevice, and specifies the
* direction of the copy. \p wOffsetDst + \p width must not exceed the width
* of the CUDA array \p dst. \p wOffsetSrc + \p width must not exceed the width
* of the CUDA array \p src.
*
* \param dst - Destination memory address
* \param wOffsetDst - Destination starting X offset
* \param hOffsetDst - Destination starting Y offset
* \param src - Source memory address
* \param wOffsetSrc - Source starting X offset
* \param hOffsetSrc - Source starting Y offset
* \param width - Width of matrix transfer (columns in bytes)
* \param height - Height of matrix transfer (rows)
* \param kind - Type of transfer
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_sync
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
/**
* \brief Copies data to the given symbol on the device
*
* Copies \p count bytes from the memory area pointed to by \p src
* to the memory area pointed to by \p offset bytes from the start of symbol
* \p symbol. The memory areas may not overlap. \p symbol is a variable that
* resides in global or constant memory space. \p kind can be either
* ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
*
* \param symbol - Device symbol address
* \param src - Source memory address
* \param count - Size in bytes to copy
* \param offset - Offset from start of symbol in bytes
* \param kind - Type of transfer
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidSymbol,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_sync
* \note_string_api_deprecation
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
/**
* \brief Copies data from the given symbol on the device
*
* Copies \p count bytes from the memory area pointed to by \p offset bytes
* from the start of symbol \p symbol to the memory area pointed to by \p dst.
* The memory areas may not overlap. \p symbol is a variable that
* resides in global or constant memory space. \p kind can be either
* ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
*
* \param dst - Destination memory address
* \param symbol - Device symbol address
* \param count - Size in bytes to copy
* \param offset - Offset from start of symbol in bytes
* \param kind - Type of transfer
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidSymbol,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_sync
* \note_string_api_deprecation
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost));
/**
* \brief Copies data between host and device
*
* Copies \p count bytes from the memory area pointed to by \p src to the
* memory area pointed to by \p dst, where \p kind is one of
* ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
* or ::cudaMemcpyDeviceToDevice, and specifies the direction of the copy. The
* memory areas may not overlap. Calling ::cudaMemcpyAsync() with \p dst and
* \p src pointers that do not match the direction of the copy results in an
* undefined behavior.
*
* ::cudaMemcpyAsync() is asynchronous with respect to the host, so the call
* may return before the copy is complete. The copy can optionally be
* associated to a stream by passing a non-zero \p stream argument. If \p kind
* is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and the \p stream is
* non-zero, the copy may overlap with operations in other streams.
*
* The device version of this function only handles device to device copies and
* cannot be given local or shared pointers.
*
* \param dst - Destination memory address
* \param src - Source memory address
* \param count - Size in bytes to copy
* \param kind - Type of transfer
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_async
* \note_null_stream
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
/**
* \brief Copies memory between two devices asynchronously.
*
* Copies memory from one device to memory on another device. \p dst is the
* base device pointer of the destination memory and \p dstDevice is the
* destination device. \p src is the base device pointer of the source memory
* and \p srcDevice is the source device. \p count specifies the number of bytes
* to copy.
*
* Note that this function is asynchronous with respect to the host and all work
* on other devices.
*
* \param dst - Destination device pointer
* \param dstDevice - Destination device
* \param src - Source device pointer
* \param srcDevice - Source device
* \param count - Size of memory copy in bytes
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevice
* \notefnerr
* \note_async
* \note_null_stream
*
* \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
* ::cudaMemcpy3DPeerAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream __dv(0));
/**
* \brief Copies data between host and device
*
* Copies \p count bytes from the memory area pointed to by \p src to the
* CUDA array \p dst starting at the upper left corner
* (\p wOffset, \p hOffset), where \p kind is one of ::cudaMemcpyHostToHost,
* ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, or
* ::cudaMemcpyDeviceToDevice, and specifies the direction of the copy.
*
* ::cudaMemcpyToArrayAsync() is asynchronous with respect to the host, so
* the call may return before the copy is complete. The copy can optionally
* be associated to a stream by passing a non-zero \p stream argument. If \p
* kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
* is non-zero, the copy may overlap with operations in other streams.
*
* \param dst - Destination memory address
* \param wOffset - Destination starting X offset
* \param hOffset - Destination starting Y offset
* \param src - Source memory address
* \param count - Size in bytes to copy
* \param kind - Type of transfer
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_async
* \note_null_stream
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
/**
* \brief Copies data between host and device
*
* Copies \p count bytes from the CUDA array \p src starting at the upper
* left corner (\p wOffset, hOffset) to the memory area pointed to by \p dst,
* where \p kind is one of ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice,
* ::cudaMemcpyDeviceToHost, or ::cudaMemcpyDeviceToDevice, and specifies the
* direction of the copy.
*
* ::cudaMemcpyFromArrayAsync() is asynchronous with respect to the host, so
* the call may return before the copy is complete. The copy can optionally
* be associated to a stream by passing a non-zero \p stream argument. If \p
* kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
* is non-zero, the copy may overlap with operations in other streams.
*
* \param dst - Destination memory address
* \param src - Source memory address
* \param wOffset - Source starting X offset
* \param hOffset - Source starting Y offset
* \param count - Size in bytes to copy
* \param kind - Type of transfer
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_async
* \note_null_stream
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
/**
* \brief Copies data between host and device
*
* Copies a matrix (\p height rows of \p width bytes each) from the memory
* area pointed to by \p src to the memory area pointed to by \p dst, where
* \p kind is one of ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice,
* ::cudaMemcpyDeviceToHost, or ::cudaMemcpyDeviceToDevice, and specifies the
* direction of the copy. \p dpitch and \p spitch are the widths in memory in
* bytes of the 2D arrays pointed to by \p dst and \p src, including any
* padding added to the end of each row. The memory areas may not overlap.
* \p width must not exceed either \p dpitch or \p spitch.
* Calling ::cudaMemcpy2DAsync() with \p dst and \p src pointers that do not
* match the direction of the copy results in an undefined behavior.
* ::cudaMemcpy2DAsync() returns an error if \p dpitch or \p spitch is greater
* than the maximum allowed.
*
* ::cudaMemcpy2DAsync() is asynchronous with respect to the host, so
* the call may return before the copy is complete. The copy can optionally
* be associated to a stream by passing a non-zero \p stream argument. If
* \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and
* \p stream is non-zero, the copy may overlap with operations in other
* streams.
*
* The device version of this function only handles device to device copies and
* cannot be given local or shared pointers.
*
* \param dst - Destination memory address
* \param dpitch - Pitch of destination memory
* \param src - Source memory address
* \param spitch - Pitch of source memory
* \param width - Width of matrix transfer (columns in bytes)
* \param height - Height of matrix transfer (rows)
* \param kind - Type of transfer
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidPitchValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_async
* \note_null_stream
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
/**
* \brief Copies data between host and device
*
* Copies a matrix (\p height rows of \p width bytes each) from the memory
* area pointed to by \p src to the CUDA array \p dst starting at the
* upper left corner (\p wOffset, \p hOffset) where \p kind is one of
* ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
* or ::cudaMemcpyDeviceToDevice, and specifies the direction of the copy.
* \p spitch is the width in memory in bytes of the 2D array pointed to by
* \p src, including any padding added to the end of each row. \p wOffset +
* \p width must not exceed the width of the CUDA array \p dst. \p width must
* not exceed \p spitch. ::cudaMemcpy2DToArrayAsync() returns an error if
* \p spitch exceeds the maximum allowed.
*
* ::cudaMemcpy2DToArrayAsync() is asynchronous with respect to the host, so
* the call may return before the copy is complete. The copy can optionally
* be associated to a stream by passing a non-zero \p stream argument. If
* \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and
* \p stream is non-zero, the copy may overlap with operations in other
* streams.
*
* \param dst - Destination memory address
* \param wOffset - Destination starting X offset
* \param hOffset - Destination starting Y offset
* \param src - Source memory address
* \param spitch - Pitch of source memory
* \param width - Width of matrix transfer (columns in bytes)
* \param height - Height of matrix transfer (rows)
* \param kind - Type of transfer
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidPitchValue,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_async
* \note_null_stream
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
/**
* \brief Copies data between host and device
*
* Copies a matrix (\p height rows of \p width bytes each) from the CUDA
* array \p srcArray starting at the upper left corner
* (\p wOffset, \p hOffset) to the memory area pointed to by \p dst, where
* \p kind is one of ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice,
* ::cudaMemcpyDeviceToHost, or ::cudaMemcpyDeviceToDevice, and specifies the
* direction of the copy. \p dpitch is the width in memory in bytes of the 2D
* array pointed to by \p dst, including any padding added to the end of each
* row. \p wOffset + \p width must not exceed the width of the CUDA array
* \p src. \p width must not exceed \p dpitch. ::cudaMemcpy2DFromArrayAsync()
* returns an error if \p dpitch exceeds the maximum allowed.
*
* ::cudaMemcpy2DFromArrayAsync() is asynchronous with respect to the host, so
* the call may return before the copy is complete. The copy can optionally be
* associated to a stream by passing a non-zero \p stream argument. If \p kind
* is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream is
* non-zero, the copy may overlap with operations in other streams.
*
* \param dst - Destination memory address
* \param dpitch - Pitch of destination memory
* \param src - Source memory address
* \param wOffset - Source starting X offset
* \param hOffset - Source starting Y offset
* \param width - Width of matrix transfer (columns in bytes)
* \param height - Height of matrix transfer (rows)
* \param kind - Type of transfer
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidPitchValue,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_async
* \note_null_stream
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync,
* ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
/**
* \brief Copies data to the given symbol on the device
*
* Copies \p count bytes from the memory area pointed to by \p src
* to the memory area pointed to by \p offset bytes from the start of symbol
* \p symbol. The memory areas may not overlap. \p symbol is a variable that
* resides in global or constant memory space. \p kind can be either
* ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
*
* ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so
* the call may return before the copy is complete. The copy can optionally
* be associated to a stream by passing a non-zero \p stream argument. If
* \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy
* may overlap with operations in other streams.
*
* \param symbol - Device symbol address
* \param src - Source memory address
* \param count - Size in bytes to copy
* \param offset - Offset from start of symbol in bytes
* \param kind - Type of transfer
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidSymbol,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_async
* \note_null_stream
* \note_string_api_deprecation
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyFromSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
/**
* \brief Copies data from the given symbol on the device
*
* Copies \p count bytes from the memory area pointed to by \p offset bytes
* from the start of symbol \p symbol to the memory area pointed to by \p dst.
* The memory areas may not overlap. \p symbol is a variable that resides in
* global or constant memory space. \p kind can be either
* ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
*
* ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so
* the call may return before the copy is complete. The copy can optionally be
* associated to a stream by passing a non-zero \p stream argument. If \p kind
* is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap
* with operations in other streams.
*
* \param dst - Destination memory address
* \param symbol - Device symbol address
* \param count - Size in bytes to copy
* \param offset - Offset from start of symbol in bytes
* \param kind - Type of transfer
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidSymbol,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidMemcpyDirection
* \notefnerr
* \note_async
* \note_null_stream
* \note_string_api_deprecation
*
* \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
* ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
* ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
* ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
* ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
* ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
* ::cudaMemcpyToSymbolAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
/**
* \brief Initializes or sets device memory to a value
*
* Fills the first \p count bytes of the memory area pointed to by \p devPtr
* with the constant byte value \p value.
*
* Note that this function is asynchronous with respect to the host unless
* \p devPtr refers to pinned host memory.
*
* \param devPtr - Pointer to device memory
* \param value - Value to set for each byte of specified memory
* \param count - Size in bytes to set
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer
* \notefnerr
* \note_memset
*
* \sa ::cudaMemset2D, ::cudaMemset3D, ::cudaMemsetAsync,
* ::cudaMemset2DAsync, ::cudaMemset3DAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count);
/**
* \brief Initializes or sets device memory to a value
*
* Sets to the specified value \p value a matrix (\p height rows of \p width
* bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the
* 2D array pointed to by \p dstPtr, including any padding added to the end
* of each row. This function performs fastest when the pitch is one that has
* been passed back by ::cudaMallocPitch().
*
* Note that this function is asynchronous with respect to the host unless
* \p devPtr refers to pinned host memory.
*
* \param devPtr - Pointer to 2D device memory
* \param pitch - Pitch in bytes of 2D device memory
* \param value - Value to set for each byte of specified memory
* \param width - Width of matrix set (columns in bytes)
* \param height - Height of matrix set (rows)
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer
* \notefnerr
* \note_memset
*
* \sa ::cudaMemset, ::cudaMemset3D, ::cudaMemsetAsync,
* ::cudaMemset2DAsync, ::cudaMemset3DAsync
*/
extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height);
/**
* \brief Initializes or sets device memory to a value
*
* Initializes each element of a 3D array to the specified value \p value.
* The object to initialize is defined by \p pitchedDevPtr. The \p pitch field
* of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed
* to by \p pitchedDevPtr, including any padding added to the end of each row.
* The \p xsize field specifies the logical width of each row in bytes, while
* the \p ysize field specifies the height of each 2D slice in rows.
*
* The extents of the initialized region are specified as a \p width in bytes,
* a \p height in rows, and a \p depth in slices.
*
* Extents with \p width greater than or equal to the \p xsize of
* \p pitchedDevPtr may perform significantly faster than extents narrower
* than the \p xsize. Secondarily, extents with \p height equal to the
* \p ysize of \p pitchedDevPtr will perform faster than when the \p height is
* shorter than the \p ysize.
*
* This function performs fastest when the \p pitchedDevPtr has been allocated
* by ::cudaMalloc3D().
*
* Note that this function is asynchronous with respect to the host unless
* \p pitchedDevPtr refers to pinned host memory.
*
* \param pitchedDevPtr - Pointer to pitched device memory
* \param value - Value to set for each byte of specified memory
* \param extent - Size parameters for where to set device memory (\p width field in bytes)
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer
* \notefnerr
* \note_memset
*
* \sa ::cudaMemset, ::cudaMemset2D,
* ::cudaMemsetAsync, ::cudaMemset2DAsync, ::cudaMemset3DAsync,
* ::cudaMalloc3D, ::make_cudaPitchedPtr,
* ::make_cudaExtent
*/
extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent);
/**
* \brief Initializes or sets device memory to a value
*
* Fills the first \p count bytes of the memory area pointed to by \p devPtr
* with the constant byte value \p value.
*
* ::cudaMemsetAsync() is asynchronous with respect to the host, so
* the call may return before the memset is complete. The operation can optionally
* be associated to a stream by passing a non-zero \p stream argument.
* If \p stream is non-zero, the operation may overlap with operations in other streams.
*
* The device version of this function only handles device to device copies and
* cannot be given local or shared pointers.
*
* \param devPtr - Pointer to device memory
* \param value - Value to set for each byte of specified memory
* \param count - Size in bytes to set
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer
* \notefnerr
* \note_memset
* \note_null_stream
*
* \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
* ::cudaMemset2DAsync, ::cudaMemset3DAsync
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0));
/**
* \brief Initializes or sets device memory to a value
*
* Sets to the specified value \p value a matrix (\p height rows of \p width
* bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the
* 2D array pointed to by \p dstPtr, including any padding added to the end
* of each row. This function performs fastest when the pitch is one that has
* been passed back by ::cudaMallocPitch().
*
* ::cudaMemset2DAsync() is asynchronous with respect to the host, so
* the call may return before the memset is complete. The operation can optionally
* be associated to a stream by passing a non-zero \p stream argument.
* If \p stream is non-zero, the operation may overlap with operations in other streams.
*
* The device version of this function only handles device to device copies and
* cannot be given local or shared pointers.
*
* \param devPtr - Pointer to 2D device memory
* \param pitch - Pitch in bytes of 2D device memory
* \param value - Value to set for each byte of specified memory
* \param width - Width of matrix set (columns in bytes)
* \param height - Height of matrix set (rows)
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer
* \notefnerr
* \note_memset
* \note_null_stream
*
* \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
* ::cudaMemsetAsync, ::cudaMemset3DAsync
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0));
/**
* \brief Initializes or sets device memory to a value
*
* Initializes each element of a 3D array to the specified value \p value.
* The object to initialize is defined by \p pitchedDevPtr. The \p pitch field
* of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed
* to by \p pitchedDevPtr, including any padding added to the end of each row.
* The \p xsize field specifies the logical width of each row in bytes, while
* the \p ysize field specifies the height of each 2D slice in rows.
*
* The extents of the initialized region are specified as a \p width in bytes,
* a \p height in rows, and a \p depth in slices.
*
* Extents with \p width greater than or equal to the \p xsize of
* \p pitchedDevPtr may perform significantly faster than extents narrower
* than the \p xsize. Secondarily, extents with \p height equal to the
* \p ysize of \p pitchedDevPtr will perform faster than when the \p height is
* shorter than the \p ysize.
*
* This function performs fastest when the \p pitchedDevPtr has been allocated
* by ::cudaMalloc3D().
*
* ::cudaMemset3DAsync() is asynchronous with respect to the host, so
* the call may return before the memset is complete. The operation can optionally
* be associated to a stream by passing a non-zero \p stream argument.
* If \p stream is non-zero, the operation may overlap with operations in other streams.
*
* The device version of this function only handles device to device copies and
* cannot be given local or shared pointers.
*
* \param pitchedDevPtr - Pointer to pitched device memory
* \param value - Value to set for each byte of specified memory
* \param extent - Size parameters for where to set device memory (\p width field in bytes)
* \param stream - Stream identifier
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer
* \notefnerr
* \note_memset
* \note_null_stream
*
* \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
* ::cudaMemsetAsync, ::cudaMemset2DAsync,
* ::cudaMalloc3D, ::make_cudaPitchedPtr,
* ::make_cudaExtent
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0));
/**
* \brief Finds the address associated with a CUDA symbol
*
* Returns in \p *devPtr the address of symbol \p symbol on the device.
* \p symbol is a variable that resides in global or constant memory space.
* If \p symbol cannot be found, or if \p symbol is not declared in the
* global or constant memory space, \p *devPtr is unchanged and the error
* ::cudaErrorInvalidSymbol is returned.
*
* \param devPtr - Return device pointer associated with symbol
* \param symbol - Device symbol address
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidSymbol
* \notefnerr
* \note_string_api_deprecation
*
* \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)",
* \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, const void *symbol);
/**
* \brief Finds the size of the object associated with a CUDA symbol
*
* Returns in \p *size the size of symbol \p symbol. \p symbol is a variable that
* resides in global or constant memory space. If \p symbol cannot be found, or
* if \p symbol is not declared in global or constant memory space, \p *size is
* unchanged and the error ::cudaErrorInvalidSymbol is returned.
*
* \param size - Size of object associated with symbol
* \param symbol - Device symbol address
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidSymbol
* \notefnerr
* \note_string_api_deprecation
*
* \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)",
* \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const void *symbol);
/** @} */ /* END CUDART_MEMORY */
/**
* \defgroup CUDART_UNIFIED Unified Addressing
*
* ___MANBRIEF___ unified addressing functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the unified addressing functions of the CUDA
* runtime application programming interface.
*
* @{
*
* \section CUDART_UNIFIED_overview Overview
*
* CUDA devices can share a unified address space with the host.
* For these devices there is no distinction between a device
* pointer and a host pointer -- the same pointer value may be
* used to access memory from the host program and from a kernel
* running on the device (with exceptions enumerated below).
*
* \section CUDART_UNIFIED_support Supported Platforms
*
* Whether or not a device supports unified addressing may be
* queried by calling ::cudaGetDeviceProperties() with the device
* property ::cudaDeviceProp::unifiedAddressing.
*
* Unified addressing is automatically enabled in 64-bit processes .
*
* Unified addressing is not yet supported on Windows Vista or
* Windows 7 for devices that do not use the TCC driver model.
*
* \section CUDART_UNIFIED_lookup Looking Up Information from Pointer Values
*
* It is possible to look up information about the memory which backs a
* pointer value. For instance, one may want to know if a pointer points
* to host or device memory. As another example, in the case of device
* memory, one may want to know on which CUDA device the memory
* resides. These properties may be queried using the function
* ::cudaPointerGetAttributes()
*
* Since pointers are unique, it is not necessary to specify information
* about the pointers specified to ::cudaMemcpy() and other copy functions.
* The copy direction ::cudaMemcpyDefault may be used to specify that the
* CUDA runtime should infer the location of the pointer from its value.
*
* \section CUDART_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
*
* All host memory allocated through all devices using ::cudaMallocHost() and
* ::cudaHostAlloc() is always directly accessible from all devices that
* support unified addressing. This is the case regardless of whether or
* not the flags ::cudaHostAllocPortable and ::cudaHostAllocMapped are
* specified.
*
* The pointer value through which allocated host memory may be accessed
* in kernels on all devices that support unified addressing is the same
* as the pointer value through which that memory is accessed on the host.
* It is not necessary to call ::cudaHostGetDevicePointer() to get the device
* pointer for these allocations.
*
* Note that this is not the case for memory allocated using the flag
* ::cudaHostAllocWriteCombined, as discussed below.
*
* \section CUDART_UNIFIED_autopeerregister Direct Access of Peer Memory
* Upon enabling direct access from a device that supports unified addressing
* to another peer device that supports unified addressing using
* ::cudaDeviceEnablePeerAccess() all memory allocated in the peer device using
* ::cudaMalloc() and ::cudaMallocPitch() will immediately be accessible
* by the current device. The device pointer value through
* which any peer's memory may be accessed in the current device
* is the same pointer value through which that memory may be
* accessed from the peer device.
*
* \section CUDART_UNIFIED_exceptions Exceptions, Disjoint Addressing
*
* Not all memory may be accessed on devices through the same pointer
* value through which they are accessed on the host. These exceptions
* are host memory registered using ::cudaHostRegister() and host memory
* allocated using the flag ::cudaHostAllocWriteCombined. For these
* exceptions, there exists a distinct host and device address for the
* memory. The device address is guaranteed to not overlap any valid host
* pointer range and is guaranteed to have the same value across all devices
* that support unified addressing.
*
* This device address may be queried using ::cudaHostGetDevicePointer()
* when a device using unified addressing is current. Either the host
* or the unified device pointer value may be used to refer to this memory
* in ::cudaMemcpy() and similar functions using the ::cudaMemcpyDefault
* memory direction.
*
*/
/**
* \brief Returns attributes about a specified pointer
*
* Returns in \p *attributes the attributes of the pointer \p ptr.
* If pointer was not allocated in, mapped by or registered with context
* supporting unified addressing ::cudaErrorInvalidValue is returned.
*
* The ::cudaPointerAttributes structure is defined as:
* \code
struct cudaPointerAttributes {
enum cudaMemoryType memoryType;
int device;
void *devicePointer;
void *hostPointer;
int isManaged;
}
\endcode
* In this structure, the individual fields mean
*
* - \ref ::cudaPointerAttributes::memoryType "memoryType" identifies the physical
* location of the memory associated with pointer \p ptr. It can be
* ::cudaMemoryTypeHost for host memory or ::cudaMemoryTypeDevice for device
* memory.
*
* - \ref ::cudaPointerAttributes::device "device" is the device against which
* \p ptr was allocated. If \p ptr has memory type ::cudaMemoryTypeDevice
* then this identifies the device on which the memory referred to by \p ptr
* physically resides. If \p ptr has memory type ::cudaMemoryTypeHost then this
* identifies the device which was current when the allocation was made
* (and if that device is deinitialized then this allocation will vanish
* with that device's state).
*
* - \ref ::cudaPointerAttributes::devicePointer "devicePointer" is
* the device pointer alias through which the memory referred to by \p ptr
* may be accessed on the current device.
* If the memory referred to by \p ptr cannot be accessed directly by the
* current device then this is NULL.
*
* - \ref ::cudaPointerAttributes::hostPointer "hostPointer" is
* the host pointer alias through which the memory referred to by \p ptr
* may be accessed on the host.
* If the memory referred to by \p ptr cannot be accessed directly by the
* host then this is NULL.
*
* - \ref ::cudaPointerAttributes::isManaged "isManaged" indicates if
* the pointer \p ptr points to managed memory or not.
*
* \param attributes - Attributes for the specified pointer
* \param ptr - Pointer to get attributes for
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice,
* ::cudaErrorInvalidValue
*
* \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice,
* ::cudaChooseDevice
*/
extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr);
/** @} */ /* END CUDART_UNIFIED */
/**
* \defgroup CUDART_PEER Peer Device Memory Access
*
* ___MANBRIEF___ peer device memory access functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the peer device memory access functions of the CUDA runtime
* application programming interface.
*
* @{
*/
/**
* \brief Queries if a device may directly access a peer device's memory.
*
* Returns in \p *canAccessPeer a value of 1 if device \p device is capable of
* directly accessing memory from \p peerDevice and 0 otherwise. If direct
* access of \p peerDevice from \p device is possible, then access may be
* enabled by calling ::cudaDeviceEnablePeerAccess().
*
* \param canAccessPeer - Returned access capability
* \param device - Device from which allocations on \p peerDevice are to
* be directly accessed.
* \param peerDevice - Device on which the allocations to be directly accessed
* by \p device reside.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice
* \notefnerr
*
* \sa ::cudaDeviceEnablePeerAccess,
* ::cudaDeviceDisablePeerAccess
*/
extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice);
/**
* \brief Enables direct access to memory allocations on a peer device.
*
* On success, all allocations from \p peerDevice will immediately be accessible by
* the current device. They will remain accessible until access is explicitly
* disabled using ::cudaDeviceDisablePeerAccess() or either device is reset using
* ::cudaDeviceReset().
*
* Note that access granted by this call is unidirectional and that in order to access
* memory on the current device from \p peerDevice, a separate symmetric call
* to ::cudaDeviceEnablePeerAccess() is required.
*
* Each device can support a system-wide maximum of eight peer connections.
*
* Peer access is not supported in 32 bit applications.
*
* Returns ::cudaErrorInvalidDevice if ::cudaDeviceCanAccessPeer() indicates
* that the current device cannot directly access memory from \p peerDevice.
*
* Returns ::cudaErrorPeerAccessAlreadyEnabled if direct access of
* \p peerDevice from the current device has already been enabled.
*
* Returns ::cudaErrorInvalidValue if \p flags is not 0.
*
* \param peerDevice - Peer device to enable direct access to from the current device
* \param flags - Reserved for future use and must be set to 0
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDevice,
* ::cudaErrorPeerAccessAlreadyEnabled,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaDeviceCanAccessPeer,
* ::cudaDeviceDisablePeerAccess
*/
extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags);
/**
* \brief Disables direct access to memory allocations on a peer device.
*
* Returns ::cudaErrorPeerAccessNotEnabled if direct access to memory on
* \p peerDevice has not yet been enabled from the current device.
*
* \param peerDevice - Peer device to disable direct access to
*
* \return
* ::cudaSuccess,
* ::cudaErrorPeerAccessNotEnabled,
* ::cudaErrorInvalidDevice
* \notefnerr
*
* \sa ::cudaDeviceCanAccessPeer,
* ::cudaDeviceEnablePeerAccess
*/
extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice);
/** @} */ /* END CUDART_PEER */
/** \defgroup CUDART_OPENGL OpenGL Interoperability */
/** \defgroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED] */
/** \defgroup CUDART_D3D9 Direct3D 9 Interoperability */
/** \defgroup CUDART_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED] */
/** \defgroup CUDART_D3D10 Direct3D 10 Interoperability */
/** \defgroup CUDART_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED] */
/** \defgroup CUDART_D3D11 Direct3D 11 Interoperability */
/** \defgroup CUDART_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED] */
/** \defgroup CUDART_VDPAU VDPAU Interoperability */
/**
* \defgroup CUDART_INTEROP Graphics Interoperability
*
* ___MANBRIEF___ graphics interoperability functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the graphics interoperability functions of the CUDA
* runtime application programming interface.
*
* @{
*/
/**
* \brief Unregisters a graphics resource for access by CUDA
*
* Unregisters the graphics resource \p resource so it is not accessible by
* CUDA unless registered again.
*
* If \p resource is invalid then ::cudaErrorInvalidResourceHandle is
* returned.
*
* \param resource - Resource to unregister
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsD3D9RegisterResource,
* ::cudaGraphicsD3D10RegisterResource,
* ::cudaGraphicsD3D11RegisterResource,
* ::cudaGraphicsGLRegisterBuffer,
* ::cudaGraphicsGLRegisterImage
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource);
/**
* \brief Set usage flags for mapping a graphics resource
*
* Set \p flags for mapping the graphics resource \p resource.
*
* Changes to \p flags will take effect the next time \p resource is mapped.
* The \p flags argument may be any of the following:
* - ::cudaGraphicsMapFlagsNone: Specifies no hints about how \p resource will
* be used. It is therefore assumed that CUDA may read from or write to \p resource.
* - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA will not write to \p resource.
* - ::cudaGraphicsMapFlagsWriteDiscard: Specifies CUDA will not read from \p resource and will
* write over the entire contents of \p resource, so none of the data
* previously stored in \p resource will be preserved.
*
* If \p resource is presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
* If \p flags is not one of the above values then ::cudaErrorInvalidValue is returned.
*
* \param resource - Registered resource to set flags for
* \param flags - Parameters for resource mapping
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown,
* \notefnerr
*
* \sa
* ::cudaGraphicsMapResources
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags);
/**
* \brief Map graphics resources for access by CUDA
*
* Maps the \p count graphics resources in \p resources for access by CUDA.
*
* The resources in \p resources may be accessed by CUDA until they
* are unmapped. The graphics API from which \p resources were registered
* should not access any resources while they are mapped by CUDA. If an
* application does so, the results are undefined.
*
* This function provides the synchronization guarantee that any graphics calls
* issued before ::cudaGraphicsMapResources() will complete before any subsequent CUDA
* work issued in \p stream begins.
*
* If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
* is returned. If any of \p resources are presently mapped for access by
* CUDA then ::cudaErrorUnknown is returned.
*
* \param count - Number of resources to map
* \param resources - Resources to map for CUDA
* \param stream - Stream for synchronization
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \note_null_stream
* \notefnerr
*
* \sa
* ::cudaGraphicsResourceGetMappedPointer,
* ::cudaGraphicsSubResourceGetMappedArray,
* ::cudaGraphicsUnmapResources
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0));
/**
* \brief Unmap graphics resources.
*
* Unmaps the \p count graphics resources in \p resources.
*
* Once unmapped, the resources in \p resources may not be accessed by CUDA
* until they are mapped again.
*
* This function provides the synchronization guarantee that any CUDA work issued
* in \p stream before ::cudaGraphicsUnmapResources() will complete before any
* subsequently issued graphics work begins.
*
* If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
* is returned. If any of \p resources are not presently mapped for access by
* CUDA then ::cudaErrorUnknown is returned.
*
* \param count - Number of resources to unmap
* \param resources - Resources to unmap
* \param stream - Stream for synchronization
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \note_null_stream
* \notefnerr
*
* \sa
* ::cudaGraphicsMapResources
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0));
/**
* \brief Get an device pointer through which to access a mapped graphics resource.
*
* Returns in \p *devPtr a pointer through which the mapped graphics resource
* \p resource may be accessed.
* Returns in \p *size the size of the memory in bytes which may be accessed from that pointer.
* The value set in \p devPtr may change every time that \p resource is mapped.
*
* If \p resource is not a buffer then it cannot be accessed via a pointer and
* ::cudaErrorUnknown is returned.
* If \p resource is not mapped then ::cudaErrorUnknown is returned.
* *
* \param devPtr - Returned pointer through which \p resource may be accessed
* \param size - Returned size of the buffer accessible starting at \p *devPtr
* \param resource - Mapped resource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa
* ::cudaGraphicsMapResources,
* ::cudaGraphicsSubResourceGetMappedArray
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(void **devPtr, size_t *size, cudaGraphicsResource_t resource);
/**
* \brief Get an array through which to access a subresource of a mapped graphics resource.
*
* Returns in \p *array an array through which the subresource of the mapped
* graphics resource \p resource which corresponds to array index \p arrayIndex
* and mipmap level \p mipLevel may be accessed. The value set in \p array may
* change every time that \p resource is mapped.
*
* If \p resource is not a texture then it cannot be accessed via an array and
* ::cudaErrorUnknown is returned.
* If \p arrayIndex is not a valid array index for \p resource then
* ::cudaErrorInvalidValue is returned.
* If \p mipLevel is not a valid mipmap level for \p resource then
* ::cudaErrorInvalidValue is returned.
* If \p resource is not mapped then ::cudaErrorUnknown is returned.
*
* \param array - Returned array through which a subresource of \p resource may be accessed
* \param resource - Mapped resource to access
* \param arrayIndex - Array index for array textures or cubemap face
* index as defined by ::cudaGraphicsCubeFace for
* cubemap textures for the subresource to access
* \param mipLevel - Mipmap level for the subresource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa ::cudaGraphicsResourceGetMappedPointer
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel);
/**
* \brief Get a mipmapped array through which to access a mapped graphics resource.
*
* Returns in \p *mipmappedArray a mipmapped array through which the mapped
* graphics resource \p resource may be accessed. The value set in \p mipmappedArray may
* change every time that \p resource is mapped.
*
* If \p resource is not a texture then it cannot be accessed via an array and
* ::cudaErrorUnknown is returned.
* If \p resource is not mapped then ::cudaErrorUnknown is returned.
*
* \param mipmappedArray - Returned mipmapped array through which \p resource may be accessed
* \param resource - Mapped resource to access
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidResourceHandle,
* ::cudaErrorUnknown
* \notefnerr
*
* \sa ::cudaGraphicsResourceGetMappedPointer
*/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource);
/** @} */ /* END CUDART_INTEROP */
/**
* \defgroup CUDART_TEXTURE Texture Reference Management
*
* ___MANBRIEF___ texture reference management functions of the CUDA runtime
* API (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the low level texture reference management functions
* of the CUDA runtime application programming interface.
*
* Some functions have overloaded C++ API template versions documented separately in the
* \ref CUDART_HIGHLEVEL "C++ API Routines" module.
*
* @{
*/
/**
* \brief Get the channel descriptor of an array
*
* Returns in \p *desc the channel descriptor of the CUDA array \p array.
*
* \param desc - Channel format
* \param array - Memory array on device
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
* ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
* \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
* \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
* \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, cudaArray_const_t array);
/**
* \brief Returns a channel descriptor using the specified format
*
* Returns a channel descriptor with format \p f and number of bits of each
* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
* defined as:
* \code
struct cudaChannelFormatDesc {
int x, y, z, w;
enum cudaChannelFormatKind f;
};
* \endcode
*
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
* ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
*
* \param x - X component
* \param y - Y component
* \param z - Z component
* \param w - W component
* \param f - Channel format
*
* \return
* Channel descriptor with format \p f
*
* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
* \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
* \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
* \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
*/
extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f);
/**
* \brief Binds a memory area to a texture
*
* Binds \p size bytes of the memory area pointed to by \p devPtr to the
* texture reference \p texref. \p desc describes how the memory is interpreted
* when fetching values from the texture. Any memory previously bound to
* \p texref is unbound.
*
* Since the hardware enforces an alignment requirement on texture base
* addresses,
* \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()"
* returns in \p *offset a byte offset that
* must be applied to texture fetches in order to read from the desired memory.
* This offset must be divided by the texel size and passed to kernels that
* read from the texture so they can be applied to the ::tex1Dfetch() function.
* If the device memory pointer was returned from ::cudaMalloc(), the offset is
* guaranteed to be 0 and NULL may be passed as the \p offset parameter.
*
* The total number of elements (or texels) in the linear address range
* cannot exceed ::cudaDeviceProp::maxTexture1DLinear[0].
* The number of elements is computed as (\p size / elementSize),
* where elementSize is determined from \p desc.
*
* \param offset - Offset in bytes
* \param texref - Texture to bind
* \param devPtr - Memory area on device
* \param desc - Channel format
* \param size - Size of the memory area pointed to by devPtr
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidTexture
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
* \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
* \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
* \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX));
/**
* \brief Binds a 2D memory area to a texture
*
* Binds the 2D memory area pointed to by \p devPtr to the
* texture reference \p texref. The size of the area is constrained by
* \p width in texel units, \p height in texel units, and \p pitch in byte
* units. \p desc describes how the memory is interpreted when fetching values
* from the texture. Any memory previously bound to \p texref is unbound.
*
* Since the hardware enforces an alignment requirement on texture base
* addresses, ::cudaBindTexture2D() returns in \p *offset a byte offset that
* must be applied to texture fetches in order to read from the desired memory.
* This offset must be divided by the texel size and passed to kernels that
* read from the texture so they can be applied to the ::tex2D() function.
* If the device memory pointer was returned from ::cudaMalloc(), the offset is
* guaranteed to be 0 and NULL may be passed as the \p offset parameter.
*
* \p width and \p height, which are specified in elements (or texels), cannot
* exceed ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1]
* respectively. \p pitch, which is specified in bytes, cannot exceed
* ::cudaDeviceProp::maxTexture2DLinear[2].
*
* The driver returns ::cudaErrorInvalidValue if \p pitch is not a multiple of
* ::cudaDeviceProp::texturePitchAlignment.
*
* \param offset - Offset in bytes
* \param texref - Texture reference to bind
* \param devPtr - 2D memory area on device
* \param desc - Channel format
* \param width - Width in texel units
* \param height - Height in texel units
* \param pitch - Pitch in bytes
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidTexture
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
* \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
* \ref ::cudaUnbindTexture(const struct textureReference*) "cudaBindTextureToArray (C API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaBindTexture2D(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch);
/**
* \brief Binds an array to a texture
*
* Binds the CUDA array \p array to the texture reference \p texref.
* \p desc describes how the memory is interpreted when fetching values from
* the texture. Any CUDA array previously bound to \p texref is unbound.
*
* \param texref - Texture to bind
* \param array - Memory array on device
* \param desc - Channel format
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidTexture
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
* \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
* \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc);
/**
* \brief Binds a mipmapped array to a texture
*
* Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p texref.
* \p desc describes how the memory is interpreted when fetching values from
* the texture. Any CUDA mipmapped array previously bound to \p texref is unbound.
*
* \param texref - Texture to bind
* \param mipmappedArray - Memory mipmapped array on device
* \param desc - Channel format
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidTexture
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
* \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
* \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaBindTextureToMipmappedArray(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc);
/**
* \brief Unbinds a texture
*
* Unbinds the texture bound to \p texref.
*
* \param texref - Texture to unbind
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
* \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
* \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textureReference *texref);
/**
* \brief Get the alignment offset of a texture
*
* Returns in \p *offset the offset that was returned when texture reference
* \p texref was bound.
*
* \param offset - Offset of texture reference in bytes
* \param texref - Texture to get offset of
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidTexture,
* ::cudaErrorInvalidTextureBinding
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
* \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
* \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
* \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref);
/**
* \brief Get the texture reference associated with a symbol
*
* Returns in \p *texref the structure associated to the texture reference
* defined by symbol \p symbol.
*
* \param texref - Texture reference associated with symbol
* \param symbol - Texture to get reference for
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidTexture
* \notefnerr
* \note_string_api_deprecation_50
*
* \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
* ::cudaGetChannelDesc,
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)",
* \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
* \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
* \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
* \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const void *symbol);
/** @} */ /* END CUDART_TEXTURE */
/**
* \defgroup CUDART_SURFACE Surface Reference Management
*
* ___MANBRIEF___ surface reference management functions of the CUDA runtime
* API (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the low level surface reference management functions
* of the CUDA runtime application programming interface.
*
* Some functions have overloaded C++ API template versions documented separately in the
* \ref CUDART_HIGHLEVEL "C++ API Routines" module.
*
* @{
*/
/**
* \brief Binds an array to a surface
*
* Binds the CUDA array \p array to the surface reference \p surfref.
* \p desc describes how the memory is interpreted when fetching values from
* the surface. Any CUDA array previously bound to \p surfref is unbound.
*
* \param surfref - Surface to bind
* \param array - Memory array on device
* \param desc - Channel format
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidSurface
* \notefnerr
*
* \sa \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)",
* \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)",
* ::cudaGetSurfaceReference
*/
extern __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc);
/**
* \brief Get the surface reference associated with a symbol
*
* Returns in \p *surfref the structure associated to the surface reference
* defined by symbol \p symbol.
*
* \param surfref - Surface reference associated with symbol
* \param symbol - Surface to get reference for
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidSurface
* \notefnerr
* \note_string_api_deprecation_50
*
* \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)"
*/
extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(const struct surfaceReference **surfref, const void *symbol);
/** @} */ /* END CUDART_SURFACE */
/**
* \defgroup CUDART_TEXTURE_OBJECT Texture Object Management
*
* ___MANBRIEF___ texture object management functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the low level texture object management functions
* of the CUDA runtime application programming interface. The texture
* object API is only supported on devices of compute capability 3.0 or higher.
*
* @{
*/
/**
* \brief Creates a texture object
*
* Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
* the data to texture from. \p pTexDesc describes how the data should be sampled.
* \p pResViewDesc is an optional argument that specifies an alternate format for
* the data described by \p pResDesc, and also describes the subresource region
* to restrict access to when texturing. \p pResViewDesc can only be specified if
* the type of resource is a CUDA array or a CUDA mipmapped array.
*
* Texture objects are only supported on devices of compute capability 3.0 or higher.
* Additionally, a texture object is an opaque value, and, as such, should only be
* accessed through CUDA API calls.
*
* The ::cudaResourceDesc structure is defined as:
* \code
struct cudaResourceDesc {
enum cudaResourceType resType;
union {
struct {
cudaArray_t array;
} array;
struct {
cudaMipmappedArray_t mipmap;
} mipmap;
struct {
void *devPtr;
struct cudaChannelFormatDesc desc;
size_t sizeInBytes;
} linear;
struct {
void *devPtr;
struct cudaChannelFormatDesc desc;
size_t width;
size_t height;
size_t pitchInBytes;
} pitch2D;
} res;
};
* \endcode
* where:
* - ::cudaResourceDesc::resType specifies the type of resource to texture from.
* CUresourceType is defined as:
* \code
enum cudaResourceType {
cudaResourceTypeArray = 0x00,
cudaResourceTypeMipmappedArray = 0x01,
cudaResourceTypeLinear = 0x02,
cudaResourceTypePitch2D = 0x03
};
* \endcode
*
* \par
* If ::cudaResourceDesc::resType is set to ::cudaResourceTypeArray, ::cudaResourceDesc::res::array::array
* must be set to a valid CUDA array handle.
*
* \par
* If ::cudaResourceDesc::resType is set to ::cudaResourceTypeMipmappedArray, ::cudaResourceDesc::res::mipmap::mipmap
* must be set to a valid CUDA mipmapped array handle and ::cudaTextureDesc::normalizedCoords must be set to true.
*
* \par
* If ::cudaResourceDesc::resType is set to ::cudaResourceTypeLinear, ::cudaResourceDesc::res::linear::devPtr
* must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment.
* ::cudaResourceDesc::res::linear::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::linear::sizeInBytes
* specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed
* ::cudaDeviceProp::maxTexture1DLinear. The number of elements is computed as (sizeInBytes / sizeof(desc)).
*
* \par
* If ::cudaResourceDesc::resType is set to ::cudaResourceTypePitch2D, ::cudaResourceDesc::res::pitch2D::devPtr
* must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment.
* ::cudaResourceDesc::res::pitch2D::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::pitch2D::width
* and ::cudaResourceDesc::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
* ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] respectively.
* ::cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to
* ::cudaDeviceProp::texturePitchAlignment. Pitch cannot exceed ::cudaDeviceProp::maxTexture2DLinear[2].
*
*
* The ::cudaTextureDesc struct is defined as
* \code
struct cudaTextureDesc {
enum cudaTextureAddressMode addressMode[3];
enum cudaTextureFilterMode filterMode;
enum cudaTextureReadMode readMode;
int sRGB;
int normalizedCoords;
unsigned int maxAnisotropy;
enum cudaTextureFilterMode mipmapFilterMode;
float mipmapLevelBias;
float minMipmapLevelClamp;
float maxMipmapLevelClamp;
};
* \endcode
* where
* - ::cudaTextureDesc::addressMode specifies the addressing mode for each dimension of the texture data. ::cudaTextureAddressMode is defined as:
* \code
enum cudaTextureAddressMode {
cudaAddressModeWrap = 0,
cudaAddressModeClamp = 1,
cudaAddressModeMirror = 2,
cudaAddressModeBorder = 3
};
* \endcode
* This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. Also, if ::cudaTextureDesc::normalizedCoords
* is set to zero, ::cudaAddressModeWrap and ::cudaAddressModeMirror won't be supported and will be switched to ::cudaAddressModeClamp.
*
* - ::cudaTextureDesc::filterMode specifies the filtering mode to be used when fetching from the texture. ::cudaTextureFilterMode is defined as:
* \code
enum cudaTextureFilterMode {
cudaFilterModePoint = 0,
cudaFilterModeLinear = 1
};
* \endcode
* This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear.
*
* - ::cudaTextureDesc::readMode specifies whether integer data should be converted to floating point or not. ::cudaTextureReadMode is defined as:
* \code
enum cudaTextureReadMode {
cudaReadModeElementType = 0,
cudaReadModeNormalizedFloat = 1
};
* \endcode
* Note that this applies only to 8-bit and 16-bit integer formats. 32-bit integer format would not be promoted, regardless of
* whether or not this ::cudaTextureDesc::readMode is set ::cudaReadModeNormalizedFloat is specified.
*
* - ::cudaTextureDesc::sRGB specifies whether sRGB to linear conversion should be performed during texture fetch.
*
* - ::cudaTextureDesc::normalizedCoords specifies whether the texture coordinates will be normalized or not.
*
* - ::cudaTextureDesc::maxAnisotropy specifies the maximum anistropy ratio to be used when doing anisotropic filtering. This value will be
* clamped to the range [1,16].
*
* - ::cudaTextureDesc::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
*
* - ::cudaTextureDesc::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
*
* - ::cudaTextureDesc::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
*
* - ::cudaTextureDesc::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
*
*
* The ::cudaResourceViewDesc struct is defined as
* \code
struct cudaResourceViewDesc {
enum cudaResourceViewFormat format;
size_t width;
size_t height;
size_t depth;
unsigned int firstMipmapLevel;
unsigned int lastMipmapLevel;
unsigned int firstLayer;
unsigned int lastLayer;
};
* \endcode
* where:
* - ::cudaResourceViewDesc::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
* be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
* compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a 32-bit unsigned integer format
* with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
* a 32-bit unsigned int with 2 channels. The other BC formats require the underlying resource to have the same 32-bit unsigned int
* format but with 4 channels.
*
* - ::cudaResourceViewDesc::width specifies the new width of the texture data. If the resource view format is a block
* compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
* this value has to be equal to that of the original resource.
*
* - ::cudaResourceViewDesc::height specifies the new height of the texture data. If the resource view format is a block
* compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
* this value has to be equal to that of the original resource.
*
* - ::cudaResourceViewDesc::depth specifies the new depth of the texture data. This value has to be equal to that of the
* original resource.
*
* - ::cudaResourceViewDesc::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
* For non-mipmapped resources, this value has to be zero.::cudaTextureDesc::minMipmapLevelClamp and ::cudaTextureDesc::maxMipmapLevelClamp
* will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
* then the actual minimum mipmap level clamp will be 3.2.
*
* - ::cudaResourceViewDesc::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
* has to be zero.
*
* - ::cudaResourceViewDesc::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
* For non-layered resources, this value has to be zero.
*
* - ::cudaResourceViewDesc::lastLayer specifies the last layer index for layered textures. For non-layered resources,
* this value has to be zero.
*
*
* \param pTexObject - Texture object to create
* \param pResDesc - Resource descriptor
* \param pTexDesc - Texture descriptor
* \param pResViewDesc - Resource view descriptor
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
*
* \sa ::cudaDestroyTextureObject
*/
extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc);
/**
* \brief Destroys a texture object
*
* Destroys the texture object specified by \p texObject.
*
* \param texObject - Texture object to destroy
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
*
* \sa ::cudaCreateTextureObject
*/
extern __host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject);
/**
* \brief Returns a texture object's resource descriptor
*
* Returns the resource descriptor for the texture object specified by \p texObject.
*
* \param pResDesc - Resource descriptor
* \param texObject - Texture object
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
*
* \sa ::cudaCreateTextureObject
*/
extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject);
/**
* \brief Returns a texture object's texture descriptor
*
* Returns the texture descriptor for the texture object specified by \p texObject.
*
* \param pTexDesc - Texture descriptor
* \param texObject - Texture object
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
*
* \sa ::cudaCreateTextureObject
*/
extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject);
/**
* \brief Returns a texture object's resource view descriptor
*
* Returns the resource view descriptor for the texture object specified by \p texObject.
* If no resource view was specified, ::cudaErrorInvalidValue is returned.
*
* \param pResViewDesc - Resource view descriptor
* \param texObject - Texture object
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
*
* \sa ::cudaCreateTextureObject
*/
extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject);
/** @} */ /* END CUDART_TEXTURE_OBJECT */
/**
* \defgroup CUDART_SURFACE_OBJECT Surface Object Management
*
* ___MANBRIEF___ surface object management functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the low level texture object management functions
* of the CUDA runtime application programming interface. The surface object
* API is only supported on devices of compute capability 3.0 or higher.
*
* @{
*/
/**
* \brief Creates a surface object
*
* Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
* the data to perform surface load/stores on. ::cudaResourceDesc::resType must be
* ::cudaResourceTypeArray and ::cudaResourceDesc::res::array::array
* must be set to a valid CUDA array handle.
*
* Surface objects are only supported on devices of compute capability 3.0 or higher.
* Additionally, a surface object is an opaque value, and, as such, should only be
* accessed through CUDA API calls.
*
* \param pSurfObject - Surface object to create
* \param pResDesc - Resource descriptor
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
*
* \sa ::cudaDestroySurfaceObject
*/
extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc);
/**
* \brief Destroys a surface object
*
* Destroys the surface object specified by \p surfObject.
*
* \param surfObject - Surface object to destroy
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
*
* \sa ::cudaCreateSurfaceObject
*/
extern __host__ cudaError_t CUDARTAPI cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject);
/**
* \brief Returns a surface object's resource descriptor
* Returns the resource descriptor for the surface object specified by \p surfObject.
*
* \param pResDesc - Resource descriptor
* \param surfObject - Surface object
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
*
* \sa ::cudaCreateSurfaceObject
*/
extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject);
/** @} */ /* END CUDART_SURFACE_OBJECT */
/**
* \defgroup CUDART__VERSION Version Management
*
* @{
*/
/**
* \brief Returns the CUDA driver version
*
* Returns in \p *driverVersion the version number of the installed CUDA
* driver. If no driver is installed, then 0 is returned as the driver
* version (via \p driverVersion). This function automatically returns
* ::cudaErrorInvalidValue if the \p driverVersion argument is NULL.
*
* \param driverVersion - Returns the CUDA driver version.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
* \notefnerr
*
* \sa ::cudaRuntimeGetVersion
*/
extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion);
/**
* \brief Returns the CUDA Runtime version
*
* Returns in \p *runtimeVersion the version number of the installed CUDA
* Runtime. This function automatically returns ::cudaErrorInvalidValue if
* the \p runtimeVersion argument is NULL.
*
* \param runtimeVersion - Returns the CUDA Runtime version.
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue
*
* \sa ::cudaDriverGetVersion
*/
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
/** @} */ /* END CUDART__VERSION */
/** \cond impl_private */
extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pExportTableId);
/** \endcond impl_private */
/**
* \defgroup CUDART_HIGHLEVEL C++ API Routines
*
* ___MANBRIEF___ C++ high level API functions of the CUDA runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the C++ high level API functions of the CUDA runtime
* application programming interface. To use these functions, your
* application needs to be compiled with the \p nvcc compiler.
*
* \brief C++-style interface built on top of CUDA runtime API
*/
/**
* \defgroup CUDART_DRIVER Interactions with the CUDA Driver API
*
* ___MANBRIEF___ interactions between CUDA Driver API and CUDA Runtime API
* (___CURRENT_FILE___) ___ENDMANBRIEF___
*
* This section describes the interactions between the CUDA Driver API and the CUDA Runtime API
*
* @{
*
* \section CUDART_CUDA_primary Primary Contexts
*
* There exists a one to one relationship between CUDA devices in the CUDA Runtime
* API and ::CUcontext s in the CUDA Driver API within a process. The specific
* context which the CUDA Runtime API uses for a device is called the device's
* primary context. From the perspective of the CUDA Runtime API, a device and
* its primary context are synonymous.
*
* \section CUDART_CUDA_init Initialization and Tear-Down
*
* CUDA Runtime API calls operate on the CUDA Driver API ::CUcontext which is current to
* to the calling host thread.
*
* The function ::cudaSetDevice() makes the primary context for the
* specified device current to the calling thread by calling ::cuCtxSetCurrent().
*
* The CUDA Runtime API will automatically initialize the primary context for
* a device at the first CUDA Runtime API call which requires an active context.
* If no ::CUcontext is current to the calling thread when a CUDA Runtime API call
* which requires an active context is made, then the primary context for a device
* will be selected, made current to the calling thread, and initialized.
*
* The context which the CUDA Runtime API initializes will be initialized using
* the parameters specified by the CUDA Runtime API functions
* ::cudaSetDeviceFlags(),
* ::cudaD3D9SetDirect3DDevice(),
* ::cudaD3D10SetDirect3DDevice(),
* ::cudaD3D11SetDirect3DDevice(),
* ::cudaGLSetGLDevice(), and
* ::cudaVDPAUSetVDPAUDevice().
* Note that these functions will fail with ::cudaErrorSetOnActiveProcess if they are
* called when the primary context for the specified device has already been initialized.
* (or if the current device has already been initialized, in the case of
* ::cudaSetDeviceFlags()).
*
* Primary contexts will remain active until they are explicitly deinitialized
* using ::cudaDeviceReset(). The function ::cudaDeviceReset() will deinitialize the
* primary context for the calling thread's current device immediately. The context
* will remain current to all of the threads that it was current to. The next CUDA
* Runtime API call on any thread which requires an active context will trigger the
* reinitialization of that device's primary context.
*
* Note that there is no reference counting of the primary context's lifetime. It is
* recommended that the primary context not be deinitialized except just before exit
* or to recover from an unspecified launch failure.
*
* \section CUDART_CUDA_context Context Interoperability
*
* Note that the use of multiple ::CUcontext s per device within a single process
* will substantially degrade performance and is strongly discouraged. Instead,
* it is highly recommended that the implicit one-to-one device-to-context mapping
* for the process provided by the CUDA Runtime API be used.
*
* If a non-primary ::CUcontext created by the CUDA Driver API is current to a
* thread then the CUDA Runtime API calls to that thread will operate on that
* ::CUcontext, with some exceptions listed below. Interoperability between data
* types is discussed in the following sections.
*
* The function ::cudaPointerGetAttributes() will return the error
* ::cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a
* non-primary context. The function ::cudaDeviceEnablePeerAccess() and the rest of
* the peer access API may not be called when a non-primary ::CUcontext is current.
* To use the pointer query and peer access APIs with a context created using the
* CUDA Driver API, it is necessary that the CUDA Driver API be used to access
* these features.
*
* All CUDA Runtime API state (e.g, global variables' addresses and values) travels
* with its underlying ::CUcontext. In particular, if a ::CUcontext is moved from one
* thread to another then all CUDA Runtime API state will move to that thread as well.
*
* Please note that attaching to legacy contexts (those with a version of 3010 as returned
* by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return
* ::cudaErrorIncompatibleDriverContext in such cases.
*
* \section CUDART_CUDA_stream Interactions between CUstream and cudaStream_t
*
* The types ::CUstream and ::cudaStream_t are identical and may be used interchangeably.
*
* \section CUDART_CUDA_event Interactions between CUevent and cudaEvent_t
*
* The types ::CUevent and ::cudaEvent_t are identical and may be used interchangeably.
*
* \section CUDART_CUDA_array Interactions between CUarray and cudaArray_t
*
* The types ::CUarray and struct ::cudaArray * represent the same data type and may be used
* interchangeably by casting the two types between each other.
*
* In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *,
* it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *.
*
* In order to use a struct ::cudaArray * in a CUDA Driver API function which takes a ::CUarray,
* it is necessary to explicitly cast the struct ::cudaArray * to a ::CUarray .
*
* \section CUDART_CUDA_graphicsResource Interactions between CUgraphicsResource and cudaGraphicsResource_t
*
* The types ::CUgraphicsResource and ::cudaGraphicsResource_t represent the same data type and may be used
* interchangeably by casting the two types between each other.
*
* In order to use a ::CUgraphicsResource in a CUDA Runtime API function which takes a
* ::cudaGraphicsResource_t, it is necessary to explicitly cast the ::CUgraphicsResource
* to a ::cudaGraphicsResource_t.
*
* In order to use a ::cudaGraphicsResource_t in a CUDA Driver API function which takes a
* ::CUgraphicsResource, it is necessary to explicitly cast the ::cudaGraphicsResource_t
* to a ::CUgraphicsResource.
*
* @}
*/
#if defined(__CUDA_API_VERSION_INTERNAL)
#undef cudaMemcpy
#undef cudaMemcpyToSymbol
#undef cudaMemcpyFromSymbol
#undef cudaMemcpy2D
#undef cudaMemcpyToArray
#undef cudaMemcpy2DToArray
#undef cudaMemcpyFromArray
#undef cudaMemcpy2DFromArray
#undef cudaMemcpyArrayToArray
#undef cudaMemcpy2DArrayToArray
#undef cudaMemcpy3D
#undef cudaMemcpy3DPeer
#undef cudaMemset
#undef cudaMemset2D
#undef cudaMemset3D
#undef cudaMemcpyAsync
#undef cudaMemcpyToSymbolAsync
#undef cudaMemcpyFromSymbolAsync
#undef cudaMemcpy2DAsync
#undef cudaMemcpyToArrayAsync
#undef cudaMemcpy2DToArrayAsync
#undef cudaMemcpyFromArrayAsync
#undef cudaMemcpy2DFromArrayAsync
#undef cudaMemcpy3DAsync
#undef cudaMemcpy3DPeerAsync
#undef cudaMemsetAsync
#undef cudaMemset2DAsync
#undef cudaMemset3DAsync
#undef cudaStreamQuery
#undef cudaStreamGetFlags
#undef cudaStreamGetPriority
#undef cudaEventRecord
#undef cudaStreamWaitEvent
#undef cudaStreamAddCallback
#undef cudaStreamAttachMemAsync
#undef cudaStreamSynchronize
#undef cudaLaunch
#undef cudaLaunchKernel
extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost));
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p);
extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count);
extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height);
extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent);
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0));
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0));
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0));
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream);
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags);
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority);
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0));
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback, void *userData, unsigned int flags);
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream);
extern __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func);
extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
#elif defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
// nvcc stubs reference the 'cudaLaunch' identifier even if it was defined
// to 'cudaLaunch_ptsz'. Redirect through a static inline function.
#undef cudaLaunch
static __inline__ __host__ cudaError_t cudaLaunch(const void *func)
{
return cudaLaunch_ptsz(func);
}
#define cudaLaunch __CUDART_API_PTSZ(cudaLaunch)
#endif
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#undef __dv
#endif /* !__CUDA_RUNTIME_API_H__ */