diff --git a/CMakeLists.txt b/CMakeLists.txt index e252063df..76575c5e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,14 +8,6 @@ add_custom_target( MAKE_HEADERS_VISIBLE SOURCES ${MAKE_HEADERS_VISIBLE_SRC} ) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/lib/external/) -#CUDA and OpenCL -find_package(CUDA QUIET) -find_package(OpenCL QUIET REQUIRED) -if(CUDA_FOUND) - set(BACKEND_DEFINES "-DISAAC_WITH_CUDA") - include_directories(${CUDA_INCLUDE_DIRS}) -endif() - #Compiler flags add_definitions(${BACKEND_DEFINES}) if(WIN32) diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt index 153487968..16e6a5369 100644 --- a/bench/CMakeLists.txt +++ b/bench/CMakeLists.txt @@ -51,5 +51,5 @@ foreach(PROG blas) add_executable(${PROG}-bench ${PROG}.cpp) set_target_properties(${PROG}-bench PROPERTIES COMPILE_FLAGS "${BLAS_DEF_STR}") endif() - target_link_libraries(${PROG}-bench ${BLAS_LIBS} isaac ${OPENCL_LIBRARIES} ) + target_link_libraries(${PROG}-bench ${BLAS_LIBS} isaac) endforeach(PROG) diff --git a/include/CL/cl.hpp b/include/CL/cl.hpp deleted file mode 100644 index abe82c1f9..000000000 --- a/include/CL/cl.hpp +++ /dev/null @@ -1,11758 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2013 The Khronos Group Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and/or associated documentation files (the - * "Materials"), to deal in the Materials without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Materials, and to - * permit persons to whom the Materials are furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Materials. - * - * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. - ******************************************************************************/ - -/*! \file - * - * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and - * OpenCL 1.2 (rev 15) - * \author Benedict R. Gaster, Laurent Morichetti and Lee Howes - * - * Additions and fixes from: - * Brian Cole, March 3rd 2010 and April 2012 - * Matt Gruenke, April 2012. - * Bruce Merry, February 2013. - * Tom Deakin and Simon McIntosh-Smith, July 2013 - * - * \version 1.2.6 - * \date August 2013 - * - * Optional extension support - * - * cl - * cl_ext_device_fission - * #define USE_CL_DEVICE_FISSION - */ - -/*! \mainpage - * \section intro Introduction - * For many large applications C++ is the language of choice and so it seems - * reasonable to define C++ bindings for OpenCL. - * - * - * The interface is contained with a single C++ header file \em cl.hpp and all - * definitions are contained within the namespace \em cl. There is no additional - * requirement to include \em cl.h and to use either the C++ or original C - * bindings it is enough to simply include \em cl.hpp. - * - * The bindings themselves are lightweight and correspond closely to the - * underlying C API. Using the C++ bindings introduces no additional execution - * overhead. - * - * For detail documentation on the bindings see: - * - * The OpenCL C++ Wrapper API 1.2 (revision 09) - * http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf - * - * \section example Example - * - * The following example shows a general use case for the C++ - * bindings, including support for the optional exception feature and - * also the supplied vector and string classes, see following sections for - * decriptions of these features. - * - * \code - * #define __CL_ENABLE_EXCEPTIONS - * - * #if defined(__APPLE__) || defined(__MACOSX) - * #include - * #else - * #include - * #endif - * #include - * #include - * #include - * - * const char * helloStr = "__kernel void " - * "hello(void) " - * "{ " - * " " - * "} "; - * - * int - * main(void) - * { - * cl_int err = CL_SUCCESS; - * try { - * - * std::vector platforms; - * cl::Platform::get(&platforms); - * if (platforms.size() == 0) { - * std::cout << "Platform size 0\n"; - * return -1; - * } - * - * cl_context_properties properties[] = - * { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; - * cl::Context context(CL_DEVICE_TYPE_CPU, properties); - * - * std::vector devices = context.getInfo(); - * - * cl::Program::Sources source(1, - * std::make_pair(helloStr,strlen(helloStr))); - * cl::Program program_ = cl::Program(context, source); - * program_.build(devices); - * - * cl::Kernel kernel(program_, "hello", &err); - * - * cl::Event event; - * cl::CommandQueue queue(context, devices[0], 0, &err); - * queue.enqueueNDRangeKernel( - * kernel, - * cl::NullRange, - * cl::NDRange(4,4), - * cl::NullRange, - * NULL, - * &event); - * - * event.wait(); - * } - * catch (cl::Error err) { - * std::cerr - * << "ERROR: " - * << err.what() - * << "(" - * << err.err() - * << ")" - * << std::endl; - * } - * - * return EXIT_SUCCESS; - * } - * - * \endcode - * - */ -#ifndef CL_HPP_ -#define CL_HPP_ - -#define CL_USE_DEPRECATED_OPENCL_1_1_APIS - -#ifdef _WIN32 - -#include -#include -#include -#include - -#if defined(__CL_ENABLE_EXCEPTIONS) -#include -#endif // #if defined(__CL_ENABLE_EXCEPTIONS) - -#pragma push_macro("max") -#undef max - -#endif // _WIN32 - -#if defined(__APPLE__) || defined(__MACOSX) -#include -#include -#else -#include -#endif // !__APPLE__ - -// -#if defined(USE_CL_DEVICE_FISSION) -#include -#endif - -// To avoid accidentally taking ownership of core OpenCL types -// such as cl_kernel constructors are made explicit -// under OpenCL 1.2 -#undef CL_VERSION_1_2 - -#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) -#define __CL_EXPLICIT_CONSTRUCTORS explicit -#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) -#define __CL_EXPLICIT_CONSTRUCTORS -#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - -// Define deprecated prefixes and suffixes to ensure compilation -// in case they are not pre-defined -#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) -#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED -#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) -#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED) -#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED -#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) - -#if !defined(CL_CALLBACK) -#define CL_CALLBACK -#endif //CL_CALLBACK - -#include -#include - -#if !defined(__NO_STD_VECTOR) -#include -#endif - -#if !defined(__NO_STD_STRING) -#include -#endif - -#if defined(ANDROID) -#include -#endif - -#if defined(linux) || defined(__APPLE__) || defined(__MACOSX) -#include - -#include -#include -#endif // linux - -#include - - -/*! \namespace cl - * - * \brief The OpenCL C++ bindings are defined within this namespace. - * - */ -namespace cl { - -class Memory; - -/** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) -#define __INIT_CL_EXT_FCN_PTR(name) \ - if(!pfn_##name) { \ - pfn_##name = (PFN_##name) \ - clGetExtensionFunctionAddress(#name); \ - if(!pfn_##name) { \ - } \ - } -#endif // #if defined(CL_VERSION_1_1) - -#if defined(CL_VERSION_1_2) -#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \ - if(!pfn_##name) { \ - pfn_##name = (PFN_##name) \ - clGetExtensionFunctionAddressForPlatform(platform, #name); \ - if(!pfn_##name) { \ - } \ - } -#endif // #if defined(CL_VERSION_1_1) - -class Program; -class Device; -class Context; -class CommandQueue; -class Memory; -class Buffer; - -#if defined(__CL_ENABLE_EXCEPTIONS) -/*! \brief Exception class - * - * This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined. - */ -class Error : public std::exception -{ -private: - cl_int err_; - const char * errStr_; -public: - /*! \brief Create a new CL error exception for a given error code - * and corresponding message. - * - * \param err error code value. - * - * \param errStr a descriptive string that must remain in scope until - * handling of the exception has concluded. If set, it - * will be returned by what(). - */ - Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr) - {} - - ~Error() throw() {} - - /*! \brief Get error string associated with exception - * - * \return A memory pointer to the error message string. - */ - virtual const char * what() const throw () - { - if (errStr_ == NULL) { - return "empty"; - } - else { - return errStr_; - } - } - - /*! \brief Get error code associated with exception - * - * \return The error code. - */ - cl_int err(void) const { return err_; } -}; - -#define __ERR_STR(x) #x -#else -#define __ERR_STR(x) NULL -#endif // __CL_ENABLE_EXCEPTIONS - - -namespace detail -{ -#if defined(__CL_ENABLE_EXCEPTIONS) -static inline cl_int errHandler ( - cl_int err, - const char * errStr = NULL) -{ - if (err != CL_SUCCESS) { - throw Error(err, errStr); - } - return err; -} -#else -static inline cl_int errHandler (cl_int err, const char * errStr = NULL) -{ - (void) errStr; // suppress unused variable warning - return err; -} -#endif // __CL_ENABLE_EXCEPTIONS -} - - - -//! \cond DOXYGEN_DETAIL -#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) -#define __GET_DEVICE_INFO_ERR __ERR_STR(clGetDeviceInfo) -#define __GET_PLATFORM_INFO_ERR __ERR_STR(clGetPlatformInfo) -#define __GET_DEVICE_IDS_ERR __ERR_STR(clGetDeviceIDs) -#define __GET_PLATFORM_IDS_ERR __ERR_STR(clGetPlatformIDs) -#define __GET_CONTEXT_INFO_ERR __ERR_STR(clGetContextInfo) -#define __GET_EVENT_INFO_ERR __ERR_STR(clGetEventInfo) -#define __GET_EVENT_PROFILE_INFO_ERR __ERR_STR(clGetEventProfileInfo) -#define __GET_MEM_OBJECT_INFO_ERR __ERR_STR(clGetMemObjectInfo) -#define __GET_IMAGE_INFO_ERR __ERR_STR(clGetImageInfo) -#define __GET_SAMPLER_INFO_ERR __ERR_STR(clGetSamplerInfo) -#define __GET_KERNEL_INFO_ERR __ERR_STR(clGetKernelInfo) -#if defined(CL_VERSION_1_2) -#define __GET_KERNEL_ARG_INFO_ERR __ERR_STR(clGetKernelArgInfo) -#endif // #if defined(CL_VERSION_1_2) -#define __GET_KERNEL_WORK_GROUP_INFO_ERR __ERR_STR(clGetKernelWorkGroupInfo) -#define __GET_PROGRAM_INFO_ERR __ERR_STR(clGetProgramInfo) -#define __GET_PROGRAM_BUILD_INFO_ERR __ERR_STR(clGetProgramBuildInfo) -#define __GET_COMMAND_QUEUE_INFO_ERR __ERR_STR(clGetCommandQueueInfo) - -#define __CREATE_CONTEXT_ERR __ERR_STR(clCreateContext) -#define __CREATE_CONTEXT_FROM_TYPE_ERR __ERR_STR(clCreateContextFromType) -#define __GET_SUPPORTED_IMAGE_FORMATS_ERR __ERR_STR(clGetSupportedImageFormats) - -#define __CREATE_BUFFER_ERR __ERR_STR(clCreateBuffer) -#define __COPY_ERR __ERR_STR(cl::copy) -#define __CREATE_SUBBUFFER_ERR __ERR_STR(clCreateSubBuffer) -#if defined(CL_VERSION_1_2) -#define __CREATE_IMAGE_ERR __ERR_STR(clCreateImage) -#define __CREATE_GL_TEXTURE_ERR __ERR_STR(clCreateFromGLTexture) -#define __IMAGE_DIMENSION_ERR __ERR_STR(Incorrect image dimensions) -#endif // #if defined(CL_VERSION_1_2) -#define __CREATE_SAMPLER_ERR __ERR_STR(clCreateSampler) -#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback) - -#define __CREATE_USER_EVENT_ERR __ERR_STR(clCreateUserEvent) -#define __SET_USER_EVENT_STATUS_ERR __ERR_STR(clSetUserEventStatus) -#define __SET_EVENT_CALLBACK_ERR __ERR_STR(clSetEventCallback) -#define __WAIT_FOR_EVENTS_ERR __ERR_STR(clWaitForEvents) - -#define __CREATE_KERNEL_ERR __ERR_STR(clCreateKernel) -#define __SET_KERNEL_ARGS_ERR __ERR_STR(clSetKernelArg) -#define __CREATE_PROGRAM_WITH_SOURCE_ERR __ERR_STR(clCreateProgramWithSource) -#define __CREATE_PROGRAM_WITH_BINARY_ERR __ERR_STR(clCreateProgramWithBinary) -#if defined(CL_VERSION_1_2) -#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR __ERR_STR(clCreateProgramWithBuiltInKernels) -#endif // #if defined(CL_VERSION_1_2) -#define __BUILD_PROGRAM_ERR __ERR_STR(clBuildProgram) -#if defined(CL_VERSION_1_2) -#define __COMPILE_PROGRAM_ERR __ERR_STR(clCompileProgram) - -#endif // #if defined(CL_VERSION_1_2) -#define __CREATE_KERNELS_IN_PROGRAM_ERR __ERR_STR(clCreateKernelsInProgram) - -#define __CREATE_COMMAND_QUEUE_ERR __ERR_STR(clCreateCommandQueue) -#define __SET_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clSetCommandQueueProperty) -#define __ENQUEUE_READ_BUFFER_ERR __ERR_STR(clEnqueueReadBuffer) -#define __ENQUEUE_READ_BUFFER_RECT_ERR __ERR_STR(clEnqueueReadBufferRect) -#define __ENQUEUE_WRITE_BUFFER_ERR __ERR_STR(clEnqueueWriteBuffer) -#define __ENQUEUE_WRITE_BUFFER_RECT_ERR __ERR_STR(clEnqueueWriteBufferRect) -#define __ENQEUE_COPY_BUFFER_ERR __ERR_STR(clEnqueueCopyBuffer) -#define __ENQEUE_COPY_BUFFER_RECT_ERR __ERR_STR(clEnqueueCopyBufferRect) -#define __ENQUEUE_FILL_BUFFER_ERR __ERR_STR(clEnqueueFillBuffer) -#define __ENQUEUE_READ_IMAGE_ERR __ERR_STR(clEnqueueReadImage) -#define __ENQUEUE_WRITE_IMAGE_ERR __ERR_STR(clEnqueueWriteImage) -#define __ENQUEUE_COPY_IMAGE_ERR __ERR_STR(clEnqueueCopyImage) -#define __ENQUEUE_FILL_IMAGE_ERR __ERR_STR(clEnqueueFillImage) -#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR __ERR_STR(clEnqueueCopyImageToBuffer) -#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR __ERR_STR(clEnqueueCopyBufferToImage) -#define __ENQUEUE_MAP_BUFFER_ERR __ERR_STR(clEnqueueMapBuffer) -#define __ENQUEUE_MAP_IMAGE_ERR __ERR_STR(clEnqueueMapImage) -#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR __ERR_STR(clEnqueueUnMapMemObject) -#define __ENQUEUE_NDRANGE_KERNEL_ERR __ERR_STR(clEnqueueNDRangeKernel) -#define __ENQUEUE_TASK_ERR __ERR_STR(clEnqueueTask) -#define __ENQUEUE_NATIVE_KERNEL __ERR_STR(clEnqueueNativeKernel) -#if defined(CL_VERSION_1_2) -#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR __ERR_STR(clEnqueueMigrateMemObjects) -#endif // #if defined(CL_VERSION_1_2) - -#define __ENQUEUE_ACQUIRE_GL_ERR __ERR_STR(clEnqueueAcquireGLObjects) -#define __ENQUEUE_RELEASE_GL_ERR __ERR_STR(clEnqueueReleaseGLObjects) - - -#define __RETAIN_ERR __ERR_STR(Retain Object) -#define __RELEASE_ERR __ERR_STR(Release Object) -#define __FLUSH_ERR __ERR_STR(clFlush) -#define __FINISH_ERR __ERR_STR(clFinish) -#define __VECTOR_CAPACITY_ERR __ERR_STR(Vector capacity error) - -/** - * CL 1.2 version that uses device fission. - */ -#if defined(CL_VERSION_1_2) -#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevices) -#else -#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevicesEXT) -#endif // #if defined(CL_VERSION_1_2) - -/** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) -#define __ENQUEUE_MARKER_ERR __ERR_STR(clEnqueueMarker) -#define __ENQUEUE_WAIT_FOR_EVENTS_ERR __ERR_STR(clEnqueueWaitForEvents) -#define __ENQUEUE_BARRIER_ERR __ERR_STR(clEnqueueBarrier) -#define __UNLOAD_COMPILER_ERR __ERR_STR(clUnloadCompiler) -#define __CREATE_GL_TEXTURE_2D_ERR __ERR_STR(clCreateFromGLTexture2D) -#define __CREATE_GL_TEXTURE_3D_ERR __ERR_STR(clCreateFromGLTexture3D) -#define __CREATE_IMAGE2D_ERR __ERR_STR(clCreateImage2D) -#define __CREATE_IMAGE3D_ERR __ERR_STR(clCreateImage3D) -#endif // #if defined(CL_VERSION_1_1) - -#endif // __CL_USER_OVERRIDE_ERROR_STRINGS -//! \endcond - -/** - * CL 1.2 marker and barrier commands - */ -#if defined(CL_VERSION_1_2) -#define __ENQUEUE_MARKER_WAIT_LIST_ERR __ERR_STR(clEnqueueMarkerWithWaitList) -#define __ENQUEUE_BARRIER_WAIT_LIST_ERR __ERR_STR(clEnqueueBarrierWithWaitList) -#endif // #if defined(CL_VERSION_1_2) - -#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING) -typedef std::string STRING_CLASS; -#elif !defined(__USE_DEV_STRING) - -/*! \class string - * \brief Simple string class, that provides a limited subset of std::string - * functionality but avoids many of the issues that come with that class. - - * \note Deprecated. Please use std::string as default or - * re-define the string class to match the std::string - * interface by defining STRING_CLASS - */ -class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED -{ -private: - ::size_t size_; - char * str_; -public: - //! \brief Constructs an empty string, allocating no memory. - string(void) : size_(0), str_(NULL) - { - } - - /*! \brief Constructs a string populated from an arbitrary value of - * specified size. - * - * An extra '\0' is added, in case none was contained in str. - * - * \param str the initial value of the string instance. Note that '\0' - * characters receive no special treatment. If NULL, - * the string is left empty, with a size of 0. - * - * \param size the number of characters to copy from str. - */ - string(const char * str, ::size_t size) : - size_(size), - str_(NULL) - { - if( size > 0 ) { - str_ = new char[size_+1]; - if (str_ != NULL) { - memcpy(str_, str, size_ * sizeof(char)); - str_[size_] = '\0'; - } - else { - size_ = 0; - } - } - } - - /*! \brief Constructs a string populated from a null-terminated value. - * - * \param str the null-terminated initial value of the string instance. - * If NULL, the string is left empty, with a size of 0. - */ - string(const char * str) : - size_(0), - str_(NULL) - { - if( str ) { - size_= ::strlen(str); - } - if( size_ > 0 ) { - str_ = new char[size_ + 1]; - if (str_ != NULL) { - memcpy(str_, str, (size_ + 1) * sizeof(char)); - } - } - } - - void resize( ::size_t n ) - { - if( size_ == n ) { - return; - } - if (n == 0) { - if( str_ ) { - delete [] str_; - } - str_ = NULL; - size_ = 0; - } - else { - char *newString = new char[n + 1]; - int copySize = n; - if( size_ < n ) { - copySize = size_; - } - size_ = n; - - if(str_) { - memcpy(newString, str_, (copySize + 1) * sizeof(char)); - } - if( copySize < size_ ) { - memset(newString + copySize, 0, size_ - copySize); - } - newString[size_] = '\0'; - - delete [] str_; - str_ = newString; - } - } - - const char& operator[] ( ::size_t pos ) const - { - return str_[pos]; - } - - char& operator[] ( ::size_t pos ) - { - return str_[pos]; - } - - /*! \brief Copies the value of another string to this one. - * - * \param rhs the string to copy. - * - * \returns a reference to the modified instance. - */ - string& operator=(const string& rhs) - { - if (this == &rhs) { - return *this; - } - - if( str_ != NULL ) { - delete [] str_; - str_ = NULL; - size_ = 0; - } - - if (rhs.size_ == 0 || rhs.str_ == NULL) { - str_ = NULL; - size_ = 0; - } - else { - str_ = new char[rhs.size_ + 1]; - size_ = rhs.size_; - - if (str_ != NULL) { - memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char)); - } - else { - size_ = 0; - } - } - - return *this; - } - - /*! \brief Constructs a string by copying the value of another instance. - * - * \param rhs the string to copy. - */ - string(const string& rhs) : - size_(0), - str_(NULL) - { - *this = rhs; - } - - //! \brief Destructor - frees memory used to hold the current value. - ~string() - { - delete[] str_; - str_ = NULL; - } - - //! \brief Queries the length of the string, excluding any added '\0's. - ::size_t size(void) const { return size_; } - - //! \brief Queries the length of the string, excluding any added '\0's. - ::size_t length(void) const { return size(); } - - /*! \brief Returns a pointer to the private copy held by this instance, - * or "" if empty/unset. - */ - const char * c_str(void) const { return (str_) ? str_ : "";} -}; -typedef cl::string STRING_CLASS; -#endif // #elif !defined(__USE_DEV_STRING) - -#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) -#define VECTOR_CLASS std::vector -#elif !defined(__USE_DEV_VECTOR) -#define VECTOR_CLASS cl::vector - -#if !defined(__MAX_DEFAULT_VECTOR_SIZE) -#define __MAX_DEFAULT_VECTOR_SIZE 10 -#endif - -/*! \class vector - * \brief Fixed sized vector implementation that mirroring - * - * \note Deprecated. Please use std::vector as default or - * re-define the vector class to match the std::vector - * interface by defining VECTOR_CLASS - - * \note Not recommended for use with custom objects as - * current implementation will construct N elements - * - * std::vector functionality. - * \brief Fixed sized vector compatible with std::vector. - * - * \note - * This differs from std::vector<> not just in memory allocation, - * but also in terms of when members are constructed, destroyed, - * and assigned instead of being copy constructed. - * - * \param T type of element contained in the vector. - * - * \param N maximum size of the vector. - */ -template -class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED -{ -private: - T data_[N]; - unsigned int size_; - -public: - //! \brief Constructs an empty vector with no memory allocated. - vector() : - size_(static_cast(0)) - {} - - //! \brief Deallocates the vector's memory and destroys all of its elements. - ~vector() - { - clear(); - } - - //! \brief Returns the number of elements currently contained. - unsigned int size(void) const - { - return size_; - } - - /*! \brief Empties the vector of all elements. - * \note - * This does not deallocate memory but will invoke destructors - * on contained elements. - */ - void clear() - { - while(!empty()) { - pop_back(); - } - } - - /*! \brief Appends an element after the last valid element. - * Calling this on a vector that has reached capacity will throw an - * exception if exceptions are enabled. - */ - void push_back (const T& x) - { - if (size() < N) { - new (&data_[size_]) T(x); - size_++; - } else { - detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); - } - } - - /*! \brief Removes the last valid element from the vector. - * Calling this on an empty vector will throw an exception - * if exceptions are enabled. - */ - void pop_back(void) - { - if (size_ != 0) { - --size_; - data_[size_].~T(); - } else { - detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); - } - } - - /*! \brief Constructs with a value copied from another. - * - * \param vec the vector to copy. - */ - vector(const vector& vec) : - size_(vec.size_) - { - if (size_ != 0) { - assign(vec.begin(), vec.end()); - } - } - - /*! \brief Constructs with a specified number of initial elements. - * - * \param size number of initial elements. - * - * \param val value of initial elements. - */ - vector(unsigned int size, const T& val = T()) : - size_(0) - { - for (unsigned int i = 0; i < size; i++) { - push_back(val); - } - } - - /*! \brief Overwrites the current content with that copied from another - * instance. - * - * \param rhs vector to copy. - * - * \returns a reference to this. - */ - vector& operator=(const vector& rhs) - { - if (this == &rhs) { - return *this; - } - - if (rhs.size_ != 0) { - assign(rhs.begin(), rhs.end()); - } else { - clear(); - } - - return *this; - } - - /*! \brief Tests equality against another instance. - * - * \param vec the vector against which to compare. - */ - bool operator==(vector &vec) - { - if (size() != vec.size()) { - return false; - } - - for( unsigned int i = 0; i < size(); ++i ) { - if( operator[](i) != vec[i] ) { - return false; - } - } - return true; - } - - //! \brief Conversion operator to T*. - operator T* () { return data_; } - - //! \brief Conversion operator to const T*. - operator const T* () const { return data_; } - - //! \brief Tests whether this instance has any elements. - bool empty (void) const - { - return size_==0; - } - - //! \brief Returns the maximum number of elements this instance can hold. - unsigned int max_size (void) const - { - return N; - } - - //! \brief Returns the maximum number of elements this instance can hold. - unsigned int capacity () const - { - return N; - } - - /*! \brief Returns a reference to a given element. - * - * \param index which element to access. * - * \note - * The caller is responsible for ensuring index is >= 0 and < size(). - */ - T& operator[](int index) - { - return data_[index]; - } - - /*! \brief Returns a const reference to a given element. - * - * \param index which element to access. - * - * \note - * The caller is responsible for ensuring index is >= 0 and < size(). - */ - const T& operator[](int index) const - { - return data_[index]; - } - - /*! \brief Assigns elements of the vector based on a source iterator range. - * - * \param start Beginning iterator of source range - * \param end Enditerator of source range - * - * \note - * Will throw an exception if exceptions are enabled and size exceeded. - */ - template - void assign(I start, I end) - { - clear(); - while(start != end) { - push_back(*start); - start++; - } - } - - /*! \class iterator - * \brief Const iterator class for vectors - */ - class iterator - { - private: - const vector *vec_; - int index_; - - /** - * Internal iterator constructor to capture reference - * to the vector it iterates over rather than taking - * the vector by copy. - */ - iterator (const vector &vec, int index) : - vec_(&vec) - { - if( !vec.empty() ) { - index_ = index; - } else { - index_ = -1; - } - } - - public: - iterator(void) : - index_(-1), - vec_(NULL) - { - } - - iterator(const iterator& rhs) : - vec_(rhs.vec_), - index_(rhs.index_) - { - } - - ~iterator(void) {} - - static iterator begin(const cl::vector &vec) - { - iterator i(vec, 0); - - return i; - } - - static iterator end(const cl::vector &vec) - { - iterator i(vec, vec.size()); - - return i; - } - - bool operator==(iterator i) - { - return ((vec_ == i.vec_) && - (index_ == i.index_)); - } - - bool operator!=(iterator i) - { - return (!(*this==i)); - } - - iterator& operator++() - { - ++index_; - return *this; - } - - iterator operator++(int) - { - iterator retVal(*this); - ++index_; - return retVal; - } - - iterator& operator--() - { - --index_; - return *this; - } - - iterator operator--(int) - { - iterator retVal(*this); - --index_; - return retVal; - } - - const T& operator *() const - { - return (*vec_)[index_]; - } - }; - - iterator begin(void) - { - return iterator::begin(*this); - } - - iterator begin(void) const - { - return iterator::begin(*this); - } - - iterator end(void) - { - return iterator::end(*this); - } - - iterator end(void) const - { - return iterator::end(*this); - } - - T& front(void) - { - return data_[0]; - } - - T& back(void) - { - return data_[size_]; - } - - const T& front(void) const - { - return data_[0]; - } - - const T& back(void) const - { - return data_[size_-1]; - } -}; -#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) - - - - - -namespace detail { -#define __DEFAULT_NOT_INITIALIZED 1 -#define __DEFAULT_BEING_INITIALIZED 2 -#define __DEFAULT_INITIALIZED 4 - - /* - * Compare and exchange primitives are needed for handling of defaults - */ - inline int compare_exchange(volatile int * dest, int exchange, int comparand) - { -#ifdef _WIN32 - return (int)(InterlockedCompareExchange( - (volatile long*)dest, - (long)exchange, - (long)comparand)); -#elif defined(__APPLE__) || defined(__MACOSX) - return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest); -#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX) - return (__sync_val_compare_and_swap( - dest, - comparand, - exchange)); -#endif // !_WIN32 - } - - #ifdef __SSE2__ - inline void fence() { _mm_mfence(); } - #else - inline void fence() { } // NOOP - #endif -} // namespace detail - - -/*! \brief class used to interface between C++ and - * OpenCL C calls that require arrays of size_t values, whose - * size is known statically. - */ -template -class size_t -{ -private: - ::size_t data_[N]; - -public: - //! \brief Initialize size_t to all 0s - size_t() - { - for( int i = 0; i < N; ++i ) { - data_[i] = 0; - } - } - - ::size_t& operator[](int index) - { - return data_[index]; - } - - const ::size_t& operator[](int index) const - { - return data_[index]; - } - - //! \brief Conversion operator to T*. - operator ::size_t* () { return data_; } - - //! \brief Conversion operator to const T*. - operator const ::size_t* () const { return data_; } -}; - -namespace detail { - -// Generic getInfoHelper. The final parameter is used to guide overload -// resolution: the actual parameter passed is an int, which makes this -// a worse conversion sequence than a specialization that declares the -// parameter as an int. -template -inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long) -{ - return f(name, sizeof(T), param, NULL); -} - -// Specialized getInfoHelper for VECTOR_CLASS params -template -inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, long) -{ - ::size_t required; - cl_int err = f(name, 0, NULL, &required); - if (err != CL_SUCCESS) { - return err; - } - - T* value = (T*) alloca(required); - err = f(name, required, value, NULL); - if (err != CL_SUCCESS) { - return err; - } - - param->assign(&value[0], &value[required/sizeof(T)]); - return CL_SUCCESS; -} - -/* Specialization for reference-counted types. This depends on the - * existence of Wrapper::cl_type, and none of the other types having the - * cl_type member. Note that simplify specifying the parameter as Wrapper - * does not work, because when using a derived type (e.g. Context) the generic - * template will provide a better match. - */ -template -inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int, typename T::cl_type = 0) -{ - ::size_t required; - cl_int err = f(name, 0, NULL, &required); - if (err != CL_SUCCESS) { - return err; - } - - typename T::cl_type * value = (typename T::cl_type *) alloca(required); - err = f(name, required, value, NULL); - if (err != CL_SUCCESS) { - return err; - } - - ::size_t elements = required / sizeof(typename T::cl_type); - param->assign(&value[0], &value[elements]); - for (::size_t i = 0; i < elements; i++) - { - if (value[i] != NULL) - { - err = (*param)[i].retain(); - if (err != CL_SUCCESS) { - return err; - } - } - } - return CL_SUCCESS; -} - -// Specialized for getInfo -template -inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int) -{ - cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL); - - if (err != CL_SUCCESS) { - return err; - } - - return CL_SUCCESS; -} - -// Specialized GetInfoHelper for STRING_CLASS params -template -inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long) -{ - ::size_t required; - cl_int err = f(name, 0, NULL, &required); - if (err != CL_SUCCESS) { - return err; - } - - char* value = (char*) alloca(required); - err = f(name, required, value, NULL); - if (err != CL_SUCCESS) { - return err; - } - - *param = value; - return CL_SUCCESS; -} - -// Specialized GetInfoHelper for cl::size_t params -template -inline cl_int getInfoHelper(Func f, cl_uint name, size_t* param, long) -{ - ::size_t required; - cl_int err = f(name, 0, NULL, &required); - if (err != CL_SUCCESS) { - return err; - } - - ::size_t* value = (::size_t*) alloca(required); - err = f(name, required, value, NULL); - if (err != CL_SUCCESS) { - return err; - } - - for(int i = 0; i < N; ++i) { - (*param)[i] = value[i]; - } - - return CL_SUCCESS; -} - -template struct ReferenceHandler; - -/* Specialization for reference-counted types. This depends on the - * existence of Wrapper::cl_type, and none of the other types having the - * cl_type member. Note that simplify specifying the parameter as Wrapper - * does not work, because when using a derived type (e.g. Context) the generic - * template will provide a better match. - */ -template -inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0) -{ - typename T::cl_type value; - cl_int err = f(name, sizeof(value), &value, NULL); - if (err != CL_SUCCESS) { - return err; - } - *param = value; - if (value != NULL) - { - err = param->retain(); - if (err != CL_SUCCESS) { - return err; - } - } - return CL_SUCCESS; -} - -#define __PARAM_NAME_INFO_1_0(F) \ - F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \ - F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \ - F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \ - F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \ - F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \ - \ - F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \ - F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \ - F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \ - F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \ - F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \ - F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \ - F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \ - F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \ - F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \ - F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \ - F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \ - F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \ - F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \ - F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \ - F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \ - F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\ - F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \ - F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \ - F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \ - F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \ - F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \ - F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \ - F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \ - F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \ - F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \ - F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \ - F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \ - F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \ - F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \ - F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \ - F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \ - F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \ - F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \ - \ - F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \ - F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS) \ - F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS) \ - \ - F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \ - F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \ - F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \ - F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \ - \ - F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \ - F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \ - F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \ - F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \ - \ - F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \ - F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \ - F(cl_mem_info, CL_MEM_SIZE, ::size_t) \ - F(cl_mem_info, CL_MEM_HOST_PTR, void*) \ - F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \ - F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \ - F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \ - \ - F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \ - F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \ - F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \ - F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \ - F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \ - F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \ - F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \ - \ - F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \ - F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \ - F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \ - F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \ - F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \ - \ - F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \ - F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \ - F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \ - F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS) \ - F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \ - F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \ - F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS) \ - \ - F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \ - F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \ - F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \ - \ - F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \ - F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \ - F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \ - F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \ - F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \ - \ - F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \ - F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \ - F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \ - \ - F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \ - F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \ - F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \ - F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties) - -#if defined(CL_VERSION_1_1) -#define __PARAM_NAME_INFO_1_1(F) \ - F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\ - F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \ - F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \ - F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \ - F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \ - F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \ - F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \ - \ - F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \ - F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \ - \ - F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \ - F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \ - \ - F(cl_event_info, CL_EVENT_CONTEXT, cl::Context) -#endif // CL_VERSION_1_1 - - -#if defined(CL_VERSION_1_2) -#define __PARAM_NAME_INFO_1_2(F) \ - F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \ - \ - F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \ - F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \ - \ - F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \ - \ - F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \ - \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \ - F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \ - \ - F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \ - F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS) \ - F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS) \ - F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \ - F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \ - F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \ - F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS) -#endif // #if defined(CL_VERSION_1_2) - -#if defined(USE_CL_DEVICE_FISSION) -#define __PARAM_NAME_DEVICE_FISSION(F) \ - F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \ - F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS) \ - F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS) \ - F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \ - F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS) -#endif // USE_CL_DEVICE_FISSION - -template -struct param_traits {}; - -#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \ -struct token; \ -template<> \ -struct param_traits \ -{ \ - enum { value = param_name }; \ - typedef T param_type; \ -}; - -__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS) -#if defined(CL_VERSION_1_1) -__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS) -#endif // CL_VERSION_1_1 -#if defined(CL_VERSION_1_2) -__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS) -#endif // CL_VERSION_1_1 - -#if defined(USE_CL_DEVICE_FISSION) -__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS); -#endif // USE_CL_DEVICE_FISSION - -#ifdef CL_PLATFORM_ICD_SUFFIX_KHR -__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS) -#endif - -#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) -#endif - -#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>) -#endif -#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_SIMD_WIDTH_AMD -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint) -#endif -#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint) -#endif - -#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) -#endif -#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint) -#endif -#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint) -#endif -#ifdef CL_DEVICE_WARP_SIZE_NV -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint) -#endif -#ifdef CL_DEVICE_GPU_OVERLAP_NV -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool) -#endif -#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool) -#endif -#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV -__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool) -#endif - -// Convenience functions - -template -inline cl_int -getInfo(Func f, cl_uint name, T* param) -{ - return getInfoHelper(f, name, param, 0); -} - -template -struct GetInfoFunctor0 -{ - Func f_; const Arg0& arg0_; - cl_int operator ()( - cl_uint param, ::size_t size, void* value, ::size_t* size_ret) - { return f_(arg0_, param, size, value, size_ret); } -}; - -template -struct GetInfoFunctor1 -{ - Func f_; const Arg0& arg0_; const Arg1& arg1_; - cl_int operator ()( - cl_uint param, ::size_t size, void* value, ::size_t* size_ret) - { return f_(arg0_, arg1_, param, size, value, size_ret); } -}; - -template -inline cl_int -getInfo(Func f, const Arg0& arg0, cl_uint name, T* param) -{ - GetInfoFunctor0 f0 = { f, arg0 }; - return getInfoHelper(f0, name, param, 0); -} - -template -inline cl_int -getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param) -{ - GetInfoFunctor1 f0 = { f, arg0, arg1 }; - return getInfoHelper(f0, name, param, 0); -} - -template -struct ReferenceHandler -{ }; - -#if defined(CL_VERSION_1_2) -/** - * OpenCL 1.2 devices do have retain/release. - */ -template <> -struct ReferenceHandler -{ - /** - * Retain the device. - * \param device A valid device created using createSubDevices - * \return - * CL_SUCCESS if the function executed successfully. - * CL_INVALID_DEVICE if device was not a valid subdevice - * CL_OUT_OF_RESOURCES - * CL_OUT_OF_HOST_MEMORY - */ - static cl_int retain(cl_device_id device) - { return ::clRetainDevice(device); } - /** - * Retain the device. - * \param device A valid device created using createSubDevices - * \return - * CL_SUCCESS if the function executed successfully. - * CL_INVALID_DEVICE if device was not a valid subdevice - * CL_OUT_OF_RESOURCES - * CL_OUT_OF_HOST_MEMORY - */ - static cl_int release(cl_device_id device) - { return ::clReleaseDevice(device); } -}; -#else // #if defined(CL_VERSION_1_2) -/** - * OpenCL 1.1 devices do not have retain/release. - */ -template <> -struct ReferenceHandler -{ - // cl_device_id does not have retain(). - static cl_int retain(cl_device_id) - { return CL_SUCCESS; } - // cl_device_id does not have release(). - static cl_int release(cl_device_id) - { return CL_SUCCESS; } -}; -#endif // #if defined(CL_VERSION_1_2) - -template <> -struct ReferenceHandler -{ - // cl_platform_id does not have retain(). - static cl_int retain(cl_platform_id) - { return CL_SUCCESS; } - // cl_platform_id does not have release(). - static cl_int release(cl_platform_id) - { return CL_SUCCESS; } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_context context) - { return ::clRetainContext(context); } - static cl_int release(cl_context context) - { return ::clReleaseContext(context); } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_command_queue queue) - { return ::clRetainCommandQueue(queue); } - static cl_int release(cl_command_queue queue) - { return ::clReleaseCommandQueue(queue); } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_mem memory) - { return ::clRetainMemObject(memory); } - static cl_int release(cl_mem memory) - { return ::clReleaseMemObject(memory); } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_sampler sampler) - { return ::clRetainSampler(sampler); } - static cl_int release(cl_sampler sampler) - { return ::clReleaseSampler(sampler); } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_program program) - { return ::clRetainProgram(program); } - static cl_int release(cl_program program) - { return ::clReleaseProgram(program); } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_kernel kernel) - { return ::clRetainKernel(kernel); } - static cl_int release(cl_kernel kernel) - { return ::clReleaseKernel(kernel); } -}; - -template <> -struct ReferenceHandler -{ - static cl_int retain(cl_event event) - { return ::clRetainEvent(event); } - static cl_int release(cl_event event) - { return ::clReleaseEvent(event); } -}; - - -// Extracts version number with major in the upper 16 bits, minor in the lower 16 -static cl_uint getVersion(const char *versionInfo) -{ - int highVersion = 0; - int lowVersion = 0; - int index = 7; - while(versionInfo[index] != '.' ) { - highVersion *= 10; - highVersion += versionInfo[index]-'0'; - ++index; - } - ++index; - while(versionInfo[index] != ' ' ) { - lowVersion *= 10; - lowVersion += versionInfo[index]-'0'; - ++index; - } - return (highVersion << 16) | lowVersion; -} - -static cl_uint getPlatformVersion(cl_platform_id platform) -{ - ::size_t size = 0; - clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size); - char *versionInfo = (char *) alloca(size); - clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size); - return getVersion(versionInfo); -} - -static cl_uint getDevicePlatformVersion(cl_device_id device) -{ - cl_platform_id platform; - clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL); - return getPlatformVersion(platform); -} - -#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) -static cl_uint getContextPlatformVersion(cl_context context) -{ - // The platform cannot be queried directly, so we first have to grab a - // device and obtain its context - ::size_t size = 0; - clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size); - if (size == 0) - return 0; - cl_device_id *devices = (cl_device_id *) alloca(size); - clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL); - return getDevicePlatformVersion(devices[0]); -} -#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - -template -class Wrapper -{ -public: - typedef T cl_type; - -protected: - cl_type object_; - -public: - Wrapper() : object_(NULL) { } - - Wrapper(const cl_type &obj) : object_(obj) { } - - ~Wrapper() - { - if (object_ != NULL) { release(); } - } - - Wrapper(const Wrapper& rhs) - { - object_ = rhs.object_; - if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } - } - - Wrapper& operator = (const Wrapper& rhs) - { - if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } - object_ = rhs.object_; - if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } - return *this; - } - - Wrapper& operator = (const cl_type &rhs) - { - if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } - object_ = rhs; - return *this; - } - - cl_type operator ()() const { return object_; } - - cl_type& operator ()() { return object_; } - -protected: - template - friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); - - cl_int retain() const - { - return ReferenceHandler::retain(object_); - } - - cl_int release() const - { - return ReferenceHandler::release(object_); - } -}; - -template <> -class Wrapper -{ -public: - typedef cl_device_id cl_type; - -protected: - cl_type object_; - bool referenceCountable_; - - static bool isReferenceCountable(cl_device_id device) - { - bool retVal = false; - if (device != NULL) { - int version = getDevicePlatformVersion(device); - if(version > ((1 << 16) + 1)) { - retVal = true; - } - } - return retVal; - } - -public: - Wrapper() : object_(NULL), referenceCountable_(false) - { - } - - Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) - { - referenceCountable_ = isReferenceCountable(obj); - } - - ~Wrapper() - { - if (object_ != NULL) { release(); } - } - - Wrapper(const Wrapper& rhs) - { - object_ = rhs.object_; - referenceCountable_ = isReferenceCountable(object_); - if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } - } - - Wrapper& operator = (const Wrapper& rhs) - { - if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } - object_ = rhs.object_; - referenceCountable_ = rhs.referenceCountable_; - if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } - return *this; - } - - Wrapper& operator = (const cl_type &rhs) - { - if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } - object_ = rhs; - referenceCountable_ = isReferenceCountable(object_); - return *this; - } - - cl_type operator ()() const { return object_; } - - cl_type& operator ()() { return object_; } - -protected: - template - friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); - - template - friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS*, int, typename U::cl_type); - - cl_int retain() const - { - if( referenceCountable_ ) { - return ReferenceHandler::retain(object_); - } - else { - return CL_SUCCESS; - } - } - - cl_int release() const - { - if( referenceCountable_ ) { - return ReferenceHandler::release(object_); - } - else { - return CL_SUCCESS; - } - } -}; - -} // namespace detail -//! \endcond - -/*! \stuct ImageFormat - * \brief Adds constructors and member functions for cl_image_format. - * - * \see cl_image_format - */ -struct ImageFormat : public cl_image_format -{ - //! \brief Default constructor - performs no initialization. - ImageFormat(){} - - //! \brief Initializing constructor. - ImageFormat(cl_channel_order order, cl_channel_type type) - { - image_channel_order = order; - image_channel_data_type = type; - } - - //! \brief Assignment operator. - ImageFormat& operator = (const ImageFormat& rhs) - { - if (this != &rhs) { - this->image_channel_data_type = rhs.image_channel_data_type; - this->image_channel_order = rhs.image_channel_order; - } - return *this; - } -}; - -/*! \brief Class interface for cl_device_id. - * - * \note Copies of these objects are inexpensive, since they don't 'own' - * any underlying resources or data structures. - * - * \see cl_device_id - */ -class Device : public detail::Wrapper -{ -public: - //! \brief Default constructor - initializes to NULL. - Device() : detail::Wrapper() { } - - /*! \brief Copy constructor. - * - * This simply copies the device ID value, which is an inexpensive operation. - */ - Device(const Device& device) : detail::Wrapper(device) { } - - /*! \brief Constructor from cl_device_id. - * - * This simply copies the device ID value, which is an inexpensive operation. - */ - Device(const cl_device_id &device) : detail::Wrapper(device) { } - - /*! \brief Returns the first device on the default context. - * - * \see Context::getDefault() - */ - static Device getDefault(cl_int * err = NULL); - - /*! \brief Assignment operator from Device. - * - * This simply copies the device ID value, which is an inexpensive operation. - */ - Device& operator = (const Device& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_device_id. - * - * This simply copies the device ID value, which is an inexpensive operation. - */ - Device& operator = (const cl_device_id& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetDeviceInfo(). - template - cl_int getInfo(cl_device_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetDeviceInfo, object_, name, param), - __GET_DEVICE_INFO_ERR); - } - - //! \brief Wrapper for clGetDeviceInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_device_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /** - * CL 1.2 version - */ -#if defined(CL_VERSION_1_2) - //! \brief Wrapper for clCreateSubDevicesEXT(). - cl_int createSubDevices( - const cl_device_partition_property * properties, - VECTOR_CLASS* devices) - { - cl_uint n = 0; - cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_SUB_DEVICES); - } - - cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); - err = clCreateSubDevices(object_, properties, n, ids, NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_SUB_DEVICES); - } - - devices->assign(&ids[0], &ids[n]); - return CL_SUCCESS; - } -#endif // #if defined(CL_VERSION_1_2) - -/** - * CL 1.1 version that uses device fission. - */ -#if defined(CL_VERSION_1_1) -#if defined(USE_CL_DEVICE_FISSION) - cl_int createSubDevices( - const cl_device_partition_property_ext * properties, - VECTOR_CLASS* devices) - { - typedef CL_API_ENTRY cl_int - ( CL_API_CALL * PFN_clCreateSubDevicesEXT)( - cl_device_id /*in_device*/, - const cl_device_partition_property_ext * /* properties */, - cl_uint /*num_entries*/, - cl_device_id * /*out_devices*/, - cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; - - static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL; - __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT); - - cl_uint n = 0; - cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_SUB_DEVICES); - } - - cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); - err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_SUB_DEVICES); - } - - devices->assign(&ids[0], &ids[n]); - return CL_SUCCESS; - } -#endif // #if defined(USE_CL_DEVICE_FISSION) -#endif // #if defined(CL_VERSION_1_1) -}; - -/*! \brief Class interface for cl_platform_id. - * - * \note Copies of these objects are inexpensive, since they don't 'own' - * any underlying resources or data structures. - * - * \see cl_platform_id - */ -class Platform : public detail::Wrapper -{ -public: - //! \brief Default constructor - initializes to NULL. - Platform() : detail::Wrapper() { } - - /*! \brief Copy constructor. - * - * This simply copies the platform ID value, which is an inexpensive operation. - */ - Platform(const Platform& platform) : detail::Wrapper(platform) { } - - /*! \brief Constructor from cl_platform_id. - * - * This simply copies the platform ID value, which is an inexpensive operation. - */ - Platform(const cl_platform_id &platform) : detail::Wrapper(platform) { } - - /*! \brief Assignment operator from Platform. - * - * This simply copies the platform ID value, which is an inexpensive operation. - */ - Platform& operator = (const Platform& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_platform_id. - * - * This simply copies the platform ID value, which is an inexpensive operation. - */ - Platform& operator = (const cl_platform_id& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetPlatformInfo(). - cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetPlatformInfo, object_, name, param), - __GET_PLATFORM_INFO_ERR); - } - - //! \brief Wrapper for clGetPlatformInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_platform_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /*! \brief Gets a list of devices for this platform. - * - * Wraps clGetDeviceIDs(). - */ - cl_int getDevices( - cl_device_type type, - VECTOR_CLASS* devices) const - { - cl_uint n = 0; - if( devices == NULL ) { - return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); - } - cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_DEVICE_IDS_ERR); - } - - cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); - err = ::clGetDeviceIDs(object_, type, n, ids, NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_DEVICE_IDS_ERR); - } - - devices->assign(&ids[0], &ids[n]); - return CL_SUCCESS; - } - -#if defined(USE_DX_INTEROP) - /*! \brief Get the list of available D3D10 devices. - * - * \param d3d_device_source. - * - * \param d3d_object. - * - * \param d3d_device_set. - * - * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device - * values returned in devices can be used to identify a specific OpenCL - * device. If \a devices argument is NULL, this argument is ignored. - * - * \return One of the following values: - * - CL_SUCCESS if the function is executed successfully. - * - * The application can query specific capabilities of the OpenCL device(s) - * returned by cl::getDevices. This can be used by the application to - * determine which device(s) to use. - * - * \note In the case that exceptions are enabled and a return value - * other than CL_SUCCESS is generated, then cl::Error exception is - * generated. - */ - cl_int getDevices( - cl_d3d10_device_source_khr d3d_device_source, - void * d3d_object, - cl_d3d10_device_set_khr d3d_device_set, - VECTOR_CLASS* devices) const - { - typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)( - cl_platform_id platform, - cl_d3d10_device_source_khr d3d_device_source, - void * d3d_object, - cl_d3d10_device_set_khr d3d_device_set, - cl_uint num_entries, - cl_device_id * devices, - cl_uint* num_devices); - - if( devices == NULL ) { - return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); - } - - static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL; - __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR); - - cl_uint n = 0; - cl_int err = pfn_clGetDeviceIDsFromD3D10KHR( - object_, - d3d_device_source, - d3d_object, - d3d_device_set, - 0, - NULL, - &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_DEVICE_IDS_ERR); - } - - cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); - err = pfn_clGetDeviceIDsFromD3D10KHR( - object_, - d3d_device_source, - d3d_object, - d3d_device_set, - n, - ids, - NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_DEVICE_IDS_ERR); - } - - devices->assign(&ids[0], &ids[n]); - return CL_SUCCESS; - } -#endif - - /*! \brief Gets a list of available platforms. - * - * Wraps clGetPlatformIDs(). - */ - static cl_int get( - VECTOR_CLASS* platforms) - { - cl_uint n = 0; - - if( platforms == NULL ) { - return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); - } - - cl_int err = ::clGetPlatformIDs(0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - } - - cl_platform_id* ids = (cl_platform_id*) alloca( - n * sizeof(cl_platform_id)); - err = ::clGetPlatformIDs(n, ids, NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - } - - platforms->assign(&ids[0], &ids[n]); - return CL_SUCCESS; - } - - /*! \brief Gets the first available platform. - * - * Wraps clGetPlatformIDs(), returning the first result. - */ - static cl_int get( - Platform * platform) - { - cl_uint n = 0; - - if( platform == NULL ) { - return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); - } - - cl_int err = ::clGetPlatformIDs(0, NULL, &n); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - } - - cl_platform_id* ids = (cl_platform_id*) alloca( - n * sizeof(cl_platform_id)); - err = ::clGetPlatformIDs(n, ids, NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - } - - *platform = ids[0]; - return CL_SUCCESS; - } - - /*! \brief Gets the first available platform, returning it by value. - * - * Wraps clGetPlatformIDs(), returning the first result. - */ - static Platform get( - cl_int * errResult = NULL) - { - Platform platform; - cl_uint n = 0; - cl_int err = ::clGetPlatformIDs(0, NULL, &n); - if (err != CL_SUCCESS) { - detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - if (errResult != NULL) { - *errResult = err; - } - } - - cl_platform_id* ids = (cl_platform_id*) alloca( - n * sizeof(cl_platform_id)); - err = ::clGetPlatformIDs(n, ids, NULL); - - if (err != CL_SUCCESS) { - detail::errHandler(err, __GET_PLATFORM_IDS_ERR); - } - - if (errResult != NULL) { - *errResult = err; - } - - return ids[0]; - } - - static Platform getDefault( - cl_int *errResult = NULL ) - { - return get(errResult); - } - - -#if defined(CL_VERSION_1_2) - //! \brief Wrapper for clUnloadCompiler(). - cl_int - unloadCompiler() - { - return ::clUnloadPlatformCompiler(object_); - } -#endif // #if defined(CL_VERSION_1_2) -}; // class Platform - -/** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) -/** - * Unload the OpenCL compiler. - * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead. - */ -inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int -UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; -inline cl_int -UnloadCompiler() -{ - return ::clUnloadCompiler(); -} -#endif // #if defined(CL_VERSION_1_1) - -/*! \brief Class interface for cl_context. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_context as the original. For details, see - * clRetainContext() and clReleaseContext(). - * - * \see cl_context - */ -class Context - : public detail::Wrapper -{ -private: - static volatile int default_initialized_; - static Context default_; - static volatile cl_int default_error_; -public: - /*! \brief Destructor. - * - * This calls clReleaseContext() on the value held by this instance. - */ - ~Context() { } - - /*! \brief Constructs a context including a list of specified devices. - * - * Wraps clCreateContext(). - */ - Context( - const VECTOR_CLASS& devices, - cl_context_properties* properties = NULL, - void (CL_CALLBACK * notifyFptr)( - const char *, - const void *, - ::size_t, - void *) = NULL, - void* data = NULL, - cl_int* err = NULL) - { - cl_int error; - - ::size_t numDevices = devices.size(); - cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); - for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { - deviceIDs[deviceIndex] = (devices[deviceIndex])(); - } - - object_ = ::clCreateContext( - properties, (cl_uint) numDevices, - deviceIDs, - notifyFptr, data, &error); - - detail::errHandler(error, __CREATE_CONTEXT_ERR); - if (err != NULL) { - *err = error; - } - } - - Context( - const Device& device, - cl_context_properties* properties = NULL, - void (CL_CALLBACK * notifyFptr)( - const char *, - const void *, - ::size_t, - void *) = NULL, - void* data = NULL, - cl_int* err = NULL) - { - cl_int error; - - cl_device_id deviceID = device(); - - object_ = ::clCreateContext( - properties, 1, - &deviceID, - notifyFptr, data, &error); - - detail::errHandler(error, __CREATE_CONTEXT_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Constructs a context including all or a subset of devices of a specified type. - * - * Wraps clCreateContextFromType(). - */ - Context( - cl_device_type type, - cl_context_properties* properties = NULL, - void (CL_CALLBACK * notifyFptr)( - const char *, - const void *, - ::size_t, - void *) = NULL, - void* data = NULL, - cl_int* err = NULL) - { - cl_int error; - -#if !defined(__APPLE__) || !defined(__MACOS) - cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 }; - - if (properties == NULL) { - // Get a valid platform ID as we cannot send in a blank one - VECTOR_CLASS platforms; - error = Platform::get(&platforms); - if (error != CL_SUCCESS) { - detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); - if (err != NULL) { - *err = error; - } - return; - } - - // Check the platforms we found for a device of our specified type - cl_context_properties platform_id = 0; - for (unsigned int i = 0; i < platforms.size(); i++) { - - VECTOR_CLASS devices; - -#if defined(__CL_ENABLE_EXCEPTIONS) - try { -#endif - - error = platforms[i].getDevices(type, &devices); - -#if defined(__CL_ENABLE_EXCEPTIONS) - } catch (Error) {} - // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type - // We do error checking next anyway, and can throw there if needed -#endif - - // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND - if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) { - detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); - if (err != NULL) { - *err = error; - } - } - - if (devices.size() > 0) { - platform_id = (cl_context_properties)platforms[i](); - break; - } - } - - if (platform_id == 0) { - detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR); - if (err != NULL) { - *err = CL_DEVICE_NOT_FOUND; - } - return; - } - - prop[1] = platform_id; - properties = &prop[0]; - } -#endif - object_ = ::clCreateContextFromType( - properties, type, notifyFptr, data, &error); - - detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT. - * - * \note All calls to this function return the same cl_context as the first. - */ - static Context getDefault(cl_int * err = NULL) - { - int state = detail::compare_exchange( - &default_initialized_, - __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); - - if (state & __DEFAULT_INITIALIZED) { - if (err != NULL) { - *err = default_error_; - } - return default_; - } - - if (state & __DEFAULT_BEING_INITIALIZED) { - // Assume writes will propagate eventually... - while(default_initialized_ != __DEFAULT_INITIALIZED) { - detail::fence(); - } - - if (err != NULL) { - *err = default_error_; - } - return default_; - } - - cl_int error; - default_ = Context( - CL_DEVICE_TYPE_DEFAULT, - NULL, - NULL, - NULL, - &error); - - detail::fence(); - - default_error_ = error; - // Assume writes will propagate eventually... - default_initialized_ = __DEFAULT_INITIALIZED; - - detail::fence(); - - if (err != NULL) { - *err = default_error_; - } - return default_; - - } - - //! \brief Default constructor - initializes to NULL. - Context() : detail::Wrapper() { } - - /*! \brief Copy constructor. - * - * This calls clRetainContext() on the parameter's cl_context. - */ - Context(const Context& context) : detail::Wrapper(context) { } - - /*! \brief Constructor from cl_context - takes ownership. - * - * This effectively transfers ownership of a refcount on the cl_context - * into the new Context object. - */ - __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper(context) { } - - /*! \brief Assignment operator from Context. - * - * This calls clRetainContext() on the parameter and clReleaseContext() on - * the previous value held by this instance. - */ - Context& operator = (const Context& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_context - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseContext() on the value previously held by this instance. - */ - Context& operator = (const cl_context& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetContextInfo(). - template - cl_int getInfo(cl_context_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetContextInfo, object_, name, param), - __GET_CONTEXT_INFO_ERR); - } - - //! \brief Wrapper for clGetContextInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_context_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /*! \brief Gets a list of supported image formats. - * - * Wraps clGetSupportedImageFormats(). - */ - cl_int getSupportedImageFormats( - cl_mem_flags flags, - cl_mem_object_type type, - VECTOR_CLASS* formats) const - { - cl_uint numEntries; - cl_int err = ::clGetSupportedImageFormats( - object_, - flags, - type, - 0, - NULL, - &numEntries); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); - } - - ImageFormat* value = (ImageFormat*) - alloca(numEntries * sizeof(ImageFormat)); - err = ::clGetSupportedImageFormats( - object_, - flags, - type, - numEntries, - (cl_image_format*) value, - NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); - } - - formats->assign(&value[0], &value[numEntries]); - return CL_SUCCESS; - } -}; - -inline Device Device::getDefault(cl_int * err) -{ - cl_int error; - Device device; - - Context context = Context::getDefault(&error); - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - - if (error != CL_SUCCESS) { - if (err != NULL) { - *err = error; - } - } - else { - device = context.getInfo()[0]; - if (err != NULL) { - *err = CL_SUCCESS; - } - } - - return device; -} - - -#ifdef _WIN32 -__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; -__declspec(selectany) Context Context::default_; -__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS; -#else -__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; -__attribute__((weak)) Context Context::default_; -__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS; -#endif - -/*! \brief Class interface for cl_event. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_event as the original. For details, see - * clRetainEvent() and clReleaseEvent(). - * - * \see cl_event - */ -class Event : public detail::Wrapper -{ -public: - /*! \brief Destructor. - * - * This calls clReleaseEvent() on the value held by this instance. - */ - ~Event() { } - - //! \brief Default constructor - initializes to NULL. - Event() : detail::Wrapper() { } - - /*! \brief Copy constructor. - * - * This calls clRetainEvent() on the parameter's cl_event. - */ - Event(const Event& event) : detail::Wrapper(event) { } - - /*! \brief Constructor from cl_event - takes ownership. - * - * This effectively transfers ownership of a refcount on the cl_event - * into the new Event object. - */ - Event(const cl_event& event) : detail::Wrapper(event) { } - - /*! \brief Assignment operator from cl_event - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseEvent() on the value previously held by this instance. - */ - Event& operator = (const Event& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_event. - * - * This calls clRetainEvent() on the parameter and clReleaseEvent() on - * the previous value held by this instance. - */ - Event& operator = (const cl_event& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetEventInfo(). - template - cl_int getInfo(cl_event_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetEventInfo, object_, name, param), - __GET_EVENT_INFO_ERR); - } - - //! \brief Wrapper for clGetEventInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_event_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - //! \brief Wrapper for clGetEventProfilingInfo(). - template - cl_int getProfilingInfo(cl_profiling_info name, T* param) const - { - return detail::errHandler(detail::getInfo( - &::clGetEventProfilingInfo, object_, name, param), - __GET_EVENT_PROFILE_INFO_ERR); - } - - //! \brief Wrapper for clGetEventProfilingInfo() that returns by value. - template typename - detail::param_traits::param_type - getProfilingInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_profiling_info, name>::param_type param; - cl_int result = getProfilingInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - /*! \brief Blocks the calling thread until this event completes. - * - * Wraps clWaitForEvents(). - */ - cl_int wait() const - { - return detail::errHandler( - ::clWaitForEvents(1, &object_), - __WAIT_FOR_EVENTS_ERR); - } - -#if defined(CL_VERSION_1_1) - /*! \brief Registers a user callback function for a specific command execution status. - * - * Wraps clSetEventCallback(). - */ - cl_int setCallback( - cl_int type, - void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), - void * user_data = NULL) - { - return detail::errHandler( - ::clSetEventCallback( - object_, - type, - pfn_notify, - user_data), - __SET_EVENT_CALLBACK_ERR); - } -#endif - - /*! \brief Blocks the calling thread until every event specified is complete. - * - * Wraps clWaitForEvents(). - */ - static cl_int - waitForEvents(const VECTOR_CLASS& events) - { - return detail::errHandler( - ::clWaitForEvents( - (cl_uint) events.size(), (cl_event*)&events.front()), - __WAIT_FOR_EVENTS_ERR); - } -}; - -#if defined(CL_VERSION_1_1) -/*! \brief Class interface for user events (a subset of cl_event's). - * - * See Event for details about copy semantics, etc. - */ -class UserEvent : public Event -{ -public: - /*! \brief Constructs a user event on a given context. - * - * Wraps clCreateUserEvent(). - */ - UserEvent( - const Context& context, - cl_int * err = NULL) - { - cl_int error; - object_ = ::clCreateUserEvent( - context(), - &error); - - detail::errHandler(error, __CREATE_USER_EVENT_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - UserEvent() : Event() { } - - //! \brief Copy constructor - performs shallow copy. - UserEvent(const UserEvent& event) : Event(event) { } - - //! \brief Assignment Operator - performs shallow copy. - UserEvent& operator = (const UserEvent& rhs) - { - if (this != &rhs) { - Event::operator=(rhs); - } - return *this; - } - - /*! \brief Sets the execution status of a user event object. - * - * Wraps clSetUserEventStatus(). - */ - cl_int setStatus(cl_int status) - { - return detail::errHandler( - ::clSetUserEventStatus(object_,status), - __SET_USER_EVENT_STATUS_ERR); - } -}; -#endif - -/*! \brief Blocks the calling thread until every event specified is complete. - * - * Wraps clWaitForEvents(). - */ -inline static cl_int -WaitForEvents(const VECTOR_CLASS& events) -{ - return detail::errHandler( - ::clWaitForEvents( - (cl_uint) events.size(), (cl_event*)&events.front()), - __WAIT_FOR_EVENTS_ERR); -} - -/*! \brief Class interface for cl_mem. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_mem as the original. For details, see - * clRetainMemObject() and clReleaseMemObject(). - * - * \see cl_mem - */ -class Memory : public detail::Wrapper -{ -public: - - /*! \brief Destructor. - * - * This calls clReleaseMemObject() on the value held by this instance. - */ - ~Memory() {} - - //! \brief Default constructor - initializes to NULL. - Memory() : detail::Wrapper() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * This calls clRetainMemObject() on the parameter's cl_mem. - */ - Memory(const Memory& memory) : detail::Wrapper(memory) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * This effectively transfers ownership of a refcount on the cl_mem - * into the new Memory object. - */ - __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper(memory) { } - - /*! \brief Assignment operator from Memory. - * - * This calls clRetainMemObject() on the parameter and clReleaseMemObject() - * on the previous value held by this instance. - */ - Memory& operator = (const Memory& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_mem - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseMemObject() on the value previously held by this instance. - */ - Memory& operator = (const cl_mem& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - //! \brief Wrapper for clGetMemObjectInfo(). - template - cl_int getInfo(cl_mem_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetMemObjectInfo, object_, name, param), - __GET_MEM_OBJECT_INFO_ERR); - } - - //! \brief Wrapper for clGetMemObjectInfo() that returns by value. - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_mem_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - -#if defined(CL_VERSION_1_1) - /*! \brief Registers a callback function to be called when the memory object - * is no longer needed. - * - * Wraps clSetMemObjectDestructorCallback(). - * - * Repeated calls to this function, for a given cl_mem value, will append - * to the list of functions called (in reverse order) when memory object's - * resources are freed and the memory object is deleted. - * - * \note - * The registered callbacks are associated with the underlying cl_mem - * value - not the Memory class instance. - */ - cl_int setDestructorCallback( - void (CL_CALLBACK * pfn_notify)(cl_mem, void *), - void * user_data = NULL) - { - return detail::errHandler( - ::clSetMemObjectDestructorCallback( - object_, - pfn_notify, - user_data), - __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR); - } -#endif - -}; - -// Pre-declare copy functions -class Buffer; -template< typename IteratorType > -cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ); -template< typename IteratorType > -cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ); -template< typename IteratorType > -cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ); -template< typename IteratorType > -cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ); - - -/*! \brief Class interface for Buffer Memory Objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class Buffer : public Memory -{ -public: - - /*! \brief Constructs a Buffer in a specified context. - * - * Wraps clCreateBuffer(). - * - * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was - * specified. Note alignment & exclusivity requirements. - */ - Buffer( - const Context& context, - cl_mem_flags flags, - ::size_t size, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); - - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! \brief Constructs a Buffer in the default context. - * - * Wraps clCreateBuffer(). - * - * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was - * specified. Note alignment & exclusivity requirements. - * - * \see Context::getDefault() - */ - Buffer( - cl_mem_flags flags, - ::size_t size, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - - Context context = Context::getDefault(err); - - object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); - - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - - /*! - * \brief Construct a Buffer from a host container via iterators. - * IteratorType must be random access. - * If useHostPtr is specified iterators must represent contiguous data. - */ - template< typename IteratorType > - Buffer( - IteratorType startIterator, - IteratorType endIterator, - bool readOnly, - bool useHostPtr = false, - cl_int* err = NULL) - { - typedef typename std::iterator_traits::value_type DataType; - cl_int error; - - cl_mem_flags flags = 0; - if( readOnly ) { - flags |= CL_MEM_READ_ONLY; - } - else { - flags |= CL_MEM_READ_WRITE; - } - if( useHostPtr ) { - flags |= CL_MEM_USE_HOST_PTR; - } - - ::size_t size = sizeof(DataType)*(endIterator - startIterator); - - Context context = Context::getDefault(err); - - if( useHostPtr ) { - object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); - } else { - object_ = ::clCreateBuffer(context(), flags, size, 0, &error); - } - - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - - if( !useHostPtr ) { - error = cl::copy(startIterator, endIterator, *this); - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } - } - - /*! - * \brief Construct a Buffer from a host container via iterators using a specified context. - * IteratorType must be random access. - * If useHostPtr is specified iterators must represent contiguous data. - */ - template< typename IteratorType > - Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator, - bool readOnly, bool useHostPtr = false, cl_int* err = NULL); - - //! \brief Default constructor - initializes to NULL. - Buffer() : Memory() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - Buffer(const Buffer& buffer) : Memory(buffer) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { } - - /*! \brief Assignment from Buffer - performs shallow copy. - * - * See Memory for further details. - */ - Buffer& operator = (const Buffer& rhs) - { - if (this != &rhs) { - Memory::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Buffer& operator = (const cl_mem& rhs) - { - Memory::operator=(rhs); - return *this; - } - -#if defined(CL_VERSION_1_1) - /*! \brief Creates a new buffer object from this. - * - * Wraps clCreateSubBuffer(). - */ - Buffer createSubBuffer( - cl_mem_flags flags, - cl_buffer_create_type buffer_create_type, - const void * buffer_create_info, - cl_int * err = NULL) - { - Buffer result; - cl_int error; - result.object_ = ::clCreateSubBuffer( - object_, - flags, - buffer_create_type, - buffer_create_info, - &error); - - detail::errHandler(error, __CREATE_SUBBUFFER_ERR); - if (err != NULL) { - *err = error; - } - - return result; - } -#endif -}; - - -/*! \brief C++ base class for Image Memory objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class Image : public Memory -{ -protected: - //! \brief Default constructor - initializes to NULL. - Image() : Memory() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - Image(const Image& image) : Memory(image) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { } - - /*! \brief Assignment from Image - performs shallow copy. - * - * See Memory for further details. - */ - Image& operator = (const Image& rhs) - { - if (this != &rhs) { - Memory::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image& operator = (const cl_mem& rhs) - { - Memory::operator=(rhs); - return *this; - } - -public: - //! \brief Wrapper for clGetImageInfo(). - template - cl_int getImageInfo(cl_image_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetImageInfo, object_, name, param), - __GET_IMAGE_INFO_ERR); - } - - //! \brief Wrapper for clGetImageInfo() that returns by value. - template typename - detail::param_traits::param_type - getImageInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_image_info, name>::param_type param; - cl_int result = getImageInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } -}; - -#if defined(CL_VERSION_1_2) -/*! \brief Class interface for 1D Image Memory objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class Image1D : public Image -{ -public: - /*! \brief Constructs a 1D Image in a specified context. - * - * Wraps clCreateImage(). - */ - Image1D( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - ::size_t width, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE1D, - width, - 0, 0, 0, 0, 0, 0, 0, 0 - }; - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } - - //! \brief Default constructor - initializes to NULL. - Image1D() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - Image1D(const Image1D& image1D) : Image(image1D) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { } - - /*! \brief Assignment from Image1D - performs shallow copy. - * - * See Memory for further details. - */ - Image1D& operator = (const Image1D& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image1D& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } -}; - -/*! \class Image1DBuffer - * \brief Image interface for 1D buffer images. - */ -class Image1DBuffer : public Image -{ -public: - Image1DBuffer( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - ::size_t width, - const Buffer &buffer, - cl_int* err = NULL) - { - cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE1D_BUFFER, - width, - 0, 0, 0, 0, 0, 0, 0, - buffer() - }; - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - NULL, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } - - Image1DBuffer() { } - - Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { } - - __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { } - - Image1DBuffer& operator = (const Image1DBuffer& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - Image1DBuffer& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } -}; - -/*! \class Image1DArray - * \brief Image interface for arrays of 1D images. - */ -class Image1DArray : public Image -{ -public: - Image1DArray( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - ::size_t arraySize, - ::size_t width, - ::size_t rowPitch, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE1D_ARRAY, - width, - 0, 0, // height, depth (unused) - arraySize, - rowPitch, - 0, 0, 0, 0 - }; - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } - - Image1DArray() { } - - Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { } - - __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { } - - Image1DArray& operator = (const Image1DArray& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - Image1DArray& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } -}; -#endif // #if defined(CL_VERSION_1_2) - - -/*! \brief Class interface for 2D Image Memory objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class Image2D : public Image -{ -public: - /*! \brief Constructs a 1D Image in a specified context. - * - * Wraps clCreateImage(). - */ - Image2D( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - ::size_t width, - ::size_t height, - ::size_t row_pitch = 0, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - bool useCreateImage; - -#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - // Run-time decision based on the actual platform - { - cl_uint version = detail::getContextPlatformVersion(context()); - useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above - } -#elif defined(CL_VERSION_1_2) - useCreateImage = true; -#else - useCreateImage = false; -#endif - -#if defined(CL_VERSION_1_2) - if (useCreateImage) - { - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE2D, - width, - height, - 0, 0, // depth, array size (unused) - row_pitch, - 0, 0, 0, 0 - }; - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // #if defined(CL_VERSION_1_2) -#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - if (!useCreateImage) - { - object_ = ::clCreateImage2D( - context(), flags,&format, width, height, row_pitch, host_ptr, &error); - - detail::errHandler(error, __CREATE_IMAGE2D_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - } - - //! \brief Default constructor - initializes to NULL. - Image2D() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - Image2D(const Image2D& image2D) : Image(image2D) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { } - - /*! \brief Assignment from Image2D - performs shallow copy. - * - * See Memory for further details. - */ - Image2D& operator = (const Image2D& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image2D& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } -}; - - - -#if defined(CL_VERSION_1_2) -/*! \class Image2DArray - * \brief Image interface for arrays of 2D images. - */ -class Image2DArray : public Image -{ -public: - Image2DArray( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - ::size_t arraySize, - ::size_t width, - ::size_t height, - ::size_t rowPitch, - ::size_t slicePitch, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE2D_ARRAY, - width, - height, - 0, // depth (unused) - arraySize, - rowPitch, - slicePitch, - 0, 0, 0 - }; - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } - - Image2DArray() { } - - Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { } - - __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { } - - Image2DArray& operator = (const Image2DArray& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - Image2DArray& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } -}; -#endif // #if defined(CL_VERSION_1_2) - -/*! \brief Class interface for 3D Image Memory objects. - * - * See Memory for details about copy semantics, etc. - * - * \see Memory - */ -class Image3D : public Image -{ -public: - /*! \brief Constructs a 3D Image in a specified context. - * - * Wraps clCreateImage(). - */ - Image3D( - const Context& context, - cl_mem_flags flags, - ImageFormat format, - ::size_t width, - ::size_t height, - ::size_t depth, - ::size_t row_pitch = 0, - ::size_t slice_pitch = 0, - void* host_ptr = NULL, - cl_int* err = NULL) - { - cl_int error; - bool useCreateImage; - -#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - // Run-time decision based on the actual platform - { - cl_uint version = detail::getContextPlatformVersion(context()); - useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above - } -#elif defined(CL_VERSION_1_2) - useCreateImage = true; -#else - useCreateImage = false; -#endif - -#if defined(CL_VERSION_1_2) - if (useCreateImage) - { - cl_image_desc desc = - { - CL_MEM_OBJECT_IMAGE3D, - width, - height, - depth, - 0, // array size (unused) - row_pitch, - slice_pitch, - 0, 0, 0 - }; - object_ = ::clCreateImage( - context(), - flags, - &format, - &desc, - host_ptr, - &error); - - detail::errHandler(error, __CREATE_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // #if defined(CL_VERSION_1_2) -#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - if (!useCreateImage) - { - object_ = ::clCreateImage3D( - context(), flags, &format, width, height, depth, row_pitch, - slice_pitch, host_ptr, &error); - - detail::errHandler(error, __CREATE_IMAGE3D_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) - } - - //! \brief Default constructor - initializes to NULL. - Image3D() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * See Memory for further details. - */ - Image3D(const Image3D& image3D) : Image(image3D) { } - - /*! \brief Constructor from cl_mem - takes ownership. - * - * See Memory for further details. - */ - __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { } - - /*! \brief Assignment from Image3D - performs shallow copy. - * - * See Memory for further details. - */ - Image3D& operator = (const Image3D& rhs) - { - if (this != &rhs) { - Image::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment from cl_mem - performs shallow copy. - * - * See Memory for further details. - */ - Image3D& operator = (const cl_mem& rhs) - { - Image::operator=(rhs); - return *this; - } -}; - -class Program; -class CommandQueue; -class Kernel; - -//! \brief Class interface for specifying NDRange values. -class NDRange -{ -private: - size_t<3> sizes_; - cl_uint dimensions_; - -public: - //! \brief Default constructor - resulting range has zero dimensions. - NDRange() - : dimensions_(0) - { } - - //! \brief Constructs one-dimensional range. - NDRange(::size_t size0) - : dimensions_(1) - { - sizes_[0] = size0; - } - - //! \brief Constructs two-dimensional range. - NDRange(::size_t size0, ::size_t size1) - : dimensions_(2) - { - sizes_[0] = size0; - sizes_[1] = size1; - } - - //! \brief Constructs three-dimensional range. - NDRange(::size_t size0, ::size_t size1, ::size_t size2) - : dimensions_(3) - { - sizes_[0] = size0; - sizes_[1] = size1; - sizes_[2] = size2; - } - - /*! \brief Conversion operator to const ::size_t *. - * - * \returns a pointer to the size of the first dimension. - */ - operator const ::size_t*() const { - return (const ::size_t*) sizes_; - } - - //! \brief Queries the number of dimensions in the range. - ::size_t dimensions() const { return dimensions_; } -}; - -//! \brief A zero-dimensional range. -static const NDRange NullRange; - -//! \brief Local address wrapper for use with Kernel::setArg -struct LocalSpaceArg -{ - ::size_t size_; -}; - -namespace detail { - -template -struct KernelArgumentHandler -{ - static ::size_t size(const T&) { return sizeof(T); } - static T* ptr(T& value) { return &value; } -}; - -template <> -struct KernelArgumentHandler -{ - static ::size_t size(const LocalSpaceArg& value) { return value.size_; } - static void* ptr(LocalSpaceArg&) { return NULL; } -}; - -} -//! \endcond - -/*! __local - * \brief Helper function for generating LocalSpaceArg objects. - * Deprecated. Replaced with Local. - */ -inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg -__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; -inline LocalSpaceArg -__local(::size_t size) -{ - LocalSpaceArg ret = { size }; - return ret; -} - -/*! Local - * \brief Helper function for generating LocalSpaceArg objects. - */ -inline LocalSpaceArg -Local(::size_t size) -{ - LocalSpaceArg ret = { size }; - return ret; -} - -//class KernelFunctor; - -/*! \brief Class interface for cl_kernel. - * - * \note Copies of these objects are shallow, meaning that the copy will refer - * to the same underlying cl_kernel as the original. For details, see - * clRetainKernel() and clReleaseKernel(). - * - * \see cl_kernel - */ -class Kernel : public detail::Wrapper -{ -public: - inline Kernel(const Program& program, const char* name, cl_int* err = NULL); - - /*! \brief Destructor. - * - * This calls clReleaseKernel() on the value held by this instance. - */ - ~Kernel() { } - - //! \brief Default constructor - initializes to NULL. - Kernel() { } - - /*! \brief Copy constructor - performs shallow copy. - * - * This calls clRetainKernel() on the parameter's cl_kernel. - */ - Kernel(const Kernel& kernel) : detail::Wrapper(kernel) { } - - /*! \brief Constructor from cl_kernel - takes ownership. - * - * This effectively transfers ownership of a refcount on the cl_kernel - * into the new Kernel object. - */ - __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper(kernel) { } - - /*! \brief Assignment operator from Kernel. - * - * This calls clRetainKernel() on the parameter and clReleaseKernel() - * on the previous value held by this instance. - */ - Kernel& operator = (const Kernel& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - /*! \brief Assignment operator from cl_kernel - takes ownership. - * - * This effectively transfers ownership of a refcount on the rhs and calls - * clReleaseKernel() on the value previously held by this instance. - */ - Kernel& operator = (const cl_kernel& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - template - cl_int getInfo(cl_kernel_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetKernelInfo, object_, name, param), - __GET_KERNEL_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_kernel_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - -#if defined(CL_VERSION_1_2) - template - cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param), - __GET_KERNEL_ARG_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getArgInfo(cl_uint argIndex, cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_kernel_arg_info, name>::param_type param; - cl_int result = getArgInfo(argIndex, name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } -#endif // #if defined(CL_VERSION_1_2) - - template - cl_int getWorkGroupInfo( - const Device& device, cl_kernel_work_group_info name, T* param) const - { - return detail::errHandler( - detail::getInfo( - &::clGetKernelWorkGroupInfo, object_, device(), name, param), - __GET_KERNEL_WORK_GROUP_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getWorkGroupInfo(const Device& device, cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_kernel_work_group_info, name>::param_type param; - cl_int result = getWorkGroupInfo(device, name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - template - cl_int setArg(cl_uint index, T value) - { - return detail::errHandler( - ::clSetKernelArg( - object_, - index, - detail::KernelArgumentHandler::size(value), - detail::KernelArgumentHandler::ptr(value)), - __SET_KERNEL_ARGS_ERR); - } - - cl_int setArg(cl_uint index, ::size_t size, void* argPtr) - { - return detail::errHandler( - ::clSetKernelArg(object_, index, size, argPtr), - __SET_KERNEL_ARGS_ERR); - } -}; - -/*! \class Program - * \brief Program interface that implements cl_program. - */ -class Program : public detail::Wrapper -{ -public: - typedef VECTOR_CLASS > Binaries; - typedef VECTOR_CLASS > Sources; - - Program( - const STRING_CLASS& source, - bool build = false, - cl_int* err = NULL) - { - cl_int error; - - const char * strings = source.c_str(); - const ::size_t length = source.size(); - - Context context = Context::getDefault(err); - - object_ = ::clCreateProgramWithSource( - context(), (cl_uint)1, &strings, &length, &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); - - if (error == CL_SUCCESS && build) { - - error = ::clBuildProgram( - object_, - 0, - NULL, - "", - NULL, - NULL); - - detail::errHandler(error, __BUILD_PROGRAM_ERR); - } - - if (err != NULL) { - *err = error; - } - } - - Program( - const Context& context, - const STRING_CLASS& source, - bool build = false, - cl_int* err = NULL) - { - cl_int error; - - const char * strings = source.c_str(); - const ::size_t length = source.size(); - - object_ = ::clCreateProgramWithSource( - context(), (cl_uint)1, &strings, &length, &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); - - if (error == CL_SUCCESS && build) { - - error = ::clBuildProgram( - object_, - 0, - NULL, - "", - NULL, - NULL); - - detail::errHandler(error, __BUILD_PROGRAM_ERR); - } - - if (err != NULL) { - *err = error; - } - } - - Program( - const Context& context, - const Sources& sources, - cl_int* err = NULL) - { - cl_int error; - - const ::size_t n = (::size_t)sources.size(); - ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t)); - const char** strings = (const char**) alloca(n * sizeof(const char*)); - - for (::size_t i = 0; i < n; ++i) { - strings[i] = sources[(int)i].first; - lengths[i] = sources[(int)i].second; - } - - object_ = ::clCreateProgramWithSource( - context(), (cl_uint)n, strings, lengths, &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); - if (err != NULL) { - *err = error; - } - } - - /** - * Construct a program object from a list of devices and a per-device list of binaries. - * \param context A valid OpenCL context in which to construct the program. - * \param devices A vector of OpenCL device objects for which the program will be created. - * \param binaries A vector of pairs of a pointer to a binary object and its length. - * \param binaryStatus An optional vector that on completion will be resized to - * match the size of binaries and filled with values to specify if each binary - * was successfully loaded. - * Set to CL_SUCCESS if the binary was successfully loaded. - * Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL. - * Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device. - * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors: - * CL_INVALID_CONTEXT if context is not a valid context. - * CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; - * or if any entry in binaries is NULL or has length 0. - * CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context. - * CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device. - * CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host. - */ - Program( - const Context& context, - const VECTOR_CLASS& devices, - const Binaries& binaries, - VECTOR_CLASS* binaryStatus = NULL, - cl_int* err = NULL) - { - cl_int error; - - const ::size_t numDevices = devices.size(); - - // Catch size mismatch early and return - if(binaries.size() != numDevices) { - error = CL_INVALID_VALUE; - detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); - if (err != NULL) { - *err = error; - } - return; - } - - ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t)); - const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**)); - - for (::size_t i = 0; i < numDevices; ++i) { - images[i] = (const unsigned char*)binaries[i].first; - lengths[i] = binaries[(int)i].second; - } - - cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); - for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { - deviceIDs[deviceIndex] = (devices[deviceIndex])(); - } - - if(binaryStatus) { - binaryStatus->resize(numDevices); - } - - object_ = ::clCreateProgramWithBinary( - context(), (cl_uint) devices.size(), - deviceIDs, - lengths, images, binaryStatus != NULL - ? &binaryStatus->front() - : NULL, &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); - if (err != NULL) { - *err = error; - } - } - - -#if defined(CL_VERSION_1_2) - /** - * Create program using builtin kernels. - * \param kernelNames Semi-colon separated list of builtin kernel names - */ - Program( - const Context& context, - const VECTOR_CLASS& devices, - const STRING_CLASS& kernelNames, - cl_int* err = NULL) - { - cl_int error; - - - ::size_t numDevices = devices.size(); - cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); - for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { - deviceIDs[deviceIndex] = (devices[deviceIndex])(); - } - - object_ = ::clCreateProgramWithBuiltInKernels( - context(), - (cl_uint) devices.size(), - deviceIDs, - kernelNames.c_str(), - &error); - - detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR); - if (err != NULL) { - *err = error; - } - } -#endif // #if defined(CL_VERSION_1_2) - - Program() { } - - Program(const Program& program) : detail::Wrapper(program) { } - - __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper(program) { } - - Program& operator = (const Program& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - Program& operator = (const cl_program& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - cl_int build( - const VECTOR_CLASS& devices, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL) const - { - ::size_t numDevices = devices.size(); - cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); - for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { - deviceIDs[deviceIndex] = (devices[deviceIndex])(); - } - - return detail::errHandler( - ::clBuildProgram( - object_, - (cl_uint) - devices.size(), - deviceIDs, - options, - notifyFptr, - data), - __BUILD_PROGRAM_ERR); - } - - cl_int build( - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL) const - { - return detail::errHandler( - ::clBuildProgram( - object_, - 0, - NULL, - options, - notifyFptr, - data), - __BUILD_PROGRAM_ERR); - } - -#if defined(CL_VERSION_1_2) - cl_int compile( - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL) const - { - return detail::errHandler( - ::clCompileProgram( - object_, - 0, - NULL, - options, - 0, - NULL, - NULL, - notifyFptr, - data), - __COMPILE_PROGRAM_ERR); - } -#endif - - template - cl_int getInfo(cl_program_info name, T* param) const - { - return detail::errHandler( - detail::getInfo(&::clGetProgramInfo, object_, name, param), - __GET_PROGRAM_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_program_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - template - cl_int getBuildInfo( - const Device& device, cl_program_build_info name, T* param) const - { - return detail::errHandler( - detail::getInfo( - &::clGetProgramBuildInfo, object_, device(), name, param), - __GET_PROGRAM_BUILD_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getBuildInfo(const Device& device, cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_program_build_info, name>::param_type param; - cl_int result = getBuildInfo(device, name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - cl_int createKernels(VECTOR_CLASS* kernels) - { - cl_uint numKernels; - cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); - } - - Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel)); - err = ::clCreateKernelsInProgram( - object_, numKernels, (cl_kernel*) value, NULL); - if (err != CL_SUCCESS) { - return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); - } - - kernels->assign(&value[0], &value[numKernels]); - return CL_SUCCESS; - } -}; - -#if defined(CL_VERSION_1_2) -inline Program linkProgram( - Program input1, - Program input2, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL, - cl_int* err = NULL) -{ - cl_int err_local = CL_SUCCESS; - - cl_program programs[2] = { input1(), input2() }; - - Context ctx = input1.getInfo(); - - cl_program prog = ::clLinkProgram( - ctx(), - 0, - NULL, - options, - 2, - programs, - notifyFptr, - data, - &err_local); - - detail::errHandler(err_local,__COMPILE_PROGRAM_ERR); - if (err != NULL) { - *err = err_local; - } - - return Program(prog); -} - -inline Program linkProgram( - VECTOR_CLASS inputPrograms, - const char* options = NULL, - void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, - void* data = NULL, - cl_int* err = NULL) -{ - cl_int err_local = CL_SUCCESS; - - cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program)); - - if (programs != NULL) { - for (unsigned int i = 0; i < inputPrograms.size(); i++) { - programs[i] = inputPrograms[i](); - } - } - - cl_program prog = ::clLinkProgram( - Context::getDefault()(), - 0, - NULL, - options, - (cl_uint)inputPrograms.size(), - programs, - notifyFptr, - data, - &err_local); - - detail::errHandler(err_local,__COMPILE_PROGRAM_ERR); - if (err != NULL) { - *err = err_local; - } - - return Program(prog); -} -#endif - -template<> -inline VECTOR_CLASS cl::Program::getInfo(cl_int* err) const -{ - VECTOR_CLASS< ::size_t> sizes = getInfo(); - VECTOR_CLASS binaries; - for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) - { - char *ptr = NULL; - if (*s != 0) - ptr = new char[*s]; - binaries.push_back(ptr); - } - - cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries); - if (err != NULL) { - *err = result; - } - return binaries; -} - -inline Kernel::Kernel(const Program& program, const char* name, cl_int* err) -{ - cl_int error; - - object_ = ::clCreateKernel(program(), name, &error); - detail::errHandler(error, __CREATE_KERNEL_ERR); - - if (err != NULL) { - *err = error; - } - -} - -/*! \class CommandQueue - * \brief CommandQueue interface for cl_command_queue. - */ -class CommandQueue : public detail::Wrapper -{ -private: - static volatile int default_initialized_; - static CommandQueue default_; - static volatile cl_int default_error_; -public: - CommandQueue( - cl_command_queue_properties properties, - cl_int* err = NULL) - { - cl_int error; - - Context context = Context::getDefault(&error); - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - - if (error != CL_SUCCESS) { - if (err != NULL) { - *err = error; - } - } - else { - Device device = context.getInfo()[0]; - - object_ = ::clCreateCommandQueue( - context(), device(), properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - } - } - /*! - * \brief Constructs a CommandQueue for an implementation defined device in the given context - */ - explicit CommandQueue( - const Context& context, - cl_command_queue_properties properties = 0, - cl_int* err = NULL) - { - cl_int error; - VECTOR_CLASS devices; - error = context.getInfo(CL_CONTEXT_DEVICES, &devices); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - - if (error != CL_SUCCESS) - { - if (err != NULL) { - *err = error; - } - return; - } - - object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - - if (err != NULL) { - *err = error; - } - - } - - CommandQueue( - const Context& context, - const Device& device, - cl_command_queue_properties properties = 0, - cl_int* err = NULL) - { - cl_int error; - object_ = ::clCreateCommandQueue( - context(), device(), properties, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - } - - static CommandQueue getDefault(cl_int * err = NULL) - { - int state = detail::compare_exchange( - &default_initialized_, - __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); - - if (state & __DEFAULT_INITIALIZED) { - if (err != NULL) { - *err = default_error_; - } - return default_; - } - - if (state & __DEFAULT_BEING_INITIALIZED) { - // Assume writes will propagate eventually... - while(default_initialized_ != __DEFAULT_INITIALIZED) { - detail::fence(); - } - - if (err != NULL) { - *err = default_error_; - } - return default_; - } - - cl_int error; - - Context context = Context::getDefault(&error); - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - - if (error != CL_SUCCESS) { - if (err != NULL) { - *err = error; - } - } - else { - Device device = context.getInfo()[0]; - - default_ = CommandQueue(context, device, 0, &error); - - detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); - if (err != NULL) { - *err = error; - } - } - - detail::fence(); - - default_error_ = error; - // Assume writes will propagate eventually... - default_initialized_ = __DEFAULT_INITIALIZED; - - detail::fence(); - - if (err != NULL) { - *err = default_error_; - } - return default_; - - } - - CommandQueue() { } - - CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper(commandQueue) { } - - CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper(commandQueue) { } - - CommandQueue& operator = (const CommandQueue& rhs) - { - if (this != &rhs) { - detail::Wrapper::operator=(rhs); - } - return *this; - } - - CommandQueue& operator = (const cl_command_queue& rhs) - { - detail::Wrapper::operator=(rhs); - return *this; - } - - template - cl_int getInfo(cl_command_queue_info name, T* param) const - { - return detail::errHandler( - detail::getInfo( - &::clGetCommandQueueInfo, object_, name, param), - __GET_COMMAND_QUEUE_INFO_ERR); - } - - template typename - detail::param_traits::param_type - getInfo(cl_int* err = NULL) const - { - typename detail::param_traits< - detail::cl_command_queue_info, name>::param_type param; - cl_int result = getInfo(name, ¶m); - if (err != NULL) { - *err = result; - } - return param; - } - - cl_int enqueueReadBuffer( - const Buffer& buffer, - cl_bool blocking, - ::size_t offset, - ::size_t size, - void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueReadBuffer( - object_, buffer(), blocking, offset, size, - ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_READ_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueWriteBuffer( - const Buffer& buffer, - cl_bool blocking, - ::size_t offset, - ::size_t size, - const void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueWriteBuffer( - object_, buffer(), blocking, offset, size, - ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_WRITE_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueCopyBuffer( - const Buffer& src, - const Buffer& dst, - ::size_t src_offset, - ::size_t dst_offset, - ::size_t size, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyBuffer( - object_, src(), dst(), src_offset, dst_offset, size, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQEUE_COPY_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueReadBufferRect( - const Buffer& buffer, - cl_bool blocking, - const size_t<3>& buffer_offset, - const size_t<3>& host_offset, - const size_t<3>& region, - ::size_t buffer_row_pitch, - ::size_t buffer_slice_pitch, - ::size_t host_row_pitch, - ::size_t host_slice_pitch, - void *ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueReadBufferRect( - object_, - buffer(), - blocking, - (const ::size_t *)buffer_offset, - (const ::size_t *)host_offset, - (const ::size_t *)region, - buffer_row_pitch, - buffer_slice_pitch, - host_row_pitch, - host_slice_pitch, - ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_READ_BUFFER_RECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueWriteBufferRect( - const Buffer& buffer, - cl_bool blocking, - const size_t<3>& buffer_offset, - const size_t<3>& host_offset, - const size_t<3>& region, - ::size_t buffer_row_pitch, - ::size_t buffer_slice_pitch, - ::size_t host_row_pitch, - ::size_t host_slice_pitch, - void *ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueWriteBufferRect( - object_, - buffer(), - blocking, - (const ::size_t *)buffer_offset, - (const ::size_t *)host_offset, - (const ::size_t *)region, - buffer_row_pitch, - buffer_slice_pitch, - host_row_pitch, - host_slice_pitch, - ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_WRITE_BUFFER_RECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueCopyBufferRect( - const Buffer& src, - const Buffer& dst, - const size_t<3>& src_origin, - const size_t<3>& dst_origin, - const size_t<3>& region, - ::size_t src_row_pitch, - ::size_t src_slice_pitch, - ::size_t dst_row_pitch, - ::size_t dst_slice_pitch, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyBufferRect( - object_, - src(), - dst(), - (const ::size_t *)src_origin, - (const ::size_t *)dst_origin, - (const ::size_t *)region, - src_row_pitch, - src_slice_pitch, - dst_row_pitch, - dst_slice_pitch, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQEUE_COPY_BUFFER_RECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - -#if defined(CL_VERSION_1_2) - /** - * Enqueue a command to fill a buffer object with a pattern - * of a given size. The pattern is specified a as vector. - * \tparam PatternType The datatype of the pattern field. - * The pattern type must be an accepted OpenCL data type. - */ - template - cl_int enqueueFillBuffer( - const Buffer& buffer, - PatternType pattern, - ::size_t offset, - ::size_t size, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueFillBuffer( - object_, - buffer(), - static_cast(&pattern), - sizeof(PatternType), - offset, - size, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_FILL_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // #if defined(CL_VERSION_1_2) - - cl_int enqueueReadImage( - const Image& image, - cl_bool blocking, - const size_t<3>& origin, - const size_t<3>& region, - ::size_t row_pitch, - ::size_t slice_pitch, - void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueReadImage( - object_, image(), blocking, (const ::size_t *) origin, - (const ::size_t *) region, row_pitch, slice_pitch, ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_READ_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueWriteImage( - const Image& image, - cl_bool blocking, - const size_t<3>& origin, - const size_t<3>& region, - ::size_t row_pitch, - ::size_t slice_pitch, - void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueWriteImage( - object_, image(), blocking, (const ::size_t *) origin, - (const ::size_t *) region, row_pitch, slice_pitch, ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_WRITE_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueCopyImage( - const Image& src, - const Image& dst, - const size_t<3>& src_origin, - const size_t<3>& dst_origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyImage( - object_, src(), dst(), (const ::size_t *) src_origin, - (const ::size_t *)dst_origin, (const ::size_t *) region, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_COPY_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - -#if defined(CL_VERSION_1_2) - /** - * Enqueue a command to fill an image object with a specified color. - * \param fillColor is the color to use to fill the image. - * This is a four component RGBA floating-point color value if - * the image channel data type is not an unnormalized signed or - * unsigned data type. - */ - cl_int enqueueFillImage( - const Image& image, - cl_float4 fillColor, - const size_t<3>& origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueFillImage( - object_, - image(), - static_cast(&fillColor), - (const ::size_t *) origin, - (const ::size_t *) region, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_FILL_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueue a command to fill an image object with a specified color. - * \param fillColor is the color to use to fill the image. - * This is a four component RGBA signed integer color value if - * the image channel data type is an unnormalized signed integer - * type. - */ - cl_int enqueueFillImage( - const Image& image, - cl_int4 fillColor, - const size_t<3>& origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueFillImage( - object_, - image(), - static_cast(&fillColor), - (const ::size_t *) origin, - (const ::size_t *) region, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_FILL_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueue a command to fill an image object with a specified color. - * \param fillColor is the color to use to fill the image. - * This is a four component RGBA unsigned integer color value if - * the image channel data type is an unnormalized unsigned integer - * type. - */ - cl_int enqueueFillImage( - const Image& image, - cl_uint4 fillColor, - const size_t<3>& origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueFillImage( - object_, - image(), - static_cast(&fillColor), - (const ::size_t *) origin, - (const ::size_t *) region, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_FILL_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // #if defined(CL_VERSION_1_2) - - cl_int enqueueCopyImageToBuffer( - const Image& src, - const Buffer& dst, - const size_t<3>& src_origin, - const size_t<3>& region, - ::size_t dst_offset, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyImageToBuffer( - object_, src(), dst(), (const ::size_t *) src_origin, - (const ::size_t *) region, dst_offset, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueCopyBufferToImage( - const Buffer& src, - const Image& dst, - ::size_t src_offset, - const size_t<3>& dst_origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueCopyBufferToImage( - object_, src(), dst(), src_offset, - (const ::size_t *) dst_origin, (const ::size_t *) region, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - void* enqueueMapBuffer( - const Buffer& buffer, - cl_bool blocking, - cl_map_flags flags, - ::size_t offset, - ::size_t size, - const VECTOR_CLASS* events = NULL, - Event* event = NULL, - cl_int* err = NULL) const - { - cl_int error; - void * result = ::clEnqueueMapBuffer( - object_, buffer(), blocking, flags, offset, size, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (cl_event*) event, - &error); - - detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - return result; - } - - void* enqueueMapImage( - const Image& buffer, - cl_bool blocking, - cl_map_flags flags, - const size_t<3>& origin, - const size_t<3>& region, - ::size_t * row_pitch, - ::size_t * slice_pitch, - const VECTOR_CLASS* events = NULL, - Event* event = NULL, - cl_int* err = NULL) const - { - cl_int error; - void * result = ::clEnqueueMapImage( - object_, buffer(), blocking, flags, - (const ::size_t *) origin, (const ::size_t *) region, - row_pitch, slice_pitch, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (cl_event*) event, - &error); - - detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR); - if (err != NULL) { - *err = error; - } - return result; - } - - cl_int enqueueUnmapMemObject( - const Memory& memory, - void* mapped_ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueUnmapMemObject( - object_, memory(), mapped_ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - -#if defined(CL_VERSION_1_2) - /** - * Enqueues a marker command which waits for either a list of events to complete, - * or all previously enqueued commands to complete. - * - * Enqueues a marker command which waits for either a list of events to complete, - * or if the list is empty it waits for all commands previously enqueued in command_queue - * to complete before it completes. This command returns an event which can be waited on, - * i.e. this event can be waited on to insure that all events either in the event_wait_list - * or all previously enqueued commands, queued before this command to command_queue, - * have completed. - */ - cl_int enqueueMarkerWithWaitList( - const VECTOR_CLASS *events = 0, - Event *event = 0) - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueMarkerWithWaitList( - object_, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_MARKER_WAIT_LIST_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * A synchronization point that enqueues a barrier operation. - * - * Enqueues a barrier command which waits for either a list of events to complete, - * or if the list is empty it waits for all commands previously enqueued in command_queue - * to complete before it completes. This command blocks command execution, that is, any - * following commands enqueued after it do not execute until it completes. This command - * returns an event which can be waited on, i.e. this event can be waited on to insure that - * all events either in the event_wait_list or all previously enqueued commands, queued - * before this command to command_queue, have completed. - */ - cl_int enqueueBarrierWithWaitList( - const VECTOR_CLASS *events = 0, - Event *event = 0) - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueBarrierWithWaitList( - object_, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_BARRIER_WAIT_LIST_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - /** - * Enqueues a command to indicate with which device a set of memory objects - * should be associated. - */ - cl_int enqueueMigrateMemObjects( - const VECTOR_CLASS &memObjects, - cl_mem_migration_flags flags, - const VECTOR_CLASS* events = NULL, - Event* event = NULL - ) - { - cl_event tmp; - - cl_mem* localMemObjects = static_cast(alloca(memObjects.size() * sizeof(cl_mem))); - for( int i = 0; i < (int)memObjects.size(); ++i ) { - localMemObjects[i] = memObjects[i](); - } - - - cl_int err = detail::errHandler( - ::clEnqueueMigrateMemObjects( - object_, - (cl_uint)memObjects.size(), - static_cast(localMemObjects), - flags, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } -#endif // #if defined(CL_VERSION_1_2) - - cl_int enqueueNDRangeKernel( - const Kernel& kernel, - const NDRange& offset, - const NDRange& global, - const NDRange& local = NullRange, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueNDRangeKernel( - object_, kernel(), (cl_uint) global.dimensions(), - offset.dimensions() != 0 ? (const ::size_t*) offset : NULL, - (const ::size_t*) global, - local.dimensions() != 0 ? (const ::size_t*) local : NULL, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_NDRANGE_KERNEL_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueTask( - const Kernel& kernel, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueTask( - object_, kernel(), - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_TASK_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - - cl_int enqueueNativeKernel( - void (CL_CALLBACK *userFptr)(void *), - std::pair args, - const VECTOR_CLASS* mem_objects = NULL, - const VECTOR_CLASS* mem_locs = NULL, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) const - { - cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) - ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem)) - : NULL; - - if (mems != NULL) { - for (unsigned int i = 0; i < mem_objects->size(); i++) { - mems[i] = ((*mem_objects)[i])(); - } - } - - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueNativeKernel( - object_, userFptr, args.first, args.second, - (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, - mems, - (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_NATIVE_KERNEL); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; - } - -/** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) - CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - { - return detail::errHandler( - ::clEnqueueMarker(object_, (cl_event*) event), - __ENQUEUE_MARKER_ERR); - } - - CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueWaitForEvents(const VECTOR_CLASS& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - { - return detail::errHandler( - ::clEnqueueWaitForEvents( - object_, - (cl_uint) events.size(), - (const cl_event*) &events.front()), - __ENQUEUE_WAIT_FOR_EVENTS_ERR); - } -#endif // #if defined(CL_VERSION_1_1) - - - - -/** - * Deprecated APIs for 1.2 - */ -#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) - CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - { - return detail::errHandler( - ::clEnqueueBarrier(object_), - __ENQUEUE_BARRIER_ERR); - } -#endif // #if defined(CL_VERSION_1_1) - - cl_int flush() const - { - return detail::errHandler(::clFlush(object_), __FLUSH_ERR); - } - - cl_int finish() const - { - return detail::errHandler(::clFinish(object_), __FINISH_ERR); - } -}; - -#ifdef _WIN32 -__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; -__declspec(selectany) CommandQueue CommandQueue::default_; -__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; -#else -__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; -__attribute__((weak)) CommandQueue CommandQueue::default_; -__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; -#endif - -template< typename IteratorType > -Buffer::Buffer( - const Context &context, - IteratorType startIterator, - IteratorType endIterator, - bool readOnly, - bool useHostPtr, - cl_int* err) -{ - typedef typename std::iterator_traits::value_type DataType; - cl_int error; - - cl_mem_flags flags = 0; - if( readOnly ) { - flags |= CL_MEM_READ_ONLY; - } - else { - flags |= CL_MEM_READ_WRITE; - } - if( useHostPtr ) { - flags |= CL_MEM_USE_HOST_PTR; - } - - ::size_t size = sizeof(DataType)*(endIterator - startIterator); - - if( useHostPtr ) { - object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); - } else { - object_ = ::clCreateBuffer(context(), flags, size, 0, &error); - } - - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - - if( !useHostPtr ) { - CommandQueue queue(context, 0, &error); - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - - error = cl::copy(queue, startIterator, endIterator, *this); - detail::errHandler(error, __CREATE_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - } -} - -inline cl_int enqueueReadBuffer( - const Buffer& buffer, - cl_bool blocking, - ::size_t offset, - ::size_t size, - void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event); -} - -inline cl_int enqueueWriteBuffer( - const Buffer& buffer, - cl_bool blocking, - ::size_t offset, - ::size_t size, - const void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event); -} - -inline void* enqueueMapBuffer( - const Buffer& buffer, - cl_bool blocking, - cl_map_flags flags, - ::size_t offset, - ::size_t size, - const VECTOR_CLASS* events = NULL, - Event* event = NULL, - cl_int* err = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - - void * result = ::clEnqueueMapBuffer( - queue(), buffer(), blocking, flags, offset, size, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (cl_event*) event, - &error); - - detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - if (err != NULL) { - *err = error; - } - return result; -} - -inline cl_int enqueueUnmapMemObject( - const Memory& memory, - void* mapped_ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); - if (error != CL_SUCCESS) { - return error; - } - - cl_event tmp; - cl_int err = detail::errHandler( - ::clEnqueueUnmapMemObject( - queue(), memory(), mapped_ptr, - (events != NULL) ? (cl_uint) events->size() : 0, - (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, - (event != NULL) ? &tmp : NULL), - __ENQUEUE_UNMAP_MEM_OBJECT_ERR); - - if (event != NULL && err == CL_SUCCESS) - *event = tmp; - - return err; -} - -inline cl_int enqueueCopyBuffer( - const Buffer& src, - const Buffer& dst, - ::size_t src_offset, - ::size_t dst_offset, - ::size_t size, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event); -} - -/** - * Blocking copy operation between iterators and a buffer. - * Host to Device. - * Uses default command queue. - */ -template< typename IteratorType > -inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - if (error != CL_SUCCESS) - return error; - - return cl::copy(queue, startIterator, endIterator, buffer); -} - -/** - * Blocking copy operation between iterators and a buffer. - * Device to Host. - * Uses default command queue. - */ -template< typename IteratorType > -inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - if (error != CL_SUCCESS) - return error; - - return cl::copy(queue, buffer, startIterator, endIterator); -} - -/** - * Blocking copy operation between iterators and a buffer. - * Host to Device. - * Uses specified queue. - */ -template< typename IteratorType > -inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ) -{ - typedef typename std::iterator_traits::value_type DataType; - cl_int error; - - ::size_t length = endIterator-startIterator; - ::size_t byteLength = length*sizeof(DataType); - - DataType *pointer = - static_cast(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error)); - // if exceptions enabled, enqueueMapBuffer will throw - if( error != CL_SUCCESS ) { - return error; - } -#if defined(_MSC_VER) - std::copy( - startIterator, - endIterator, - stdext::checked_array_iterator( - pointer, length)); -#else - std::copy(startIterator, endIterator, pointer); -#endif - Event endEvent; - error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); - // if exceptions enabled, enqueueUnmapMemObject will throw - if( error != CL_SUCCESS ) { - return error; - } - endEvent.wait(); - return CL_SUCCESS; -} - -/** - * Blocking copy operation between iterators and a buffer. - * Device to Host. - * Uses specified queue. - */ -template< typename IteratorType > -inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ) -{ - typedef typename std::iterator_traits::value_type DataType; - cl_int error; - - ::size_t length = endIterator-startIterator; - ::size_t byteLength = length*sizeof(DataType); - - DataType *pointer = - static_cast(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error)); - // if exceptions enabled, enqueueMapBuffer will throw - if( error != CL_SUCCESS ) { - return error; - } - std::copy(pointer, pointer + length, startIterator); - Event endEvent; - error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); - // if exceptions enabled, enqueueUnmapMemObject will throw - if( error != CL_SUCCESS ) { - return error; - } - endEvent.wait(); - return CL_SUCCESS; -} - -#if defined(CL_VERSION_1_1) -inline cl_int enqueueReadBufferRect( - const Buffer& buffer, - cl_bool blocking, - const size_t<3>& buffer_offset, - const size_t<3>& host_offset, - const size_t<3>& region, - ::size_t buffer_row_pitch, - ::size_t buffer_slice_pitch, - ::size_t host_row_pitch, - ::size_t host_slice_pitch, - void *ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueReadBufferRect( - buffer, - blocking, - buffer_offset, - host_offset, - region, - buffer_row_pitch, - buffer_slice_pitch, - host_row_pitch, - host_slice_pitch, - ptr, - events, - event); -} - -inline cl_int enqueueWriteBufferRect( - const Buffer& buffer, - cl_bool blocking, - const size_t<3>& buffer_offset, - const size_t<3>& host_offset, - const size_t<3>& region, - ::size_t buffer_row_pitch, - ::size_t buffer_slice_pitch, - ::size_t host_row_pitch, - ::size_t host_slice_pitch, - void *ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueWriteBufferRect( - buffer, - blocking, - buffer_offset, - host_offset, - region, - buffer_row_pitch, - buffer_slice_pitch, - host_row_pitch, - host_slice_pitch, - ptr, - events, - event); -} - -inline cl_int enqueueCopyBufferRect( - const Buffer& src, - const Buffer& dst, - const size_t<3>& src_origin, - const size_t<3>& dst_origin, - const size_t<3>& region, - ::size_t src_row_pitch, - ::size_t src_slice_pitch, - ::size_t dst_row_pitch, - ::size_t dst_slice_pitch, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyBufferRect( - src, - dst, - src_origin, - dst_origin, - region, - src_row_pitch, - src_slice_pitch, - dst_row_pitch, - dst_slice_pitch, - events, - event); -} -#endif - -inline cl_int enqueueReadImage( - const Image& image, - cl_bool blocking, - const size_t<3>& origin, - const size_t<3>& region, - ::size_t row_pitch, - ::size_t slice_pitch, - void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueReadImage( - image, - blocking, - origin, - region, - row_pitch, - slice_pitch, - ptr, - events, - event); -} - -inline cl_int enqueueWriteImage( - const Image& image, - cl_bool blocking, - const size_t<3>& origin, - const size_t<3>& region, - ::size_t row_pitch, - ::size_t slice_pitch, - void* ptr, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueWriteImage( - image, - blocking, - origin, - region, - row_pitch, - slice_pitch, - ptr, - events, - event); -} - -inline cl_int enqueueCopyImage( - const Image& src, - const Image& dst, - const size_t<3>& src_origin, - const size_t<3>& dst_origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyImage( - src, - dst, - src_origin, - dst_origin, - region, - events, - event); -} - -inline cl_int enqueueCopyImageToBuffer( - const Image& src, - const Buffer& dst, - const size_t<3>& src_origin, - const size_t<3>& region, - ::size_t dst_offset, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyImageToBuffer( - src, - dst, - src_origin, - region, - dst_offset, - events, - event); -} - -inline cl_int enqueueCopyBufferToImage( - const Buffer& src, - const Image& dst, - ::size_t src_offset, - const size_t<3>& dst_origin, - const size_t<3>& region, - const VECTOR_CLASS* events = NULL, - Event* event = NULL) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.enqueueCopyBufferToImage( - src, - dst, - src_offset, - dst_origin, - region, - events, - event); -} - - -inline cl_int flush(void) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - return queue.flush(); -} - -inline cl_int finish(void) -{ - cl_int error; - CommandQueue queue = CommandQueue::getDefault(&error); - - if (error != CL_SUCCESS) { - return error; - } - - - return queue.finish(); -} - -// Kernel Functor support -// New interface as of September 2011 -// Requires the C++11 std::tr1::function (note do not support TR1) -// Visual Studio 2010 and GCC 4.2 - -struct EnqueueArgs -{ - CommandQueue queue_; - const NDRange offset_; - const NDRange global_; - const NDRange local_; - VECTOR_CLASS events_; - - EnqueueArgs(NDRange global) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(NullRange) - { - - } - - EnqueueArgs(NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(local) - { - - } - - EnqueueArgs(NDRange offset, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(offset), - global_(global), - local_(local) - { - - } - - EnqueueArgs(Event e, NDRange global) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(NullRange) - { - events_.push_back(e); - } - - EnqueueArgs(Event e, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(local) - { - events_.push_back(e); - } - - EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(offset), - global_(global), - local_(local) - { - events_.push_back(e); - } - - EnqueueArgs(const VECTOR_CLASS &events, NDRange global) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(NullRange), - events_(events) - { - - } - - EnqueueArgs(const VECTOR_CLASS &events, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(NullRange), - global_(global), - local_(local), - events_(events) - { - - } - - EnqueueArgs(const VECTOR_CLASS &events, NDRange offset, NDRange global, NDRange local) : - queue_(CommandQueue::getDefault()), - offset_(offset), - global_(global), - local_(local), - events_(events) - { - - } - - EnqueueArgs(CommandQueue &queue, NDRange global) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(NullRange) - { - - } - - EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(local) - { - - } - - EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : - queue_(queue), - offset_(offset), - global_(global), - local_(local) - { - - } - - EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(NullRange) - { - events_.push_back(e); - } - - EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(local) - { - events_.push_back(e); - } - - EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : - queue_(queue), - offset_(offset), - global_(global), - local_(local) - { - events_.push_back(e); - } - - EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange global) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(NullRange), - events_(events) - { - - } - - EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange global, NDRange local) : - queue_(queue), - offset_(NullRange), - global_(global), - local_(local), - events_(events) - { - - } - - EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange offset, NDRange global, NDRange local) : - queue_(queue), - offset_(offset), - global_(global), - local_(local), - events_(events) - { - - } -}; - -namespace detail { - -class NullType {}; - -template -struct SetArg -{ - static void set (Kernel kernel, T0 arg) - { - kernel.setArg(index, arg); - } -}; - -template -struct SetArg -{ - static void set (Kernel, NullType) - { - } -}; - -template < - typename T0, typename T1, typename T2, typename T3, - typename T4, typename T5, typename T6, typename T7, - typename T8, typename T9, typename T10, typename T11, - typename T12, typename T13, typename T14, typename T15, - typename T16, typename T17, typename T18, typename T19, - typename T20, typename T21, typename T22, typename T23, - typename T24, typename T25, typename T26, typename T27, - typename T28, typename T29, typename T30, typename T31 -> -class KernelFunctorGlobal -{ -private: - Kernel kernel_; - -public: - KernelFunctorGlobal( - Kernel kernel) : - kernel_(kernel) - {} - - KernelFunctorGlobal( - const Program& program, - const STRING_CLASS name, - cl_int * err = NULL) : - kernel_(program, name.c_str(), err) - {} - - Event operator() ( - const EnqueueArgs& args, - T0 t0, - T1 t1 = NullType(), - T2 t2 = NullType(), - T3 t3 = NullType(), - T4 t4 = NullType(), - T5 t5 = NullType(), - T6 t6 = NullType(), - T7 t7 = NullType(), - T8 t8 = NullType(), - T9 t9 = NullType(), - T10 t10 = NullType(), - T11 t11 = NullType(), - T12 t12 = NullType(), - T13 t13 = NullType(), - T14 t14 = NullType(), - T15 t15 = NullType(), - T16 t16 = NullType(), - T17 t17 = NullType(), - T18 t18 = NullType(), - T19 t19 = NullType(), - T20 t20 = NullType(), - T21 t21 = NullType(), - T22 t22 = NullType(), - T23 t23 = NullType(), - T24 t24 = NullType(), - T25 t25 = NullType(), - T26 t26 = NullType(), - T27 t27 = NullType(), - T28 t28 = NullType(), - T29 t29 = NullType(), - T30 t30 = NullType(), - T31 t31 = NullType() - ) - { - Event event; - SetArg<0, T0>::set(kernel_, t0); - SetArg<1, T1>::set(kernel_, t1); - SetArg<2, T2>::set(kernel_, t2); - SetArg<3, T3>::set(kernel_, t3); - SetArg<4, T4>::set(kernel_, t4); - SetArg<5, T5>::set(kernel_, t5); - SetArg<6, T6>::set(kernel_, t6); - SetArg<7, T7>::set(kernel_, t7); - SetArg<8, T8>::set(kernel_, t8); - SetArg<9, T9>::set(kernel_, t9); - SetArg<10, T10>::set(kernel_, t10); - SetArg<11, T11>::set(kernel_, t11); - SetArg<12, T12>::set(kernel_, t12); - SetArg<13, T13>::set(kernel_, t13); - SetArg<14, T14>::set(kernel_, t14); - SetArg<15, T15>::set(kernel_, t15); - SetArg<16, T16>::set(kernel_, t16); - SetArg<17, T17>::set(kernel_, t17); - SetArg<18, T18>::set(kernel_, t18); - SetArg<19, T19>::set(kernel_, t19); - SetArg<20, T20>::set(kernel_, t20); - SetArg<21, T21>::set(kernel_, t21); - SetArg<22, T22>::set(kernel_, t22); - SetArg<23, T23>::set(kernel_, t23); - SetArg<24, T24>::set(kernel_, t24); - SetArg<25, T25>::set(kernel_, t25); - SetArg<26, T26>::set(kernel_, t26); - SetArg<27, T27>::set(kernel_, t27); - SetArg<28, T28>::set(kernel_, t28); - SetArg<29, T29>::set(kernel_, t29); - SetArg<30, T30>::set(kernel_, t30); - SetArg<31, T31>::set(kernel_, t31); - - args.queue_.enqueueNDRangeKernel( - kernel_, - args.offset_, - args.global_, - args.local_, - &args.events_, - &event); - - return event; - } - -}; - -//------------------------------------------------------------------------------------------------------ - - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25, - typename T26, - typename T27, - typename T28, - typename T29, - typename T30, - typename T31> -struct functionImplementation_ -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - T30, - T31> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - T30, - T31); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25, - T26 arg26, - T27 arg27, - T28 arg28, - T29 arg29, - T30 arg30, - T31 arg31) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25, - arg26, - arg27, - arg28, - arg29, - arg30, - arg31); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25, - typename T26, - typename T27, - typename T28, - typename T29, - typename T30> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - T30, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - T30, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - T30); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25, - T26 arg26, - T27 arg27, - T28 arg28, - T29 arg29, - T30 arg30) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25, - arg26, - arg27, - arg28, - arg29, - arg30); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25, - typename T26, - typename T27, - typename T28, - typename T29> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - T29); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25, - T26 arg26, - T27 arg27, - T28 arg28, - T29 arg29) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25, - arg26, - arg27, - arg28, - arg29); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25, - typename T26, - typename T27, - typename T28> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - T28); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25, - T26 arg26, - T27 arg27, - T28 arg28) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25, - arg26, - arg27, - arg28); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25, - typename T26, - typename T27> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - T27); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25, - T26 arg26, - T27 arg27) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25, - arg26, - arg27); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25, - typename T26> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - T26); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25, - T26 arg26) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25, - arg26); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24, - typename T25> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - T25); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24, - T25 arg25) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24, - arg25); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23, - typename T24> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - T24); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23, - T24 arg24) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23, - arg24); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22, - typename T23> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - T23); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22, - T23 arg23) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22, - arg23); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21, - typename T22> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - T22); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21, - T22 arg22) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21, - arg22); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20, - typename T21> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - T21); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20, - T21 arg21) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20, - arg21); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19, - typename T20> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - T20); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19, - T20 arg20) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19, - arg20); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18, - typename T19> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - T19); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18, - T19 arg19) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18, - arg19); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17, - typename T18> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - T18); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17, - T18 arg18) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17, - arg18); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16, - typename T17> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - T17); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16, - T17 arg17) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16, - arg17); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15, - typename T16> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - T16); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15, - T16 arg16) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15, - arg16); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14, - typename T15> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - T15); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14, - T15 arg15) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14, - arg15); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13, - typename T14> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - T14); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13, - T14 arg14) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13, - arg14); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12, - typename T13> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - T13); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12, - T13 arg13) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12, - arg13); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11, - typename T12> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - T12); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11, - T12 arg12) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11, - arg12); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10, - typename T11> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - T11); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10, - T11 arg11) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10, - arg11); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9, - typename T10> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - T10); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9, - T10 arg10) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9, - arg10); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8, - typename T9> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - T9); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8, - T9 arg9) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8, - arg9); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7, - typename T8> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - T8); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7, - T8 arg8) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7, - arg8); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6, - typename T7> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6, - T7); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6, - T7 arg7) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6, - arg7); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5, - typename T6> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - T6, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - T6, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5, - T6); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5, - T6 arg6) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5, - arg6); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4, - typename T5> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - T5, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - T5, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4, - T5); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4, - T5 arg5) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4, - arg5); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3, - typename T4> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - T4, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - T4, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3, - T4); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3, - T4 arg4) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3, - arg4); - } - - -}; - -template< - typename T0, - typename T1, - typename T2, - typename T3> -struct functionImplementation_ -< T0, - T1, - T2, - T3, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - T3, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2, - T3); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2, - T3 arg3) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2, - arg3); - } - - -}; - -template< - typename T0, - typename T1, - typename T2> -struct functionImplementation_ -< T0, - T1, - T2, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - T2, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1, - T2); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1, - T2 arg2) - { - return functor_( - enqueueArgs, - arg0, - arg1, - arg2); - } - - -}; - -template< - typename T0, - typename T1> -struct functionImplementation_ -< T0, - T1, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - T1, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0, - T1); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0, - T1 arg1) - { - return functor_( - enqueueArgs, - arg0, - arg1); - } - - -}; - -template< - typename T0> -struct functionImplementation_ -< T0, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> -{ - typedef detail::KernelFunctorGlobal< - T0, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType, - NullType> FunctorType; - - FunctorType functor_; - - functionImplementation_(const FunctorType &functor) : - functor_(functor) - { - - #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1)) - // Fail variadic expansion for dev11 - static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); - #endif - - } - - //! \brief Return type of the functor - typedef Event result_type; - - //! \brief Function signature of kernel functor with no event dependency. - typedef Event type_( - const EnqueueArgs&, - T0); - - Event operator()( - const EnqueueArgs& enqueueArgs, - T0 arg0) - { - return functor_( - enqueueArgs, - arg0); - } - - -}; - - - - - -} // namespace detail - -//---------------------------------------------------------------------------------------------- - -template < - typename T0, typename T1 = detail::NullType, typename T2 = detail::NullType, - typename T3 = detail::NullType, typename T4 = detail::NullType, - typename T5 = detail::NullType, typename T6 = detail::NullType, - typename T7 = detail::NullType, typename T8 = detail::NullType, - typename T9 = detail::NullType, typename T10 = detail::NullType, - typename T11 = detail::NullType, typename T12 = detail::NullType, - typename T13 = detail::NullType, typename T14 = detail::NullType, - typename T15 = detail::NullType, typename T16 = detail::NullType, - typename T17 = detail::NullType, typename T18 = detail::NullType, - typename T19 = detail::NullType, typename T20 = detail::NullType, - typename T21 = detail::NullType, typename T22 = detail::NullType, - typename T23 = detail::NullType, typename T24 = detail::NullType, - typename T25 = detail::NullType, typename T26 = detail::NullType, - typename T27 = detail::NullType, typename T28 = detail::NullType, - typename T29 = detail::NullType, typename T30 = detail::NullType, - typename T31 = detail::NullType -> -struct make_kernel : - public detail::functionImplementation_< - T0, T1, T2, T3, - T4, T5, T6, T7, - T8, T9, T10, T11, - T12, T13, T14, T15, - T16, T17, T18, T19, - T20, T21, T22, T23, - T24, T25, T26, T27, - T28, T29, T30, T31 - > -{ -public: - typedef detail::KernelFunctorGlobal< - T0, T1, T2, T3, - T4, T5, T6, T7, - T8, T9, T10, T11, - T12, T13, T14, T15, - T16, T17, T18, T19, - T20, T21, T22, T23, - T24, T25, T26, T27, - T28, T29, T30, T31 - > FunctorType; - - make_kernel( - const Program& program, - const STRING_CLASS name, - cl_int * err = NULL) : - detail::functionImplementation_< - T0, T1, T2, T3, - T4, T5, T6, T7, - T8, T9, T10, T11, - T12, T13, T14, T15, - T16, T17, T18, T19, - T20, T21, T22, T23, - T24, T25, T26, T27, - T28, T29, T30, T31 - >( - FunctorType(program, name, err)) - {} - - make_kernel( - const Kernel kernel) : - detail::functionImplementation_< - T0, T1, T2, T3, - T4, T5, T6, T7, - T8, T9, T10, T11, - T12, T13, T14, T15, - T16, T17, T18, T19, - T20, T21, T22, T23, - T24, T25, T26, T27, - T28, T29, T30, T31 - >( - FunctorType(kernel)) - {} -}; - - -//---------------------------------------------------------------------------------------------------------------------- - -#undef __ERR_STR -#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) -#undef __GET_DEVICE_INFO_ERR -#undef __GET_PLATFORM_INFO_ERR -#undef __GET_DEVICE_IDS_ERR -#undef __GET_CONTEXT_INFO_ERR -#undef __GET_EVENT_INFO_ERR -#undef __GET_EVENT_PROFILE_INFO_ERR -#undef __GET_MEM_OBJECT_INFO_ERR -#undef __GET_IMAGE_INFO_ERR -#undef __GET_SAMPLER_INFO_ERR -#undef __GET_KERNEL_INFO_ERR -#undef __GET_KERNEL_ARG_INFO_ERR -#undef __GET_KERNEL_WORK_GROUP_INFO_ERR -#undef __GET_PROGRAM_INFO_ERR -#undef __GET_PROGRAM_BUILD_INFO_ERR -#undef __GET_COMMAND_QUEUE_INFO_ERR - -#undef __CREATE_CONTEXT_ERR -#undef __CREATE_CONTEXT_FROM_TYPE_ERR -#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR - -#undef __CREATE_BUFFER_ERR -#undef __CREATE_SUBBUFFER_ERR -#undef __CREATE_IMAGE2D_ERR -#undef __CREATE_IMAGE3D_ERR -#undef __CREATE_SAMPLER_ERR -#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR - -#undef __CREATE_USER_EVENT_ERR -#undef __SET_USER_EVENT_STATUS_ERR -#undef __SET_EVENT_CALLBACK_ERR -#undef __SET_PRINTF_CALLBACK_ERR - -#undef __WAIT_FOR_EVENTS_ERR - -#undef __CREATE_KERNEL_ERR -#undef __SET_KERNEL_ARGS_ERR -#undef __CREATE_PROGRAM_WITH_SOURCE_ERR -#undef __CREATE_PROGRAM_WITH_BINARY_ERR -#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR -#undef __BUILD_PROGRAM_ERR -#undef __CREATE_KERNELS_IN_PROGRAM_ERR - -#undef __CREATE_COMMAND_QUEUE_ERR -#undef __SET_COMMAND_QUEUE_PROPERTY_ERR -#undef __ENQUEUE_READ_BUFFER_ERR -#undef __ENQUEUE_WRITE_BUFFER_ERR -#undef __ENQUEUE_READ_BUFFER_RECT_ERR -#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR -#undef __ENQEUE_COPY_BUFFER_ERR -#undef __ENQEUE_COPY_BUFFER_RECT_ERR -#undef __ENQUEUE_READ_IMAGE_ERR -#undef __ENQUEUE_WRITE_IMAGE_ERR -#undef __ENQUEUE_COPY_IMAGE_ERR -#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR -#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR -#undef __ENQUEUE_MAP_BUFFER_ERR -#undef __ENQUEUE_MAP_IMAGE_ERR -#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR -#undef __ENQUEUE_NDRANGE_KERNEL_ERR -#undef __ENQUEUE_TASK_ERR -#undef __ENQUEUE_NATIVE_KERNEL - -#undef __CL_EXPLICIT_CONSTRUCTORS - -#undef __UNLOAD_COMPILER_ERR -#endif //__CL_USER_OVERRIDE_ERROR_STRINGS - -#undef __CL_FUNCTION_TYPE - -// Extensions -/** - * Deprecated APIs for 1.2 - */ -#if defined(CL_VERSION_1_1) -#undef __INIT_CL_EXT_FCN_PTR -#endif // #if defined(CL_VERSION_1_1) -#undef __CREATE_SUB_DEVICES - -#if defined(USE_CL_DEVICE_FISSION) -#undef __PARAM_NAME_DEVICE_FISSION -#endif // USE_CL_DEVICE_FISSION - -#undef __DEFAULT_NOT_INITIALIZED -#undef __DEFAULT_BEING_INITIALIZED -#undef __DEFAULT_INITIALIZED - -} // namespace cl - -#ifdef _WIN32 -#pragma pop_macro("max") -#endif // _WIN32 - -#endif // CL_HPP_ diff --git a/include/CL/cl_d3d10.h b/include/CL/cl_d3d10.h deleted file mode 100644 index 81b0d3721..000000000 --- a/include/CL/cl_d3d10.h +++ /dev/null @@ -1,126 +0,0 @@ -/********************************************************************************** - * Copyright (c) 2008-2012 The Khronos Group Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and/or associated documentation files (the - * "Materials"), to deal in the Materials without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Materials, and to - * permit persons to whom the Materials are furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Materials. - * - * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. - **********************************************************************************/ - -/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ - -#ifndef __OPENCL_CL_D3D10_H -#define __OPENCL_CL_D3D10_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/****************************************************************************** - * cl_khr_d3d10_sharing */ -#define cl_khr_d3d10_sharing 1 - -typedef cl_uint cl_d3d10_device_source_khr; -typedef cl_uint cl_d3d10_device_set_khr; - -/******************************************************************************/ - -// Error Codes -#define CL_INVALID_D3D10_DEVICE_KHR -1002 -#define CL_INVALID_D3D10_RESOURCE_KHR -1003 -#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004 -#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005 - -// cl_d3d10_device_source_nv -#define CL_D3D10_DEVICE_KHR 0x4010 -#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011 - -// cl_d3d10_device_set_nv -#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012 -#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013 - -// cl_context_info -#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014 -#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C - -// cl_mem_info -#define CL_MEM_D3D10_RESOURCE_KHR 0x4015 - -// cl_image_info -#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016 - -// cl_command_type -#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017 -#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018 - -/******************************************************************************/ - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)( - cl_platform_id platform, - cl_d3d10_device_source_khr d3d_device_source, - void * d3d_object, - cl_d3d10_device_set_khr d3d_device_set, - cl_uint num_entries, - cl_device_id * devices, - cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D10Buffer * resource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D10Texture2D * resource, - UINT subresource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)( - cl_context context, - cl_mem_flags flags, - ID3D10Texture3D * resource, - UINT subresource, - cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)( - cl_command_queue command_queue, - cl_uint num_objects, - const cl_mem * mem_objects, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) CL_API_SUFFIX__VERSION_1_0; - -#ifdef __cplusplus -} -#endif - -#endif // __OPENCL_CL_D3D10_H - diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h deleted file mode 100644 index af2036cc9..000000000 --- a/include/CL/cl_gl.h +++ /dev/null @@ -1,162 +0,0 @@ -/********************************************************************************** - * Copyright (c) 2008 - 2012 The Khronos Group Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and/or associated documentation files (the - * "Materials"), to deal in the Materials without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Materials, and to - * permit persons to whom the Materials are furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Materials. - * - * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. - **********************************************************************************/ - -#ifndef __OPENCL_CL_GL_H -#define __OPENCL_CL_GL_H - -#ifdef __APPLE__ -#include -#else -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -typedef cl_uint cl_gl_object_type; -typedef cl_uint cl_gl_texture_info; -typedef cl_uint cl_gl_platform_info; -typedef struct __GLsync *cl_GLsync; - -/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */ -#define CL_GL_OBJECT_BUFFER 0x2000 -#define CL_GL_OBJECT_TEXTURE2D 0x2001 -#define CL_GL_OBJECT_TEXTURE3D 0x2002 -#define CL_GL_OBJECT_RENDERBUFFER 0x2003 -#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E -#define CL_GL_OBJECT_TEXTURE1D 0x200F -#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 -#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 - -/* cl_gl_texture_info */ -#define CL_GL_TEXTURE_TARGET 0x2004 -#define CL_GL_MIPMAP_LEVEL 0x2005 -#define CL_GL_NUM_SAMPLES 0x2012 - - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromGLBuffer(cl_context /* context */, - cl_mem_flags /* flags */, - cl_GLuint /* bufobj */, - int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromGLTexture(cl_context /* context */, - cl_mem_flags /* flags */, - cl_GLenum /* target */, - cl_GLint /* miplevel */, - cl_GLuint /* texture */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; - -extern CL_API_ENTRY cl_mem CL_API_CALL -clCreateFromGLRenderbuffer(cl_context /* context */, - cl_mem_flags /* flags */, - cl_GLuint /* renderbuffer */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetGLObjectInfo(cl_mem /* memobj */, - cl_gl_object_type * /* gl_object_type */, - cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetGLTextureInfo(cl_mem /* memobj */, - cl_gl_texture_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */, - cl_uint /* num_objects */, - const cl_mem * /* mem_objects */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */, - cl_uint /* num_objects */, - const cl_mem * /* mem_objects */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - - -// Deprecated OpenCL 1.1 APIs -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL -clCreateFromGLTexture2D(cl_context /* context */, - cl_mem_flags /* flags */, - cl_GLenum /* target */, - cl_GLint /* miplevel */, - cl_GLuint /* texture */, - cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL -clCreateFromGLTexture3D(cl_context /* context */, - cl_mem_flags /* flags */, - cl_GLenum /* target */, - cl_GLint /* miplevel */, - cl_GLuint /* texture */, - cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; - -/* cl_khr_gl_sharing extension */ - -#define cl_khr_gl_sharing 1 - -typedef cl_uint cl_gl_context_info; - -/* Additional Error Codes */ -#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 - -/* cl_gl_context_info */ -#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 -#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 - -/* Additional cl_context_properties */ -#define CL_GL_CONTEXT_KHR 0x2008 -#define CL_EGL_DISPLAY_KHR 0x2009 -#define CL_GLX_DISPLAY_KHR 0x200A -#define CL_WGL_HDC_KHR 0x200B -#define CL_CGL_SHAREGROUP_KHR 0x200C - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetGLContextInfoKHR(const cl_context_properties * /* properties */, - cl_gl_context_info /* param_name */, - size_t /* param_value_size */, - void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - -typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( - const cl_context_properties * properties, - cl_gl_context_info param_name, - size_t param_value_size, - void * param_value, - size_t * param_value_size_ret); - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_GL_H */ diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h deleted file mode 100644 index 77d53536f..000000000 --- a/include/CL/cl_gl_ext.h +++ /dev/null @@ -1,69 +0,0 @@ -/********************************************************************************** - * Copyright (c) 2008-2012 The Khronos Group Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and/or associated documentation files (the - * "Materials"), to deal in the Materials without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Materials, and to - * permit persons to whom the Materials are furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Materials. - * - * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. - **********************************************************************************/ - -/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ - -/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */ -/* OpenGL dependencies. */ - -#ifndef __OPENCL_CL_GL_EXT_H -#define __OPENCL_CL_GL_EXT_H - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef __APPLE__ - #include -#else - #include -#endif - -/* - * For each extension, follow this template - * cl_VEN_extname extension */ -/* #define cl_VEN_extname 1 - * ... define new types, if any - * ... define new tokens, if any - * ... define new APIs, if any - * - * If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header - * This allows us to avoid having to decide whether to include GL headers or GLES here. - */ - -/* - * cl_khr_gl_event extension - * See section 9.9 in the OpenCL 1.1 spec for more information - */ -#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D - -extern CL_API_ENTRY cl_event CL_API_CALL -clCreateEventFromGLsyncKHR(cl_context /* context */, - cl_GLsync /* cl_GLsync */, - cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_CL_GL_EXT_H */ diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h deleted file mode 100644 index cf2b7210a..000000000 --- a/include/CL/cl_platform.h +++ /dev/null @@ -1,1254 +0,0 @@ -/********************************************************************************** - * Copyright (c) 2008-2012 The Khronos Group Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and/or associated documentation files (the - * "Materials"), to deal in the Materials without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Materials, and to - * permit persons to whom the Materials are furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Materials. - * - * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. - **********************************************************************************/ - -/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */ - -#ifndef __CL_PLATFORM_H -#define __CL_PLATFORM_H - -#ifdef __APPLE__ - /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */ - #include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -#if defined(_WIN32) - #define CL_API_ENTRY - #define CL_API_CALL __stdcall - #define CL_CALLBACK __stdcall -#else - #define CL_API_ENTRY - #define CL_API_CALL - #define CL_CALLBACK -#endif - -#ifdef __APPLE__ - #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import)) - #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER - #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER - #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER - #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER - #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER - #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 - - #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER - #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER - #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER - #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER - #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 - #else - #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here! - #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER - #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER - #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER - #endif -#else - #define CL_EXTENSION_WEAK_LINK - #define CL_API_SUFFIX__VERSION_1_0 - #define CL_EXT_SUFFIX__VERSION_1_0 - #define CL_API_SUFFIX__VERSION_1_1 - #define CL_EXT_SUFFIX__VERSION_1_1 - #define CL_API_SUFFIX__VERSION_1_2 - #define CL_EXT_SUFFIX__VERSION_1_2 - - #ifdef __GNUC__ - #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS - #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated)) - #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED - #endif - - #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated)) - #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - #endif - #elif _WIN32 - #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS - #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated) - #endif - - #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - #else - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated) - #endif - #else - #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED - - #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED - #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED - #endif -#endif - -#if (defined (_WIN32) && defined(_MSC_VER)) - -/* scalar types */ -typedef signed __int8 cl_char; -typedef unsigned __int8 cl_uchar; -typedef signed __int16 cl_short; -typedef unsigned __int16 cl_ushort; -typedef signed __int32 cl_int; -typedef unsigned __int32 cl_uint; -typedef signed __int64 cl_long; -typedef unsigned __int64 cl_ulong; - -typedef unsigned __int16 cl_half; -typedef float cl_float; -typedef double cl_double; - -/* Macro names and corresponding values defined by OpenCL */ -#define CL_CHAR_BIT 8 -#define CL_SCHAR_MAX 127 -#define CL_SCHAR_MIN (-127-1) -#define CL_CHAR_MAX CL_SCHAR_MAX -#define CL_CHAR_MIN CL_SCHAR_MIN -#define CL_UCHAR_MAX 255 -#define CL_SHRT_MAX 32767 -#define CL_SHRT_MIN (-32767-1) -#define CL_USHRT_MAX 65535 -#define CL_INT_MAX 2147483647 -#define CL_INT_MIN (-2147483647-1) -#define CL_UINT_MAX 0xffffffffU -#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) -#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) -#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) - -#define CL_FLT_DIG 6 -#define CL_FLT_MANT_DIG 24 -#define CL_FLT_MAX_10_EXP +38 -#define CL_FLT_MAX_EXP +128 -#define CL_FLT_MIN_10_EXP -37 -#define CL_FLT_MIN_EXP -125 -#define CL_FLT_RADIX 2 -#define CL_FLT_MAX 340282346638528859811704183484516925440.0f -#define CL_FLT_MIN 1.175494350822287507969e-38f -#define CL_FLT_EPSILON 0x1.0p-23f - -#define CL_DBL_DIG 15 -#define CL_DBL_MANT_DIG 53 -#define CL_DBL_MAX_10_EXP +308 -#define CL_DBL_MAX_EXP +1024 -#define CL_DBL_MIN_10_EXP -307 -#define CL_DBL_MIN_EXP -1021 -#define CL_DBL_RADIX 2 -#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 -#define CL_DBL_MIN 2.225073858507201383090e-308 -#define CL_DBL_EPSILON 2.220446049250313080847e-16 - -#define CL_M_E 2.718281828459045090796 -#define CL_M_LOG2E 1.442695040888963387005 -#define CL_M_LOG10E 0.434294481903251816668 -#define CL_M_LN2 0.693147180559945286227 -#define CL_M_LN10 2.302585092994045901094 -#define CL_M_PI 3.141592653589793115998 -#define CL_M_PI_2 1.570796326794896557999 -#define CL_M_PI_4 0.785398163397448278999 -#define CL_M_1_PI 0.318309886183790691216 -#define CL_M_2_PI 0.636619772367581382433 -#define CL_M_2_SQRTPI 1.128379167095512558561 -#define CL_M_SQRT2 1.414213562373095145475 -#define CL_M_SQRT1_2 0.707106781186547572737 - -#define CL_M_E_F 2.71828174591064f -#define CL_M_LOG2E_F 1.44269502162933f -#define CL_M_LOG10E_F 0.43429449200630f -#define CL_M_LN2_F 0.69314718246460f -#define CL_M_LN10_F 2.30258512496948f -#define CL_M_PI_F 3.14159274101257f -#define CL_M_PI_2_F 1.57079637050629f -#define CL_M_PI_4_F 0.78539818525314f -#define CL_M_1_PI_F 0.31830987334251f -#define CL_M_2_PI_F 0.63661974668503f -#define CL_M_2_SQRTPI_F 1.12837922573090f -#define CL_M_SQRT2_F 1.41421353816986f -#define CL_M_SQRT1_2_F 0.70710676908493f - -#define CL_NAN (CL_INFINITY - CL_INFINITY) -#define CL_HUGE_VALF ((cl_float) 1e50) -#define CL_HUGE_VAL ((cl_double) 1e500) -#define CL_MAXFLOAT CL_FLT_MAX -#define CL_INFINITY CL_HUGE_VALF - -#else - -#include - -/* scalar types */ -typedef int8_t cl_char; -typedef uint8_t cl_uchar; -typedef int16_t cl_short __attribute__((aligned(2))); -typedef uint16_t cl_ushort __attribute__((aligned(2))); -typedef int32_t cl_int __attribute__((aligned(4))); -typedef uint32_t cl_uint __attribute__((aligned(4))); -typedef int64_t cl_long __attribute__((aligned(8))); -typedef uint64_t cl_ulong __attribute__((aligned(8))); - -typedef uint16_t cl_half __attribute__((aligned(2))); -typedef float cl_float __attribute__((aligned(4))); -typedef double cl_double __attribute__((aligned(8))); - -/* Macro names and corresponding values defined by OpenCL */ -#define CL_CHAR_BIT 8 -#define CL_SCHAR_MAX 127 -#define CL_SCHAR_MIN (-127-1) -#define CL_CHAR_MAX CL_SCHAR_MAX -#define CL_CHAR_MIN CL_SCHAR_MIN -#define CL_UCHAR_MAX 255 -#define CL_SHRT_MAX 32767 -#define CL_SHRT_MIN (-32767-1) -#define CL_USHRT_MAX 65535 -#define CL_INT_MAX 2147483647 -#define CL_INT_MIN (-2147483647-1) -#define CL_UINT_MAX 0xffffffffU -#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) -#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) -#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) - -#define CL_FLT_DIG 6 -#define CL_FLT_MANT_DIG 24 -#define CL_FLT_MAX_10_EXP +38 -#define CL_FLT_MAX_EXP +128 -#define CL_FLT_MIN_10_EXP -37 -#define CL_FLT_MIN_EXP -125 -#define CL_FLT_RADIX 2 -#define CL_FLT_MAX 0x1.fffffep127f -#define CL_FLT_MIN 0x1.0p-126f -#define CL_FLT_EPSILON 0x1.0p-23f - -#define CL_DBL_DIG 15 -#define CL_DBL_MANT_DIG 53 -#define CL_DBL_MAX_10_EXP +308 -#define CL_DBL_MAX_EXP +1024 -#define CL_DBL_MIN_10_EXP -307 -#define CL_DBL_MIN_EXP -1021 -#define CL_DBL_RADIX 2 -#define CL_DBL_MAX 0x1.fffffffffffffp1023 -#define CL_DBL_MIN 0x1.0p-1022 -#define CL_DBL_EPSILON 0x1.0p-52 - -#define CL_M_E 2.718281828459045090796 -#define CL_M_LOG2E 1.442695040888963387005 -#define CL_M_LOG10E 0.434294481903251816668 -#define CL_M_LN2 0.693147180559945286227 -#define CL_M_LN10 2.302585092994045901094 -#define CL_M_PI 3.141592653589793115998 -#define CL_M_PI_2 1.570796326794896557999 -#define CL_M_PI_4 0.785398163397448278999 -#define CL_M_1_PI 0.318309886183790691216 -#define CL_M_2_PI 0.636619772367581382433 -#define CL_M_2_SQRTPI 1.128379167095512558561 -#define CL_M_SQRT2 1.414213562373095145475 -#define CL_M_SQRT1_2 0.707106781186547572737 - -#define CL_M_E_F 2.71828174591064f -#define CL_M_LOG2E_F 1.44269502162933f -#define CL_M_LOG10E_F 0.43429449200630f -#define CL_M_LN2_F 0.69314718246460f -#define CL_M_LN10_F 2.30258512496948f -#define CL_M_PI_F 3.14159274101257f -#define CL_M_PI_2_F 1.57079637050629f -#define CL_M_PI_4_F 0.78539818525314f -#define CL_M_1_PI_F 0.31830987334251f -#define CL_M_2_PI_F 0.63661974668503f -#define CL_M_2_SQRTPI_F 1.12837922573090f -#define CL_M_SQRT2_F 1.41421353816986f -#define CL_M_SQRT1_2_F 0.70710676908493f - -#if defined( __GNUC__ ) - #define CL_HUGE_VALF __builtin_huge_valf() - #define CL_HUGE_VAL __builtin_huge_val() - #define CL_NAN __builtin_nanf( "" ) -#else - #define CL_HUGE_VALF ((cl_float) 1e50) - #define CL_HUGE_VAL ((cl_double) 1e500) - float nanf( const char * ); - #define CL_NAN nanf( "" ) -#endif -#define CL_MAXFLOAT CL_FLT_MAX -#define CL_INFINITY CL_HUGE_VALF - -#endif - -#include - -/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */ -typedef unsigned int cl_GLuint; -typedef int cl_GLint; -typedef unsigned int cl_GLenum; - -/* - * Vector types - * - * Note: OpenCL requires that all types be naturally aligned. - * This means that vector types must be naturally aligned. - * For example, a vector of four floats must be aligned to - * a 16 byte boundary (calculated as 4 * the natural 4-byte - * alignment of the float). The alignment qualifiers here - * will only function properly if your compiler supports them - * and if you don't actively work to defeat them. For example, - * in order for a cl_float4 to be 16 byte aligned in a struct, - * the start of the struct must itself be 16-byte aligned. - * - * Maintaining proper alignment is the user's responsibility. - */ - -/* Define basic vector types */ -#if defined( __VEC__ ) - #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ - typedef vector unsigned char __cl_uchar16; - typedef vector signed char __cl_char16; - typedef vector unsigned short __cl_ushort8; - typedef vector signed short __cl_short8; - typedef vector unsigned int __cl_uint4; - typedef vector signed int __cl_int4; - typedef vector float __cl_float4; - #define __CL_UCHAR16__ 1 - #define __CL_CHAR16__ 1 - #define __CL_USHORT8__ 1 - #define __CL_SHORT8__ 1 - #define __CL_UINT4__ 1 - #define __CL_INT4__ 1 - #define __CL_FLOAT4__ 1 -#endif - -#if defined( __SSE__ ) - #if defined( __MINGW64__ ) - #include - #else - #include - #endif - #if defined( __GNUC__ ) - typedef float __cl_float4 __attribute__((vector_size(16))); - #else - typedef __m128 __cl_float4; - #endif - #define __CL_FLOAT4__ 1 -#endif - -#if defined( __SSE2__ ) - #if defined( __MINGW64__ ) - #include - #else - #include - #endif - #if defined( __GNUC__ ) - typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); - typedef cl_char __cl_char16 __attribute__((vector_size(16))); - typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); - typedef cl_short __cl_short8 __attribute__((vector_size(16))); - typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); - typedef cl_int __cl_int4 __attribute__((vector_size(16))); - typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); - typedef cl_long __cl_long2 __attribute__((vector_size(16))); - typedef cl_double __cl_double2 __attribute__((vector_size(16))); - #else - typedef __m128i __cl_uchar16; - typedef __m128i __cl_char16; - typedef __m128i __cl_ushort8; - typedef __m128i __cl_short8; - typedef __m128i __cl_uint4; - typedef __m128i __cl_int4; - typedef __m128i __cl_ulong2; - typedef __m128i __cl_long2; - typedef __m128d __cl_double2; - #endif - #define __CL_UCHAR16__ 1 - #define __CL_CHAR16__ 1 - #define __CL_USHORT8__ 1 - #define __CL_SHORT8__ 1 - #define __CL_INT4__ 1 - #define __CL_UINT4__ 1 - #define __CL_ULONG2__ 1 - #define __CL_LONG2__ 1 - #define __CL_DOUBLE2__ 1 -#endif - -#if defined( __MMX__ ) - #include - #if defined( __GNUC__ ) - typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); - typedef cl_char __cl_char8 __attribute__((vector_size(8))); - typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); - typedef cl_short __cl_short4 __attribute__((vector_size(8))); - typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); - typedef cl_int __cl_int2 __attribute__((vector_size(8))); - typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); - typedef cl_long __cl_long1 __attribute__((vector_size(8))); - typedef cl_float __cl_float2 __attribute__((vector_size(8))); - #else - typedef __m64 __cl_uchar8; - typedef __m64 __cl_char8; - typedef __m64 __cl_ushort4; - typedef __m64 __cl_short4; - typedef __m64 __cl_uint2; - typedef __m64 __cl_int2; - typedef __m64 __cl_ulong1; - typedef __m64 __cl_long1; - typedef __m64 __cl_float2; - #endif - #define __CL_UCHAR8__ 1 - #define __CL_CHAR8__ 1 - #define __CL_USHORT4__ 1 - #define __CL_SHORT4__ 1 - #define __CL_INT2__ 1 - #define __CL_UINT2__ 1 - #define __CL_ULONG1__ 1 - #define __CL_LONG1__ 1 - #define __CL_FLOAT2__ 1 -#endif - -#if defined( __AVX__ ) - #if defined( __MINGW64__ ) - #include - #else - #include - #endif - #if defined( __GNUC__ ) - typedef cl_float __cl_float8 __attribute__((vector_size(32))); - typedef cl_double __cl_double4 __attribute__((vector_size(32))); - #else - typedef __m256 __cl_float8; - typedef __m256d __cl_double4; - #endif - #define __CL_FLOAT8__ 1 - #define __CL_DOUBLE4__ 1 -#endif - -/* Define alignment keys */ -#if defined( __GNUC__ ) - #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) -#elif defined( _WIN32) && (_MSC_VER) - /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ - /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ - /* #include */ - /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ - #define CL_ALIGNED(_x) -#else - #warning Need to implement some method to align data here - #define CL_ALIGNED(_x) -#endif - -/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - /* .xyzw and .s0123...{f|F} are supported */ - #define CL_HAS_NAMED_VECTOR_FIELDS 1 - /* .hi and .lo are supported */ - #define CL_HAS_HI_LO_VECTOR_FIELDS 1 -#endif - -/* Define cl_vector types */ - -/* ---- cl_charn ---- */ -typedef union -{ - cl_char CL_ALIGNED(2) s[2]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_char x, y; }; - __extension__ struct{ cl_char s0, s1; }; - __extension__ struct{ cl_char lo, hi; }; -#endif -#if defined( __CL_CHAR2__) - __cl_char2 v2; -#endif -}cl_char2; - -typedef union -{ - cl_char CL_ALIGNED(4) s[4]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_char x, y, z, w; }; - __extension__ struct{ cl_char s0, s1, s2, s3; }; - __extension__ struct{ cl_char2 lo, hi; }; -#endif -#if defined( __CL_CHAR2__) - __cl_char2 v2[2]; -#endif -#if defined( __CL_CHAR4__) - __cl_char4 v4; -#endif -}cl_char4; - -/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ -typedef cl_char4 cl_char3; - -typedef union -{ - cl_char CL_ALIGNED(8) s[8]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_char x, y, z, w; }; - __extension__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; - __extension__ struct{ cl_char4 lo, hi; }; -#endif -#if defined( __CL_CHAR2__) - __cl_char2 v2[4]; -#endif -#if defined( __CL_CHAR4__) - __cl_char4 v4[2]; -#endif -#if defined( __CL_CHAR8__ ) - __cl_char8 v8; -#endif -}cl_char8; - -typedef union -{ - cl_char CL_ALIGNED(16) s[16]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __extension__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __extension__ struct{ cl_char8 lo, hi; }; -#endif -#if defined( __CL_CHAR2__) - __cl_char2 v2[8]; -#endif -#if defined( __CL_CHAR4__) - __cl_char4 v4[4]; -#endif -#if defined( __CL_CHAR8__ ) - __cl_char8 v8[2]; -#endif -#if defined( __CL_CHAR16__ ) - __cl_char16 v16; -#endif -}cl_char16; - - -/* ---- cl_ucharn ---- */ -typedef union -{ - cl_uchar CL_ALIGNED(2) s[2]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_uchar x, y; }; - __extension__ struct{ cl_uchar s0, s1; }; - __extension__ struct{ cl_uchar lo, hi; }; -#endif -#if defined( __cl_uchar2__) - __cl_uchar2 v2; -#endif -}cl_uchar2; - -typedef union -{ - cl_uchar CL_ALIGNED(4) s[4]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_uchar x, y, z, w; }; - __extension__ struct{ cl_uchar s0, s1, s2, s3; }; - __extension__ struct{ cl_uchar2 lo, hi; }; -#endif -#if defined( __CL_UCHAR2__) - __cl_uchar2 v2[2]; -#endif -#if defined( __CL_UCHAR4__) - __cl_uchar4 v4; -#endif -}cl_uchar4; - -/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ -typedef cl_uchar4 cl_uchar3; - -typedef union -{ - cl_uchar CL_ALIGNED(8) s[8]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_uchar x, y, z, w; }; - __extension__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; - __extension__ struct{ cl_uchar4 lo, hi; }; -#endif -#if defined( __CL_UCHAR2__) - __cl_uchar2 v2[4]; -#endif -#if defined( __CL_UCHAR4__) - __cl_uchar4 v4[2]; -#endif -#if defined( __CL_UCHAR8__ ) - __cl_uchar8 v8; -#endif -}cl_uchar8; - -typedef union -{ - cl_uchar CL_ALIGNED(16) s[16]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __extension__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __extension__ struct{ cl_uchar8 lo, hi; }; -#endif -#if defined( __CL_UCHAR2__) - __cl_uchar2 v2[8]; -#endif -#if defined( __CL_UCHAR4__) - __cl_uchar4 v4[4]; -#endif -#if defined( __CL_UCHAR8__ ) - __cl_uchar8 v8[2]; -#endif -#if defined( __CL_UCHAR16__ ) - __cl_uchar16 v16; -#endif -}cl_uchar16; - - -/* ---- cl_shortn ---- */ -typedef union -{ - cl_short CL_ALIGNED(4) s[2]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_short x, y; }; - __extension__ struct{ cl_short s0, s1; }; - __extension__ struct{ cl_short lo, hi; }; -#endif -#if defined( __CL_SHORT2__) - __cl_short2 v2; -#endif -}cl_short2; - -typedef union -{ - cl_short CL_ALIGNED(8) s[4]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_short x, y, z, w; }; - __extension__ struct{ cl_short s0, s1, s2, s3; }; - __extension__ struct{ cl_short2 lo, hi; }; -#endif -#if defined( __CL_SHORT2__) - __cl_short2 v2[2]; -#endif -#if defined( __CL_SHORT4__) - __cl_short4 v4; -#endif -}cl_short4; - -/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ -typedef cl_short4 cl_short3; - -typedef union -{ - cl_short CL_ALIGNED(16) s[8]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_short x, y, z, w; }; - __extension__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; - __extension__ struct{ cl_short4 lo, hi; }; -#endif -#if defined( __CL_SHORT2__) - __cl_short2 v2[4]; -#endif -#if defined( __CL_SHORT4__) - __cl_short4 v4[2]; -#endif -#if defined( __CL_SHORT8__ ) - __cl_short8 v8; -#endif -}cl_short8; - -typedef union -{ - cl_short CL_ALIGNED(32) s[16]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __extension__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __extension__ struct{ cl_short8 lo, hi; }; -#endif -#if defined( __CL_SHORT2__) - __cl_short2 v2[8]; -#endif -#if defined( __CL_SHORT4__) - __cl_short4 v4[4]; -#endif -#if defined( __CL_SHORT8__ ) - __cl_short8 v8[2]; -#endif -#if defined( __CL_SHORT16__ ) - __cl_short16 v16; -#endif -}cl_short16; - - -/* ---- cl_ushortn ---- */ -typedef union -{ - cl_ushort CL_ALIGNED(4) s[2]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_ushort x, y; }; - __extension__ struct{ cl_ushort s0, s1; }; - __extension__ struct{ cl_ushort lo, hi; }; -#endif -#if defined( __CL_USHORT2__) - __cl_ushort2 v2; -#endif -}cl_ushort2; - -typedef union -{ - cl_ushort CL_ALIGNED(8) s[4]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_ushort x, y, z, w; }; - __extension__ struct{ cl_ushort s0, s1, s2, s3; }; - __extension__ struct{ cl_ushort2 lo, hi; }; -#endif -#if defined( __CL_USHORT2__) - __cl_ushort2 v2[2]; -#endif -#if defined( __CL_USHORT4__) - __cl_ushort4 v4; -#endif -}cl_ushort4; - -/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ -typedef cl_ushort4 cl_ushort3; - -typedef union -{ - cl_ushort CL_ALIGNED(16) s[8]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_ushort x, y, z, w; }; - __extension__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; - __extension__ struct{ cl_ushort4 lo, hi; }; -#endif -#if defined( __CL_USHORT2__) - __cl_ushort2 v2[4]; -#endif -#if defined( __CL_USHORT4__) - __cl_ushort4 v4[2]; -#endif -#if defined( __CL_USHORT8__ ) - __cl_ushort8 v8; -#endif -}cl_ushort8; - -typedef union -{ - cl_ushort CL_ALIGNED(32) s[16]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __extension__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __extension__ struct{ cl_ushort8 lo, hi; }; -#endif -#if defined( __CL_USHORT2__) - __cl_ushort2 v2[8]; -#endif -#if defined( __CL_USHORT4__) - __cl_ushort4 v4[4]; -#endif -#if defined( __CL_USHORT8__ ) - __cl_ushort8 v8[2]; -#endif -#if defined( __CL_USHORT16__ ) - __cl_ushort16 v16; -#endif -}cl_ushort16; - -/* ---- cl_intn ---- */ -typedef union -{ - cl_int CL_ALIGNED(8) s[2]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_int x, y; }; - __extension__ struct{ cl_int s0, s1; }; - __extension__ struct{ cl_int lo, hi; }; -#endif -#if defined( __CL_INT2__) - __cl_int2 v2; -#endif -}cl_int2; - -typedef union -{ - cl_int CL_ALIGNED(16) s[4]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_int x, y, z, w; }; - __extension__ struct{ cl_int s0, s1, s2, s3; }; - __extension__ struct{ cl_int2 lo, hi; }; -#endif -#if defined( __CL_INT2__) - __cl_int2 v2[2]; -#endif -#if defined( __CL_INT4__) - __cl_int4 v4; -#endif -}cl_int4; - -/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ -typedef cl_int4 cl_int3; - -typedef union -{ - cl_int CL_ALIGNED(32) s[8]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_int x, y, z, w; }; - __extension__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; - __extension__ struct{ cl_int4 lo, hi; }; -#endif -#if defined( __CL_INT2__) - __cl_int2 v2[4]; -#endif -#if defined( __CL_INT4__) - __cl_int4 v4[2]; -#endif -#if defined( __CL_INT8__ ) - __cl_int8 v8; -#endif -}cl_int8; - -typedef union -{ - cl_int CL_ALIGNED(64) s[16]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __extension__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __extension__ struct{ cl_int8 lo, hi; }; -#endif -#if defined( __CL_INT2__) - __cl_int2 v2[8]; -#endif -#if defined( __CL_INT4__) - __cl_int4 v4[4]; -#endif -#if defined( __CL_INT8__ ) - __cl_int8 v8[2]; -#endif -#if defined( __CL_INT16__ ) - __cl_int16 v16; -#endif -}cl_int16; - - -/* ---- cl_uintn ---- */ -typedef union -{ - cl_uint CL_ALIGNED(8) s[2]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_uint x, y; }; - __extension__ struct{ cl_uint s0, s1; }; - __extension__ struct{ cl_uint lo, hi; }; -#endif -#if defined( __CL_UINT2__) - __cl_uint2 v2; -#endif -}cl_uint2; - -typedef union -{ - cl_uint CL_ALIGNED(16) s[4]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_uint x, y, z, w; }; - __extension__ struct{ cl_uint s0, s1, s2, s3; }; - __extension__ struct{ cl_uint2 lo, hi; }; -#endif -#if defined( __CL_UINT2__) - __cl_uint2 v2[2]; -#endif -#if defined( __CL_UINT4__) - __cl_uint4 v4; -#endif -}cl_uint4; - -/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ -typedef cl_uint4 cl_uint3; - -typedef union -{ - cl_uint CL_ALIGNED(32) s[8]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_uint x, y, z, w; }; - __extension__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; - __extension__ struct{ cl_uint4 lo, hi; }; -#endif -#if defined( __CL_UINT2__) - __cl_uint2 v2[4]; -#endif -#if defined( __CL_UINT4__) - __cl_uint4 v4[2]; -#endif -#if defined( __CL_UINT8__ ) - __cl_uint8 v8; -#endif -}cl_uint8; - -typedef union -{ - cl_uint CL_ALIGNED(64) s[16]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __extension__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __extension__ struct{ cl_uint8 lo, hi; }; -#endif -#if defined( __CL_UINT2__) - __cl_uint2 v2[8]; -#endif -#if defined( __CL_UINT4__) - __cl_uint4 v4[4]; -#endif -#if defined( __CL_UINT8__ ) - __cl_uint8 v8[2]; -#endif -#if defined( __CL_UINT16__ ) - __cl_uint16 v16; -#endif -}cl_uint16; - -/* ---- cl_longn ---- */ -typedef union -{ - cl_long CL_ALIGNED(16) s[2]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_long x, y; }; - __extension__ struct{ cl_long s0, s1; }; - __extension__ struct{ cl_long lo, hi; }; -#endif -#if defined( __CL_LONG2__) - __cl_long2 v2; -#endif -}cl_long2; - -typedef union -{ - cl_long CL_ALIGNED(32) s[4]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_long x, y, z, w; }; - __extension__ struct{ cl_long s0, s1, s2, s3; }; - __extension__ struct{ cl_long2 lo, hi; }; -#endif -#if defined( __CL_LONG2__) - __cl_long2 v2[2]; -#endif -#if defined( __CL_LONG4__) - __cl_long4 v4; -#endif -}cl_long4; - -/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ -typedef cl_long4 cl_long3; - -typedef union -{ - cl_long CL_ALIGNED(64) s[8]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_long x, y, z, w; }; - __extension__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; - __extension__ struct{ cl_long4 lo, hi; }; -#endif -#if defined( __CL_LONG2__) - __cl_long2 v2[4]; -#endif -#if defined( __CL_LONG4__) - __cl_long4 v4[2]; -#endif -#if defined( __CL_LONG8__ ) - __cl_long8 v8; -#endif -}cl_long8; - -typedef union -{ - cl_long CL_ALIGNED(128) s[16]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __extension__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __extension__ struct{ cl_long8 lo, hi; }; -#endif -#if defined( __CL_LONG2__) - __cl_long2 v2[8]; -#endif -#if defined( __CL_LONG4__) - __cl_long4 v4[4]; -#endif -#if defined( __CL_LONG8__ ) - __cl_long8 v8[2]; -#endif -#if defined( __CL_LONG16__ ) - __cl_long16 v16; -#endif -}cl_long16; - - -/* ---- cl_ulongn ---- */ -typedef union -{ - cl_ulong CL_ALIGNED(16) s[2]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_ulong x, y; }; - __extension__ struct{ cl_ulong s0, s1; }; - __extension__ struct{ cl_ulong lo, hi; }; -#endif -#if defined( __CL_ULONG2__) - __cl_ulong2 v2; -#endif -}cl_ulong2; - -typedef union -{ - cl_ulong CL_ALIGNED(32) s[4]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_ulong x, y, z, w; }; - __extension__ struct{ cl_ulong s0, s1, s2, s3; }; - __extension__ struct{ cl_ulong2 lo, hi; }; -#endif -#if defined( __CL_ULONG2__) - __cl_ulong2 v2[2]; -#endif -#if defined( __CL_ULONG4__) - __cl_ulong4 v4; -#endif -}cl_ulong4; - -/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ -typedef cl_ulong4 cl_ulong3; - -typedef union -{ - cl_ulong CL_ALIGNED(64) s[8]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_ulong x, y, z, w; }; - __extension__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; - __extension__ struct{ cl_ulong4 lo, hi; }; -#endif -#if defined( __CL_ULONG2__) - __cl_ulong2 v2[4]; -#endif -#if defined( __CL_ULONG4__) - __cl_ulong4 v4[2]; -#endif -#if defined( __CL_ULONG8__ ) - __cl_ulong8 v8; -#endif -}cl_ulong8; - -typedef union -{ - cl_ulong CL_ALIGNED(128) s[16]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __extension__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __extension__ struct{ cl_ulong8 lo, hi; }; -#endif -#if defined( __CL_ULONG2__) - __cl_ulong2 v2[8]; -#endif -#if defined( __CL_ULONG4__) - __cl_ulong4 v4[4]; -#endif -#if defined( __CL_ULONG8__ ) - __cl_ulong8 v8[2]; -#endif -#if defined( __CL_ULONG16__ ) - __cl_ulong16 v16; -#endif -}cl_ulong16; - - -/* --- cl_floatn ---- */ - -typedef union -{ - cl_float CL_ALIGNED(8) s[2]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_float x, y; }; - __extension__ struct{ cl_float s0, s1; }; - __extension__ struct{ cl_float lo, hi; }; -#endif -#if defined( __CL_FLOAT2__) - __cl_float2 v2; -#endif -}cl_float2; - -typedef union -{ - cl_float CL_ALIGNED(16) s[4]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_float x, y, z, w; }; - __extension__ struct{ cl_float s0, s1, s2, s3; }; - __extension__ struct{ cl_float2 lo, hi; }; -#endif -#if defined( __CL_FLOAT2__) - __cl_float2 v2[2]; -#endif -#if defined( __CL_FLOAT4__) - __cl_float4 v4; -#endif -}cl_float4; - -/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ -typedef cl_float4 cl_float3; - -typedef union -{ - cl_float CL_ALIGNED(32) s[8]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_float x, y, z, w; }; - __extension__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; - __extension__ struct{ cl_float4 lo, hi; }; -#endif -#if defined( __CL_FLOAT2__) - __cl_float2 v2[4]; -#endif -#if defined( __CL_FLOAT4__) - __cl_float4 v4[2]; -#endif -#if defined( __CL_FLOAT8__ ) - __cl_float8 v8; -#endif -}cl_float8; - -typedef union -{ - cl_float CL_ALIGNED(64) s[16]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __extension__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __extension__ struct{ cl_float8 lo, hi; }; -#endif -#if defined( __CL_FLOAT2__) - __cl_float2 v2[8]; -#endif -#if defined( __CL_FLOAT4__) - __cl_float4 v4[4]; -#endif -#if defined( __CL_FLOAT8__ ) - __cl_float8 v8[2]; -#endif -#if defined( __CL_FLOAT16__ ) - __cl_float16 v16; -#endif -}cl_float16; - -/* --- cl_doublen ---- */ - -typedef union -{ - cl_double CL_ALIGNED(16) s[2]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_double x, y; }; - __extension__ struct{ cl_double s0, s1; }; - __extension__ struct{ cl_double lo, hi; }; -#endif -#if defined( __CL_DOUBLE2__) - __cl_double2 v2; -#endif -}cl_double2; - -typedef union -{ - cl_double CL_ALIGNED(32) s[4]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_double x, y, z, w; }; - __extension__ struct{ cl_double s0, s1, s2, s3; }; - __extension__ struct{ cl_double2 lo, hi; }; -#endif -#if defined( __CL_DOUBLE2__) - __cl_double2 v2[2]; -#endif -#if defined( __CL_DOUBLE4__) - __cl_double4 v4; -#endif -}cl_double4; - -/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ -typedef cl_double4 cl_double3; - -typedef union -{ - cl_double CL_ALIGNED(64) s[8]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_double x, y, z, w; }; - __extension__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; - __extension__ struct{ cl_double4 lo, hi; }; -#endif -#if defined( __CL_DOUBLE2__) - __cl_double2 v2[4]; -#endif -#if defined( __CL_DOUBLE4__) - __cl_double4 v4[2]; -#endif -#if defined( __CL_DOUBLE8__ ) - __cl_double8 v8; -#endif -}cl_double8; - -typedef union -{ - cl_double CL_ALIGNED(128) s[16]; -#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) - __extension__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; - __extension__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; - __extension__ struct{ cl_double8 lo, hi; }; -#endif -#if defined( __CL_DOUBLE2__) - __cl_double2 v2[8]; -#endif -#if defined( __CL_DOUBLE4__) - __cl_double4 v4[4]; -#endif -#if defined( __CL_DOUBLE8__ ) - __cl_double8 v8[2]; -#endif -#if defined( __CL_DOUBLE16__ ) - __cl_double16 v16; -#endif -}cl_double16; - -/* Macro to facilitate debugging - * Usage: - * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. - * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" - * Each line thereafter of OpenCL C source must end with: \n\ - * The last line ends in "; - * - * Example: - * - * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ - * kernel void foo( int a, float * b ) \n\ - * { \n\ - * // my comment \n\ - * *b[ get_global_id(0)] = a; \n\ - * } \n\ - * "; - * - * This should correctly set up the line, (column) and file information for your source - * string so you can do source level debugging. - */ -#define __CL_STRINGIFY( _x ) # _x -#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) -#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" - -#ifdef __cplusplus -} -#endif - -#endif /* __CL_PLATFORM_H */ diff --git a/include/CL/opencl.h b/include/CL/opencl.h deleted file mode 100644 index 3f0052471..000000000 --- a/include/CL/opencl.h +++ /dev/null @@ -1,54 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2012 The Khronos Group Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and/or associated documentation files (the - * "Materials"), to deal in the Materials without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Materials, and to - * permit persons to whom the Materials are furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Materials. - * - * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. - ******************************************************************************/ - -/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ - -#ifndef __OPENCL_H -#define __OPENCL_H - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef __APPLE__ - -#include -#include -#include -#include - -#else - -#include -#include -#include -#include - -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* __OPENCL_H */ - diff --git a/include/isaac/driver/backend.h b/include/isaac/driver/backend.h index 5ccbb8199..56e80eaf5 100644 --- a/include/isaac/driver/backend.h +++ b/include/isaac/driver/backend.h @@ -8,7 +8,7 @@ #include "isaac/common/expression_type.h" #include "isaac/common/numeric_type.h" -#include "isaac/driver/common.h" +#include "isaac/driver/dispatch.h" #include "isaac/defines.h" #include "isaac/types.h" diff --git a/include/isaac/driver/common.h b/include/isaac/driver/common.h index 3c8dde3b2..29e7a174f 100644 --- a/include/isaac/driver/common.h +++ b/include/isaac/driver/common.h @@ -2,13 +2,7 @@ #define ISAAC_DRIVER_COMMON_H #include -//OpenCL Backend -#include "isaac/driver/external/CL/cl.h" -#include "isaac/driver/external/CL/cl_ext.h" -//CUDA Backend -#include "isaac/driver/external/CUDA/cuda.h" -#include "isaac/driver/external/CUDA/nvrtc.h" - +#include "isaac/driver/dispatch.h" #include "isaac/defines.h" DISABLE_MSVC_WARNING_C4275 @@ -20,10 +14,8 @@ namespace driver enum backend_type { - OPENCL -#ifdef ISAAC_WITH_CUDA - ,CUDA -#endif + OPENCL, + CUDA }; diff --git a/include/isaac/driver/device.h b/include/isaac/driver/device.h index 7aa16d77e..1ca2e0654 100644 --- a/include/isaac/driver/device.h +++ b/include/isaac/driver/device.h @@ -26,6 +26,7 @@ public: CPU = CL_DEVICE_TYPE_CPU, ACCELERATOR = CL_DEVICE_TYPE_ACCELERATOR }; + enum class Vendor { AMD, @@ -42,14 +43,11 @@ public: }; private: -#ifdef ISAAC_WITH_CUDA template int cuGetInfo() const; -#endif + public: -#ifdef ISAAC_WITH_CUDA Device(int ordinal); -#endif Device(cl_device_id const & device, bool take_ownership = true); bool operator==(Device const &) const; diff --git a/include/isaac/driver/dispatch.h b/include/isaac/driver/dispatch.h new file mode 100644 index 000000000..c1e7f0927 --- /dev/null +++ b/include/isaac/driver/dispatch.h @@ -0,0 +1,203 @@ +#ifndef ISAAC_DRIVER_DISPATCHER_H +#define ISAAC_DRIVER_DISPATCHER_H + +#include +#include + +//OpenCL Backend +#include "isaac/driver/external/CL/cl.h" +#include "isaac/driver/external/CL/cl_ext.h" +//CUDA Backend +#include "isaac/driver/external/CUDA/cuda.h" +#include "isaac/driver/external/CUDA/nvrtc.h" + +#include + +namespace isaac +{ +namespace driver +{ + + +class dispatch +{ +private: + template + struct return_type; + + template + struct return_type + { + typedef R type; + }; + + typedef bool (*f_init_t)(); + + template + static typename return_type::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args) + { + initializer(); + if(cache == nullptr) + cache = dlsym(lib_h, name); + return (*(FunPtrT)cache)(args...); + } + +public: + static bool clinit(); + static bool cuinit(); + static bool nvrtcinit(); + + static void release(); + + //OpenCL + static cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *); + static cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *); + static cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *); + static cl_int clReleaseMemObject(cl_mem); + static cl_int clFinish(cl_command_queue); + static cl_int clGetMemObjectInfo(cl_mem, cl_mem_info, size_t, void *, size_t *); + static cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *); + static cl_int clReleaseContext(cl_context); + static cl_int clReleaseEvent(cl_event); + static cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); + static cl_int clEnqueueReadBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *); + static cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *); + static cl_int clReleaseDevice(cl_device_id); + static cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *); + static cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *); + static cl_int clGetContextInfo(cl_context, cl_context_info, size_t, void *, size_t *); + static cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *); + static cl_int clReleaseCommandQueue(cl_command_queue); + static cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *); + static cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *); + static cl_int clGetEventProfilingInfo(cl_event, cl_profiling_info, size_t, void *, size_t *); + static cl_program clCreateProgramWithBinary(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *); + static cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *); + static cl_int clRetainEvent(cl_event); + static cl_int clReleaseProgram(cl_program); + static cl_int clFlush(cl_command_queue); + static cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *); + static cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *); + static cl_int clGetKernelWorkGroupInfo(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *); + static cl_kernel clCreateKernel(cl_program, const char *, cl_int *); + static cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *); + static cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *); + static cl_int clReleaseKernel(cl_kernel); + + //CUDA + static CUresult cuCtxDestroy_v2(CUcontext ctx); + static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags); + static CUresult cuDeviceGet(CUdevice *device, int ordinal); + static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags); + static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); + static CUresult cuMemFree_v2(CUdeviceptr dptr); + static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + static CUresult cuDriverGetVersion(int *driverVersion); + static CUresult cuDeviceGetName(char *name, int len, CUdevice dev); + static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); + static CUresult cuModuleLoad(CUmodule *module, const char *fname); + static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); + static CUresult cuModuleUnload(CUmodule hmod); + static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); + static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); + static CUresult cuDeviceGetCount(int *count); + static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + static CUresult cuInit(unsigned int Flags); + static CUresult cuEventRecord(CUevent hEvent, CUstream hStream); + static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev); + static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); + static CUresult cuStreamSynchronize(CUstream hStream); + static CUresult cuStreamDestroy_v2(CUstream hStream); + static CUresult cuEventDestroy_v2(CUevent hEvent); + static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize); + + static nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char **options); + static nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet); + static nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx); + static nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet); + static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames); + static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); + +private: + static void* opencl_; + static void* cuda_; + static void* nvrtc_; + + //OpenCL + static void* clBuildProgram_; + static void* clEnqueueNDRangeKernel_; + static void* clSetKernelArg_; + static void* clReleaseMemObject_; + static void* clFinish_; + static void* clGetMemObjectInfo_; + static void* clGetCommandQueueInfo_; + static void* clReleaseContext_; + static void* clReleaseEvent_; + static void* clEnqueueWriteBuffer_; + static void* clEnqueueReadBuffer_; + static void* clGetProgramBuildInfo_; + static void* clReleaseDevice_; + static void* clCreateContext_; + static void* clGetDeviceIDs_; + static void* clGetContextInfo_; + static void* clGetDeviceInfo_; + static void* clReleaseCommandQueue_; + static void* clGetPlatformIDs_; + static void* clGetPlatformInfo_; + static void* clGetEventProfilingInfo_; + static void* clCreateProgramWithBinary_; + static void* clCreateCommandQueue_; + static void* clRetainEvent_; + static void* clReleaseProgram_; + static void* clFlush_; + static void* clGetProgramInfo_; + static void* clGetKernelInfo_; + static void* clGetKernelWorkGroupInfo_; + static void* clCreateKernel_; + static void* clCreateBuffer_; + static void* clCreateProgramWithSource_; + static void* clReleaseKernel_; + + //CUDA + static void* cuCtxDestroy_v2_; + static void* cuEventCreate_; + static void* cuDeviceGet_; + static void* cuMemcpyDtoH_v2_; + static void* cuStreamCreate_; + static void* cuEventElapsedTime_; + static void* cuMemFree_v2_; + static void* cuMemcpyDtoHAsync_v2_; + static void* cuDriverGetVersion_; + static void* cuDeviceGetName_; + static void* cuMemcpyHtoDAsync_v2_; + static void* cuModuleLoad_; + static void* cuLaunchKernel_; + static void* cuModuleUnload_; + static void* cuModuleLoadDataEx_; + static void* cuDeviceGetAttribute_; + static void* cuDeviceGetCount_; + static void* cuMemcpyHtoD_v2_; + static void* cuInit_; + static void* cuEventRecord_; + static void* cuCtxCreate_v2_; + static void* cuModuleGetFunction_; + static void* cuStreamSynchronize_; + static void* cuStreamDestroy_v2_; + static void* cuEventDestroy_v2_; + static void* cuMemAlloc_v2_; + + static void* nvrtcCompileProgram_; + static void* nvrtcGetProgramLogSize_; + static void* nvrtcGetPTX_; + static void* nvrtcGetPTXSize_; + static void* nvrtcCreateProgram_; + static void* nvrtcGetProgramLog_; +}; + +} +} + + + +#endif diff --git a/include/CL/cl.h b/include/isaac/driver/external/CL/cl.h similarity index 100% rename from include/CL/cl.h rename to include/isaac/driver/external/CL/cl.h diff --git a/include/CL/cl_ext.h b/include/isaac/driver/external/CL/cl_ext.h similarity index 100% rename from include/CL/cl_ext.h rename to include/isaac/driver/external/CL/cl_ext.h diff --git a/include/isaac/driver/external/CUDA/cuda.h b/include/isaac/driver/external/CUDA/cuda.h new file mode 100644 index 000000000..5aedd5d0d --- /dev/null +++ b/include/isaac/driver/external/CUDA/cuda.h @@ -0,0 +1,10796 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef __cuda_cuda_h__ +#define __cuda_cuda_h__ + +#include + +/** + * CUDA API versioning support + */ +#if defined(CUDA_FORCE_API_VERSION) + #if (CUDA_FORCE_API_VERSION == 3010) + #define __CUDA_API_VERSION 3010 + #else + #error "Unsupported value of CUDA_FORCE_API_VERSION" + #endif +#else + #define __CUDA_API_VERSION 7000 +#endif /* CUDA_FORCE_API_VERSION */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define __CUDA_API_PER_THREAD_DEFAULT_STREAM + #define __CUDA_API_PTDS(api) api ## _ptds + #define __CUDA_API_PTSZ(api) api ## _ptsz +#else + #define __CUDA_API_PTDS(api) api + #define __CUDA_API_PTSZ(api) api +#endif + +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020 + #define cuDeviceTotalMem cuDeviceTotalMem_v2 + #define cuCtxCreate cuCtxCreate_v2 + #define cuModuleGetGlobal cuModuleGetGlobal_v2 + #define cuMemGetInfo cuMemGetInfo_v2 + #define cuMemAlloc cuMemAlloc_v2 + #define cuMemAllocPitch cuMemAllocPitch_v2 + #define cuMemFree cuMemFree_v2 + #define cuMemGetAddressRange cuMemGetAddressRange_v2 + #define cuMemAllocHost cuMemAllocHost_v2 + #define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 + #define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) + #define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) + #define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2) + #define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2) + #define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2) + #define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2) + #define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2) + #define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2) + #define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2) + #define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2) + #define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2) + #define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2) + #define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2) + #define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2) + #define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2) + #define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2) + #define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2) + #define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2) + #define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2) + #define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2) + #define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2) + #define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2) + #define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2) + #define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2) + #define cuArrayCreate cuArrayCreate_v2 + #define cuArrayGetDescriptor cuArrayGetDescriptor_v2 + #define cuArray3DCreate cuArray3DCreate_v2 + #define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2 + #define cuTexRefSetAddress cuTexRefSetAddress_v2 + #define cuTexRefGetAddress cuTexRefGetAddress_v2 + #define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 3020 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4000 + #define cuCtxDestroy cuCtxDestroy_v2 + #define cuCtxPopCurrent cuCtxPopCurrent_v2 + #define cuCtxPushCurrent cuCtxPushCurrent_v2 + #define cuStreamDestroy cuStreamDestroy_v2 + #define cuEventDestroy cuEventDestroy_v2 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4000 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4010 + #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4010 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050 + #define cuLinkCreate cuLinkCreate_v2 + #define cuLinkAddData cuLinkAddData_v2 + #define cuLinkAddFile cuLinkAddFile_v2 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050 + #define cuMemHostRegister cuMemHostRegister_v2 + #define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */ + +#if !defined(__CUDA_API_VERSION_INTERNAL) +#if defined(__CUDA_API_VERSION) && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 + #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v2 +#endif /* __CUDA_API_VERSION && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 */ +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define cuMemcpy __CUDA_API_PTDS(cuMemcpy) + #define cuMemcpyAsync __CUDA_API_PTSZ(cuMemcpyAsync) + #define cuMemcpyPeer __CUDA_API_PTDS(cuMemcpyPeer) + #define cuMemcpyPeerAsync __CUDA_API_PTSZ(cuMemcpyPeerAsync) + #define cuMemcpy3DPeer __CUDA_API_PTDS(cuMemcpy3DPeer) + #define cuMemcpy3DPeerAsync __CUDA_API_PTSZ(cuMemcpy3DPeerAsync) + + #define cuMemsetD8Async __CUDA_API_PTSZ(cuMemsetD8Async) + #define cuMemsetD16Async __CUDA_API_PTSZ(cuMemsetD16Async) + #define cuMemsetD32Async __CUDA_API_PTSZ(cuMemsetD32Async) + #define cuMemsetD2D8Async __CUDA_API_PTSZ(cuMemsetD2D8Async) + #define cuMemsetD2D16Async __CUDA_API_PTSZ(cuMemsetD2D16Async) + #define cuMemsetD2D32Async __CUDA_API_PTSZ(cuMemsetD2D32Async) + + #define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority) + #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags) + #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent) + #define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback) + #define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync) + #define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery) + #define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize) + #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord) + #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel) + #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources) + #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources) +#endif + +/** + * \file cuda.h + * \brief Header file for the CUDA Toolkit application programming interface. + * + * \file cudaGL.h + * \brief Header file for the OpenGL interoperability functions of the + * low-level CUDA driver application programming interface. + * + * \file cudaD3D9.h + * \brief Header file for the Direct3D 9 interoperability functions of the + * low-level CUDA driver application programming interface. + */ + +/** + * \defgroup CUDA_TYPES Data types used by CUDA driver + * @{ + */ + +/** + * CUDA API version number + */ +#define CUDA_VERSION 7000 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * CUDA device pointer + * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. + */ +#if __CUDA_API_VERSION >= 3020 + +#if defined(_WIN64) || defined(__LP64__) +typedef unsigned long long CUdeviceptr; +#else +typedef unsigned int CUdeviceptr; +#endif + +#endif /* __CUDA_API_VERSION >= 3020 */ + +typedef int CUdevice; /**< CUDA device */ +typedef struct CUctx_st *CUcontext; /**< CUDA context */ +typedef struct CUmod_st *CUmodule; /**< CUDA module */ +typedef struct CUfunc_st *CUfunction; /**< CUDA function */ +typedef struct CUarray_st *CUarray; /**< CUDA array */ +typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ +typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ +typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ +typedef struct CUevent_st *CUevent; /**< CUDA event */ +typedef struct CUstream_st *CUstream; /**< CUDA stream */ +typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ +typedef unsigned long long CUtexObject; /**< An opaque value that represents a CUDA texture object */ +typedef unsigned long long CUsurfObject; /**< An opaque value that represents a CUDA surface object */ + +typedef struct CUuuid_st { /**< CUDA definition of UUID */ + char bytes[16]; +} CUuuid; + + +#if __CUDA_API_VERSION >= 4010 + +/** + * CUDA IPC handle size + */ +#define CU_IPC_HANDLE_SIZE 64 + +/** + * CUDA IPC event handle + */ +typedef struct CUipcEventHandle_st { + char reserved[CU_IPC_HANDLE_SIZE]; +} CUipcEventHandle; + +/** + * CUDA IPC mem handle + */ +typedef struct CUipcMemHandle_st { + char reserved[CU_IPC_HANDLE_SIZE]; +} CUipcMemHandle; + +/** + * CUDA Ipc Mem Flags + */ +typedef enum CUipcMem_flags_enum { + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */ +} CUipcMem_flags; + +#endif + +/** + * CUDA Mem Attach Flags + */ +typedef enum CUmemAttach_flags_enum { + CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */ + CU_MEM_ATTACH_HOST = 0x2, /**< Memory cannot be accessed by any stream on any device */ + CU_MEM_ATTACH_SINGLE = 0x4 /**< Memory can only be accessed by a single stream on the associated device */ +} CUmemAttach_flags; + +/** + * Context creation flags + */ +typedef enum CUctx_flags_enum { + CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ + CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ + CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ + CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling + * \deprecated This flag was deprecated as of CUDA 4.0 + * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */ + CU_CTX_SCHED_MASK = 0x07, + CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */ + CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ + CU_CTX_FLAGS_MASK = 0x1f +} CUctx_flags; + +/** + * Stream creation flags + */ +typedef enum CUstream_flags_enum { + CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */ + CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */ +} CUstream_flags; + +/** + * Legacy stream handle + * + * Stream handle that can be passed as a CUstream to use an implicit stream + * with legacy synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define CU_STREAM_LEGACY ((CUstream)0x1) + +/** + * Per-thread stream handle + * + * Stream handle that can be passed as a CUstream to use an implicit stream + * with per-thread synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define CU_STREAM_PER_THREAD ((CUstream)0x2) + +/** + * Event creation flags + */ +typedef enum CUevent_flags_enum { + CU_EVENT_DEFAULT = 0x0, /**< Default event flag */ + CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */ + CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */ + CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */ +} CUevent_flags; + +/** + * Occupancy calculator flag + */ +typedef enum CUoccupancy_flags_enum { + CU_OCCUPANCY_DEFAULT = 0x0, /**< Default behavior */ + CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1 /**< Assume global caching is enabled and cannot be automatically turned off */ +} CUoccupancy_flags; + +/** + * Array formats + */ +typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ + CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ + CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ + CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ + CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ +} CUarray_format; + +/** + * Texture reference addressing modes + */ +typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ + CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ + CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ + CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ +} CUaddress_mode; + +/** + * Texture reference filtering modes + */ +typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ + CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ +} CUfilter_mode; + +/** + * Device properties + */ +typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ + CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ + CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ + CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ + CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ + CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ + CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ + CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ + CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ + CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Maximum 1D linear texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,/**< Maximum mipmapped 2D texture height */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ + CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ + CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */ + CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ + CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ + CU_DEVICE_ATTRIBUTE_MAX +} CUdevice_attribute; + +/** + * Legacy device properties + */ +typedef struct CUdevprop_st { + int maxThreadsPerBlock; /**< Maximum number of threads per block */ + int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ + int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ + int sharedMemPerBlock; /**< Shared memory available per block in bytes */ + int totalConstantMemory; /**< Constant memory available on device in bytes */ + int SIMDWidth; /**< Warp size in threads */ + int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ + int regsPerBlock; /**< 32-bit registers available per block */ + int clockRate; /**< Clock frequency in kilohertz */ + int textureAlign; /**< Alignment requirement for textures */ +} CUdevprop; + +/** + * Pointer information + */ +typedef enum CUpointer_attribute_enum { + CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ + CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ + CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ + CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ + CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ + CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ + CU_POINTER_ATTRIBUTE_IS_MANAGED = 8 /**< Indicates if the pointer points to managed memory */ +} CUpointer_attribute; + +/** + * Function properties + */ +typedef enum CUfunction_attribute_enum { + /** + * The maximum number of threads per block, beyond which a launch of the + * function would fail. This number depends on both the function and the + * device on which the function is currently loaded. + */ + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + + /** + * The size in bytes of statically-allocated shared memory required by + * this function. This does not include dynamically-allocated shared + * memory requested by the user at runtime. + */ + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, + + /** + * The size in bytes of user-allocated constant memory required by this + * function. + */ + CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, + + /** + * The size in bytes of local memory used by each thread of this function. + */ + CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, + + /** + * The number of registers used by each thread of this function. + */ + CU_FUNC_ATTRIBUTE_NUM_REGS = 4, + + /** + * The PTX virtual architecture version for which the function was + * compiled. This value is the major PTX version * 10 + the minor PTX + * version, so a PTX version 1.3 function would return the value 13. + * Note that this may return the undefined value of 0 for cubins + * compiled prior to CUDA 3.0. + */ + CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, + + /** + * The binary architecture version for which the function was compiled. + * This value is the major binary version * 10 + the minor binary version, + * so a binary version 1.3 function would return the value 13. Note that + * this will return a value of 10 for legacy cubins that do not have a + * properly-encoded binary architecture version. + */ + CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, + + /** + * The attribute to indicate whether the function has been compiled with + * user specified option "-Xptxas --dlcm=ca" set . + */ + CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, + + CU_FUNC_ATTRIBUTE_MAX +} CUfunction_attribute; + +/** + * Function cache configurations + */ +typedef enum CUfunc_cache_enum { + CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ + CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ + CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */ + CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */ +} CUfunc_cache; + +/** + * Shared memory configurations + */ +typedef enum CUsharedconfig_enum { + CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */ + CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */ + CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */ +} CUsharedconfig; + +/** + * Memory types + */ +typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ + CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ + CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ + CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ +} CUmemorytype; + +/** + * Compute Modes + */ +typedef enum CUcomputemode_enum { + CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ + CU_COMPUTEMODE_EXCLUSIVE = 1, /**< Compute-exclusive-thread mode (Only one context used by a single thread can be present on this device at a time) */ + CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ + CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */ +} CUcomputemode; + +/** + * Online compiler and linker options + */ +typedef enum CUjit_option_enum +{ + /** + * Max number of registers that a thread may use.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_MAX_REGISTERS = 0, + + /** + * IN: Specifies minimum number of threads per block to target compilation + * for\n + * OUT: Returns the number of threads the compiler actually targeted. + * This restricts the resource utilization fo the compiler (e.g. max + * registers) such that a block with the given number of threads should be + * able to launch based on register limitations. Note, this option does not + * currently take into account any other resource limitations, such as + * shared memory utilization.\n + * Cannot be combined with ::CU_JIT_TARGET.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_THREADS_PER_BLOCK, + + /** + * Overwrites the option value with the total wall clock time, in + * milliseconds, spent in the compiler and linker\n + * Option type: float\n + * Applies to: compiler and linker + */ + CU_JIT_WALL_TIME, + + /** + * Pointer to a buffer in which to print any log messages + * that are informational in nature (the buffer size is specified via + * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n + * Option type: char *\n + * Applies to: compiler and linker + */ + CU_JIT_INFO_LOG_BUFFER, + + /** + * IN: Log buffer size in bytes. Log messages will be capped at this size + * (including null terminator)\n + * OUT: Amount of log buffer filled with messages\n + * Option type: unsigned int\n + * Applies to: compiler and linker + */ + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + + /** + * Pointer to a buffer in which to print any log messages that + * reflect errors (the buffer size is specified via option + * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n + * Option type: char *\n + * Applies to: compiler and linker + */ + CU_JIT_ERROR_LOG_BUFFER, + + /** + * IN: Log buffer size in bytes. Log messages will be capped at this size + * (including null terminator)\n + * OUT: Amount of log buffer filled with messages\n + * Option type: unsigned int\n + * Applies to: compiler and linker + */ + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + + /** + * Level of optimizations to apply to generated code (0 - 4), with 4 + * being the default and highest level of optimizations.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_OPTIMIZATION_LEVEL, + + /** + * No option value required. Determines the target based on the current + * attached context (default)\n + * Option type: No option value needed\n + * Applies to: compiler and linker + */ + CU_JIT_TARGET_FROM_CUCONTEXT, + + /** + * Target is chosen based on supplied ::CUjit_target. Cannot be + * combined with ::CU_JIT_THREADS_PER_BLOCK.\n + * Option type: unsigned int for enumerated type ::CUjit_target\n + * Applies to: compiler and linker + */ + CU_JIT_TARGET, + + /** + * Specifies choice of fallback strategy if matching cubin is not found. + * Choice is based on supplied ::CUjit_fallback. This option cannot be + * used with cuLink* APIs as the linker requires exact matches.\n + * Option type: unsigned int for enumerated type ::CUjit_fallback\n + * Applies to: compiler only + */ + CU_JIT_FALLBACK_STRATEGY, + + /** + * Specifies whether to create debug information in output (-g) + * (0: false, default)\n + * Option type: int\n + * Applies to: compiler and linker + */ + CU_JIT_GENERATE_DEBUG_INFO, + + /** + * Generate verbose log messages (0: false, default)\n + * Option type: int\n + * Applies to: compiler and linker + */ + CU_JIT_LOG_VERBOSE, + + /** + * Generate line number information (-lineinfo) (0: false, default)\n + * Option type: int\n + * Applies to: compiler only + */ + CU_JIT_GENERATE_LINE_INFO, + + /** + * Specifies whether to enable caching explicitly (-dlcm) \n + * Choice is based on supplied ::CUjit_cacheMode_enum.\n + * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n + * Applies to: compiler only + */ + CU_JIT_CACHE_MODE, + + CU_JIT_NUM_OPTIONS + +} CUjit_option; + +/** + * Online compilation targets + */ +typedef enum CUjit_target_enum +{ + CU_TARGET_COMPUTE_10 = 10, /**< Compute device class 1.0 */ + CU_TARGET_COMPUTE_11 = 11, /**< Compute device class 1.1 */ + CU_TARGET_COMPUTE_12 = 12, /**< Compute device class 1.2 */ + CU_TARGET_COMPUTE_13 = 13, /**< Compute device class 1.3 */ + CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */ + CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */ + CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */ + CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */ + CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */ + CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */ + CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */ + CU_TARGET_COMPUTE_52 = 52 /**< Compute device class 5.2 */ +} CUjit_target; + +/** + * Cubin matching fallback strategies + */ +typedef enum CUjit_fallback_enum +{ + CU_PREFER_PTX = 0, /**< Prefer to compile ptx if exact binary match not found */ + + CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code if exact match not found */ + +} CUjit_fallback; + +/** + * Caching modes for dlcm + */ +typedef enum CUjit_cacheMode_enum +{ + CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */ + CU_JIT_CACHE_OPTION_CG, /**< Compile with L1 cache disabled */ + CU_JIT_CACHE_OPTION_CA /**< Compile with L1 cache enabled */ +} CUjit_cacheMode; + +/** + * Device code formats + */ +typedef enum CUjitInputType_enum +{ + /** + * Compiled device-class-specific device code\n + * Applicable options: none + */ + CU_JIT_INPUT_CUBIN = 0, + + /** + * PTX source code\n + * Applicable options: PTX compiler options + */ + CU_JIT_INPUT_PTX, + + /** + * Bundle of multiple cubins and/or PTX of some device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_FATBINARY, + + /** + * Host object with embedded device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_OBJECT, + + /** + * Archive of host objects with embedded device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_LIBRARY, + + CU_JIT_NUM_INPUT_TYPES +} CUjitInputType; + +#if __CUDA_API_VERSION >= 5050 +typedef struct CUlinkState_st *CUlinkState; +#endif /* __CUDA_API_VERSION >= 5050 */ + +/** + * Flags to register a graphics resource + */ +typedef enum CUgraphicsRegisterFlags_enum { + CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, + CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, + CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, + CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04, + CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08 +} CUgraphicsRegisterFlags; + +/** + * Flags for mapping and unmapping interop resources + */ +typedef enum CUgraphicsMapResourceFlags_enum { + CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 +} CUgraphicsMapResourceFlags; + +/** + * Array indices for cube faces + */ +typedef enum CUarray_cubemap_face_enum { + CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ + CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ + CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ +} CUarray_cubemap_face; + +/** + * Limits + */ +typedef enum CUlimit_enum { + CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ + CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ + CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, /**< GPU malloc heap size */ + CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, /**< GPU device runtime launch synchronize depth */ + CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */ + CU_LIMIT_MAX +} CUlimit; + +/** + * Resource types + */ +typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ + CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ + CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ +} CUresourcetype; + +/** + * Error codes + */ +typedef enum cudaError_enum { + /** + * The API call returned with no errors. In the case of query calls, this + * can also mean that the operation being queried is complete (see + * ::cuEventQuery() and ::cuStreamQuery()). + */ + CUDA_SUCCESS = 0, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + CUDA_ERROR_INVALID_VALUE = 1, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + CUDA_ERROR_OUT_OF_MEMORY = 2, + + /** + * This indicates that the CUDA driver has not been initialized with + * ::cuInit() or that initialization has failed. + */ + CUDA_ERROR_NOT_INITIALIZED = 3, + + /** + * This indicates that the CUDA driver is in the process of shutting down. + */ + CUDA_ERROR_DEINITIALIZED = 4, + + /** + * This indicates profiler is not initialized for this run. This can + * happen when the application is running with external profiling tools + * like visual profiler. + */ + CUDA_ERROR_PROFILER_DISABLED = 5, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to attempt to enable/disable the profiling via ::cuProfilerStart or + * ::cuProfilerStop without initialization. + */ + CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cuProfilerStart() when profiling is already enabled. + */ + CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cuProfilerStop() when profiling is already disabled. + */ + CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, + + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + CUDA_ERROR_NO_DEVICE = 100, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device. + */ + CUDA_ERROR_INVALID_DEVICE = 101, + + + /** + * This indicates that the device kernel image is invalid. This can also + * indicate an invalid CUDA module. + */ + CUDA_ERROR_INVALID_IMAGE = 200, + + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + CUDA_ERROR_INVALID_CONTEXT = 201, + + /** + * This indicated that the context being supplied as a parameter to the + * API call was already the active context. + * \deprecated + * This error return is deprecated as of CUDA 3.2. It is no longer an + * error to attempt to push the active context via ::cuCtxPushCurrent(). + */ + CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, + + /** + * This indicates that a map or register operation has failed. + */ + CUDA_ERROR_MAP_FAILED = 205, + + /** + * This indicates that an unmap or unregister operation has failed. + */ + CUDA_ERROR_UNMAP_FAILED = 206, + + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + CUDA_ERROR_ARRAY_IS_MAPPED = 207, + + /** + * This indicates that the resource is already mapped. + */ + CUDA_ERROR_ALREADY_MAPPED = 208, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + CUDA_ERROR_NO_BINARY_FOR_GPU = 209, + + /** + * This indicates that a resource has already been acquired. + */ + CUDA_ERROR_ALREADY_ACQUIRED = 210, + + /** + * This indicates that a resource is not mapped. + */ + CUDA_ERROR_NOT_MAPPED = 211, + + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, + + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + CUDA_ERROR_ECC_UNCORRECTABLE = 214, + + /** + * This indicates that the ::CUlimit passed to the API call is not + * supported by the active device. + */ + CUDA_ERROR_UNSUPPORTED_LIMIT = 215, + + /** + * This indicates that the ::CUcontext passed to the API call can + * only be bound to a single CPU thread at a time but is already + * bound to a CPU thread. + */ + CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, + + /** + * This indicates that peer access is not supported across the given + * devices. + */ + CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217, + + /** + * This indicates that a PTX JIT compilation failed. + */ + CUDA_ERROR_INVALID_PTX = 218, + + /** + * This indicates an error with OpenGL or DirectX context. + */ + CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219, + + /** + * This indicates that the device kernel source is invalid. + */ + CUDA_ERROR_INVALID_SOURCE = 300, + + /** + * This indicates that the file specified was not found. + */ + CUDA_ERROR_FILE_NOT_FOUND = 301, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, + + /** + * This indicates that initialization of a shared object failed. + */ + CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, + + /** + * This indicates that an OS call failed. + */ + CUDA_ERROR_OPERATING_SYSTEM = 304, + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::CUstream and ::CUevent. + */ + CUDA_ERROR_INVALID_HANDLE = 400, + + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, texture names, and surface names. + */ + CUDA_ERROR_NOT_FOUND = 500, + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::CUDA_SUCCESS (which indicates completion). Calls that + * may return this value include ::cuEventQuery() and ::cuStreamQuery(). + */ + CUDA_ERROR_NOT_READY = 600, + + /** + * While executing a kernel, the device encountered a + * load or store instruction on an invalid memory address. + * The context cannot be used, so it must be destroyed (and a new one should be created). + * All existing device memory allocations from this context are invalid + * and must be reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_ILLEGAL_ADDRESS = 700, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. This error usually indicates that the user has + * attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer + * when a 32-bit int is expected) is equivalent to passing too many + * arguments and can also result in this error. + */ + CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device attribute + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The + * context cannot be used (and must be destroyed similar to + * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from + * this context are invalid and must be reconstructed if the program is to + * continue using CUDA. + */ + CUDA_ERROR_LAUNCH_TIMEOUT = 702, + + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, + + /** + * This error indicates that a call to ::cuCtxEnablePeerAccess() is + * trying to re-enable peer access to a context which has already + * had peer access to it enabled. + */ + CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, + + /** + * This error indicates that ::cuCtxDisablePeerAccess() is + * trying to disable peer access which has not been enabled yet + * via ::cuCtxEnablePeerAccess(). + */ + CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, + + /** + * This error indicates that the primary context for the specified device + * has already been initialized. + */ + CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, + + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy, or is a primary context which + * has not yet been initialized. + */ + CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, + + /** + * A device-side assert triggered during kernel execution. The context + * cannot be used anymore, and must be destroyed. All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_ASSERT = 710, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cuCtxEnablePeerAccess(). + */ + CUDA_ERROR_TOO_MANY_PEERS = 711, + + /** + * This error indicates that the memory range passed to ::cuMemHostRegister() + * has already been registered. + */ + CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, + + /** + * This error indicates that the pointer passed to ::cuMemHostUnregister() + * does not correspond to any currently registered memory region. + */ + CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, + + /** + * While executing a kernel, the device encountered a stack error. + * This can be due to stack corruption or exceeding the stack size limit. + * The context cannot be used, so it must be destroyed (and a new one should be created). + * All existing device memory allocations from this context are invalid + * and must be reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_HARDWARE_STACK_ERROR = 714, + + /** + * While executing a kernel, the device encountered an illegal instruction. + * The context cannot be used, so it must be destroyed (and a new one should be created). + * All existing device memory allocations from this context are invalid + * and must be reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_ILLEGAL_INSTRUCTION = 715, + + /** + * While executing a kernel, the device encountered a load or store instruction + * on a memory address which is not aligned. + * The context cannot be used, so it must be destroyed (and a new one should be created). + * All existing device memory allocations from this context are invalid + * and must be reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_MISALIGNED_ADDRESS = 716, + + /** + * While executing a kernel, the device encountered an instruction + * which can only operate on memory locations in certain address spaces + * (global, shared, or local), but was supplied a memory address not + * belonging to an allowed address space. + * The context cannot be used, so it must be destroyed (and a new one should be created). + * All existing device memory allocations from this context are invalid + * and must be reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_INVALID_ADDRESS_SPACE = 717, + + /** + * While executing a kernel, the device program counter wrapped its address space. + * The context cannot be used, so it must be destroyed (and a new one should be created). + * All existing device memory allocations from this context are invalid + * and must be reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_INVALID_PC = 718, + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. The context cannot be used, so it must + * be destroyed (and a new one should be created). All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_LAUNCH_FAILED = 719, + + + /** + * This error indicates that the attempted operation is not permitted. + */ + CUDA_ERROR_NOT_PERMITTED = 800, + + /** + * This error indicates that the attempted operation is not supported + * on the current system or device. + */ + CUDA_ERROR_NOT_SUPPORTED = 801, + + /** + * This indicates that an unknown internal error has occurred. + */ + CUDA_ERROR_UNKNOWN = 999 +} CUresult; + +#ifdef _WIN32 +#define CUDA_CB __stdcall +#else +#define CUDA_CB +#endif + +/** + * CUDA stream callback + * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. + * \param status ::CUDA_SUCCESS or any persistent error on the stream. + * \param userData User parameter provided at registration. + */ +typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData); + +/** + * Block size to per-block dynamic shared memory mapping for a certain + * kernel \param blockSize Block size of the kernel. + * + * \return The dynamic shared memory needed by a block. + */ +typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize); + +/** + * If set, host memory is portable between CUDA contexts. + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_PORTABLE 0x01 + +/** + * If set, host memory is mapped into CUDA address space and + * ::cuMemHostGetDevicePointer() may be called on the host pointer. + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_DEVICEMAP 0x02 + +/** + * If set, host memory is allocated as write-combined - fast to write, + * faster to DMA, slow to read except via SSE4 streaming load instruction + * (MOVNTDQA). + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 + +/** + * If set, host memory is portable between CUDA contexts. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_PORTABLE 0x01 + +/** + * If set, host memory is mapped into CUDA address space and + * ::cuMemHostGetDevicePointer() may be called on the host pointer. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 + +#if __CUDA_API_VERSION >= 3020 + +/** + * 2D memory copy parameters + */ +typedef struct CUDA_MEMCPY2D_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + + size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ + size_t Height; /**< Height of 2D memory copy */ +} CUDA_MEMCPY2D; + +/** + * 3D memory copy parameters + */ +typedef struct CUDA_MEMCPY3D_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + size_t srcZ; /**< Source Z */ + size_t srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + size_t dstZ; /**< Destination Z */ + size_t dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ + size_t Height; /**< Height of 3D memory copy */ + size_t Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D; + +/** + * 3D memory cross-context copy parameters + */ +typedef struct CUDA_MEMCPY3D_PEER_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + size_t srcZ; /**< Source Z */ + size_t srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + size_t dstZ; /**< Destination Z */ + size_t dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ + size_t Height; /**< Height of 3D memory copy */ + size_t Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D_PEER; + +/** + * Array descriptor + */ +typedef struct CUDA_ARRAY_DESCRIPTOR_st +{ + size_t Width; /**< Width of array */ + size_t Height; /**< Height of array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ +} CUDA_ARRAY_DESCRIPTOR; + +/** + * 3D array descriptor + */ +typedef struct CUDA_ARRAY3D_DESCRIPTOR_st +{ + size_t Width; /**< Width of 3D array */ + size_t Height; /**< Height of 3D array */ + size_t Depth; /**< Depth of 3D array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ +} CUDA_ARRAY3D_DESCRIPTOR; + +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 5000 + +/** + * CUDA Resource descriptor + */ +typedef struct CUDA_RESOURCE_DESC_st +{ + CUresourcetype resType; /**< Resource type */ + + union { + struct { + CUarray hArray; /**< CUDA array */ + } array; + struct { + CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ + } mipmap; + struct { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + struct { + int reserved[32]; + } reserved; + } res; + + unsigned int flags; /**< Flags (must be zero) */ +} CUDA_RESOURCE_DESC; + +/** + * Texture descriptor + */ +typedef struct CUDA_TEXTURE_DESC_st { + CUaddress_mode addressMode[3]; /**< Address modes */ + CUfilter_mode filterMode; /**< Filter mode */ + unsigned int flags; /**< Flags */ + unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ + CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ + float mipmapLevelBias; /**< Mipmap level bias */ + float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ + float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ + int reserved[16]; +} CUDA_TEXTURE_DESC; + +/** + * Resource view format + */ +typedef enum CUresourceViewFormat_enum +{ + CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ + CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ + CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ +} CUresourceViewFormat; + +/** + * Resource view descriptor + */ +typedef struct CUDA_RESOURCE_VIEW_DESC_st +{ + CUresourceViewFormat format; /**< Resource view format */ + size_t width; /**< Width of the resource view */ + size_t height; /**< Height of the resource view */ + size_t depth; /**< Depth of the resource view */ + unsigned int firstMipmapLevel; /**< First defined mipmap level */ + unsigned int lastMipmapLevel; /**< Last defined mipmap level */ + unsigned int firstLayer; /**< First layer index */ + unsigned int lastLayer; /**< Last layer index */ + unsigned int reserved[16]; +} CUDA_RESOURCE_VIEW_DESC; + +/** + * GPU Direct v3 tokens + */ +typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st { + unsigned long long p2pToken; + unsigned int vaSpaceToken; +} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; + +#endif /* __CUDA_API_VERSION >= 5000 */ + +/** + * If set, the CUDA array is a collection of layers, where each layer is either a 1D + * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number + * of layers, not the depth of a 3D array. + */ +#define CUDA_ARRAY3D_LAYERED 0x01 + +/** + * Deprecated, use CUDA_ARRAY3D_LAYERED + */ +#define CUDA_ARRAY3D_2DARRAY 0x01 + +/** + * This flag must be set in order to bind a surface reference + * to the CUDA array + */ +#define CUDA_ARRAY3D_SURFACE_LDST 0x02 + +/** + * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The + * width of such a CUDA array must be equal to its height, and Depth must be six. + * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps + * and Depth must be a multiple of six. + */ +#define CUDA_ARRAY3D_CUBEMAP 0x04 + +/** + * This flag must be set in order to perform texture gather operations + * on a CUDA array. + */ +#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08 + +/** + * This flag if set indicates that the CUDA + * array is a DEPTH_TEXTURE. +*/ +#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10 + +/** + * Override the texref format with a format inferred from the array. + * Flag for ::cuTexRefSetArray() + */ +#define CU_TRSA_OVERRIDE_FORMAT 0x01 + +/** + * Read the texture as integers rather than promoting the values to floats + * in the range [0,1]. + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_READ_AS_INTEGER 0x01 + +/** + * Use normalized texture coordinates in the range [0,1) instead of [0,dim). + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_NORMALIZED_COORDINATES 0x02 + +/** + * Perform sRGB->linear conversion during texture read. + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_SRGB 0x10 + +/** + * End of array terminator for the \p extra parameter to + * ::cuLaunchKernel + */ +#define CU_LAUNCH_PARAM_END ((void*)0x00) + +/** + * Indicator that the next value in the \p extra parameter to + * ::cuLaunchKernel will be a pointer to a buffer containing all kernel + * parameters used for launching kernel \p f. This buffer needs to + * honor all alignment/padding requirements of the individual parameters. + * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the + * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no + * effect. + */ +#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01) + +/** + * Indicator that the next value in the \p extra parameter to + * ::cuLaunchKernel will be a pointer to a size_t which contains the + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER. + * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified + * in the \p extra array if the value associated with + * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. + */ +#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02) + +/** + * For texture references loaded into the module, use default texunit from + * texture reference. + */ +#define CU_PARAM_TR_DEFAULT -1 + +/** @} */ /* END CUDA_TYPES */ + +#ifdef _WIN32 +#define CUDAAPI __stdcall +#else +#define CUDAAPI +#endif + +/** + * \defgroup CUDA_ERROR Error Handling + * + * ___MANBRIEF___ error handling functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the error handling functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Gets the string description of an error code + * + * Sets \p *pStr to the address of a NULL-terminated string description + * of the error code \p error. + * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE + * will be returned and \p *pStr will be set to the NULL address. + * + * \param error - Error code to convert to string + * \param pStr - Address of the string pointer. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::CUresult + */ +CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr); + +/** + * \brief Gets the string representation of an error code enum name + * + * Sets \p *pStr to the address of a NULL-terminated string representation + * of the name of the enum error code \p error. + * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE + * will be returned and \p *pStr will be set to the NULL address. + * + * \param error - Error code to convert to string + * \param pStr - Address of the string pointer. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::CUresult + */ +CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr); + +/** @} */ /* END CUDA_ERROR */ + +/** + * \defgroup CUDA_INITIALIZE Initialization + * + * ___MANBRIEF___ initialization functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the initialization functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Initialize the CUDA driver API + * + * Initializes the driver API and must be called before any other function from + * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit() + * has not been called, any function from the driver API will return + * ::CUDA_ERROR_NOT_INITIALIZED. + * + * \param Flags - Initialization flag for CUDA. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + */ +CUresult CUDAAPI cuInit(unsigned int Flags); + +/** @} */ /* END CUDA_INITIALIZE */ + +/** + * \defgroup CUDA_VERSION Version Management + * + * ___MANBRIEF___ version management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the version management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns the CUDA driver version + * + * Returns in \p *driverVersion the version number of the installed CUDA + * driver. This function automatically returns ::CUDA_ERROR_INVALID_VALUE if + * the \p driverVersion argument is NULL. + * + * \param driverVersion - Returns the CUDA driver version + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + */ +CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); + +/** @} */ /* END CUDA_VERSION */ + +/** + * \defgroup CUDA_DEVICE Device Management + * + * ___MANBRIEF___ device management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the device management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns a handle to a compute device + * + * Returns in \p *device a device handle given an ordinal in the range [0, + * ::cuDeviceGetCount()-1]. + * + * \param device - Returned device handle + * \param ordinal - Device number to get handle for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); + +/** + * \brief Returns the number of compute-capable devices + * + * Returns in \p *count the number of devices with compute capability greater + * than or equal to 1.0 that are available for execution. If there is no such + * device, ::cuDeviceGetCount() returns 0. + * + * \param count - Returned number of compute-capable devices + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGetCount(int *count); + +/** + * \brief Returns an identifer string for the device + * + * Returns an ASCII string identifying the device \p dev in the NULL-terminated + * string pointed to by \p name. \p len specifies the maximum length of the + * string that may be returned. + * + * \param name - Returned identifier string for the device + * \param len - Maximum length of string to store in \p name + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Returns the total amount of memory on the device + * + * Returns in \p *bytes the total amount of memory available on the device + * \p dev in bytes. + * + * \param bytes - Returned memory available on device in bytes + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGet, + */ +CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Returns information about the device + * + * Returns in \p *pi the integer value of the attribute \p attrib on device + * \p dev. The supported attributes are: + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per + * block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid; + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid; + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid; + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of + * shared memory available to a thread block in bytes; + * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for + * __constant__ variables in a CUDA C kernel in bytes; + * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads; + * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the + * memory copy functions that involve memory regions allocated through + * ::cuMemAllocPitch(); + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D + * texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width + * for a 1D texture bound to linear memory; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum + * mipmapped 1D texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D + * texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D + * texture height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width + * for a 2D texture bound to linear memory; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height + * for a 2D texture bound to linear memory; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch + * in bytes for a 2D texture bound to linear memory; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum + * mipmapped 2D texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum + * mipmapped 2D texture height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D + * texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D + * texture height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D + * texture depth; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: + * Alternate maximum 3D texture width, 0 if no alternate + * maximum 3D texture size is supported; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: + * Alternate maximum 3D texture height, 0 if no alternate + * maximum 3D texture size is supported; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: + * Alternate maximum 3D texture depth, 0 if no alternate + * maximum 3D texture size is supported; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: + * Maximum cubemap texture width or height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: + * Maximum 1D layered texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: + * Maximum layers in a 1D layered texture; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: + * Maximum 2D layered texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: + * Maximum 2D layered texture height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: + * Maximum layers in a 2D layered texture; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: + * Maximum cubemap layered texture width or height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: + * Maximum layers in a cubemap layered texture; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: + * Maximum 1D surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: + * Maximum 2D surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: + * Maximum 2D surface height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: + * Maximum 3D surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: + * Maximum 3D surface height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: + * Maximum 3D surface depth; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: + * Maximum 1D layered surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: + * Maximum layers in a 1D layered surface; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: + * Maximum 2D layered surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: + * Maximum 2D layered surface height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: + * Maximum layers in a 2D layered surface; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: + * Maximum cubemap surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: + * Maximum cubemap layered surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: + * Maximum layers in a cubemap layered surface; + * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit + * registers available to a thread block; + * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz; + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture + * base addresses aligned to ::textureAlign bytes do not need an offset + * applied to texture fetches; + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement + * for 2D texture references bound to pitched memory; + * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy + * memory between host and device while executing a kernel, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on + * the device; + * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit + * for kernels executed on the device, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the + * memory subsystem, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host + * memory into the CUDA address space, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently + * in. Available modes are as follows: + * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and + * can have multiple CUDA contexts present at a single time. + * - ::CU_COMPUTEMODE_EXCLUSIVE: Compute-exclusive mode - Device can have + * only one CUDA context present on it at a time. + * - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is + * prohibited from creating new CUDA contexts. + * - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS: Compute-exclusive-process mode - Device + * can have only one context used by a single process at a time. + * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports + * executing multiple kernels within the same context simultaneously, or 0 if + * not. It is not guaranteed that multiple kernels will be resident + * on the device concurrently so this feature should not be relied upon for + * correctness; + * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the + * device, 0 if error correction is disabled or not supported by the device; + * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device; + * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier + * of the device; + * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC + * is only available on Tesla hardware running Windows Vista or later; + * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz; + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits; + * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache; + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor; + * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with + * the host, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number; + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number; + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals + * in L1 cache, 0 if caching globals in L1 cache is not supported by the device; + * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals + * in L1 cache, 0 if caching locals in L1 cache is not supported by the device; + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of + * shared memory available to a multiprocessor in bytes; this amount is shared + * by all thread blocks simultaneously resident on a multiprocessor; + * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit + * registers available to a multiprocessor; this number is shared by all thread + * blocks simultaneously resident on a multiprocessor; + * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory + * on this system, 0 if allocating managed memory is not supported by the device on this system. + * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not. + * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices + * associated with the same board. Devices on the same multi-GPU board will share the same identifier. + * + * \param pi - Returned device attribute value + * \param attrib - Device attribute to query + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); + +/** @} */ /* END CUDA_DEVICE */ + +/** + * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated device management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the device management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns properties for a selected device + * + * \deprecated + * + * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute(). + * + * Returns in \p *prop the properties of device \p dev. The ::CUdevprop + * structure is defined as: + * + * \code + typedef struct CUdevprop_st { + int maxThreadsPerBlock; + int maxThreadsDim[3]; + int maxGridSize[3]; + int sharedMemPerBlock; + int totalConstantMemory; + int SIMDWidth; + int memPitch; + int regsPerBlock; + int clockRate; + int textureAlign + } CUdevprop; + * \endcode + * where: + * + * - ::maxThreadsPerBlock is the maximum number of threads per block; + * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block; + * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid; + * - ::sharedMemPerBlock is the total amount of shared memory available per + * block in bytes; + * - ::totalConstantMemory is the total amount of constant memory available on + * the device in bytes; + * - ::SIMDWidth is the warp size; + * - ::memPitch is the maximum pitch allowed by the memory copy functions that + * involve memory regions allocated through ::cuMemAllocPitch(); + * - ::regsPerBlock is the total number of registers available per block; + * - ::clockRate is the clock frequency in kilohertz; + * - ::textureAlign is the alignment requirement; texture base addresses that + * are aligned to ::textureAlign bytes do not need an offset applied to + * texture fetches. + * + * \param prop - Returned properties of device + * \param dev - Device to get properties for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); + +/** + * \brief Returns the compute capability of the device + * + * \deprecated + * + * This function was deprecated as of CUDA 5.0 and its functionality superceded + * by ::cuDeviceGetAttribute(). + * + * Returns in \p *major and \p *minor the major and minor revision numbers that + * define the compute capability of the device \p dev. + * + * \param major - Major revision number + * \param minor - Minor revision number + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); + +/** @} */ /* END CUDA_DEVICE_DEPRECATED */ + +/** + * \defgroup CUDA_PRIMARY_CTX Primary Context Management + * + * ___MANBRIEF___ primary context management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the primary context management functions of the low-level + * CUDA driver application programming interface. + * + * The primary context unique per device and it's shared with CUDA runtime API. + * Those functions allows seemless integration with other libraries using CUDA. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 7000 + +/** + * \brief Retain the primary context on the GPU + * + * Retains the primary context on the device, creating it if necessary, + * increasing its usage count. The caller must call + * ::cuDevicePrimaryCtxRelease() when done using the context. + * Unlike ::cuCtxCreate() the newly created context is not pushed onto the stack. + * + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + * the device is ::CU_COMPUTEMODE_PROHIBITED. Similarly, context creation will + * also fail with ::CUDA_ERROR_UNKNOWN if the compute mode for the device is + * set to ::CU_COMPUTEMODE_EXCLUSIVE and there is already an active, non-primary, + * context on the device. The function ::cuDeviceGetAttribute() can be used with + * ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode of the + * device. The nvidia-smi tool can be used to set the compute mode for + * devices. Documentation for nvidia-smi can be obtained by passing a + * -h option to it. + * + * Please note that the primary context always supports pinned allocations. Other + * flags can be specified by ::cuDevicePrimaryCtxSetFlags(). + * + * \param pctx - Returned context handle of the new context + * \param dev - Device for which primary context is requested + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRelease, + * ::cuDevicePrimaryCtxSetFlags, + * ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev); + +/** + * \brief Release the primary context on the GPU + * + * Releases the primary context interop on the device by decreasing the usage + * count by 1. If the usage drops to 0 the primary context of device \p dev + * will be destroyed regardless of how many threads it is current to. + * + * Please note that unlike ::cuCtxDestroy() this method does not pop the context + * from stack in any circumstances. + * + * \param dev - Device which primary context is released + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); + +/** + * \brief Set flags for the primary context + * + * Sets the flags for the primary context on the device overwriting perviously + * set ones. If the primary context is already created + * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE is returned. + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work.
+ * Deprecated: This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * However, on low power devices like Tegra, it always defaults to + * ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage. + * + * \param dev - Device for which the primary context flags are set + * \param flags - New flags for the device + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuDevicePrimaryCtxGetState, + * ::cuCtxCreate, + * ::cuCtxGetFlags + */ +CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); + +/** + * \brief Get the state of the primary context + * + * Returns in \p *flags the flags for the primary context of \p dev, and in + * \p *active whether it is active. See ::cuDevicePrimaryCtxSetFlags for flag + * values. + * + * \param dev - Device to get primary context flags for + * \param flags - Pointer to store flags + * \param active - Pointer to store context state; 0 = inactive, 1 = active + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxSetFlags, + * ::cuCtxGetFlags + */ +CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active); + +/** + * \brief Destroy all allocations and reset all state on the primary context + * + * Explicitly destroys and cleans up all resources associated with the current + * device in the current process. + * + * Note that it is responsibility of the calling function to ensure that no + * other module in the process is using the device any more. For that reason + * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. + * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() + * even after resetting the device. + * + * \param dev - Device for which primary context is destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuDevicePrimaryCtxRelease, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + * + */ +CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); + +#endif /* __CUDA_API_VERSION >= 7000 */ + +/** @} */ /* END CUDA_PRIMARY_CTX */ + + +/** + * \defgroup CUDA_CTX Context Management + * + * ___MANBRIEF___ context management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the context management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Create a CUDA context + * + * Creates a new CUDA context and associates it with the calling thread. The + * \p flags parameter is described below. The context is created with a usage + * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or + * when done using the context. If a context is already current to the thread, + * it is supplanted by the newly created context and may be restored by a subsequent + * call to ::cuCtxPopCurrent(). + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work.
+ * Deprecated: This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * However, on low power devices like Tegra, it always defaults to + * ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. + * This flag must be set in order to allocate pinned host memory that is + * accessible to the GPU. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage. + * + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + * the device is ::CU_COMPUTEMODE_PROHIBITED. Similarly, context creation will + * also fail with ::CUDA_ERROR_UNKNOWN if the compute mode for the device is + * set to ::CU_COMPUTEMODE_EXCLUSIVE and there is already an active context on + * the device. The function ::cuDeviceGetAttribute() can be used with + * ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode of the + * device. The nvidia-smi tool can be used to set the compute mode for + * devices. Documentation for nvidia-smi can be obtained by passing a + * -h option to it. + * + * \param pctx - Returned context handle of the new context + * \param flags - Context creation flags + * \param dev - Device to create context on + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Destroy a CUDA context + * + * Destroys the CUDA context specified by \p ctx. The context \p ctx will be + * destroyed regardless of how many threads it is current to. + * It is the responsibility of the calling function to ensure that no API + * call issues using \p ctx while ::cuCtxDestroy() is executing. + * + * If \p ctx is current to the calling thread then \p ctx will also be + * popped from the current thread's context stack (as though ::cuCtxPopCurrent() + * were called). If \p ctx is current to other threads, then \p ctx will + * remain current to those threads, and attempting to access \p ctx from + * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED. + * + * \param ctx - Context to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Pushes a context on the current CPU thread + * + * Pushes the given context \p ctx onto the CPU thread's stack of current + * contexts. The specified context becomes the CPU thread's current context, so + * all CUDA functions that operate on the current context are affected. + * + * The previous current context may be made current again by calling + * ::cuCtxDestroy() or ::cuCtxPopCurrent(). + * + * \param ctx - Context to push + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); + +/** + * \brief Pops the current CUDA context from the current CPU thread. + * + * Pops the current CUDA context from the CPU thread and passes back the + * old context handle in \p *pctx. That context may then be made current + * to a different CPU thread by calling ::cuCtxPushCurrent(). + * + * If a context was current to the CPU thread before ::cuCtxCreate() or + * ::cuCtxPushCurrent() was called, this function makes that context current to + * the CPU thread again. + * + * \param pctx - Returned new context handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); + +/** + * \brief Binds the specified CUDA context to the calling CPU thread + * + * Binds the specified CUDA context to the calling CPU thread. + * If \p ctx is NULL then the CUDA context previously bound to the + * calling CPU thread is unbound and ::CUDA_SUCCESS is returned. + * + * If there exists a CUDA context stack on the calling CPU thread, this + * will replace the top of that stack with \p ctx. + * If \p ctx is NULL then this will be equivalent to popping the top + * of the calling CPU thread's CUDA context stack (or a no-op if the + * calling CPU thread's CUDA context stack is empty). + * + * \param ctx - Context to bind to the calling CPU thread + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxGetCurrent, ::cuCtxCreate, ::cuCtxDestroy + */ +CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); + +/** + * \brief Returns the CUDA context bound to the calling CPU thread. + * + * Returns in \p *pctx the CUDA context bound to the calling CPU thread. + * If no context is bound to the calling CPU thread then \p *pctx is + * set to NULL and ::CUDA_SUCCESS is returned. + * + * \param pctx - Returned context handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * \notefnerr + * + * \sa ::cuCtxSetCurrent, ::cuCtxCreate, ::cuCtxDestroy + */ +CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx); +#endif /* __CUDA_API_VERSION >= 4000 */ + +/** + * \brief Returns the device ID for the current context + * + * Returns in \p *device the ordinal of the current context's device. + * + * \param device - Returned device ID for the current context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxGetDevice(CUdevice *device); + +#if __CUDA_API_VERSION >= 7000 +/** + * \brief Returns the flags for the current context + * + * Returns in \p *flags the flags of the current context. See ::cuCtxCreate + * for flag values. + * + * \param flags - Pointer to store flags of current context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetCurrent, + * ::cuCtxGetDevice + * ::cuCtxGetLimit, + * ::cuCtxGetSharedMemConfig, + * ::cuCtxGetStreamPriorityRange + */ +CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags); +#endif /* __CUDA_API_VERSION >= 7000 */ + +/** + * \brief Block for a context's tasks to complete + * + * Blocks until the device has completed all preceding requested tasks. + * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed. + * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the + * CPU thread will block until the GPU context has finished its work. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit + */ +CUresult CUDAAPI cuCtxSynchronize(void); + +/** + * \brief Set resource limits + * + * Setting \p limit to \p value is a request by the application to update + * the current limit maintained by the context. The driver is free to + * modify the requested value to meet h/w requirements (this could be + * clamping to minimum or maximum values, rounding up to nearest element + * size, etc). The application can use ::cuCtxGetLimit() to find out exactly + * what the limit has been set to. + * + * Setting each ::CUlimit has its own specific restrictions, so each is + * discussed here. + * + * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread. + * This limit is only applicable to devices of compute capability 2.0 and + * higher. Attempting to set this limit on devices of compute capability + * less than 2.0 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT + * being returned. + * + * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used + * by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE + * must be performed before launching any kernel that uses the ::printf() + * device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned. + * This limit is only applicable to devices of compute capability 2.0 and + * higher. Attempting to set this limit on devices of compute capability + * less than 2.0 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT + * being returned. + * + * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used + * by the ::malloc() and ::free() device system calls. Setting + * ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel + * that uses the ::malloc() or ::free() device system calls, otherwise + * ::CUDA_ERROR_INVALID_VALUE will be returned. This limit is only applicable + * to devices of compute capability 2.0 and higher. Attempting to set this + * limit on devices of compute capability less than 2.0 will result in the + * error ::CUDA_ERROR_UNSUPPORTED_LIMIT being returned. + * + * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of + * a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting + * this limit must be performed before any launch of a kernel that uses the + * device runtime and calls ::cudaDeviceSynchronize() above the default sync + * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail + * with error code ::cudaErrorSyncDepthExceeded if the limitation is + * violated. This limit can be set smaller than the default or up the maximum + * launch depth of 24. When setting this limit, keep in mind that additional + * levels of sync depth require the driver to reserve large amounts of device + * memory which can no longer be used for user allocations. If these + * reservations of device memory fail, ::cuCtxSetLimit will return + * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being + * returned. + * + * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of + * outstanding device runtime launches that can be made from the current + * context. A grid is outstanding from the point of launch up until the grid + * is known to have been completed. Device runtime launches which violate + * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when + * ::cudaGetLastError() is called after launch. If more pending launches than + * the default (2048 launches) are needed for a module using the device + * runtime, this limit can be increased. Keep in mind that being able to + * sustain additional pending launches will require the driver to reserve + * larger amounts of device memory upfront which can no longer be used for + * allocations. If these reservations fail, ::cuCtxSetLimit will return + * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being + * returned. + * + * \param limit - Limit to set + * \param value - Size of limit + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_LIMIT, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value); + +/** + * \brief Returns resource limits + * + * Returns in \p *pvalue the current size of \p limit. The supported + * ::CUlimit values are: + * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread. + * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the + * ::printf() device system call. + * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the + * ::malloc() and ::free() device system calls. + * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread + * can issue the device runtime call ::cudaDeviceSynchronize() to wait on + * child grid launches to complete. + * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding + * device runtime launches that can be made from this context. + * + * \param limit - Limit to query + * \param pvalue - Returned size of limit + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_LIMIT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit); + +/** + * \brief Returns the preferred cache configuration for the current context. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this function returns through \p pconfig the preferred cache configuration + * for the current context. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute functions. + * + * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices + * where the size of the L1 cache and shared memory are fixed. + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param pconfig - Returned cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuFuncSetCacheConfig + */ +CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig); + +/** + * \brief Sets the preferred cache configuration for the current context. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p config the preferred cache configuration for + * the current context. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute the function. Any function preference + * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide + * setting. Setting the context-wide cache configuration to + * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer + * to not change the cache configuration unless required to launch the kernel. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param config - Requested cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuFuncSetCacheConfig + */ +CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config); + +#if __CUDA_API_VERSION >= 4020 +/** + * \brief Returns the current shared memory configuration for the current context. + * + * This function will return in \p pConfig the current size of shared memory banks + * in the current context. On devices with configurable shared memory banks, + * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all + * subsequent kernel launches will by default use the new bank size. When + * ::cuCtxGetSharedMemConfig is called on devices without configurable shared + * memory, it will return the fixed bank size of the hardware. + * + * The returned bank configurations can be either: + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is + * four bytes. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will + * eight bytes. + * + * \param pConfig - returned shared memory configuration + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuCtxGetSharedMemConfig, + * ::cuFuncSetCacheConfig, + */ +CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig); + +/** + * \brief Sets the shared memory configuration for the current context. + * + * On devices with configurable shared memory banks, this function will set + * the context's shared memory bank size which is used for subsequent kernel + * launches. + * + * Changed the shared memory configuration between launches may insert a device + * side synchronization point between those launches. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * The supported bank configurations are: + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial + * setting (currently, four bytes). + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to + * be natively four bytes. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to + * be natively eight bytes. + * + * \param config - requested shared memory configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuCtxGetSharedMemConfig, + * ::cuFuncSetCacheConfig, + */ +CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config); +#endif + +/** + * \brief Gets the context's API version. + * + * Returns a version number in \p version corresponding to the capabilities of + * the context (e.g. 3010 or 3020), which library developers can use to direct + * callers to a specific API version. If \p ctx is NULL, returns the API version + * used to create the currently bound context. + * + * Note that new API versions are only introduced when context capabilities are + * changed that break binary compatibility, so the API version and driver version + * may be different. For example, it is valid for the API version to be 3020 while + * the driver version is 4020. + * + * \param ctx - Context to check + * \param version - Pointer to version + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version); + +/** + * \brief Returns numerical values that correspond to the least and + * greatest stream priorities. + * + * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond + * to the least and greatest stream priorities respectively. Stream priorities + * follow a convention where lower numbers imply greater priorities. The range of + * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. + * If the user attempts to create a stream with a priority value that is + * outside the meaningful range as specified by this API, the priority is + * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority + * respectively. See ::cuStreamCreateWithPriority for details on creating a + * priority stream. + * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value + * is not desired. + * + * This function will return '0' in both \p *leastPriority and \p *greatestPriority if + * the current context's device does not support stream priorities + * (see ::cuDeviceGetAttribute). + * + * \param leastPriority - Pointer to an int in which the numerical value for least + * stream priority is returned + * \param greatestPriority - Pointer to an int in which the numerical value for greatest + * stream priority is returned + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority); + +/** @} */ /* END CUDA_CTX */ + +/** + * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated context management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated context management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Increment a context's usage-count + * + * \deprecated + * + * Note that this function is deprecated and should not be used. + * + * Increments the usage count of the context and passes back a context handle + * in \p *pctx that must be passed to ::cuCtxDetach() when the application is + * done with the context. ::cuCtxAttach() fails if there is no context current + * to the thread. + * + * Currently, the \p flags parameter must be 0. + * + * \param pctx - Returned context handle of the current context + * \param flags - Context attach flags (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxDetach, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); + +/** + * \brief Decrement a context's usage-count + * + * \deprecated + * + * Note that this function is deprecated and should not be used. + * + * Decrements the usage count of the context \p ctx, and destroys the context + * if the usage count goes to 0. The context must be a handle that was passed + * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the + * calling thread. + * + * \param ctx - Context to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxDetach(CUcontext ctx); + +/** @} */ /* END CUDA_CTX_DEPRECATED */ + + +/** + * \defgroup CUDA_MODULE Module Management + * + * ___MANBRIEF___ module management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the module management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Loads a compute module + * + * Takes a filename \p fname and loads the corresponding module \p module into + * the current context. The CUDA driver API does not attempt to lazily + * allocate the resources needed by a module; if the memory for functions and + * data (constant and global) needed by the module cannot be allocated, + * ::cuModuleLoad() fails. The file should be a \e cubin file as output by + * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or + * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later. + * + * \param module - Returned module + * \param fname - Filename of module to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_FILE_NOT_FOUND, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); + +/** + * \brief Load a module's data + * + * Takes a pointer \p image and loads the corresponding module \p module into + * the current context. The pointer may be obtained by mapping a \e cubin or + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin + * object into the executable resources and using operating system calls such + * as Windows \c FindResource() to obtain the pointer. + * + * \param module - Returned module + * \param image - Module data to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image); + +/** + * \brief Load a module's data with options + * + * Takes a pointer \p image and loads the corresponding module \p module into + * the current context. The pointer may be obtained by mapping a \e cubin or + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin + * object into the executable resources and using operating system calls such + * as Windows \c FindResource() to obtain the pointer. Options are passed as + * an array via \p options and any corresponding parameters are passed in + * \p optionValues. The number of total options is supplied via \p numOptions. + * Any outputs will be returned via \p optionValues. + * + * \param module - Returned module + * \param image - Module data to load + * \param numOptions - Number of options + * \param options - Options for JIT + * \param optionValues - Option values for JIT + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Load a module's data + * + * Takes a pointer \p fatCubin and loads the corresponding module \p module + * into the current context. The pointer represents a fat binary object, + * which is a collection of different \e cubin and/or \e PTX files, all + * representing the same device code, but compiled and optimized for different + * architectures. + * + * Prior to CUDA 4.0, there was no documented API for constructing and using + * fat binary objects by programmers. Starting with CUDA 4.0, fat binary + * objects can be constructed by providing the -fatbin option to \b nvcc. + * More information can be found in the \b nvcc document. + * + * \param module - Returned module + * \param fatCubin - Fat binary to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); + +/** + * \brief Unloads a module + * + * Unloads a module \p hmod from the current context. + * + * \param hmod - Module to unload + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary + */ +CUresult CUDAAPI cuModuleUnload(CUmodule hmod); + +/** + * \brief Returns a function handle + * + * Returns in \p *hfunc the handle of the function of name \p name located in + * module \p hmod. If no function of that name exists, ::cuModuleGetFunction() + * returns ::CUDA_ERROR_NOT_FOUND. + * + * \param hfunc - Returned function handle + * \param hmod - Module to retrieve function from + * \param name - Name of function to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Returns a global pointer from a module + * + * Returns in \p *dptr and \p *bytes the base pointer and size of the + * global of name \p name located in module \p hmod. If no variable of that name + * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both + * parameters \p dptr and \p bytes are optional. If one of them is + * NULL, it is ignored. + * + * \param dptr - Returned global device pointer + * \param bytes - Returned global size in bytes + * \param hmod - Module to retrieve global from + * \param name - Name of global to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Returns a handle to a texture reference + * + * Returns in \p *pTexRef the handle of the texture reference of name \p name + * in the module \p hmod. If no texture reference of that name exists, + * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference + * handle should not be destroyed, since it will be destroyed when the module + * is unloaded. + * + * \param pTexRef - Returned texture reference + * \param hmod - Module to retrieve texture reference from + * \param name - Name of texture reference to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetSurfRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); + +/** + * \brief Returns a handle to a surface reference + * + * Returns in \p *pSurfRef the handle of the surface reference of name \p name + * in the module \p hmod. If no surface reference of that name exists, + * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND. + * + * \param pSurfRef - Returned surface reference + * \param hmod - Module to retrieve surface reference from + * \param name - Name of surface reference to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); + +#if __CUDA_API_VERSION >= 5050 + +/** + * \brief Creates a pending JIT linker invocation. + * + * If the call is successful, the caller owns the returned CUlinkState, which + * should eventually be destroyed with ::cuLinkDestroy. The + * device code machine size (32 or 64 bit) will match the calling application. + * + * Both linker and compiler options may be specified. Compiler options will + * be applied to inputs to this linker action which must be compiled from PTX. + * The options ::CU_JIT_WALL_TIME, + * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES + * will accumulate data until the CUlinkState is destroyed. + * + * \p optionValues must remain valid for the life of the CUlinkState if output + * options are used. No other references to inputs are maintained after this + * call returns. + * + * \param numOptions Size of options arrays + * \param options Array of linker and compiler options + * \param optionValues Array of option values, each cast to void * + * \param stateOut On success, this will contain a CUlinkState to specify + * and complete this action + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuLinkAddData, + * ::cuLinkAddFile, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); + +/** + * \brief Add an input to a pending linker invocation + * + * Ownership of \p data is retained by the caller. No reference is retained to any + * inputs after this call returns. + * + * This method accepts only compiler options, which are used if the data must + * be compiled from PTX, and does not accept any of + * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, + * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. + * + * \param state A pending linker action. + * \param type The type of the input data. + * \param data The input data. PTX must be NULL-terminated. + * \param size The length of the input data. + * \param name An optional name for this input in log messages. + * \param numOptions Size of options. + * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate). + * \param optionValues Array of option values, each cast to void *. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU + * + * \sa ::cuLinkCreate, + * ::cuLinkAddFile, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, + unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Add a file input to a pending linker invocation + * + * No reference is retained to any inputs after this call returns. + * + * This method accepts only compiler options, which are used if the input + * must be compiled from PTX, and does not accept any of + * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, + * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. + * + * This method is equivalent to invoking ::cuLinkAddData on the contents + * of the file. + * + * \param state A pending linker action + * \param type The type of the input data + * \param path Path to the input file + * \param numOptions Size of options + * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate) + * \param optionValues Array of option values, each cast to void * + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_FILE_NOT_FOUND + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU + * + * \sa ::cuLinkCreate, + * ::cuLinkAddData, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, + unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Complete a pending linker invocation + * + * Completes the pending linker action and returns the cubin image for the linked + * device code, which can be used with ::cuModuleLoadData. The cubin is owned by + * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy. + * This call does not destroy \p state. + * + * \param state A pending linker invocation + * \param cubinOut On success, this will point to the output image + * \param sizeOut Optional parameter to receive the size of the generated image + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuLinkCreate, + * ::cuLinkAddData, + * ::cuLinkAddFile, + * ::cuLinkDestroy, + * ::cuModuleLoadData + */ +CUresult CUDAAPI +cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut); + +/** + * \brief Destroys state for a JIT linker invocation. + * + * \param state State object for the linker invocation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE + * + * \sa ::cuLinkCreate + */ +CUresult CUDAAPI +cuLinkDestroy(CUlinkState state); + +#endif /* __CUDA_API_VERSION >= 5050 */ + +/** @} */ /* END CUDA_MODULE */ + + +/** + * \defgroup CUDA_MEM Memory Management + * + * ___MANBRIEF___ memory management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the memory management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Gets free and total memory + * + * Returns in \p *free and \p *total respectively, the free and total amount of + * memory available for allocation by the CUDA context, in bytes. + * + * \param free - Returned free memory in bytes + * \param total - Returned total memory in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total); + +/** + * \brief Allocates device memory + * + * Allocates \p bytesize bytes of linear memory on the device and returns in + * \p *dptr a pointer to the allocated memory. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p bytesize + * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE. + * + * \param dptr - Returned device pointer + * \param bytesize - Requested allocation size in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); + +/** + * \brief Allocates pitched device memory + * + * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on + * the device and returns in \p *dptr a pointer to the allocated memory. The + * function may pad the allocation to ensure that corresponding pointers in + * any given row will continue to meet the alignment requirements for + * coalescing as the address is updated from row to row. \p ElementSizeBytes + * specifies the size of the largest reads and writes that will be performed + * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced + * memory transactions are not possible on other data sizes). If + * \p ElementSizeBytes is smaller than the actual read/write size of a kernel, + * the kernel will run correctly, but possibly at reduced speed. The pitch + * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the + * allocation. The intended usage of pitch is as a separate parameter of the + * allocation, used to compute addresses within the 2D array. Given the row + * and column of an array element of type \b T, the address is computed as: + * \code + T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column; + * \endcode + * + * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with + * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is + * recommended that programmers consider performing pitch allocations using + * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is + * especially true if the application will be performing 2D memory copies + * between different regions of device memory (whether linear memory or CUDA + * arrays). + * + * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed + * to match or exceed the alignment requirement for texture binding with + * ::cuTexRefSetAddress2D(). + * + * \param dptr - Returned device pointer + * \param pPitch - Returned pitch of allocation in bytes + * \param WidthInBytes - Requested allocation width in bytes + * \param Height - Requested allocation height in rows + * \param ElementSizeBytes - Size of largest reads/writes for range + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); + +/** + * \brief Frees device memory + * + * Frees the memory space pointed to by \p dptr, which must have been returned + * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch(). + * + * \param dptr - Pointer to memory to free + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); + +/** + * \brief Get information on memory allocations + * + * Returns the base address in \p *pbase and size in \p *psize of the + * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input + * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one + * of them is NULL, it is ignored. + * + * \param pbase - Returned base address + * \param psize - Returned size of device memory allocation + * \param dptr - Device pointer to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); + +/** + * \brief Allocates page-locked host memory + * + * Allocates \p bytesize bytes of host memory that is page-locked and + * accessible to the device. The driver tracks the virtual memory ranges + * allocated with this function and automatically accelerates calls to + * functions such as ::cuMemcpy(). Since the memory can be accessed directly by + * the device, it can be read or written with much higher bandwidth than + * pageable memory obtained with functions such as ::malloc(). Allocating + * excessive amounts of memory with ::cuMemAllocHost() may degrade system + * performance, since it reduces the amount of memory available to the system + * for paging. As a result, this function is best used sparingly to allocate + * staging areas for data exchange between host and device. + * + * Note all host memory allocated using ::cuMemHostAlloc() will automatically + * be immediately accessible to all contexts on all devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). + * The device pointer that may be used to access this host memory from those + * contexts is always equal to the returned host pointer \p *pp. + * See \ref CUDA_UNIFIED for additional details. + * + * \param pp - Returned host pointer to page-locked memory + * \param bytesize - Requested allocation size in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Frees page-locked host memory + * + * Frees the memory space pointed to by \p p, which must have been returned by + * a previous call to ::cuMemAllocHost(). + * + * \param p - Pointer to memory to free + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemFreeHost(void *p); + +/** + * \brief Allocates page-locked host memory + * + * Allocates \p bytesize bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device, + * it can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * pinned memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to allocate staging areas for data exchange between + * host and device. + * + * The \p Flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * + * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address + * space. The device pointer to the memory may be obtained by calling + * ::cuMemHostGetDevicePointer(). This feature is available only on GPUs + * with compute capability greater than or equal to 1.1. + * + * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined + * (WC). WC memory can be transferred across the PCI Express bus more + * quickly on some system configurations, but cannot be read efficiently by + * most CPUs. WC memory is a good option for buffers that will be written by + * the CPU and read by the GPU via mapped pinned memory or host->device + * transfers. + * + * All of these flags are orthogonal to one another: a developer may allocate + * memory that is portable, mapped and/or write-combined with no restrictions. + * + * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in + * order for the ::CU_MEMHOSTALLOC_DEVICEMAP flag to have any effect. + * + * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is deferred + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into + * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag. + * + * The memory allocated by this function must be freed with ::cuMemFreeHost(). + * + * Note all host memory allocated using ::cuMemHostAlloc() will automatically + * be immediately accessible to all contexts on all devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). + * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer + * that may be used to access this host memory from those contexts is always equal + * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED + * is specified, then the function ::cuMemHostGetDevicePointer() must be used + * to query the device pointer, even if the context supports unified addressing. + * See \ref CUDA_UNIFIED for additional details. + * + * \param pp - Returned host pointer to page-locked memory + * \param bytesize - Requested allocation size in bytes + * \param Flags - Flags for allocation request + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Passes back device pointer of mapped pinned memory + * + * Passes back the device pointer \p pdptr corresponding to the mapped, pinned + * host buffer \p p allocated by ::cuMemHostAlloc. + * + * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP + * flag was not specified at the time the memory was allocated, or if the + * function is called on a GPU that does not support mapped pinned memory. + * + * \p Flags provides for future releases. For now, it must be set to 0. + * + * \param pdptr - Returned device pointer + * \param p - Host pointer + * \param Flags - Options (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Passes back flags that were used for a pinned allocation + * + * Passes back the flags \p pFlags that were specified when allocating + * the pinned host buffer \p p allocated by ::cuMemHostAlloc. + * + * ::cuMemHostGetFlags() will fail if the pointer does not reside in + * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc(). + * + * \param pFlags - Returned flags word + * \param p - Host pointer + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuMemAllocHost, ::cuMemHostAlloc + */ +CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p); + +#if __CUDA_API_VERSION >= 6000 + +/** + * \brief Allocates memory that will be automatically managed by the Unified Memory system + * + * Allocates \p bytesize bytes of managed memory on the device and returns in + * \p *dptr a pointer to the allocated memory. If the device doesn't support + * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support + * for managed memory can be queried using the device attribute + * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p bytesize + * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer + * is valid on the CPU and on all GPUs in the system that support managed memory. + * All accesses to this pointer must obey the Unified Memory programming model. + * + * \p flags specifies the default stream association for this allocation. + * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If + * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from + * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the + * allocation is created with initial visibility restricted to host access only; + * an explicit call to ::cuStreamAttachMemAsync will be required to enable access + * on the device. + * + * If the association is later changed via ::cuStreamAttachMemAsync to + * a single stream, the default association as specifed during ::cuMemAllocManaged + * is restored when that stream is destroyed. For __managed__ variables, the + * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a + * stream is an asynchronous operation, and as a result, the change to default + * association won't happen until all work in the stream has completed. + * + * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree. + * + * On a multi-GPU system with peer-to-peer support, where multiple GPUs support + * managed memory, the physical storage is created on the GPU which is active + * at the time ::cuMemAllocManaged is called. All other GPUs will reference the + * data at reduced bandwidth via peer mappings over the PCIe bus. The Unified + * Memory management system does not migrate memory between GPUs. + * + * On a multi-GPU system where multiple GPUs support managed memory, but not + * all pairs of such GPUs have peer-to-peer support between them, the physical + * storage is created in 'zero-copy' or system memory. All GPUs will reference + * the data at reduced bandwidth over the PCIe bus. In these circumstances, + * use of the environment variable, CUDA_VISIBLE_DEVICES, is recommended to + * restrict CUDA to only use those GPUs that have peer-to-peer support. + * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a + * non-zero value to force the driver to always use device memory for physical storage. + * When this environment variable is set to a non-zero value, all contexts created in + * that process on devices that support managed memory have to be peer-to-peer compatible + * with each other. Context creation will fail if a context is created on a device that + * supports managed memory and is not peer-to-peer compatible with any of the other + * managed memory supporting devices on which contexts were previously created, even if + * those contexts have been destroyed. These environment variables are described + * in the CUDA programming guide under the "CUDA environment variables" section. + * + * \param dptr - Returned device pointer + * \param bytesize - Requested allocation size in bytes + * \param flags - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync + */ +CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); + +#endif /* __CUDA_API_VERSION >= 6000 */ + +#if __CUDA_API_VERSION >= 4010 + +/** + * \brief Returns a handle to a compute device + * + * Returns in \p *device a device handle given a PCI bus ID string. + * + * \param dev - Returned device handle + * + * \param pciBusId - String in one of the following forms: + * [domain]:[bus]:[device].[function] + * [domain]:[bus]:[device] + * [bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuDeviceGet, ::cuDeviceGetAttribute, ::cuDeviceGetPCIBusId + */ +CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId); + +/** + * \brief Returns a PCI Bus Id string for the device + * + * Returns an ASCII string identifying the device \p dev in the NULL-terminated + * string pointed to by \p pciBusId. \p len specifies the maximum length of the + * string that may be returned. + * + * \param pciBusId - Returned identifier string for the device in the following format + * [domain]:[bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. + * pciBusId should be large enough to store 13 characters including the NULL-terminator. + * + * \param len - Maximum length of string to store in \p name + * + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuDeviceGet, ::cuDeviceGetAttribute, ::cuDeviceGetByPCIBusId + */ +CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); + +/** + * \brief Gets an interprocess handle for a previously allocated event + * + * Takes as input a previously allocated event. This event must have been + * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING + * flags set. This opaque handle may be copied into other processes and + * opened with ::cuIpcOpenEventHandle to allow efficient hardware + * synchronization between GPU work in different processes. + * + * After the event has been opened in the importing process, + * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and + * ::cuEventQuery may be used in either process. Performing operations + * on the imported event after the exported event has been freed + * with ::cuEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. + * + * \param pHandle - Pointer to a user allocated CUipcEventHandle + * in which to return the opaque event handle + * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and + * ::CU_EVENT_DISABLE_TIMING flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_MAP_FAILED + * + * \sa + * ::cuEventCreate, + * ::cuEventDestroy, + * ::cuEventSynchronize, + * ::cuEventQuery, + * ::cuStreamWaitEvent, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle + */ +CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); + +/** + * \brief Opens an interprocess event handle for use in the current process + * + * Opens an interprocess event handle exported from another process with + * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like + * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. + * This event must be freed with ::cuEventDestroy. + * + * Performing operations on the imported event after the exported event has + * been freed with ::cuEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. + * + * \param phEvent - Returns the imported event + * \param handle - Interprocess handle to open + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, + * ::CUDA_ERROR_INVALID_HANDLE + * + * \sa + * ::cuEventCreate, + * ::cuEventDestroy, + * ::cuEventSynchronize, + * ::cuEventQuery, + * ::cuStreamWaitEvent, + * ::cuIpcGetEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle + */ +CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle); + +/** + * \brief Gets an interprocess memory handle for an existing device memory + * allocation + * + * Takes a pointer to the base of an existing device memory allocation created + * with ::cuMemAlloc and exports it for use in another process. This is a + * lightweight operation and may be called multiple times on an allocation + * without adverse effects. + * + * If a region of memory is freed with ::cuMemFree and a subsequent call + * to ::cuMemAlloc returns memory with the same device address, + * ::cuIpcGetMemHandle will return a unique handle for the + * new memory. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. + * + * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return + * the handle in. + * \param dptr - Base pointer to previously allocated device memory + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_MAP_FAILED, + * + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle + */ +CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); + +/** + * \brief Opens an interprocess memory handle exported from another process + * and returns a device pointer usable in the local process. + * + * Maps memory exported from another process with ::cuIpcGetMemHandle into + * the current device address space. For contexts on different devices + * ::cuIpcOpenMemHandle can attempt to enable peer access between the + * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is + * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. + * ::cuDeviceCanAccessPeer can determine if a mapping is possible. + * + * Contexts that may open ::CUipcMemHandles are restricted in the following way. + * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened + * by one ::CUcontext per ::CUdevice per other process. + * + * Memory returned from ::cuIpcOpenMemHandle must be freed with + * ::cuIpcCloseMemHandle. + * + * Calling ::cuMemFree on an exported memory region before calling + * ::cuIpcCloseMemHandle in the importing context will result in undefined + * behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. + * + * \param pdptr - Returned device pointer + * \param handle - ::CUipcMemHandle to open + * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_TOO_MANY_PEERS + * + * \note No guarantees are made about the address returned in \p *pdptr. + * In particular, multiple processes may not receive the same address for the same \p handle. + * + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcCloseMemHandle, + * ::cuCtxEnablePeerAccess, + * ::cuDeviceCanAccessPeer, + */ +CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); + +/** + * \brief Close memory mapped with ::cuIpcOpenMemHandle + * + * Unmaps memory returnd by ::cuIpcOpenMemHandle. The original allocation + * in the exporting process as well as imported mappings in other processes + * will be unaffected. + * + * Any resources used to enable peer access will be freed if this is the + * last mapping using them. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. + * + * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_HANDLE, + * + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + */ +CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); + +#endif /* __CUDA_API_VERSION >= 4010 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Registers an existing host memory range for use by CUDA + * + * Page-locks the memory range specified by \p p and \p bytesize and maps it + * for the device(s) as specified by \p Flags. This memory range also is added + * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed + * directly by the device, it can be read or written with much higher bandwidth + * than pageable memory that has not been registered. Page-locking excessive + * amounts of memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to register staging areas for data exchange between + * host and device. + * + * This function has limited support on Mac OS X. OS 10.7 or higher is required. + * + * The \p Flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * + * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address + * space. The device pointer to the memory may be obtained by calling + * ::cuMemHostGetDevicePointer(). This feature is available only on GPUs + * with compute capability greater than or equal to 1.1. + * + * All of these flags are orthogonal to one another: a developer may page-lock + * memory that is portable or mapped with no restrictions. + * + * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in + * order for the ::CU_MEMHOSTREGISTER_DEVICEMAP flag to have any effect. + * + * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is deferred + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into + * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag. + * + * The memory page-locked by this function must be unregistered with + * ::cuMemHostUnregister(). + * + * \param p - Host pointer to memory to page-lock + * \param bytesize - Size in bytes of the address range to page-lock + * \param Flags - Flags for allocation request + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED + * \notefnerr + * + * \sa ::cuMemHostUnregister, ::cuMemHostGetFlags, ::cuMemHostGetDevicePointer + */ +CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); + +/** + * \brief Unregisters a memory range that was registered with cuMemHostRegister. + * + * Unmaps the memory range whose base address is specified by \p p, and makes + * it pageable again. + * + * The base address must be the same one specified to ::cuMemHostRegister(). + * + * \param p - Host pointer to memory to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + * \notefnerr + * + * \sa ::cuMemHostRegister + */ +CUresult CUDAAPI cuMemHostUnregister(void *p); + +/** + * \brief Copies memory + * + * Copies data between two pointers. + * \p dst and \p src are base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * Note that this function infers the type of the transfer (host to host, host to + * device, device to device, or device to host) from the pointer values. This + * function is only allowed in contexts which support unified addressing. + * + * \param dst - Destination unified virtual address space pointer + * \param src - Source unified virtual address space pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); + +/** + * \brief Copies device memory between two contexts + * + * Copies from device memory in one context to device memory in another + * context. \p dstDevice is the base device pointer of the destination memory + * and \p dstContext is the destination context. \p srcDevice is the base + * device pointer of the source memory and \p srcContext is the source pointer. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param dstContext - Destination context + * \param srcDevice - Source device pointer + * \param srcContext - Source context + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync + */ +CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); + +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Copies memory from Host to Device + * + * Copies from host memory to device memory. \p dstDevice and \p srcHost are + * the base addresses of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + +/** + * \brief Copies memory from Device to Host + * + * Copies from device to host memory. \p dstHost and \p srcDevice specify the + * base pointers of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstHost - Destination host pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Device to Device + * + * Copies from device memory to device memory. \p dstDevice and \p srcDevice + * are the base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Device to Array + * + * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting index of the destination data. + * \p srcDevice specifies the base pointer of the source. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Array to Device + * + * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the + * base pointer of the destination and must be naturally aligned with the CUDA + * array elements. \p srcArray and \p srcOffset specify the CUDA array handle + * and the offset in bytes into the array where the copy is to begin. + * \p ByteCount specifies the number of bytes to copy and must be evenly + * divisible by the array element size. + * + * \param dstDevice - Destination device pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory from Host to Array + * + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting offset in bytes of the destination + * data. \p pSrc specifies the base address of the source. \p ByteCount specifies + * the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); + +/** + * \brief Copies memory from Array to Host + * + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA + * array handle and starting offset in bytes of the source data. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstHost - Destination device pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory from Array to Array + * + * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray + * specify the handles of the destination and source CUDA arrays for the copy, + * respectively. \p dstOffset and \p srcOffset specify the destination and + * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of + * bytes to be copied. The size of the elements in the CUDA arrays need not be + * the same format, but the elements must be the same size; and count must be + * evenly divisible by that size. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * + * \par + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run + * significantly slower in the cases where ::cuMemcpy2D() would have returned + * an error code. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * + * \par + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run + * significantly slower in the cases where ::cuMemcpy2D() would have returned + * an error code. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); + +/** + * \brief Copies memory for 3D arrays + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY3D_st { + + unsigned int srcXInBytes, srcY, srcZ; + unsigned int srcLOD; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; // ignored when src is array + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 + + unsigned int dstXInBytes, dstY, dstZ; + unsigned int dstLOD; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; // ignored when dst is array + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 + + unsigned int WidthInBytes; + unsigned int Height; + unsigned int Depth; + } CUDA_MEMCPY3D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and + * ::srcHeight specify the (host) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and + * ::srcHeight specify the (device) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and + * ::srcHeight are ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data, the bytes per row, + * and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data, the bytes per + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and + * ::dstHeight are ignored. + * + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source + * data for the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the + * destination data for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height + * and depth of the 3D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). + * + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be + * set to 0. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Copies memory between contexts + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure + * for documentation of its parameters. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync + */ +CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); + +/** + * \brief Copies memory asynchronously + * + * Copies data between two pointers. + * \p dst and \p src are base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * Note that this function infers the type of the transfer (host to host, host to + * device, device to device, or device to host) from the pointer values. This + * function is only allowed in contexts which support unified addressing. + * + * \param dst - Destination unified virtual address space pointer + * \param src - Source unified virtual address space pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies device memory between two contexts asynchronously. + * + * Copies from device memory in one context to device memory in another + * context. \p dstDevice is the base device pointer of the destination memory + * and \p dstContext is the destination context. \p srcDevice is the base + * device pointer of the source memory and \p srcContext is the source pointer. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param dstContext - Destination context + * \param srcDevice - Source device pointer + * \param srcContext - Source context + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, + * ::cuMemcpy3DPeerAsync + */ +CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Copies memory from Host to Device + * + * Copies from host memory to device memory. \p dstDevice and \p srcHost are + * the base addresses of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Device to Host + * + * Copies from device to host memory. \p dstHost and \p srcDevice specify the + * base pointers of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstHost - Destination host pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Device to Device + * + * Copies from device memory to device memory. \p dstDevice and \p srcDevice + * are the base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Host to Array + * + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting offset in bytes of the + * destination data. \p srcHost specifies the base address of the source. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Array to Host + * + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA + * array handle and starting offset in bytes of the source data. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstHost - Destination pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch(). + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); + +/** + * \brief Copies memory for 3D arrays + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY3D_st { + + unsigned int srcXInBytes, srcY, srcZ; + unsigned int srcLOD; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; // ignored when src is array + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 + + unsigned int dstXInBytes, dstY, dstZ; + unsigned int dstLOD; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; // ignored when dst is array + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 + + unsigned int WidthInBytes; + unsigned int Height; + unsigned int Depth; + } CUDA_MEMCPY3D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and + * ::srcHeight specify the (host) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and + * ::srcHeight specify the (device) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and + * ::srcHeight are ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data, the bytes per row, + * and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data, the bytes per + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and + * ::dstHeight are ignored. + * + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source + * data for the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the + * destination data for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height + * and depth of the 3D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). + * + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be + * set to 0. + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Copies memory between contexts asynchronously. + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure + * for documentation of its parameters. + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync + */ +CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 8-bit values to the specified value + * \p uc. + * + * \param dstDevice - Destination device pointer + * \param uc - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 16-bit values to the specified value + * \p us. The \p dstDevice pointer must be two byte aligned. + * + * \param dstDevice - Destination device pointer + * \param us - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 32-bit values to the specified value + * \p ui. The \p dstDevice pointer must be four byte aligned. + * + * \param dstDevice - Destination device pointer + * \param ui - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 8-bit values to the specified value + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param uc - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 16-bit values to the specified value + * \p us. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be two byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param us - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 32-bit values to the specified value + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be four byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param ui - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 8-bit values to the specified value + * \p uc. + * + * \param dstDevice - Destination device pointer + * \param uc - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 16-bit values to the specified value + * \p us. The \p dstDevice pointer must be two byte aligned. + * + * \param dstDevice - Destination device pointer + * \param us - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 32-bit values to the specified value + * \p ui. The \p dstDevice pointer must be four byte aligned. + * + * \param dstDevice - Destination device pointer + * \param ui - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 8-bit values to the specified value + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param uc - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 16-bit values to the specified value + * \p us. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be two byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param us - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 32-bit values to the specified value + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be four byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param ui - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async + */ +CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Creates a 1D or 2D CUDA array + * + * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. + * The ::CUDA_ARRAY_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + CUarray_format Format; + unsigned int NumChannels; + } CUDA_ARRAY_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, and \p Height are the width, and height of the CUDA array (in + * elements); the CUDA array is one-dimensional if height is 0, two-dimensional + * otherwise; + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * Here are examples of CUDA array descriptions: + * + * Description for a CUDA array of 2048 floats: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 2048; + desc.Height = 1; + * \endcode + * + * Description for a 64 x 64 CUDA array of floats: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 64; + desc.Height = 64; + * \endcode + * + * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit + * float16's: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.FormatFlags = CU_AD_FORMAT_HALF; + desc.NumChannels = 4; + desc.Width = width; + desc.Height = height; + * \endcode + * + * Description for a \p width x \p height CUDA array of 16-bit elements, each + * of which is two 8-bit unsigned chars: + * \code + CUDA_ARRAY_DESCRIPTOR arrayDesc; + desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8; + desc.NumChannels = 2; + desc.Width = width; + desc.Height = height; + * \endcode + * + * \param pHandle - Returned array + * \param pAllocateArray - Array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); + +/** + * \brief Get a 1D or 2D CUDA array descriptor + * + * Returns in \p *pArrayDescriptor a descriptor containing information on the + * format and dimensions of the CUDA array \p hArray. It is useful for + * subroutines that have been passed a CUDA array, but need to know the CUDA + * array parameters for validation or other purposes. + * + * \param pArrayDescriptor - Returned array descriptor + * \param hArray - Array to get descriptor of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); +#endif /* __CUDA_API_VERSION >= 3020 */ + + +/** + * \brief Destroys a CUDA array + * + * Destroys the CUDA array \p hArray. + * + * \param hArray - Array to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ARRAY_IS_MAPPED + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuArrayDestroy(CUarray hArray); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Creates a 3D CUDA array + * + * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. + * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + unsigned int Depth; + CUarray_format Format; + unsigned int NumChannels; + unsigned int Flags; + } CUDA_ARRAY3D_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, \p Height, and \p Depth are the width, height, and depth of the + * CUDA array (in elements); the following types of CUDA arrays can be allocated: + * - A 1D array is allocated if \p Height and \p Depth extents are both zero. + * - A 2D array is allocated if only \p Depth extent is zero. + * - A 3D array is allocated if all three extents are non-zero. + * - A 1D layered CUDA array is allocated if only \p Height is zero and the + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number + * of layers is determined by the depth extent. + * - A 2D layered CUDA array is allocated if all three extents are non-zero and + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number + * of layers is determined by the depth extent. + * - A cubemap CUDA array is allocated if all three extents are non-zero and the + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, + * where the six layers represent the six faces of a cube. The order of the six + * layers in memory is the same as that listed in ::CUarray_cubemap_face. + * - A cubemap layered CUDA array is allocated if all three extents are non-zero, + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that + * consists of a collection of cubemaps. The first six layers represent the first + * cubemap, the next six layers form the second cubemap, and so on. + * + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * - ::Flags may be set to + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, + * \p Depth specifies the number of layers, not the depth of a 3D array. + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array. + * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array + * to a surface reference. + * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be + * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, + * then \p Depth must be a multiple of six. + * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather. + * Texture gather can only be performed on 2D CUDA arrays. + * + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute + * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH. + * + * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag + * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH + * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case. + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), + * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
+ * {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } + *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), + * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), + * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, + * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, + * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), + * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), + * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), + * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), + * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), + * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
+ * + * Here are examples of CUDA array descriptions: + * + * Description for a CUDA array of 2048 floats: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 2048; + desc.Height = 0; + desc.Depth = 0; + * \endcode + * + * Description for a 64 x 64 CUDA array of floats: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 64; + desc.Height = 64; + desc.Depth = 0; + * \endcode + * + * Description for a \p width x \p height x \p depth CUDA array of 64-bit, + * 4x16-bit float16's: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.FormatFlags = CU_AD_FORMAT_HALF; + desc.NumChannels = 4; + desc.Width = width; + desc.Height = height; + desc.Depth = depth; + * \endcode + * + * \param pHandle - Returned array + * \param pAllocateArray - 3D array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); + +/** + * \brief Get a 3D CUDA array descriptor + * + * Returns in \p *pArrayDescriptor a descriptor containing information on the + * format and dimensions of the CUDA array \p hArray. It is useful for + * subroutines that have been passed a CUDA array, but need to know the CUDA + * array parameters for validation or other purposes. + * + * This function may be called on 1D and 2D arrays, in which case the \p Height + * and/or \p Depth members of the descriptor struct will be set to 0. + * + * \param pArrayDescriptor - Returned 3D array descriptor + * \param hArray - 3D array to get descriptor of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 5000 + +/** + * \brief Creates a CUDA mipmapped array + * + * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure + * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle. + * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is + * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. + * + * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + unsigned int Depth; + CUarray_format Format; + unsigned int NumChannels; + unsigned int Flags; + } CUDA_ARRAY3D_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, \p Height, and \p Depth are the width, height, and depth of the + * CUDA array (in elements); the following types of CUDA arrays can be allocated: + * - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero. + * - A 2D mipmapped array is allocated if only \p Depth extent is zero. + * - A 3D mipmapped array is allocated if all three extents are non-zero. + * - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number + * of layers is determined by the depth extent. + * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number + * of layers is determined by the depth extent. + * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, + * where the six layers represent the six faces of a cube. The order of the six + * layers in memory is the same as that listed in ::CUarray_cubemap_face. + * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that + * consists of a collection of cubemaps. The first six layers represent the first + * cubemap, the next six layers form the second cubemap, and so on. + * + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * - ::Flags may be set to + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, + * \p Depth specifies the number of layers, not the depth of a 3D array. + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of + * the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to + * bind a mipmap level of the CUDA mipmapped array to a surface reference. + * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be + * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, + * then \p Depth must be a multiple of six. + * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather. + * Texture gather can only be performed on 2D CUDA mipmapped arrays. + * + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute + * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH. + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), + * (depth range)}
1D{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } + *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), + * (1,TEXTURE3D_DEPTH_ALTERNATE) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, + * (1,TEXTURE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), + * (1,TEXTURE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), + * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }
+ * + * + * \param pHandle - Returned mipmapped array + * \param pMipmappedArrayDesc - mipmapped array descriptor + * \param numMipmapLevels - Number of mipmap levels + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuMipmappedArrayDestroy, ::cuMipmappedArrayGetLevel, ::cuArrayCreate, + */ +CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels); + +/** + * \brief Gets a mipmap level of a CUDA mipmapped array + * + * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level + * of the CUDA mipmapped array \p hMipmappedArray. + * + * If \p level is greater than the maximum number of levels in this mipmapped array, + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param pLevelArray - Returned mipmap level CUDA array + * \param hMipmappedArray - CUDA mipmapped array + * \param level - Mipmap level + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuMipmappedArrayCreate, ::cuMipmappedArrayDestroy, ::cuArrayCreate, + */ +CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level); + +/** + * \brief Destroys a CUDA mipmapped array + * + * Destroys the CUDA mipmapped array \p hMipmappedArray. + * + * \param hMipmappedArray - Mipmapped array to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ARRAY_IS_MAPPED + * \notefnerr + * + * \sa ::cuMipmappedArrayCreate, ::cuMipmappedArrayGetLevel, ::cuArrayCreate, + */ +CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); + +#endif /* __CUDA_API_VERSION >= 5000 */ + +/** @} */ /* END CUDA_MEM */ + +/** + * \defgroup CUDA_UNIFIED Unified Addressing + * + * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the unified addressing functions of the + * low-level CUDA driver application programming interface. + * + * @{ + * + * \section CUDA_UNIFIED_overview Overview + * + * CUDA devices can share a unified address space with the host. + * For these devices there is no distinction between a device + * pointer and a host pointer -- the same pointer value may be + * used to access memory from the host program and from a kernel + * running on the device (with exceptions enumerated below). + * + * \section CUDA_UNIFIED_support Supported Platforms + * + * Whether or not a device supports unified addressing may be + * queried by calling ::cuDeviceGetAttribute() with the device + * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. + * + * Unified addressing is automatically enabled in 64-bit processes + * on devices with compute capability greater than or equal to 2.0. + * + * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values + * + * It is possible to look up information about the memory which backs a + * pointer value. For instance, one may want to know if a pointer points + * to host or device memory. As another example, in the case of device + * memory, one may want to know on which CUDA device the memory + * resides. These properties may be queried using the function + * ::cuPointerGetAttribute() + * + * Since pointers are unique, it is not necessary to specify information + * about the pointers specified to the various copy functions in the + * CUDA API. The function ::cuMemcpy() may be used to perform a copy + * between two pointers, ignoring whether they point to host or device + * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH() + * unnecessary for devices supporting unified addressing). For + * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be + * used to specify that the CUDA driver should infer the location of the + * pointer from its value. + * + * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory + * + * All host memory allocated in all contexts using ::cuMemAllocHost() and + * ::cuMemHostAlloc() is always directly accessible from all contexts on + * all devices that support unified addressing. This is the case regardless + * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and + * ::CU_MEMHOSTALLOC_DEVICEMAP are specified. + * + * The pointer value through which allocated host memory may be accessed + * in kernels on all devices that support unified addressing is the same + * as the pointer value through which that memory is accessed on the host, + * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device + * pointer for these allocations. + * + * Note that this is not the case for memory allocated using the flag + * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below. + * + * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory + * + * Upon enabling direct access from a context that supports unified addressing + * to another peer context that supports unified addressing using + * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using + * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible + * by the current context. The device pointer value through + * which any peer memory may be accessed in the current context + * is the same pointer value through which that memory may be + * accessed in the peer context. + * + * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing + * + * Not all memory may be accessed on devices through the same pointer + * value through which they are accessed on the host. These exceptions + * are host memory registered using ::cuMemHostRegister() and host memory + * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these + * exceptions, there exists a distinct host and device address for the + * memory. The device address is guaranteed to not overlap any valid host + * pointer range and is guaranteed to have the same value across all + * contexts that support unified addressing. + * + * This device address may be queried using ::cuMemHostGetDevicePointer() + * when a context using unified addressing is current. Either the host + * or the unified device pointer value may be used to refer to this memory + * through ::cuMemcpy() and similar functions using the + * ::CU_MEMORYTYPE_UNIFIED memory type. + * + */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Returns information about a pointer + * + * The supported attributes are: + * + * - ::CU_POINTER_ATTRIBUTE_CONTEXT: + * + * Returns in \p *data the ::CUcontext in which \p ptr was allocated or + * registered. + * The type of \p data must be ::CUcontext *. + * + * If \p ptr was not allocated by, mapped by, or registered with + * a ::CUcontext which uses unified virtual addressing then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: + * + * Returns in \p *data the physical memory type of the memory that + * \p ptr addresses as a ::CUmemorytype enumerated value. + * The type of \p data must be unsigned int. + * + * If \p ptr addresses device memory then \p *data is set to + * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the + * memory resides is the ::CUdevice of the ::CUcontext returned by the + * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr. + * + * If \p ptr addresses host memory then \p *data is set to + * ::CU_MEMORYTYPE_HOST. + * + * If \p ptr was not allocated by, mapped by, or registered with + * a ::CUcontext which uses unified virtual addressing then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * If the current ::CUcontext does not support unified virtual + * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER: + * + * Returns in \p *data the device pointer value through which + * \p ptr may be accessed by kernels running in the current + * ::CUcontext. + * The type of \p data must be CUdeviceptr *. + * + * If there exists no device pointer value through which + * kernels running in the current ::CUcontext may access + * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * If there is no current ::CUcontext then + * ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * Except in the exceptional disjoint addressing cases discussed + * below, the value returned in \p *data will equal the input + * value \p ptr. + * + * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER: + * + * Returns in \p *data the host pointer value through which + * \p ptr may be accessed by by the host program. + * The type of \p data must be void **. + * If there exists no host pointer value through which + * the host program may directly access \p ptr then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Except in the exceptional disjoint addressing cases discussed + * below, the value returned in \p *data will equal the input + * value \p ptr. + * + * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS: + * + * Returns in \p *data two tokens for use with the nv-p2p.h Linux + * kernel interface. \p data must be a struct of type + * CUDA_POINTER_ATTRIBUTE_P2P_TOKENS. + * + * \p ptr must be a pointer to memory obtained from :cuMemAlloc(). + * Note that p2pToken and vaSpaceToken are only valid for the + * lifetime of the source allocation. A subsequent allocation at + * the same address may return completely different tokens. + * Querying this attribute has a side effect of setting the attribute + * ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that + * \p ptr points to. + * + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: + * + * A boolean attribute which when set, ensures that synchronous memory operations + * initiated on the region of memory that \p ptr points to will always synchronize. + * See further documentation in the section titled "API synchronization behavior" + * to learn more about cases when synchronous memory operations can + * exhibit asynchronous behavior. + * + * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID: + * + * Returns in \p *data a buffer ID which is guaranteed to be unique within the process. + * \p data must point to an unsigned long long. + * + * \p ptr must be a pointer to memory obtained from a CUDA memory allocation API. + * Every memory allocation from any of the CUDA memory allocation APIs will + * have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs + * from previous freed allocations. IDs are only unique within a single process. + * + * + * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED: + * + * Returns in \p *data a boolean that indicates whether the pointer points to + * managed memory or not. + * + * \par + * + * Note that for most allocations in the unified virtual address space + * the host and device pointer for accessing the allocation will be the + * same. The exceptions to this are + * - user memory registered using ::cuMemHostRegister + * - host memory allocated using ::cuMemHostAlloc with the + * ::CU_MEMHOSTALLOC_WRITECOMBINED flag + * For these types of allocation there will exist separate, disjoint host + * and device addresses for accessing the allocation. In particular + * - The host address will correspond to an invalid unmapped device address + * (which will result in an exception if accessed from the device) + * - The device address will correspond to an invalid unmapped host address + * (which will result in an exception if accessed from the host). + * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER + * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host + * and device addresses from either address. + * + * \param data - Returned pointer attribute value + * \param attribute - Pointer attribute to query + * \param ptr - Pointer + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa cuPointerSetAttribute, + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuMemAllocHost, + * ::cuMemFreeHost, + * ::cuMemHostAlloc, + * ::cuMemHostRegister, + * ::cuMemHostUnregister + */ +CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 6000 +/** + * \brief Set attributes on a previously allocated memory region + * + * The supported attributes are: + * + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: + * + * A boolean attribute that can either be set (1) or unset (0). When set, + * the region of memory that \p ptr points to is guaranteed to always synchronize + * memory operations that are synchronous. If there are some previously initiated + * synchronous memory operations that are pending when this attribute is set, the + * function does not return until those memory operations are complete. + * See further documentation in the section titled "API synchronization behavior" + * to learn more about cases when synchronous memory operations can + * exhibit asynchronous behavior. + * \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set. + * + * \param value - Pointer to memory containing the value to be set + * \param attribute - Pointer attribute to set + * \param ptr - Pointer to a memory region allocated using CUDA memory allocation APIs + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuPointerGetAttribute, + * ::cuPointerGetAttributes, + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuMemAllocHost, + * ::cuMemFreeHost, + * ::cuMemHostAlloc, + * ::cuMemHostRegister, + * ::cuMemHostUnregister + */ +CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr); +#endif /* __CUDA_API_VERSION >= 6000 */ + +#if __CUDA_API_VERSION >= 7000 +/** + * \brief Returns information about a pointer. + * + * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions): + * + * - ::CU_POINTER_ATTRIBUTE_CONTEXT + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE + * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER + * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS + * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID + * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED + * + * \param numAttributes - Number of attributes to query + * \param attributes - An array of attributes to query + * (numAttributes and the number of attributes in this array should match) + * \param data - A two-dimensional array containing pointers to memory + * locations where the result of each attribute query will be written to. + * \param ptr - Pointer to query + * + * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr + * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values + * and CUDA_SUCCESS is returned. + * + * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA + * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuPointerGetAttribute, + * ::cuPointerSetAttribute + */ +CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); +#endif /* __CUDA_API_VERSION >= 7000 */ + +/** @} */ /* END CUDA_UNIFIED */ + +/** + * \defgroup CUDA_STREAM Stream Management + * + * ___MANBRIEF___ stream management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Create a stream + * + * Creates a stream and returns a handle in \p phStream. The \p Flags argument + * determines behaviors of the stream. Valid values for \p Flags are: + * - ::CU_STREAM_DEFAULT: Default stream creation flag. + * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created + * stream may run concurrently with work in stream 0 (the NULL stream), and that + * the created stream should perform no implicit synchronization with stream 0. + * + * \param phStream - Returned newly created stream + * \param Flags - Parameters for stream creation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback + */ +CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags); + +/** + * \brief Create a stream with the given priority + * + * Creates a stream with the specified priority and returns a handle in \p phStream. + * This API alters the scheduler priority of work in the stream. Work in a higher + * priority stream may preempt work already executing in a low priority stream. + * + * \p priority follows a convention where lower numbers represent higher priorities. + * '0' represents default priority. The range of meaningful numerical priorities can + * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is + * outside the numerical range returned by ::cuCtxGetStreamPriorityRange, + * it will automatically be clamped to the lowest or the highest number in the range. + * + * \param phStream - Returned newly created stream + * \param flags - Flags for stream creation. See ::cuStreamCreate for a list of + * valid flags + * \param priority - Stream priority. Lower numbers represent higher priorities. + * See ::cuCtxGetStreamPriorityRange for more information about + * meaningful stream priorities that can be passed. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \note Stream priorities are supported only on Quadro and Tesla GPUs + * with compute capability 3.5 or higher. + * + * \note In the current implementation, only compute kernels launched in + * priority streams are affected by the stream's priority. Stream priorities have + * no effect on host-to-device and device-to-host memory operations. + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamGetPriority, + * ::cuCtxGetStreamPriorityRange, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback + */ +CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority); + + +/** + * \brief Query the priority of a given stream + * + * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority + * and return the priority in \p priority. Note that if the stream was created with a + * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange, + * this function returns the clamped priority. + * See ::cuStreamCreateWithPriority for details about priority clamping. + * + * \param hStream - Handle to the stream to be queried + * \param priority - Pointer to a signed integer in which the stream's priority is returned + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamCreateWithPriority, + * ::cuCtxGetStreamPriorityRange, + * ::cuStreamGetFlags + */ +CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); + +/** + * \brief Query the flags of a given stream + * + * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority + * and return the flags in \p flags. + * + * \param hStream - Handle to the stream to be queried + * \param flags - Pointer to an unsigned integer in which the stream's flags are returned + * The value returned in \p flags is a logical 'OR' of all flags that + * were used while creating this stream. See ::cuStreamCreate for the list + * of valid flags + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamGetPriority + */ +CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); + + +/** + * \brief Make a compute stream wait on an event + * + * Makes all future work submitted to \p hStream wait until \p hEvent + * reports completion before beginning execution. This synchronization + * will be performed efficiently on the device. The event \p hEvent may + * be from a different context than \p hStream, in which case this function + * will perform cross-device synchronization. + * + * The stream \p hStream will wait only for the completion of the most recent + * host call to ::cuEventRecord() on \p hEvent. Once this call has returned, + * any functions (including ::cuEventRecord() and ::cuEventDestroy()) may be + * called on \p hEvent again, and subsequent calls will not have any + * effect on \p hStream. + * + * If ::cuEventRecord() has not been called on \p hEvent, this call acts as if + * the record has already completed, and so is a functional no-op. + * + * \param hStream - Stream to wait + * \param hEvent - Event to wait on (may not be NULL) + * \param Flags - Parameters for the operation (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuEventRecord, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cuStreamDestroy + */ +CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); + +/** + * \brief Add a callback to a compute stream + * + * Adds a callback to be called on the host after all currently enqueued + * items in the stream have completed. For each + * cuStreamAddCallback call, the callback will be executed exactly once. + * The callback will block later work in the stream until it is finished. + * + * The callback may be passed ::CUDA_SUCCESS or an error code. In the event + * of a device error, all subsequently executed callbacks will receive an + * appropriate ::CUresult. + * + * Callbacks must not make any CUDA API calls. Attempting to use a CUDA API + * will result in ::CUDA_ERROR_NOT_PERMITTED. Callbacks must not perform any + * synchronization that may depend on outstanding device work or other callbacks + * that are not mandated to run earlier. Callbacks without a mandated order + * (in independent streams) execute in undefined order and may be serialized. + * + * This API requires compute capability 1.1 or greater. See + * ::cuDeviceGetAttribute or ::cuDeviceGetProperties to query compute + * capability. Attempting to use this API with earlier compute versions will + * return ::CUDA_ERROR_NOT_SUPPORTED. + * + * For the purposes of Unified Memory, callback execution makes a number of + * guarantees: + *
    + *
  • The callback stream is considered idle for the duration of the + * callback. Thus, for example, a callback may always use memory attached + * to the callback stream.
  • + *
  • The start of execution of a callback has the same effect as + * synchronizing an event recorded in the same stream immediately prior to + * the callback. It thus synchronizes streams which have been "joined" + * prior to the callback.
  • + *
  • Adding device work to any stream does not have the effect of making + * the stream active until all preceding callbacks have executed. Thus, for + * example, a callback might use global attached memory even if work has + * been added to another stream, if it has been properly ordered with an + * event.
  • + *
  • Completion of a callback does not cause a stream to become + * active except as described above. The callback stream will remain idle + * if no device work follows the callback, and will remain idle across + * consecutive callbacks without device work in between. Thus, for example, + * stream synchronization can be done by signaling from a callback at the + * end of the stream.
  • + *
+ * + * \param hStream - Stream to add callback to + * \param callback - The function to call once preceding stream operations are complete + * \param userData - User specified data to be passed to the callback function + * \param flags - Reserved for future use, must be 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cuStreamAttachMemAsync + */ +CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); + +#if __CUDA_API_VERSION >= 6000 + +/** + * \brief Attach memory to a stream asynchronously + * + * Enqueues an operation in \p hStream to specify stream association of + * \p length bytes of memory starting from \p dptr. This function is a + * stream-ordered operation, meaning that it is dependent on, and will + * only take effect when, previous work in stream has completed. Any + * previous association is automatically replaced. + * + * \p dptr must point to an address within managed memory space declared + * using the __managed__ keyword or allocated with ::cuMemAllocManaged. + * + * \p length must be zero, to indicate that the entire allocation's + * stream association is being changed. Currently, it's not possible + * to change stream association for a portion of an allocation. + * + * The stream association is specified using \p flags which must be + * one of ::CUmemAttach_flags. + * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed + * by any stream on any device. + * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee + * that it won't access the memory on the device from any stream. + * If the ::CU_MEM_ATTACH_SINGLE flag is specified, the program makes a guarantee + * that it will only access the memory on the device from \p hStream. It is illegal + * to attach singly to the NULL stream, because the NULL stream is a virtual global + * stream and not a specific stream. An error will be returned in this case. + * + * When memory is associated with a single stream, the Unified Memory system will + * allow CPU access to this memory region so long as all operations in \p hStream + * have completed, regardless of whether other streams are active. In effect, + * this constrains exclusive ownership of the managed memory region by + * an active GPU to per-stream activity instead of whole-GPU activity. + * + * Accessing memory on the device from streams that are not associated with + * it will produce undefined results. No error checking is performed by the + * Unified Memory system to ensure that kernels launched into other streams + * do not access this region. + * + * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync + * via events, synchronization or other means to ensure legal access to memory + * at all times. Data visibility and coherency will be changed appropriately + * for all kernels which follow a stream-association change. + * + * If \p hStream is destroyed while data is associated with it, the association is + * removed and the association reverts to the default visibility of the allocation + * as specified at ::cuMemAllocManaged. For __managed__ variables, the default + * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an + * asynchronous operation, and as a result, the change to default association won't + * happen until all work in the stream has completed. + * + * \param hStream - Stream in which to enqueue the attach operation + * \param dptr - Pointer to memory (must be a pointer to managed memory) + * \param length - Length of memory (must be zero) + * \param flags - Must be one of ::CUmemAttach_flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged + */ +CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); + +#endif /* __CUDA_API_VERSION >= 6000 */ + +/** + * \brief Determine status of a compute stream + * + * Returns ::CUDA_SUCCESS if all operations in the stream specified by + * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not. + * + * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS + * is equivalent to having called ::cuStreamSynchronize(). + * + * \param hStream - Stream to query status of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_READY + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback + */ +CUresult CUDAAPI cuStreamQuery(CUstream hStream); + +/** + * \brief Wait until a stream's tasks are completed + * + * Waits until the device has completed all operations in the stream specified + * by \p hStream. If the context was created with the + * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the + * stream is finished with all of its tasks. + * + * \param hStream - Stream to wait for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamDestroy, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamAddCallback + */ +CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Destroys a stream + * + * Destroys the stream specified by \p hStream. + * + * In case the device is still doing work in the stream \p hStream + * when ::cuStreamDestroy() is called, the function will return immediately + * and the resources associated with \p hStream will be released automatically + * once the device has completed all work in \p hStream. + * + * \param hStream - Stream to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback + */ +CUresult CUDAAPI cuStreamDestroy(CUstream hStream); +#endif /* __CUDA_API_VERSION >= 4000 */ + +/** @} */ /* END CUDA_STREAM */ + + +/** + * \defgroup CUDA_EVENT Event Management + * + * ___MANBRIEF___ event management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the event management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Creates an event + * + * Creates an event *phEvent with the flags specified via \p Flags. Valid flags + * include: + * - ::CU_EVENT_DEFAULT: Default event creation flag. + * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking + * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on + * an event created with this flag will block until the event has actually + * been recorded. + * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need + * to record timing data. Events created with this flag specified and + * the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best + * performance when used with ::cuStreamWaitEvent() and ::cuEventQuery(). + * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an + * interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must + * be specified along with ::CU_EVENT_DISABLE_TIMING. + * + * \param phEvent - Returns newly created event + * \param Flags - Event creation flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cuEventElapsedTime + */ +CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags); + +/** + * \brief Records an event + * + * Records an event. See note on NULL stream behavior. Since operation is + * asynchronous, ::cuEventQuery or ::cuEventSynchronize() must be used + * to determine when the event has actually been recorded. + * + * If ::cuEventRecord() has previously been called on \p hEvent, then this + * call will overwrite any existing state in \p hEvent. Any subsequent calls + * which examine the status of \p hEvent will only examine the completion of + * this most recent call to ::cuEventRecord(). + * + * It is necessary that \p hEvent and \p hStream be created on the same context. + * + * \param hEvent - Event to record + * \param hStream - Stream to record event for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \note_null_stream + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuStreamWaitEvent, + * ::cuEventDestroy, + * ::cuEventElapsedTime + */ +CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); + +/** + * \brief Queries an event's status + * + * Query the status of all device work preceding the most recent + * call to ::cuEventRecord() (in the appropriate compute streams, + * as specified by the arguments to ::cuEventRecord()). + * + * If this work has successfully been completed by the device, or if + * ::cuEventRecord() has not been called on \p hEvent, then ::CUDA_SUCCESS is + * returned. If this work has not yet been completed by the device then + * ::CUDA_ERROR_NOT_READY is returned. + * + * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS + * is equivalent to having called ::cuEventSynchronize(). + * + * \param hEvent - Event to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_READY + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cuEventElapsedTime + */ +CUresult CUDAAPI cuEventQuery(CUevent hEvent); + +/** + * \brief Waits for an event to complete + * + * Wait until the completion of all device work preceding the most recent + * call to ::cuEventRecord() (in the appropriate compute streams, as specified + * by the arguments to ::cuEventRecord()). + * + * If ::cuEventRecord() has not been called on \p hEvent, ::CUDA_SUCCESS is + * returned immediately. + * + * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC + * flag will cause the calling CPU thread to block until the event has + * been completed by the device. If the ::CU_EVENT_BLOCKING_SYNC flag has + * not been set, then the CPU thread will busy-wait until the event has + * been completed by the device. + * + * \param hEvent - Event to wait for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventDestroy, + * ::cuEventElapsedTime + */ +CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Destroys an event + * + * Destroys the event specified by \p hEvent. + * + * In case \p hEvent has been recorded but has not yet been completed + * when ::cuEventDestroy() is called, the function will return immediately and + * the resources associated with \p hEvent will be released automatically once + * the device has completed \p hEvent. + * + * \param hEvent - Event to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventElapsedTime + */ +CUresult CUDAAPI cuEventDestroy(CUevent hEvent); +#endif /* __CUDA_API_VERSION >= 4000 */ + +/** + * \brief Computes the elapsed time between two events + * + * Computes the elapsed time between two events (in milliseconds with a + * resolution of around 0.5 microseconds). + * + * If either event was last recorded in a non-NULL stream, the resulting time + * may be greater than expected (even if both used the same stream handle). This + * happens because the ::cuEventRecord() operation takes place asynchronously + * and there is no guarantee that the measured latency is actually just between + * the two events. Any number of other different stream operations could execute + * in between the two measured events, thus altering the timing in a significant + * way. + * + * If ::cuEventRecord() has not been called on either event then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called + * on both events but one or both of them has not yet been completed (that is, + * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the + * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with + * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return + * ::CUDA_ERROR_INVALID_HANDLE. + * + * \param pMilliseconds - Time between \p hStart and \p hEnd in ms + * \param hStart - Starting event + * \param hEnd - Ending event + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_READY + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventDestroy + */ +CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); + +/** @} */ /* END CUDA_EVENT */ + +/** + * \defgroup CUDA_EXEC Execution Control + * + * ___MANBRIEF___ execution control functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the execution control functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns information about a function + * + * Returns in \p *pi the integer value of the attribute \p attrib on the kernel + * given by \p hfunc. The supported attributes are: + * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads + * per block, beyond which a launch of the function would fail. This number + * depends on both the function and the device on which the function is + * currently loaded. + * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of + * statically-allocated shared memory per block required by this function. + * This does not include dynamically-allocated shared memory requested by + * the user at runtime. + * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated + * constant memory required by this function. + * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory + * used by each thread of this function. + * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread + * of this function. + * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for + * which the function was compiled. This value is the major PTX version * 10 + * + the minor PTX version, so a PTX version 1.3 function would return the + * value 13. Note that this may return the undefined value of 0 for cubins + * compiled prior to CUDA 3.0. + * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for + * which the function was compiled. This value is the major binary + * version * 10 + the minor binary version, so a binary version 1.3 function + * would return the value 13. Note that this will return a value of 10 for + * legacy cubins that do not have a properly-encoded binary architecture + * version. + * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has + * been compiled with user specified option "-Xptxas --dlcm=ca" set . + * + * \param pi - Returned attribute value + * \param attrib - Attribute requested + * \param hfunc - Function to query attribute of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); + +/** + * \brief Sets the preferred cache configuration for a device function + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p config the preferred cache configuration for + * the device function \p hfunc. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute \p hfunc. Any context-wide preference + * set via ::cuCtxSetCacheConfig() will be overridden by this per-function + * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In + * that case, the current context-wide setting will be used. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param hfunc - Kernel to configure cache for + * \param config - Requested cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); + +#if __CUDA_API_VERSION >= 4020 +/** + * \brief Sets the shared memory configuration for a device function. + * + * On devices with configurable shared memory banks, this function will + * force all subsequent launches of the specified device function to have + * the given shared memory bank size configuration. On any given launch of the + * function, the shared memory configuration of the device will be temporarily + * changed if needed to suit the function's preferred configuration. Changes in + * shared memory configuration between subsequent launches of functions, + * may introduce a device side synchronization point. + * + * Any per-function setting of shared memory bank size set via + * ::cuFuncSetSharedMemConfig will override the context wide setting set with + * ::cuCtxSetSharedMemConfig. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * The supported bank configurations are: + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory + * configuration when launching this function. + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to + * be natively four bytes when launching this function. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to + * be natively eight bytes when launching this function. + * + * \param hfunc - kernel to be given a shared memory config + * \param config - requested shared memory configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuCtxGetSharedMemConfig, + * ::cuCtxSetSharedMemConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); +#endif + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Launches a CUDA function + * + * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ + * grid of blocks. Each block contains \p blockDimX x \p blockDimY x + * \p blockDimZ threads. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * Kernel parameters to \p f can be specified in one of two ways: + * + * 1) Kernel parameters can be specified via \p kernelParams. If \p f + * has N parameters, then \p kernelParams needs to be an array of N + * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] + * must point to a region of memory from which the actual kernel + * parameter will be copied. The number of kernel parameters and their + * offsets and sizes do not need to be specified as that information is + * retrieved directly from the kernel's image. + * + * 2) Kernel parameters can also be packaged by the application into + * a single buffer that is passed in via the \p extra parameter. + * This places the burden on the application of knowing each kernel + * parameter's size and alignment/padding within the buffer. Here is + * an example of using the \p extra parameter in this manner: + * \code + size_t argBufferSize; + char argBuffer[256]; + + // populate argBuffer and argBufferSize + + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, + CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize, + CU_LAUNCH_PARAM_END + }; + status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config); + * \endcode + * + * The \p extra parameter exists to allow ::cuLaunchKernel to take + * additional less commonly used arguments. \p extra specifies a list of + * names of extra settings and their corresponding values. Each extra + * setting name is immediately followed by the corresponding value. The + * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END. + * + * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra + * array; + * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next + * value in \p extra will be a pointer to a buffer containing all + * the kernel parameters for launching kernel \p f; + * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next + * value in \p extra will be a pointer to a size_t containing the + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER; + * + * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel + * parameters are specified with both \p kernelParams and \p extra + * (i.e. both \p kernelParams and \p extra are non-NULL). + * + * Calling ::cuLaunchKernel() sets persistent function state that is + * the same as function state set through the following deprecated APIs: + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), + * ::cuParamSetv(). + * + * When the kernel \p f is launched via ::cuLaunchKernel(), the previous + * block shape, shared size and parameter info associated with \p f + * is overwritten. + * + * Note that to use ::cuLaunchKernel(), the kernel \p f must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchKernel() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * \param f - Kernel to launch + * \param gridDimX - Width of grid in blocks + * \param gridDimY - Height of grid in blocks + * \param gridDimZ - Depth of grid in blocks + * \param blockDimX - X dimension of each thread block + * \param blockDimY - Y dimension of each thread block + * \param blockDimZ - Z dimension of each thread block + * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes + * \param hStream - Stream identifier + * \param kernelParams - Array of pointers to kernel parameters + * \param extra - Extra options + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute + */ +CUresult CUDAAPI cuLaunchKernel(CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams, + void **extra); +#endif /* __CUDA_API_VERSION >= 4000 */ + +/** @} */ /* END CUDA_EXEC */ + +/** + * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED] + * + * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated execution control functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Sets the block-dimensions for the function + * + * \deprecated + * + * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are + * created when the kernel given by \p hfunc is launched. + * + * \param hfunc - Kernel to specify dimensions of + * \param x - X dimension + * \param y - Y dimension + * \param z - Z dimension + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetSharedSize, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); + +/** + * \brief Sets the dynamic shared-memory size for the function + * + * \deprecated + * + * Sets through \p bytes the amount of dynamic shared memory that will be + * available to each thread block when the kernel given by \p hfunc is launched. + * + * \param hfunc - Kernel to specify dynamic shared-memory size for + * \param bytes - Dynamic shared-memory size per thread in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); + +/** + * \brief Sets the parameter size for the function + * + * \deprecated + * + * Sets through \p numbytes the total size in bytes needed by the function + * parameters of the kernel corresponding to \p hfunc. + * + * \param hfunc - Kernel to set parameter size for + * \param numbytes - Size of parameter list in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); + +/** + * \brief Adds an integer parameter to the function's argument list + * + * \deprecated + * + * Sets an integer parameter that will be specified the next time the + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. + * + * \param hfunc - Kernel to add parameter to + * \param offset - Offset to add parameter to argument list + * \param value - Value of parameter + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); + +/** + * \brief Adds a floating-point parameter to the function's argument list + * + * \deprecated + * + * Sets a floating-point parameter that will be specified the next time the + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. + * + * \param hfunc - Kernel to add parameter to + * \param offset - Offset to add parameter to argument list + * \param value - Value of parameter + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); + +/** + * \brief Adds arbitrary data to the function's argument list + * + * \deprecated + * + * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr + * into the parameter space of the kernel corresponding to \p hfunc. \p offset + * is a byte offset. + * + * \param hfunc - Kernel to add data to + * \param offset - Offset to add data to argument list + * \param ptr - Pointer to arbitrary data + * \param numbytes - Size of data to copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block + * contains the number of threads specified by a previous call to + * ::cuFuncSetBlockShape(). + * + * \param f - Kernel to launch + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuLaunch(CUfunction f); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of + * blocks. Each block contains the number of threads specified by a previous + * call to ::cuFuncSetBlockShape(). + * + * \param f - Kernel to launch + * \param grid_width - Width of grid in blocks + * \param grid_height - Height of grid in blocks + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of + * blocks. Each block contains the number of threads specified by a previous + * call to ::cuFuncSetBlockShape(). + * + * \param f - Kernel to launch + * \param grid_width - Width of grid in blocks + * \param grid_height - Height of grid in blocks + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * + * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), + * this function may serialize kernel launches. In order to force the CUDA driver to retain + * asynchronous behavior, set the ::CU_CTX_LMEM_RESIZE_TO_MAX flag during context creation (see ::cuCtxCreate). + * + * \note_null_stream + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); + + +/** + * \brief Adds a texture-reference to the function's argument list + * + * \deprecated + * + * Makes the CUDA array or linear memory bound to the texture reference + * \p hTexRef available to a device program as a texture. In this version of + * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and + * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT. + * + * \param hfunc - Kernel to add texture-reference to + * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT) + * \param hTexRef - Texture-reference to add to argument list + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + */ +CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); +/** @} */ /* END CUDA_EXEC_DEPRECATED */ + + +#if __CUDA_API_VERSION >= 6050 +/** + * \defgroup CUDA_OCCUPANCY Occupancy + * + * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the occupancy calculation functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns occupancy of a function + * + * Returns in \p *numBlocks the number of the maximum active blocks per + * streaming multiprocessor. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + */ +CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize); + +/** + * \brief Returns occupancy of a function + * + * Returns in \p *numBlocks the number of the maximum active blocks per + * streaming multiprocessor. + * + * The \p Flags parameter controls how special cases are handled. The + * valid flags are: + * + * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as + * ::cuOccupancyMaxActiveBlocksPerMultiprocessor; + * + * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the + * default behavior on platform where global caching affects + * occupancy. On such platforms, if caching is enabled, but + * per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching + * is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes + * the occupancy calculator to return 0 in such cases. More information + * can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + */ +CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags); + +/** + * \brief Suggest a launch configuration with reasonable occupancy + * + * Returns in \p *blockSize a reasonable block size that can achieve + * the maximum occupancy (or, the maximum number of active warps with + * the fewest blocks per multiprocessor), and in \p *minGridSize the + * minimum grid size to achieve the maximum occupancy. + * + * If \p blockSizeLimit is 0, the configurator will use the maximum + * block size permitted by the device / function instead. + * + * If per-block dynamic shared memory allocation is not needed, the + * user should leave both \p blockSizeToDynamicSMemSize and \p + * dynamicSMemSize as 0. + * + * If per-block dynamic shared memory allocation is needed, then if + * the dynamic shared memory size is constant regardless of block + * size, the size should be passed through \p dynamicSMemSize, and \p + * blockSizeToDynamicSMemSize should be NULL. + * + * Otherwise, if the per-block dynamic shared memory size varies with + * different block sizes, the user needs to provide a unary function + * through \p blockSizeToDynamicSMemSize that computes the dynamic + * shared memory needed by \p func for any given block size. \p + * dynamicSMemSize is ignored. An example signature is: + * + * \code + * // Take block size, returns dynamic shared memory needed + * size_t blockToSmem(int blockSize); + * \endcode + * + * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy + * \param blockSize - Returned maximum block size that can achieve the maximum occupancy + * \param func - Kernel for which launch configuration is calculated + * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size + * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + */ +CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit); + +/** + * \brief Suggest a launch configuration with reasonable occupancy + * + * An extended version of ::cuOccupancyMaxPotentialBlockSize. In + * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize, + * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags + * parameter. + * + * The \p Flags parameter controls how special cases are handled. The + * valid flags are: + * + * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as + * ::cuOccupancyMaxPotentialBlockSize; + * + * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the + * default behavior on platform where global caching affects + * occupancy. On such platforms, the launch configurations that + * produces maximal occupancy might not support global + * caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE + * guarantees that the the produced launch configuration is global + * caching compatible at a potential cost of occupancy. More information + * can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy + * \param blockSize - Returned maximum block size that can achieve the maximum occupancy + * \param func - Kernel for which launch configuration is calculated + * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size + * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to handle + * \param flags - Options + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + */ +CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); + +/** @} */ /* END CUDA_OCCUPANCY */ +#endif /* __CUDA_API_VERSION >= 6050 */ + +/** + * \defgroup CUDA_TEXREF Texture Reference Management + * + * ___MANBRIEF___ texture reference management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the texture reference management functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Binds an array as a texture reference + * + * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. \p Flags must be set to + * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is + * unbound. + * + * \param hTexRef - Texture reference to bind + * \param hArray - Array to bind + * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); + +/** + * \brief Binds a mipmapped array to a texture reference + * + * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef. + * Any previous address or CUDA array state associated with the texture reference + * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. + * Any CUDA array previously bound to \p hTexRef is unbound. + * + * \param hTexRef - Texture reference to bind + * \param hMipmappedArray - Mipmapped array to bind + * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Binds an address as a texture reference + * + * Binds a linear address range to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. Any memory previously bound to \p hTexRef + * is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, ::cuTexRefSetAddress() passes back a byte offset in + * \p *ByteOffset that must be applied to texture fetches in order to read from + * the desired memory. This offset must be divided by the texel size and + * passed to kernels that read from the texture so they can be applied to the + * ::tex1Dfetch() function. + * + * If the device memory pointer was returned from ::cuMemAlloc(), the offset + * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter. + * + * The total number of elements (or texels) in the linear address range + * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. + * The number of elements is computed as (\p bytes / bytesPerElement), + * where bytesPerElement is determined from the data format and number of + * components set using ::cuTexRefSetFormat(). + * + * \param ByteOffset - Returned byte offset + * \param hTexRef - Texture reference to bind + * \param dptr - Device pointer to bind + * \param bytes - Size of memory to bind in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); + +/** + * \brief Binds an address as a 2D texture reference + * + * Binds a linear address range to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. Any memory previously bound to \p hTexRef + * is unbound. + * + * Using a ::tex2D() function inside a kernel requires a call to either + * ::cuTexRefSetArray() to bind the corresponding texture reference to an + * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear + * memory. + * + * Function calls to ::cuTexRefSetFormat() cannot follow calls to + * ::cuTexRefSetAddress2D() for the same texture reference. + * + * It is required that \p dptr be aligned to the appropriate hardware-specific + * texture alignment. You can query this value using the device attribute + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \p Pitch has to be aligned to the hardware-specific texture pitch alignment. + * This value can be queried using the device attribute + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Width and Height, which are specified in elements (or texels), cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. + * \p Pitch, which is specified in bytes, cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. + * + * \param hTexRef - Texture reference to bind + * \param desc - Descriptor of CUDA array + * \param dptr - Device pointer to bind + * \param Pitch - Line pitch in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Sets the format for a texture reference + * + * Specifies the format of the data to be read by the texture reference + * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the + * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure: + * They specify the format of each component and the number of components per + * array element. + * + * \param hTexRef - Texture reference + * \param fmt - Format to set + * \param NumPackedComponents - Number of components per array element + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); + +/** + * \brief Sets the addressing mode for a texture reference + * + * Specifies the addressing mode \p am for the given dimension \p dim of the + * texture reference \p hTexRef. If \p dim is zero, the addressing mode is + * applied to the first parameter of the functions used to fetch from the + * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined + * as: + * \code + typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, + CU_TR_ADDRESS_MODE_CLAMP = 1, + CU_TR_ADDRESS_MODE_MIRROR = 2, + CU_TR_ADDRESS_MODE_BORDER = 3 + } CUaddress_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only + * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. + * + * \param hTexRef - Texture reference + * \param dim - Dimension + * \param am - Addressing mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); + +/** + * \brief Sets the filtering mode for a texture reference + * + * Specifies the filtering mode \p fm to be used when reading memory through + * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: + * + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * + * \param hTexRef - Texture reference + * \param fm - Filtering mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); + +/** + * \brief Sets the mipmap filtering mode for a texture reference + * + * Specifies the mipmap filtering mode \p fm to be used when reading memory through + * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: + * + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param fm - Filtering mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm); + +/** + * \brief Sets the mipmap level bias for a texture reference + * + * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when + * reading memory through the texture reference \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param bias - Mipmap level bias + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); + +/** + * \brief Sets the mipmap min/max mipmap level clamps for a texture reference + * + * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp + * respectively, to be used when reading memory through the texture reference + * \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param minMipmapLevelClamp - Mipmap min level clamp + * \param maxMipmapLevelClamp - Mipmap max level clamp + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); + +/** + * \brief Sets the maximum anisotropy for a texture reference + * + * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through + * the texture reference \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * + * \param hTexRef - Texture reference + * \param maxAniso - Maximum anisotropy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso); + +/** + * \brief Sets the flags for a texture reference + * + * Specifies optional flags via \p Flags to specify the behavior of data + * returned through the texture reference \p hTexRef. The valid flags are: + * + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of + * having the texture promote integer data to floating point data in the + * range [0, 1]. Note that texture with 32-bit integer format + * would not be promoted, regardless of whether or not this + * flag is specified; + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the + * default behavior of having the texture coordinates range + * from [0, Dim) where Dim is the width or height of the CUDA + * array. Instead, the texture coordinates [0, 1.0) reference + * the entire breadth of the array dimension; + * + * \param hTexRef - Texture reference + * \param Flags - Optional flags to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Gets the address associated with a texture reference + * + * Returns in \p *pdptr the base address bound to the texture reference + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any device memory range. + * + * \param pdptr - Returned device address + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Gets the array bound to a texture reference + * + * Returns in \p *phArray the CUDA array bound to the texture reference + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any CUDA array. + * + * \param phArray - Returned array + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); + +/** + * \brief Gets the mipmapped array bound to a texture reference + * + * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture + * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any CUDA mipmapped array. + * + * \param phMipmappedArray - Returned mipmapped array + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); + +/** + * \brief Gets the addressing mode used by a texture reference + * + * Returns in \p *pam the addressing mode corresponding to the + * dimension \p dim of the texture reference \p hTexRef. Currently, the only + * valid value for \p dim are 0 and 1. + * + * \param pam - Returned addressing mode + * \param hTexRef - Texture reference + * \param dim - Dimension + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); + +/** + * \brief Gets the filter-mode used by a texture reference + * + * Returns in \p *pfm the filtering mode of the texture reference + * \p hTexRef. + * + * \param pfm - Returned filtering mode + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); + +/** + * \brief Gets the format used by a texture reference + * + * Returns in \p *pFormat and \p *pNumChannels the format and number + * of components of the CUDA array bound to the texture reference \p hTexRef. + * If \p pFormat or \p pNumChannels is NULL, it will be ignored. + * + * \param pFormat - Returned format + * \param pNumChannels - Returned number of components + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags + */ +CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); + +/** + * \brief Gets the mipmap filtering mode for a texture reference + * + * Returns the mipmap filtering mode in \p pfm that's used when reading memory through + * the texture reference \p hTexRef. + * + * \param pfm - Returned mipmap filtering mode + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); + +/** + * \brief Gets the mipmap level bias for a texture reference + * + * Returns the mipmap level bias in \p pBias that's added to the specified mipmap + * level when reading memory through the texture reference \p hTexRef. + * + * \param pbias - Returned mipmap level bias + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); + +/** + * \brief Gets the min/max mipmap level clamps for a texture reference + * + * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp + * that's used when reading memory through the texture reference \p hTexRef. + * + * \param pminMipmapLevelClamp - Returned mipmap min level clamp + * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); + +/** + * \brief Gets the maximum anisotropy for a texture reference + * + * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through + * the texture reference \p hTexRef. + * + * \param pmaxAniso - Returned maximum anisotropy + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); + +/** + * \brief Gets the flags used by a texture reference + * + * Returns in \p *pFlags the flags of the texture reference \p hTexRef. + * + * \param pFlags - Returned flags + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); + +/** @} */ /* END CUDA_TEXREF */ + +/** + * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated texture reference management functions of the + * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated texture reference management + * functions of the low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Creates a texture reference + * + * \deprecated + * + * Creates a texture reference and returns its handle in \p *pTexRef. Once + * created, the application must call ::cuTexRefSetArray() or + * ::cuTexRefSetAddress() to associate the reference with allocated memory. + * Other texture reference functions are used to specify the format and + * interpretation (addressing, filtering, etc.) to be used when the memory is + * read through this texture reference. + * + * \param pTexRef - Returned texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefDestroy + */ +CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); + +/** + * \brief Destroys a texture reference + * + * \deprecated + * + * Destroys the texture reference specified by \p hTexRef. + * + * \param hTexRef - Texture reference to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefCreate + */ +CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); + +/** @} */ /* END CUDA_TEXREF_DEPRECATED */ + + +/** + * \defgroup CUDA_SURFREF Surface Reference Management + * + * ___MANBRIEF___ surface reference management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the surface reference management functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Sets the CUDA array for a surface reference. + * + * Sets the CUDA array \p hArray to be read and written by the surface reference + * \p hSurfRef. Any previous CUDA array state associated with the surface + * reference is superseded by this function. \p Flags must be set to 0. + * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array. + * Any CUDA array previously bound to \p hSurfRef is unbound. + + * \param hSurfRef - Surface reference handle + * \param hArray - CUDA array handle + * \param Flags - set to 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuModuleGetSurfRef, ::cuSurfRefGetArray + */ +CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); + +/** + * \brief Passes back the CUDA array bound to a surface reference. + * + * Returns in \p *phArray the CUDA array bound to the surface reference + * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference + * is not bound to any CUDA array. + + * \param phArray - Surface reference handle + * \param hSurfRef - Surface reference handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray + */ +CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); + +/** @} */ /* END CUDA_SURFREF */ + +#if __CUDA_API_VERSION >= 5000 +/** + * \defgroup CUDA_TEXOBJECT Texture Object Management + * + * ___MANBRIEF___ texture object management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the texture object management functions of the + * low-level CUDA driver application programming interface. The texture + * object API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a texture object + * + * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes + * the data to texture from. \p pTexDesc describes how the data should be sampled. + * \p pResViewDesc is an optional argument that specifies an alternate format for + * the data described by \p pResDesc, and also describes the subresource region + * to restrict access to when texturing. \p pResViewDesc can only be specified if + * the type of resource is a CUDA array or a CUDA mipmapped array. + * + * Texture objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a texture object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * The ::CUDA_RESOURCE_DESC structure is defined as: + * \code + typedef struct CUDA_RESOURCE_DESC_st + { + CUresourcetype resType; + + union { + struct { + CUarray hArray; + } array; + struct { + CUmipmappedArray hMipmappedArray; + } mipmap; + struct { + CUdeviceptr devPtr; + CUarray_format format; + unsigned int numChannels; + size_t sizeInBytes; + } linear; + struct { + CUdeviceptr devPtr; + CUarray_format format; + unsigned int numChannels; + size_t width; + size_t height; + size_t pitchInBytes; + } pitch2D; + } res; + + unsigned int flags; + } CUDA_RESOURCE_DESC; + + * \endcode + * where: + * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from. + * CUresourceType is defined as: + * \code + typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, + CU_RESOURCE_TYPE_LINEAR = 0x02, + CU_RESOURCE_TYPE_PITCH2D = 0x03 + } CUresourcetype; + * \endcode + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray + * must be set to a valid CUDA array handle. + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray + * must be set to a valid CUDA mipmapped array handle. + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr + * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. + * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels + * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes + * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)). + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr + * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. + * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels + * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width + * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. + * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. + * + * - ::flags must be set to zero. + * + * + * The ::CUDA_TEXTURE_DESC struct is defined as + * \code + typedef struct CUDA_TEXTURE_DESC_st { + CUaddress_mode addressMode[3]; + CUfilter_mode filterMode; + unsigned int flags; + unsigned int maxAnisotropy; + CUfilter_mode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + } CUDA_TEXTURE_DESC; + * \endcode + * where + * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as: + * \code + typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, + CU_TR_ADDRESS_MODE_CLAMP = 1, + CU_TR_ADDRESS_MODE_MIRROR = 2, + CU_TR_ADDRESS_MODE_BORDER = 3 + } CUaddress_mode; + * \endcode + * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES + * is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. + * + * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as: + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. + * + * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following: + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of having the texture promote integer data to floating point data in the + * range [0, 1]. Note that texture with 32-bit integer format would not be promoted, regardless of whether or not this flag is specified. + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior of having the texture coordinates range from [0, Dim) where Dim is + * the width or height of the CUDA array. Instead, the texture coordinates [0, 1.0) reference the entire breadth of the array dimension; Note + * that for CUDA mipmapped arrays, this flag has to be set. + * + * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be + * clamped to the range [1,16]. + * + * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. + * + * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. + * + * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. + * + * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. + * + * + * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as + * \code + typedef struct CUDA_RESOURCE_VIEW_DESC_st + { + CUresourceViewFormat format; + size_t width; + size_t height; + size_t depth; + unsigned int firstMipmapLevel; + unsigned int lastMipmapLevel; + unsigned int firstLayer; + unsigned int lastLayer; + } CUDA_RESOURCE_VIEW_DESC; + * \endcode + * where: + * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should + * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block + * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32. + * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have + * a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base + * format but with 4 channels. + * + * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the + * original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. + * For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp + * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, + * then the actual minimum mipmap level clamp will be 3.2. + * + * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value + * has to be zero. + * + * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. + * For non-layered resources, this value has to be zero. + * + * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, + * this value has to be zero. + * + * + * \param pTexObject - Texture object to create + * \param pResDesc - Resource descriptor + * \param pTexDesc - Texture descriptor + * \param pResViewDesc - Resource view descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexObjectDestroy + */ +CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc); + +/** + * \brief Destroys a texture object + * + * Destroys the texture object specified by \p texObject. + * + * \param texObject - Texture object to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexObjectCreate + */ +CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject); + +/** + * \brief Returns a texture object's resource descriptor + * + * Returns the resource descriptor for the texture object specified by \p texObject. + * + * \param pResDesc - Resource descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexObjectCreate + */ +CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject); + +/** + * \brief Returns a texture object's texture descriptor + * + * Returns the texture descriptor for the texture object specified by \p texObject. + * + * \param pTexDesc - Texture descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexObjectCreate + */ +CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject); + +/** + * \brief Returns a texture object's resource view descriptor + * + * Returns the resource view descriptor for the texture object specified by \p texObject. + * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param pResViewDesc - Resource view descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexObjectCreate + */ +CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject); + +/** @} */ /* END CUDA_TEXOBJECT */ + +/** + * \defgroup CUDA_SURFOBJECT Surface Object Management + * + * ___MANBRIEF___ surface object management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the surface object management functions of the + * low-level CUDA driver application programming interface. The surface + * object API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a surface object + * + * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes + * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be + * ::CU_RESOURCE_TYPE_ARRAY and ::CUDA_RESOURCE_DESC::res::array::hArray + * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero. + * + * Surface objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a surface object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * \param pSurfObject - Surface object to create + * \param pResDesc - Resource descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuSurfObjectDestroy + */ +CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc); + +/** + * \brief Destroys a surface object + * + * Destroys the surface object specified by \p surfObject. + * + * \param surfObject - Surface object to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuSurfObjectCreate + */ +CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject); + +/** + * \brief Returns a surface object's resource descriptor + * + * Returns the resource descriptor for the surface object specified by \p surfObject. + * + * \param pResDesc - Resource descriptor + * \param surfObject - Surface object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuSurfObjectCreate + */ +CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject); + +/** @} */ /* END CUDA_SURFOBJECT */ +#endif /* __CUDA_API_VERSION >= 5000 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access + * + * ___MANBRIEF___ direct peer context memory access functions of the low-level + * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the direct peer context memory access functions + * of the low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Queries if a device may directly access a peer device's memory. + * + * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of + * directly accessing memory from contexts on \p peerDev and 0 otherwise. + * If direct access of \p peerDev from \p dev is possible, then access may be + * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess(). + * + * \param canAccessPeer - Returned access capability + * \param dev - Device from which allocations on \p peerDev are to + * be directly accessed. + * \param peerDev - Device on which the allocations to be directly accessed + * by \p dev reside. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuCtxEnablePeerAccess, + * ::cuCtxDisablePeerAccess + */ +CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev); + +/** + * \brief Enables direct access to memory allocations in a peer context. + * + * If both the current context and \p peerContext are on devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same + * major compute capability, then on success all allocations from \p peerContext will + * immediately be accessible by the current context. See \ref CUDA_UNIFIED for additional + * details. + * + * Note that access granted by this call is unidirectional and that in order to access + * memory from the current context in \p peerContext, a separate symmetric call + * to ::cuCtxEnablePeerAccess() is required. + * + * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates + * that the ::CUdevice of the current context cannot directly access memory + * from the ::CUdevice of \p peerContext. + * + * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of + * \p peerContext from the current context has already been enabled. + * + * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible + * because hardware resources required for peer access have been exhausted. + * + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext + * is not a valid context, or if the current context is \p peerContext. + * + * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0. + * + * \param peerContext - Peer context to enable direct access to from the current context + * \param Flags - Reserved for future use and must be set to 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, + * ::CUDA_ERROR_TOO_MANY_PEERS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuDeviceCanAccessPeer, + * ::cuCtxDisablePeerAccess + */ +CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags); + +/** + * \brief Disables direct access to memory allocations in a peer context and + * unregisters any registered allocations. + * + Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has + * not yet been enabled from \p peerContext to the current context. + * + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if + * \p peerContext is not a valid context. + * + * \param peerContext - Peer context to disable direct access to + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * \notefnerr + * + * \sa ::cuDeviceCanAccessPeer, + * ::cuCtxEnablePeerAccess + */ +CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); + +/** @} */ /* END CUDA_PEER_ACCESS */ +#endif /* __CUDA_API_VERSION >= 4000 */ + +/** + * \defgroup CUDA_GRAPHICS Graphics Interoperability + * + * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the graphics interoperability functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Unregisters a graphics resource for access by CUDA + * + * Unregisters the graphics resource \p resource so it is not accessible by + * CUDA unless registered again. + * + * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is + * returned. + * + * \param resource - Resource to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuGraphicsD3D9RegisterResource, + * ::cuGraphicsD3D10RegisterResource, + * ::cuGraphicsD3D11RegisterResource, + * ::cuGraphicsGLRegisterBuffer, + * ::cuGraphicsGLRegisterImage + */ +CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource); + +/** + * \brief Get an array through which to access a subresource of a mapped graphics resource. + * + * Returns in \p *pArray an array through which the subresource of the mapped + * graphics resource \p resource which corresponds to array index \p arrayIndex + * and mipmap level \p mipLevel may be accessed. The value set in \p *pArray may + * change every time that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via an array and + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. + * If \p arrayIndex is not a valid array index for \p resource then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * If \p mipLevel is not a valid mipmap level for \p resource then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param pArray - Returned array through which a subresource of \p resource may be accessed + * \param resource - Mapped resource to access + * \param arrayIndex - Array index for array textures or cubemap face + * index as defined by ::CUarray_cubemap_face for + * cubemap textures for the subresource to access + * \param mipLevel - Mipmap level for the subresource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY + * \notefnerr + * + * \sa ::cuGraphicsResourceGetMappedPointer + */ +CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); + +#if __CUDA_API_VERSION >= 5000 + +/** + * \brief Get a mipmapped array through which to access a mapped graphics resource. + * + * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics + * resource \p resource. The value set in \p *pMipmappedArray may change every time + * that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via a mipmapped array and + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed + * \param resource - Mapped resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY + * \notefnerr + * + * \sa ::cuGraphicsResourceGetMappedPointer + */ +CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource); + +#endif /* __CUDA_API_VERSION >= 5000 */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Get a device pointer through which to access a mapped graphics resource. + * + * Returns in \p *pDevPtr a pointer through which the mapped graphics resource + * \p resource may be accessed. + * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer. + * The value set in \p pPointer may change every time that \p resource is mapped. + * + * If \p resource is not a buffer then it cannot be accessed via a pointer and + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * * + * \param pDevPtr - Returned pointer through which \p resource may be accessed + * \param pSize - Returned size of the buffer accessible starting at \p *pPointer + * \param resource - Mapped resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cuGraphicsSubResourceGetMappedArray + */ +CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Set usage flags for mapping a graphics resource + * + * Set \p flags for mapping the graphics resource \p resource. + * + * Changes to \p flags will take effect the next time \p resource is mapped. + * The \p flags argument may be any of the following: + + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA kernels. This is the default value. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which + * access this resource will not write to this resource. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels + * which access this resource will not read from this resource and will + * write over the entire contents of the resource, so none of the data + * previously stored in the resource will be preserved. + * + * If \p resource is presently mapped for access by CUDA then + * ::CUDA_ERROR_ALREADY_MAPPED is returned. + * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param resource - Registered resource to set flags for + * \param flags - Parameters for resource mapping + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources + */ +CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); + +/** + * \brief Map graphics resources for access by CUDA + * + * Maps the \p count graphics resources in \p resources for access by CUDA. + * + * The resources in \p resources may be accessed by CUDA until they + * are unmapped. The graphics API from which \p resources were registered + * should not access any resources while they are mapped by CUDA. If an + * application does so, the results are undefined. + * + * This function provides the synchronization guarantee that any graphics calls + * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA + * work issued in \p stream begins. + * + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. + * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned. + * + * \param count - Number of resources to map + * \param resources - Resources to map for CUDA usage + * \param hStream - Stream with which to synchronize + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \note_null_stream + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cuGraphicsUnmapResources + */ +CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + +/** + * \brief Unmap graphics resources. + * + * Unmaps the \p count graphics resources in \p resources. + * + * Once unmapped, the resources in \p resources may not be accessed by CUDA + * until they are mapped again. + * + * This function provides the synchronization guarantee that any CUDA work issued + * in \p stream before ::cuGraphicsUnmapResources() will complete before any + * subsequently issued graphics work begins. + * + * + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. + * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param count - Number of resources to unmap + * \param resources - Resources to unmap + * \param hStream - Stream with which to synchronize + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \note_null_stream + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources + */ +CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + +/** @} */ /* END CUDA_GRAPHICS */ + +CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); + + +/** + * CUDA API versioning support + */ +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef cuMemHostRegister + #undef cuGraphicsResourceSetMapFlags + #undef cuLinkCreate + #undef cuLinkAddData + #undef cuLinkAddFile + #undef cuDeviceTotalMem + #undef cuCtxCreate + #undef cuModuleGetGlobal + #undef cuMemGetInfo + #undef cuMemAlloc + #undef cuMemAllocPitch + #undef cuMemFree + #undef cuMemGetAddressRange + #undef cuMemAllocHost + #undef cuMemHostGetDevicePointer + #undef cuMemcpyHtoD + #undef cuMemcpyDtoH + #undef cuMemcpyDtoD + #undef cuMemcpyDtoA + #undef cuMemcpyAtoD + #undef cuMemcpyHtoA + #undef cuMemcpyAtoH + #undef cuMemcpyAtoA + #undef cuMemcpyHtoAAsync + #undef cuMemcpyAtoHAsync + #undef cuMemcpy2D + #undef cuMemcpy2DUnaligned + #undef cuMemcpy3D + #undef cuMemcpyHtoDAsync + #undef cuMemcpyDtoHAsync + #undef cuMemcpyDtoDAsync + #undef cuMemcpy2DAsync + #undef cuMemcpy3DAsync + #undef cuMemsetD8 + #undef cuMemsetD16 + #undef cuMemsetD32 + #undef cuMemsetD2D8 + #undef cuMemsetD2D16 + #undef cuMemsetD2D32 + #undef cuArrayCreate + #undef cuArrayGetDescriptor + #undef cuArray3DCreate + #undef cuArray3DGetDescriptor + #undef cuTexRefSetAddress + #undef cuTexRefSetAddress2D + #undef cuTexRefGetAddress + #undef cuGraphicsResourceGetMappedPointer + #undef cuCtxDestroy + #undef cuCtxPopCurrent + #undef cuCtxPushCurrent + #undef cuStreamDestroy + #undef cuEventDestroy + #undef cuMemcpy + #undef cuMemcpyAsync + #undef cuMemcpyPeer + #undef cuMemcpyPeerAsync + #undef cuMemcpy3DPeer + #undef cuMemcpy3DPeerAsync + #undef cuMemsetD8Async + #undef cuMemsetD16Async + #undef cuMemsetD32Async + #undef cuMemsetD2D8Async + #undef cuMemsetD2D16Async + #undef cuMemsetD2D32Async + #undef cuStreamGetPriority + #undef cuStreamGetFlags + #undef cuStreamWaitEvent + #undef cuStreamAddCallback + #undef cuStreamAttachMemAsync + #undef cuStreamQuery + #undef cuStreamSynchronize + #undef cuEventRecord + #undef cuLaunchKernel + #undef cuGraphicsMapResources + #undef cuGraphicsUnmapResources +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) +CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); +#endif /* defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050 +CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); +#endif /* defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050 */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050) +CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); +CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, + unsigned int numOptions, CUjit_option *options, void **optionValues); +CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, + unsigned int numOptions, CUjit_option *options, void **optionValues); +#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050) */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) +CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); +#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) */ + +/** + * CUDA API made obselete at API version 3020 + */ +#if defined(__CUDA_API_VERSION_INTERNAL) + #define CUdeviceptr CUdeviceptr_v1 + #define CUDA_MEMCPY2D_st CUDA_MEMCPY2D_v1_st + #define CUDA_MEMCPY2D CUDA_MEMCPY2D_v1 + #define CUDA_MEMCPY3D_st CUDA_MEMCPY3D_v1_st + #define CUDA_MEMCPY3D CUDA_MEMCPY3D_v1 + #define CUDA_ARRAY_DESCRIPTOR_st CUDA_ARRAY_DESCRIPTOR_v1_st + #define CUDA_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR_v1 + #define CUDA_ARRAY3D_DESCRIPTOR_st CUDA_ARRAY3D_DESCRIPTOR_v1_st + #define CUDA_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR_v1 +#endif /* CUDA_FORCE_LEGACY32_INTERNAL */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 + +typedef unsigned int CUdeviceptr; + +typedef struct CUDA_MEMCPY2D_st +{ + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + + unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ + unsigned int Height; /**< Height of 2D memory copy */ +} CUDA_MEMCPY2D; + +typedef struct CUDA_MEMCPY3D_st +{ + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + unsigned int srcZ; /**< Source Z */ + unsigned int srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + unsigned int dstZ; /**< Destination Z */ + unsigned int dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ + unsigned int Height; /**< Height of 3D memory copy */ + unsigned int Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D; + +typedef struct CUDA_ARRAY_DESCRIPTOR_st +{ + unsigned int Width; /**< Width of array */ + unsigned int Height; /**< Height of array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ +} CUDA_ARRAY_DESCRIPTOR; + +typedef struct CUDA_ARRAY3D_DESCRIPTOR_st +{ + unsigned int Width; /**< Width of 3D array */ + unsigned int Height; /**< Height of 3D array */ + unsigned int Depth; /**< Depth of 3D array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ +} CUDA_ARRAY3D_DESCRIPTOR; + +CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev); +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); +CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name); +CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); +CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize); +CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); +CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); +CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr); +CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); +CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); +CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); +CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); +CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); +CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); +CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); +CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N); +CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N); +CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N); +CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); +CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); +CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); +CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); +CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); +CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); +CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); +CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes); +CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch); +CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); +CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 3020 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 4000 +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); +CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); +CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); +CUresult CUDAAPI cuStreamDestroy(CUstream hStream); +CUresult CUDAAPI cuEventDestroy(CUevent hEvent); +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 4000 */ +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef CUdeviceptr + #undef CUDA_MEMCPY2D_st + #undef CUDA_MEMCPY2D + #undef CUDA_MEMCPY3D_st + #undef CUDA_MEMCPY3D + #undef CUDA_ARRAY_DESCRIPTOR_st + #undef CUDA_ARRAY_DESCRIPTOR + #undef CUDA_ARRAY3D_DESCRIPTOR_st + #undef CUDA_ARRAY3D_DESCRIPTOR +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#if defined(__CUDA_API_VERSION_INTERNAL) + CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy); + CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy); + CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy); + CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N); + CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N); + CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N); + CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); + CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); + CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); + CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); + CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); + CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); + + CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); + + CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); + CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); + CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); + CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); + CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); + CUresult CUDAAPI cuStreamQuery(CUstream hStream); + CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); + CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); + CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); + CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); +#endif + +#ifdef __cplusplus +} +#endif + +#undef __CUDA_API_VERSION + +#endif /* __cuda_cuda_h__ */ + diff --git a/include/isaac/driver/external/CUDA/nvrtc.h b/include/isaac/driver/external/CUDA/nvrtc.h new file mode 100644 index 000000000..390de0cc9 --- /dev/null +++ b/include/isaac/driver/external/CUDA/nvrtc.h @@ -0,0 +1,392 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef __NVRTC_H__ +#define __NVRTC_H__ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#include + + +/*****************************//** + * + * \defgroup error Error Handling + * + ********************************/ + + +/** + * \ingroup error + * \brief CUDA Online Compiler API call result code. + */ +typedef enum { + NVRTC_SUCCESS = 0, + NVRTC_ERROR_OUT_OF_MEMORY = 1, + NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, + NVRTC_ERROR_INVALID_INPUT = 3, + NVRTC_ERROR_INVALID_PROGRAM = 4, + NVRTC_ERROR_INVALID_OPTION = 5, + NVRTC_ERROR_COMPILATION = 6, + NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7 +} nvrtcResult; + + +/** + * \ingroup error + * \brief ::nvrtcGetErrorString is a helper function that stringifies the + * given #nvrtcResult code, e.g., \link #nvrtcResult NVRTC_SUCCESS + * \endlink to \c "NVRTC_SUCCESS". For unrecognized enumeration + * values, it returns \c "NVRTC_ERROR unknown". + * + * \param [in] result CUDA Online Compiler API result code. + * \return Message string for the given #nvrtcResult code. + */ +const char *nvrtcGetErrorString(nvrtcResult result); + + +/****************************************//** + * + * \defgroup query General Information Query + * + *******************************************/ + + +/** + * \ingroup query + * \brief ::nvrtcVersion sets the output parameters \p major and \p minor + * with the CUDA Online Compiler version number. + * + * \param [out] major CUDA Online Compiler major version number. + * \param [out] minor CUDA Online Compiler minor version number. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * + */ +nvrtcResult nvrtcVersion(int *major, int *minor); + + +/********************************//** + * + * \defgroup compilation Compilation + * + ***********************************/ + + +/** + * \ingroup compilation + * \brief ::nvrtcProgram is the unit of compilation, and an opaque handle for + * a program. + * + * To compile a CUDA program string, an instance of nvrtcProgram must be + * created first with ::nvrtcCreateProgram, then compiled with + * ::nvrtcCompileProgram. + */ +typedef struct _nvrtcProgram *nvrtcProgram; + + +/** + * \ingroup compilation + * \brief ::nvrtcCreateProgram creates an instance of ::nvrtcProgram with the + * given input parameters, and sets the output parameter \p prog with + * it. + * + * \param [out] prog CUDA Online Compiler program. + * \param [in] src CUDA program source. + * \param [in] name CUDA program name.\n + * \p name can be \c NULL; \c "default_program" is + * used when \p name is \c NULL. + * \param [in] numHeaders Number of headers used.\n + * \p numHeaders must be greater than or equal to 0. + * \param [in] headers Sources of the headers.\n + * \p headers can be \c NULL when \p numHeaders is + * 0. + * \param [in] includeNames Name of each header by which they can be + * included in the CUDA program source.\n + * \p includeNames can be \c NULL when \p numHeaders + * is 0. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink + * - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcDestroyProgram + */ +nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, + const char *src, + const char *name, + int numHeaders, + const char **headers, + const char **includeNames); + + +/** + * \ingroup compilation + * \brief ::nvrtcDestroyProgram destroys the given program. + * + * \param [in] prog CUDA Online Compiler program. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcCreateProgram + */ +nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog); + + +/** + * \ingroup compilation + * \brief ::nvrtcCompileProgram compiles the given program. + * + * The valid compiler options are: + * + * - Compilation targets + * - --gpu-architecture=\ (-arch)\n + * Specify the name of the class of GPU architectures for which the + * input must be compiled.\n + * - Valid GPU architecture names: + * - compute_20 + * - compute_30 + * - compute_35 + * - compute_50 + * - Default: compute_20 + * - Separate compilation / whole-program compilation + * - --device-c (-dc)\n + * Generate relocatable code that can be linked with other relocatable + * device code. It is equivalent to --relocatable-device-code=true. + * - --device-w (-dw)\n + * Generate non-relocatable code. It is equivalent to + * --relocatable-device-code=false. + * - --relocatable-device-code=[true, false] (-rdc)\n + * Enable (disable) the generation of relocatable device code. + * - Default: false + * - Debugging support + * - --device-debug (-G)\n + * Generate debug information. + * - --generate-line-info (-lineinfo)\n + * Generate line-number information. + * - Code generation + * - --maxrregcount=\ (-maxrregcount)\n + * Specify the maximum amount of registers that GPU functions can use. + * Until a function-specific limit, a higher value will generally + * increase the performance of individual GPU threads that execute this + * function. However, because thread registers are allocated from a + * global register pool on each GPU, a higher value of this option will + * also reduce the maximum thread block size, thereby reducing the amount + * of thread parallelism. Hence, a good maxrregcount value is the result + * of a trade-off. If this option is not specified, then no maximum is + * assumed. Value less than the minimum registers required by ABI will + * be bumped up by the compiler to ABI minimum limit. + * - --ftz=[true, false] (-ftz)\n + * When performing single-precision floating-point operations, flush + * denormal values to zero or preserve denormal values. --use_fast_math + * implies --ftz=true. + * - Default: false + * - --prec-sqrt=[true, false] (-prec-sqrt)\n + * For single-precision floating-point square root, use IEEE + * round-to-nearest mode or use a faster approximation. --use_fast_math + * implies --prec-sqrt=false. + * - Default: true + * - --prec-div=[true, false] (-prec-div)\n + * For single-precision floating-point division and reciprocals, use IEEE + * round-to-nearest mode or use a faster approximation. --use_fast_math + * implies --prec-div=false. + * - Default: true + * - --fmad=[true, false] (-fmad)\n + * Enables (disables) the contraction of floating-point multiplies and + * adds/subtracts into floating-point multiply-add operations (FMAD, + * FFMA, or DFMA). --use_fast_math implies --fmad=true. + * - Default: true + * - --use_fast_math (-use_fast_math)\n + * Make use of fast math operations. --use_fast_math implies --ftz=true + * --prec-div=false --prec-sqrt=false --fmad=true. + * - Preprocessing + * - --define-macro=\ (-D)\n + * macrodef can be either name or + * name=definitions. + * - name\n + * Predefine name as a macro with definition 1. + * - name=definition\n + * The contents of definition are tokenized and preprocessed + * as if they appeared during translation phase three in a \c \#define + * directive. In particular, the definition will be truncated by + * embedded new line characters. + * - --undefine-macro=\ (-U)\n + * Cancel any previous definition of \em name. + * - --include-path=\ (-I)\n + * Add the directory dir to the list of directories to be + * searched for headers. These paths are searched after the list of + * headers given to ::nvrtcCreateProgram. + * - --pre-include=\ (-include)\n + * Preinclude header during preprocessing. + * - Language Dialect + * - --std=c++11 (-std=c++11)\n + * Set language dialect to C++11. + * - --builtin-move-forward=[true, false] (-builtin-move-forward)\n + * Provide builtin definitions of std::move and std::forward, when C++11 + * language dialect is selected. + * - Default : true + * - --builtin-initializer-list=[true, false] (-builtin-initializer-list)\n + * Provide builtin definitions of std::initializer_list class and member + * functions when C++11 language dialect is selected. + * - Default : true + * - Misc + * - --disable-warnings (-w)\n + * Inhibit all warning messages. + * - --restrict (-restrict)\n + * Programmer assertion that all kernel pointer parameters are restrict + * pointers. + * - --device-as-default-execution-space +* (-default-device)\n + * Treat entities with no execution space annotation as \c __device__ + * entities. + * + * \param [in] prog CUDA Online Compiler program. + * \param [in] numOptions Number of compiler options passed. + * \param [in] options Compiler options in the form of C string array.\n + * \p options can be \c NULL when \p numOptions is 0. + * + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink + * - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink + * - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink + */ +nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, + int numOptions, const char **options); + + +/** + * \ingroup compilation + * \brief ::nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX + * generated by the previous compilation of \p prog (including the + * trailing \c NULL). + * + * \param [in] prog CUDA Online Compiler program. + * \param [out] ptxSizeRet Size of the generated PTX (including the trailing + * \c NULL). + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetPTX + */ +nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet); + + +/** + * \ingroup compilation + * \brief ::nvrtcGetPTX stores the PTX generated by the previous compilation + * of \p prog in the memory pointed by \p ptx. + * + * \param [in] prog CUDA Online Compiler program. + * \param [out] ptx Compiled result. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetPTXSize + */ +nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx); + + +/** + * \ingroup compilation + * \brief ::nvrtcGetProgramLogSize sets \p logSizeRet with the size of the + * log generated by the previous compilation of \p prog (including the + * trailing \c NULL). + * + * Note that compilation log may be generated with warnings and informative + * messages, even when the compilation of \p prog succeeds. + * + * \param [in] prog CUDA Online Compiler program. + * \param [out] logSizeRet Size of the compilation log + * (including the trailing \c NULL). + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetProgramLog + */ +nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet); + + +/** + * \ingroup compilation + * \brief ::nvrtcGetProgramLog stores the log generated by the previous + * compilation of \p prog in the memory pointed by \p log. + * + * \param [in] prog CUDA Online Compiler program. + * \param [out] log Compilation log. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetProgramLogSize + */ +nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); + + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* __NVRTC_H__ */ diff --git a/include/isaac/driver/handle.h b/include/isaac/driver/handle.h index 831e4e3df..8a2e39eb2 100644 --- a/include/isaac/driver/handle.h +++ b/include/isaac/driver/handle.h @@ -12,26 +12,18 @@ namespace isaac namespace driver { -#ifdef ISAAC_WITH_CUDA struct cu_event_t{ operator bool() const { return first && second; } CUevent first; CUevent second; }; -#endif - -#ifdef ISAAC_WITH_CUDA #define HANDLE_TYPE(CLTYPE, CUTYPE) Handle -#else -#define HANDLE_TYPE(CLTYPE, CUTYPE) Handle -#endif template class ISAACAPI Handle { private: -#ifdef ISAAC_WITH_CUDA static void _delete(CUcontext x); static void _delete(CUdeviceptr x); static void _delete(CUstream x); @@ -40,7 +32,6 @@ private: static void _delete(CUfunction); static void _delete(CUmodule x); static void _delete(cu_event_t x); -#endif static void release(cl_context x); static void release(cl_mem x); @@ -56,10 +47,8 @@ public: bool operator<(Handle const & other) const; CLType & cl(); CLType const & cl() const; -#ifdef ISAAC_WITH_CUDA CUType & cu(); CUType const & cu() const; -#endif ~Handle(); private: diff --git a/include/isaac/driver/kernel.h b/include/isaac/driver/kernel.h index 22206e0e6..ae4eb1855 100644 --- a/include/isaac/driver/kernel.h +++ b/include/isaac/driver/kernel.h @@ -30,10 +30,8 @@ public: private: backend_type backend_; unsigned int address_bits_; -#ifdef ISAAC_WITH_CUDA std::vector > cu_params_store_; std::vector cu_params_; -#endif HANDLE_TYPE(cl_kernel, CUfunction) h_; }; diff --git a/include/isaac/driver/platform.h b/include/isaac/driver/platform.h index d496c2ac0..0d5b025f6 100644 --- a/include/isaac/driver/platform.h +++ b/include/isaac/driver/platform.h @@ -19,9 +19,7 @@ class ISAACAPI Platform { private: public: -#ifdef ISAAC_WITH_CUDA Platform(backend_type); -#endif Platform(cl_platform_id const &); std::string name() const; std::string version() const; diff --git a/include/isaac/kernels/keywords.h b/include/isaac/kernels/keywords.h index 9b9aac7c6..1cbc8735e 100644 --- a/include/isaac/kernels/keywords.h +++ b/include/isaac/kernels/keywords.h @@ -21,10 +21,8 @@ static inline std::string size_type(driver::Device const & device) { switch(device.backend()) { -#ifdef ISAAC_WITH_CUDA case driver::CUDA: return "int"; -#endif case driver::OPENCL: return "int"; default: diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 22fdeed39..680e739d1 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -18,19 +18,13 @@ else() endif() add_dependencies(isaac database) -#Linkage -if(OPENCL_FOUND) - target_link_libraries(isaac ${OPENCL_LIBRARIES}) -endif() -if(CUDA_FOUND) - cuda_find_library_local_first(CUDA_NVRTC_LIBRARY nvrtc "\"nvrtc\" library") - target_link_libraries(isaac ${CUDA_CUDA_LIBRARY} ${CUDA_NVRTC_LIBRARY}) - #Cuda JIT headers to file - set(CUDA_HELPERS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/driver/helpers/cuda/) - file(GLOB_RECURSE CUDA_HELPERS_SRC ${CUDA_HELPERS_PATH}/*.cu) - CODE_TO_H(SOURCES ${CUDA_HELPERS_SRC} VARNAME kernel_files EXTENSION "hpp" - OUTPUT_DIR ${CUDA_HELPERS_PATH} NAMESPACE "isaac helpers cuda" TARGET cuda_headers EOF "0") -endif() +target_link_libraries(isaac "dl") + +#Cuda JIT headers to file +set(CUDA_HELPERS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/driver/helpers/cuda/) +file(GLOB_RECURSE CUDA_HELPERS_SRC ${CUDA_HELPERS_PATH}/*.cu) +CODE_TO_H(SOURCES ${CUDA_HELPERS_SRC} VARNAME kernel_files EXTENSION "hpp" + OUTPUT_DIR ${CUDA_HELPERS_PATH} NAMESPACE "isaac helpers cuda" TARGET cuda_headers EOF "0") #Installation install(TARGETS isaac LIBRARY DESTINATION lib ARCHIVE DESTINATION lib) diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index eae47885f..3c0aa5775 100644 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -127,15 +127,28 @@ std::list backend::contexts::cache_; void backend::platforms(std::vector & platforms) { - #ifdef ISAAC_WITH_CUDA - platforms.push_back(Platform(CUDA)); - #endif - cl_uint nplatforms; - ocl::check(clGetPlatformIDs(0, NULL, &nplatforms)); - std::vector clplatforms(nplatforms); - ocl::check(clGetPlatformIDs(nplatforms, clplatforms.data(), NULL)); - for(cl_platform_id p: clplatforms) - platforms.push_back(Platform(p)); + //if cuda is here + if(dispatch::cuinit()) + { + if(dispatch::nvrtcinit()) + platforms.push_back(Platform(CUDA)); + else + throw std::runtime_error("ISAAC: Unable to find NVRTC. Make sure you are using CUDA >= 7.0"); + } + + //if OpenCL is here + if(dispatch::clinit()) + { + cl_uint nplatforms; + ocl::check(dispatch::dispatch::clGetPlatformIDs(0, NULL, &nplatforms)); + std::vector clplatforms(nplatforms); + ocl::check(dispatch::dispatch::clGetPlatformIDs(nplatforms, clplatforms.data(), NULL)); + for(cl_platform_id p: clplatforms) + platforms.push_back(Platform(p)); + } + + if(platforms.empty()) + throw std::runtime_error("ISAAC: No backend available. Make sure OpenCL and/or CUDA are available in your library path"); } void backend::synchronize(Context const & context) diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index a4292fce4..64bb69829 100644 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -18,14 +18,12 @@ Buffer::Buffer(Context const & context, size_t size) : backend_(context.backend_ { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: - cuda::check(cuMemAlloc(&h_.cu(), size)); + cuda::check(dispatch::cuMemAlloc(&h_.cu(), size)); break; -#endif case OPENCL: cl_int err; - h_.cl() = clCreateBuffer(context.h_.cl(), CL_MEM_READ_WRITE, size, NULL, &err); + h_.cl() = dispatch::clCreateBuffer(context.h_.cl(), CL_MEM_READ_WRITE, size, NULL, &err); ocl::check(err); break; default: diff --git a/lib/driver/check.cpp b/lib/driver/check.cpp index 4a8e376c2..42180bf15 100644 --- a/lib/driver/check.cpp +++ b/lib/driver/check.cpp @@ -5,8 +5,6 @@ namespace isaac namespace driver { -#ifdef ISAAC_WITH_CUDA - namespace nvrtc { @@ -101,8 +99,6 @@ void check(CUresult err) } } -#endif - namespace ocl { diff --git a/lib/driver/command_queue.cpp b/lib/driver/command_queue.cpp index bc91d1b95..3fbf9db73 100644 --- a/lib/driver/command_queue.cpp +++ b/lib/driver/command_queue.cpp @@ -27,15 +27,14 @@ CommandQueue::CommandQueue(Context const & context, Device const & device, cl_co { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: - cuda::check(cuStreamCreate(&h_.cu(), 0)); + cuda::check(dispatch::cuStreamCreate(&h_.cu(), 0)); break; -#endif + case OPENCL: { cl_int err; - h_.cl() = clCreateCommandQueue(context.h_.cl(), device.h_.cl(), properties, &err); + h_.cl() = dispatch::clCreateCommandQueue(context.h_.cl(), device.h_.cl(), properties, &err); ocl::check(err); break; } @@ -57,10 +56,8 @@ void CommandQueue::synchronize() { switch(backend_) { -#ifdef ISAAC_WITH_CUDA - case CUDA: cuda::check(cuStreamSynchronize(h_.cu())); break; -#endif - case OPENCL: ocl::check(clFinish(h_.cl())); break; + case CUDA: cuda::check(dispatch::cuStreamSynchronize(h_.cu())); break; + case OPENCL: ocl::check(dispatch::clFinish(h_.cl())); break; default: throw; } } @@ -70,16 +67,14 @@ Event CommandQueue::enqueue(Kernel const & kernel, NDRange global, driver::NDRan Event event(backend_); switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: - cuda::check(cuEventRecord(event.h_.cu().first, h_.cu())); - cuda::check(cuLaunchKernel(kernel.h_.cu(), global[0]/local[0], global[1]/local[1], global[2]/local[2], + cuda::check(dispatch::cuEventRecord(event.h_.cu().first, h_.cu())); + cuda::check(dispatch::cuLaunchKernel(kernel.h_.cu(), global[0]/local[0], global[1]/local[1], global[2]/local[2], local[0], local[1], local[2], 0, h_.cu(),(void**)&kernel.cu_params_[0], NULL)); - cuda::check(cuEventRecord(event.h_.cu().second, h_.cu())); + cuda::check(dispatch::cuEventRecord(event.h_.cu().second, h_.cu())); break; -#endif case OPENCL: - ocl::check(clEnqueueNDRangeKernel(h_.cl(), kernel.h_.cl(), global.dimension(), NULL, (const size_t *)global, (const size_t *) local, 0, NULL, &event.handle().cl())); + ocl::check(dispatch::clEnqueueNDRangeKernel(h_.cl(), kernel.h_.cl(), global.dimension(), NULL, (const size_t *)global, (const size_t *) local, 0, NULL, &event.handle().cl())); break; default: throw; } @@ -90,16 +85,14 @@ void CommandQueue::write(Buffer const & buffer, bool blocking, std::size_t offse { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: if(blocking) - cuda::check(cuMemcpyHtoD(buffer.h_.cu() + offset, ptr, size)); + cuda::check(dispatch::cuMemcpyHtoD(buffer.h_.cu() + offset, ptr, size)); else - cuda::check(cuMemcpyHtoDAsync(buffer.h_.cu() + offset, ptr, size, h_.cu())); + cuda::check(dispatch::cuMemcpyHtoDAsync(buffer.h_.cu() + offset, ptr, size, h_.cu())); break; -#endif case OPENCL: - ocl::check(clEnqueueWriteBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL)); + ocl::check(dispatch::clEnqueueWriteBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL)); break; default: throw; } @@ -109,16 +102,14 @@ void CommandQueue::read(Buffer const & buffer, bool blocking, std::size_t offset { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: if(blocking) - cuda::check(cuMemcpyDtoH(ptr, buffer.h_.cu() + offset, size)); + cuda::check(dispatch::cuMemcpyDtoH(ptr, buffer.h_.cu() + offset, size)); else - cuda::check(cuMemcpyDtoHAsync(ptr, buffer.h_.cu() + offset, size, h_.cu())); + cuda::check(dispatch::cuMemcpyDtoHAsync(ptr, buffer.h_.cu() + offset, size, h_.cu())); break; -#endif case OPENCL: - ocl::check(clEnqueueReadBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL)); + ocl::check(dispatch::clEnqueueReadBuffer(h_.cl(), buffer.h_.cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL)); break; default: throw; } diff --git a/lib/driver/context.cpp b/lib/driver/context.cpp index 9efadfcd1..aa41173c0 100644 --- a/lib/driver/context.cpp +++ b/lib/driver/context.cpp @@ -21,14 +21,12 @@ Context::Context(Device const & device) : backend_(device.backend_), device_(dev { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: - cuda::check(cuCtxCreate(&h_.cu(), CU_CTX_SCHED_AUTO, device.h_.cu())); + cuda::check(dispatch::cuCtxCreate(&h_.cu(), CU_CTX_SCHED_AUTO, device.h_.cu())); break; -#endif case OPENCL: cl_int err; - h_.cl() = clCreateContext(NULL, 1, &device_.h_.cl(), NULL, NULL, &err); + h_.cl() = dispatch::clCreateContext(NULL, 1, &device_.h_.cl(), NULL, NULL, &err); ocl::check(err); break; default: diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index ac0f7c29f..0d350fe3e 100644 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -13,23 +13,19 @@ namespace driver { -#ifdef ISAAC_WITH_CUDA template int Device::cuGetInfo() const { int res; - cuda::check(cuDeviceGetAttribute(&res, attr, h_.cu())); + cuda::check(dispatch::cuDeviceGetAttribute(&res, attr, h_.cu())); return res; } Device::Device(int ordinal): backend_(CUDA), h_(backend_, true) { - cuda::check(cuDeviceGet(&h_.cu(), ordinal)); + cuda::check(dispatch::cuDeviceGet(&h_.cu(), ordinal)); } -#endif - - Device::Device(cl_device_id const & device, bool take_ownership) : backend_(OPENCL), h_(backend_, take_ownership) { h_.cl() = device; @@ -85,9 +81,7 @@ unsigned int Device::address_bits() const { switch(backend_) { -#ifdef ISAAC_WITH_CUDA - case CUDA: return sizeof(long long)*8; -#endif + case CUDA: return sizeof(size_t)*8; case OPENCL: return ocl::info(h_.cl()); default: throw; } @@ -99,9 +93,7 @@ driver::Platform Device::platform() const { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: return Platform(CUDA); -#endif case OPENCL: return Platform(ocl::info(h_.cl())); default: throw; } @@ -111,13 +103,12 @@ std::string Device::name() const { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: char tmp[128]; - cuda::check(cuDeviceGetName(tmp, 128, h_.cu())); + cuda::check(dispatch::cuDeviceGetName(tmp, 128, h_.cu())); return std::string(tmp); -#endif - case OPENCL: return ocl::info(h_.cl()); + case OPENCL: + return ocl::info(h_.cl()); default: throw; } } @@ -126,10 +117,10 @@ std::string Device::vendor_str() const { switch(backend_) { -#ifdef ISAAC_WITH_CUDA - case CUDA: return "NVidia"; -#endif - case OPENCL: return ocl::info(h_.cl()); + case CUDA: + return "NVidia"; + case OPENCL: + return ocl::info(h_.cl()); default: throw; } } @@ -139,7 +130,6 @@ std::vector Device::max_work_item_sizes() const { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: { std::vector result(3); @@ -148,7 +138,6 @@ std::vector Device::max_work_item_sizes() const result[2] = cuGetInfo(); return result; } -#endif case OPENCL: return ocl::info(h_.cl()); default: @@ -160,9 +149,7 @@ Device::Type Device::type() const { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: return Type::GPU; -#endif case OPENCL: return static_cast(ocl::info(h_.cl())); default: throw; } @@ -172,10 +159,8 @@ std::string Device::extensions() const { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: return ""; -#endif case OPENCL: return ocl::info(h_.cl()); default: throw; @@ -188,10 +173,8 @@ std::pair Device::nv_compute_capability() const { case OPENCL: return std::pair(ocl::info(h_.cl()), ocl::info(h_.cl())); -#ifdef ISAAC_WITH_CUDA case CUDA: return std::pair(cuGetInfo(), cuGetInfo()); -#endif default: throw; } @@ -203,12 +186,8 @@ bool Device::fp64_support() const { case OPENCL: return extensions().find("cl_khr_fp64")!=std::string::npos; - -#ifdef ISAAC_WITH_CUDA case CUDA: return true; -#endif - default: throw; } @@ -230,19 +209,12 @@ std::string Device::infos() const } // Properties - -#ifdef ISAAC_WITH_CUDA - #define CUDACASE(CUNAME) case CUDA: return cuGetInfo(); -#else - #define CUDACASE(CUNAME) -#endif\ - #define WRAP_ATTRIBUTE(ret, fname, CUNAME, CLNAME) \ ret Device::fname() const\ {\ switch(backend_)\ {\ - CUDACASE(CUNAME)\ + case CUDA: return cuGetInfo();\ case OPENCL: return static_cast(ocl::info(h_.cl()));\ default: throw;\ }\ diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp new file mode 100644 index 000000000..53cea26ee --- /dev/null +++ b/lib/driver/dispatch.cpp @@ -0,0 +1,257 @@ +#include "isaac/driver/dispatch.h" + +namespace isaac +{ +namespace driver +{ + +//Helpers for function definition +#define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\ + {return f_impl(hlib, fname, fname ## _, #fname, a); } + +#define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\ + {return f_impl(hlib, fname, fname ## _, #fname, a, b); } + +#define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\ + {return f_impl(hlib, fname, fname ## _, #fname, a, b, c); } + +#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\ + {return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d); } + +#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\ + {return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e); } + +#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\ + {return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); } + +#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\ + {return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); } + +#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\ + {return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); } + +#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\ + {return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); } + +#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\ + {return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); } + +#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\ + {return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); } + +//Specialized helpers for OpenCL +#define OCL_DEFINE1(ret, fname, t1) DEFINE1(clinit, opencl_, ret, fname, t1) +#define OCL_DEFINE2(ret, fname, t1, t2) DEFINE2(clinit, opencl_, ret, fname, t1, t2) +#define OCL_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(clinit, opencl_, ret, fname, t1, t2, t3) +#define OCL_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(clinit, opencl_, ret, fname, t1, t2, t3, t4) +#define OCL_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5) +#define OCL_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6) +#define OCL_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6, t7) +#define OCL_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) +#define OCL_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) + +//Specialized helpers for CUDA +#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1) +#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2) +#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3) +#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4) +#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5) +#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6) +#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7) +#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) +#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) +#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) +#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) + +#define NVRTC_DEFINE1(ret, fname, t1) DEFINE1(nvrtcinit, nvrtc_, ret, fname, t1) +#define NVRTC_DEFINE2(ret, fname, t1, t2) DEFINE2(nvrtcinit, nvrtc_, ret, fname, t1, t2) +#define NVRTC_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3) +#define NVRTC_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4) +#define NVRTC_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5) +#define NVRTC_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6) +#define NVRTC_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7) +#define NVRTC_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) +#define NVRTC_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) +#define NVRTC_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) +#define NVRTC_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) + + +bool dispatch::clinit() +{ + if(opencl_==nullptr) + opencl_ = dlopen("libOpenCL.so", RTLD_LAZY); + return opencl_ != nullptr; +} + +bool dispatch::cuinit() +{ + if(cuda_==nullptr) + cuda_ = dlopen("libcuda.so", RTLD_LAZY); + return cuda_ != nullptr; +} + +bool dispatch::nvrtcinit() +{ + if(nvrtc_==nullptr) + nvrtc_ = dlopen("libnvrtc.so", RTLD_LAZY); + return nvrtc_ != nullptr; +} + + +//OpenCL + +cl_int dispatch::clBuildProgram(cl_program a, cl_uint b, const cl_device_id * c, const char * d, void (*e)(cl_program, void *), void * f) +{ return f_impl(opencl_, clBuildProgram, clBuildProgram_, "clBuildProgram", a, b, c, d, e, f); } + +cl_context dispatch::clCreateContext(const cl_context_properties * a, cl_uint b, const cl_device_id * c, void (*d)(const char *, const void *, size_t, void *), void * e, cl_int * f) +{ return f_impl(opencl_, dispatch::clCreateContext, dispatch::clCreateContext_, "clCreateContext", a, b, c, d, e, f); } + +OCL_DEFINE9(cl_int, clEnqueueNDRangeKernel, cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, const cl_event*, cl_event*) +OCL_DEFINE4(cl_int, clSetKernelArg, cl_kernel, cl_uint, size_t, const void *) +OCL_DEFINE1(cl_int, clReleaseMemObject, cl_mem) +OCL_DEFINE1(cl_int, clFinish, cl_command_queue) +OCL_DEFINE5(cl_int, clGetMemObjectInfo, cl_mem, cl_mem_info, size_t, void *, size_t *) +OCL_DEFINE5(cl_int, clGetCommandQueueInfo, cl_command_queue, cl_command_queue_info, size_t, void *, size_t *) +OCL_DEFINE1(cl_int, clReleaseContext, cl_context) +OCL_DEFINE1(cl_int, clReleaseEvent, cl_event) +OCL_DEFINE9(cl_int, clEnqueueWriteBuffer, cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) +OCL_DEFINE9(cl_int, clEnqueueReadBuffer, cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *) +OCL_DEFINE6(cl_int, clGetProgramBuildInfo, cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *) +OCL_DEFINE1(cl_int, clReleaseDevice, cl_device_id) +OCL_DEFINE5(cl_int, clGetDeviceIDs, cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *) +OCL_DEFINE5(cl_int, clGetContextInfo, cl_context, cl_context_info, size_t, void *, size_t *) +OCL_DEFINE5(cl_int, clGetDeviceInfo, cl_device_id, cl_device_info, size_t, void *, size_t *) +OCL_DEFINE1(cl_int, clReleaseCommandQueue, cl_command_queue) +OCL_DEFINE3(cl_int, clGetPlatformIDs, cl_uint, cl_platform_id *, cl_uint *) +OCL_DEFINE5(cl_int, clGetPlatformInfo, cl_platform_id, cl_platform_info, size_t, void *, size_t *) +OCL_DEFINE5(cl_int, clGetEventProfilingInfo, cl_event, cl_profiling_info, size_t, void *, size_t *) +OCL_DEFINE7(cl_program, clCreateProgramWithBinary, cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *) +OCL_DEFINE4(cl_command_queue, clCreateCommandQueue, cl_context, cl_device_id, cl_command_queue_properties, cl_int *) +OCL_DEFINE1(cl_int, clRetainEvent, cl_event) +OCL_DEFINE1(cl_int, clReleaseProgram, cl_program) +OCL_DEFINE1(cl_int, clFlush, cl_command_queue) +OCL_DEFINE5(cl_int, clGetProgramInfo, cl_program, cl_program_info, size_t, void *, size_t *) +OCL_DEFINE5(cl_int, clGetKernelInfo, cl_kernel, cl_kernel_info, size_t, void *, size_t *) +OCL_DEFINE6(cl_int, clGetKernelWorkGroupInfo, cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *) +OCL_DEFINE3(cl_kernel, clCreateKernel, cl_program, const char *, cl_int *) +OCL_DEFINE5(cl_mem, clCreateBuffer, cl_context, cl_mem_flags, size_t, void *, cl_int *) +OCL_DEFINE5(cl_program, clCreateProgramWithSource, cl_context, cl_uint, const char **, const size_t *, cl_int *) +OCL_DEFINE1(cl_int, clReleaseKernel, cl_kernel) + +//CUDA +CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext) +CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int) +CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int) +CUDA_DEFINE3(CUresult, dispatch::cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t) +CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int) +CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent) +CUDA_DEFINE1(CUresult, dispatch::cuMemFree_v2, CUdeviceptr) +CUDA_DEFINE4(CUresult, dispatch::cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream) +CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *) +CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice) +CUDA_DEFINE4(CUresult, dispatch::cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream) +CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *) +CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **) +CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule) +CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **) +CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice) +CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *) +CUDA_DEFINE3(CUresult, dispatch::cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t ) +CUDA_DEFINE1(CUresult, cuInit, unsigned int) +CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream) +CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice) +CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *) +CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream) +CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream) +CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent) +CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t) + +NVRTC_DEFINE3(nvrtcResult, nvrtcCompileProgram, nvrtcProgram, int, const char **) +NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLogSize, nvrtcProgram, size_t *) +NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTX, nvrtcProgram, char *) +NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTXSize, nvrtcProgram, size_t *) +NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **) +NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *) + +void dispatch::release() +{ + if(opencl_) + dlclose(opencl_); +} + +void * dispatch::opencl_; +void * dispatch::cuda_; +void * dispatch::nvrtc_; + +//OpenCL +void* dispatch::clBuildProgram_; +void* dispatch::clEnqueueNDRangeKernel_; +void* dispatch::clSetKernelArg_; +void* dispatch::clReleaseMemObject_; +void* dispatch::clFinish_; +void* dispatch::clGetMemObjectInfo_; +void* dispatch::clGetCommandQueueInfo_; +void* dispatch::clReleaseContext_; +void* dispatch::clReleaseEvent_; +void* dispatch::clEnqueueWriteBuffer_; +void* dispatch::clEnqueueReadBuffer_; +void* dispatch::clGetProgramBuildInfo_; +void* dispatch::clReleaseDevice_; +void* dispatch::clCreateContext_; +void* dispatch::clGetDeviceIDs_; +void* dispatch::clGetContextInfo_; +void* dispatch::clGetDeviceInfo_; +void* dispatch::clReleaseCommandQueue_; +void* dispatch::clGetPlatformIDs_; +void* dispatch::clGetPlatformInfo_; +void* dispatch::clGetEventProfilingInfo_; +void* dispatch::clCreateProgramWithBinary_; +void* dispatch::clCreateCommandQueue_; +void* dispatch::clRetainEvent_; +void* dispatch::clReleaseProgram_; +void* dispatch::clFlush_; +void* dispatch::clGetProgramInfo_; +void* dispatch::clGetKernelInfo_; +void* dispatch::clGetKernelWorkGroupInfo_; +void* dispatch::clCreateKernel_; +void* dispatch::clCreateBuffer_; +void* dispatch::clCreateProgramWithSource_; +void* dispatch::clReleaseKernel_; + +//CUDA +void* dispatch::cuCtxDestroy_v2_; +void* dispatch::cuEventCreate_; +void* dispatch::cuDeviceGet_; +void* dispatch::cuMemcpyDtoH_v2_; +void* dispatch::cuStreamCreate_; +void* dispatch::cuEventElapsedTime_; +void* dispatch::cuMemFree_v2_; +void* dispatch::cuMemcpyDtoHAsync_v2_; +void* dispatch::cuDriverGetVersion_; +void* dispatch::cuDeviceGetName_; +void* dispatch::cuMemcpyHtoDAsync_v2_; +void* dispatch::cuModuleLoad_; +void* dispatch::cuLaunchKernel_; +void* dispatch::cuModuleUnload_; +void* dispatch::cuModuleLoadDataEx_; +void* dispatch::cuDeviceGetAttribute_; +void* dispatch::cuDeviceGetCount_; +void* dispatch::cuMemcpyHtoD_v2_; +void* dispatch::cuInit_; +void* dispatch::cuEventRecord_; +void* dispatch::cuCtxCreate_v2_; +void* dispatch::cuModuleGetFunction_; +void* dispatch::cuStreamSynchronize_; +void* dispatch::cuStreamDestroy_v2_; +void* dispatch::cuEventDestroy_v2_; +void* dispatch::cuMemAlloc_v2_; + +void* dispatch::nvrtcCompileProgram_; +void* dispatch::nvrtcGetProgramLogSize_; +void* dispatch::nvrtcGetPTX_; +void* dispatch::nvrtcGetPTXSize_; +void* dispatch::nvrtcCreateProgram_; +void* dispatch::nvrtcGetProgramLog_; + +} +} diff --git a/lib/driver/event.cpp b/lib/driver/event.cpp index 2d4bfa0fb..3c21f7d42 100644 --- a/lib/driver/event.cpp +++ b/lib/driver/event.cpp @@ -11,14 +11,14 @@ Event::Event(backend_type backend) : backend_(backend), h_(backend_, true) { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: - cuda::check(cuEventCreate(&h_.cu().first, CU_EVENT_DEFAULT)); - cuda::check(cuEventCreate(&h_.cu().second, CU_EVENT_DEFAULT)); + cuda::check(dispatch::dispatch::cuEventCreate(&h_.cu().first, CU_EVENT_DEFAULT)); + cuda::check(dispatch::dispatch::cuEventCreate(&h_.cu().second, CU_EVENT_DEFAULT)); break; -#endif - case OPENCL: break; - default: throw; + case OPENCL: + break; + default: + throw; } } @@ -31,12 +31,10 @@ long Event::elapsed_time() const { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: float time; - cuda::check(cuEventElapsedTime(&time, h_.cu().first, h_.cu().second)); + cuda::check(dispatch::cuEventElapsedTime(&time, h_.cu().first, h_.cu().second)); return 1e6*time; -#endif case OPENCL: return static_cast(ocl::info(h_.cl()) - ocl::info(h_.cl())); default: diff --git a/lib/driver/handle.cpp b/lib/driver/handle.cpp index f551959fe..31bb30fc3 100644 --- a/lib/driver/handle.cpp +++ b/lib/driver/handle.cpp @@ -7,63 +7,59 @@ namespace isaac namespace driver { -#ifdef ISAAC_WITH_CUDA +//CUDA +template +void Handle::_delete(CUcontext x) { cuda::check(dispatch::cuCtxDestroy(x)); } template -void Handle::_delete(CUcontext x) { cuCtxDestroy(x); } +void Handle::_delete(CUdeviceptr x) { cuda::check(dispatch::dispatch::cuMemFree(x)); } template -void Handle::_delete(CUdeviceptr x) { cuMemFree(x); } +void Handle::_delete(CUstream x) { cuda::check(dispatch::cuStreamDestroy(x)); } template -void Handle::_delete(CUstream x) { cuStreamDestroy(x); } +void Handle::_delete(CUdevice) { } template -void Handle::_delete(CUdevice) { std::cout << "CUdevice" << std::endl;} - -template -void Handle::_delete(CUevent x) { cuEventDestroy(x); } +void Handle::_delete(CUevent x) { cuda::check(dispatch::dispatch::cuEventDestroy(x)); } template void Handle::_delete(CUfunction) { } template -void Handle::_delete(CUmodule x) { cuModuleUnload(x); } +void Handle::_delete(CUmodule x) { cuda::check(dispatch::dispatch::cuModuleUnload(x)); } template void Handle::_delete(cu_event_t x) { _delete(x.first); _delete(x.second); } -#endif +//OpenCL +template +void Handle::release(cl_context x) { ocl::check(dispatch::clReleaseContext(x)); } template -void Handle::release(cl_context x) { ocl::check(clReleaseContext(x)); } +void Handle::release(cl_mem x) { ocl::check(dispatch::clReleaseMemObject(x)); } template -void Handle::release(cl_mem x) { ocl::check(clReleaseMemObject(x)); } +void Handle::release(cl_command_queue x) { ocl::check(dispatch::clReleaseCommandQueue(x)); } template -void Handle::release(cl_command_queue x) { ocl::check(clReleaseCommandQueue(x)); } +void Handle::release(cl_device_id x) { ocl::check(dispatch::clReleaseDevice(x)); } template -void Handle::release(cl_device_id x) { ocl::check(clReleaseDevice(x)); } +void Handle::release(cl_event x) { ocl::check(dispatch::clReleaseEvent(x)); } template -void Handle::release(cl_event x) { ocl::check(clReleaseEvent(x)); } +void Handle::release(cl_kernel x) { ocl::check(dispatch::clReleaseKernel(x)); } template -void Handle::release(cl_kernel x) { ocl::check(clReleaseKernel(x)); } - -template -void Handle::release(cl_program x) { ocl::check(clReleaseProgram(x)); } +void Handle::release(cl_program x) { ocl::check(dispatch::clReleaseProgram(x)); } template Handle::Handle(backend_type backend, bool take_ownership): backend_(backend), has_ownership_(take_ownership) { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: cu_.reset(new CUType()); -#endif case OPENCL: cl_.reset(new CLType()); } } @@ -71,10 +67,8 @@ Handle::Handle(backend_type backend, bool take_ownership): backe template bool Handle::operator==(Handle const & other) const { -#ifdef ISAAC_WITH_CUDA if(backend_==CUDA && other.backend_==CUDA) return cu()==other.cu(); -#endif if(backend_==OPENCL && other.backend_==OPENCL) return cl()==other.cl(); return false; @@ -83,28 +77,22 @@ bool Handle::operator==(Handle const & other) const template bool Handle::operator<(Handle const & other) const { -#ifdef ISAAC_WITH_CUDA if(backend_==CUDA && other.backend_==CUDA) return (*cu_)<(*other.cu_); -#endif if(backend_==OPENCL && other.backend_==OPENCL) return (*cl_)<(*other.cl_); -#ifdef ISAAC_WITH_CUDA if(backend_==CUDA && other.backend_==OPENCL) return true; -#endif return false; } template Handle::~Handle() { -#ifdef ISAAC_WITH_CUDA - if(has_ownership_ && cu_ && cu_.unique() && *cu_){ + if(backend_==CUDA && has_ownership_ && cu_ && cu_.unique() && *cu_){ _delete(*cu_); } -#endif - if(has_ownership_ && cl_ && cl_.unique() && *cl_) + if(backend_==OPENCL && has_ownership_ && cl_ && cl_.unique() && *cl_) release(*cl_); } @@ -116,7 +104,6 @@ template CLType const & Handle::cl() const { return *cl_; } -#ifdef ISAAC_WITH_CUDA template CUType & Handle::cu() { @@ -136,16 +123,6 @@ template class Handle; template class Handle; template class Handle; template class Handle; -#else -template class Handle; -template class Handle; -template class Handle; -template class Handle; -template class Handle; -template class Handle; -template class Handle; -#endif - } } diff --git a/lib/driver/helpers/ocl/infos.hpp b/lib/driver/helpers/ocl/infos.hpp index e2aa8d08c..797e6a217 100644 --- a/lib/driver/helpers/ocl/infos.hpp +++ b/lib/driver/helpers/ocl/infos.hpp @@ -44,7 +44,7 @@ struct info static void get(cl_mem handle, cl_mem_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret) { - cl_int err = clGetMemObjectInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + cl_int err = dispatch::clGetMemObjectInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); check(err); } }; @@ -56,7 +56,7 @@ struct info static void get(cl_device_id handle, cl_device_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret) { - cl_int err = clGetDeviceInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + cl_int err = dispatch::clGetDeviceInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); check(err); } }; @@ -67,12 +67,12 @@ struct info typedef cl_kernel_info type; static void get(cl_kernel handle, cl_kernel_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = clGetKernelInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + cl_int err = dispatch::clGetKernelInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); check(err); } static void get(cl_kernel handle, cl_device_id dev_id, cl_kernel_work_group_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = clGetKernelWorkGroupInfo(handle, dev_id, param_name,param_value_size,param_value,param_value_size_ret); + cl_int err = dispatch::clGetKernelWorkGroupInfo(handle, dev_id, param_name,param_value_size,param_value,param_value_size_ret); check(err); } }; @@ -83,7 +83,7 @@ struct info typedef cl_context_info type; static void get(cl_context handle, cl_context_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = clGetContextInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + cl_int err = dispatch::clGetContextInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); check(err); } }; @@ -94,12 +94,12 @@ struct info typedef cl_program_info type; static void get(cl_program handle, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = clGetProgramInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + cl_int err = dispatch::clGetProgramInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); check(err); } static void get(cl_program handle, cl_device_id device, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = clGetProgramBuildInfo(handle,device,param_name,param_value_size,param_value,param_value_size_ret); + cl_int err = dispatch::clGetProgramBuildInfo(handle,device,param_name,param_value_size,param_value,param_value_size_ret); check(err); } }; @@ -110,7 +110,7 @@ struct info { typedef cl_profiling_info type; static void get(cl_event handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = clGetEventProfilingInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + cl_int err = dispatch::clGetEventProfilingInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); check(err); } }; @@ -120,7 +120,7 @@ struct info { typedef cl_command_queue_info type; static void get(cl_command_queue handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = clGetCommandQueueInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + cl_int err = dispatch::clGetCommandQueueInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); check(err); } }; @@ -130,7 +130,7 @@ struct info { typedef cl_command_queue_info type; static void get(cl_platform_id handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = clGetPlatformInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + cl_int err = dispatch::clGetPlatformInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); check(err); } }; @@ -343,7 +343,7 @@ inline typename detail::return_type::Result inf std::vector sizes = info(handle); for(size_t s: sizes) res.push_back(new unsigned char[s]); - clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(unsigned char**), (void*)res.data(), NULL); + dispatch::clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(unsigned char**), (void*)res.data(), NULL); return res; } diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index a72a3367a..bc9415712 100644 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -13,16 +13,14 @@ Kernel::Kernel(Program const & program, const char * name) : backend_(program.ba { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: - cu_params_store_.reserve(32); - cu_params_.reserve(32); - cuda::check(cuModuleGetFunction(&h_.cu(), program.h_.cu(), name));\ + cu_params_store_.reserve(64); + cu_params_.reserve(64); + cuda::check(dispatch::cuModuleGetFunction(&h_.cu(), program.h_.cu(), name));\ break; -#endif case OPENCL: cl_int err; - h_.cl() = clCreateKernel(program.h_.cl(), name, &err); + h_.cl() = dispatch::clCreateKernel(program.h_.cl(), name, &err); ocl::check(err); break; default: @@ -34,7 +32,6 @@ void Kernel::setArg(unsigned int index, std::size_t size, void* ptr) { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: if(index + 1> cu_params_store_.size()) { @@ -45,9 +42,8 @@ void Kernel::setArg(unsigned int index, std::size_t size, void* ptr) memcpy(cu_params_store_[index].get(), ptr, size); cu_params_[index] = cu_params_store_[index].get(); break; -#endif case OPENCL: - ocl::check(clSetKernelArg(h_.cl(), index, size, ptr)); + ocl::check(dispatch::clSetKernelArg(h_.cl(), index, size, ptr)); break; default: throw; @@ -58,14 +54,12 @@ void Kernel::setArg(unsigned int index, Buffer const & data) { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: { setArg(index, sizeof(CUdeviceptr), (void*)&data.h_.cu()); break; } -#endif case OPENCL: - ocl::check(clSetKernelArg(h_.cl(), index, sizeof(cl_mem), (void*)&data.h_.cl())); + ocl::check(dispatch::clSetKernelArg(h_.cl(), index, sizeof(cl_mem), (void*)&data.h_.cl())); break; default: throw; } @@ -75,14 +69,12 @@ void Kernel::setSizeArg(unsigned int index, size_t N) { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: { int NN = static_cast(N); setArg(index, sizeof(int), &NN); break; } -#endif case OPENCL: { cl_int NN = static_cast(N); diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cpp index b80c4b4bc..524c1d80e 100644 --- a/lib/driver/platform.cpp +++ b/lib/driver/platform.cpp @@ -11,13 +11,11 @@ namespace isaac namespace driver { -#ifdef ISAAC_WITH_CUDA Platform::Platform(backend_type backend): backend_(backend) { if(backend==CUDA) - cuInit(0); + dispatch::cuInit(0); } -#endif Platform::Platform(cl_platform_id const & platform) : backend_(OPENCL) { @@ -28,13 +26,12 @@ std::string Platform::version() const { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: int version; - cuDriverGetVersion(&version); + dispatch::cuDriverGetVersion(&version); return tools::to_string(version); -#endif - case OPENCL: return ocl::info(cl_platform_); + case OPENCL: + return ocl::info(cl_platform_); default: throw; } } @@ -42,10 +39,7 @@ std::string Platform::name() const { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: return "CUDA"; -#endif - case OPENCL: return ocl::info(cl_platform_); default: throw; } @@ -60,22 +54,20 @@ void Platform::devices(std::vector & devices) const { switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: { int N; - cuda::check(cuDeviceGetCount(&N)); + cuda::check(dispatch::cuDeviceGetCount(&N)); for(int i = 0 ; i < N ; ++i) devices.push_back(Device(i)); break; } -#endif case OPENCL: { cl_uint ndevices; - ocl::check(clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, 0, NULL, &ndevices)); + ocl::check(dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, 0, NULL, &ndevices)); std::vector device_ids(ndevices); - ocl::check(clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, ndevices, device_ids.data(), NULL)); + ocl::check(dispatch::dispatch::clGetDeviceIDs(cl_platform_, CL_DEVICE_TYPE_ALL, ndevices, device_ids.data(), NULL)); for(cl_device_id d : device_ids) devices.push_back(Device(d)); break; diff --git a/lib/driver/program.cpp b/lib/driver/program.cpp index b2b239960..01536e648 100644 --- a/lib/driver/program.cpp +++ b/lib/driver/program.cpp @@ -4,9 +4,7 @@ #include "isaac/driver/program.h" #include "isaac/driver/context.h" -#ifdef ISAAC_WITH_CUDA #include "helpers/cuda/vector.hpp" -#endif #include "helpers/ocl/infos.hpp" #include "sha1.hpp" @@ -23,7 +21,6 @@ Program::Program(Context const & context, std::string const & source) : backend_ std::string cache_path = context.cache_path_; switch(backend_) { -#ifdef ISAAC_WITH_CUDA case CUDA: { @@ -34,7 +31,7 @@ Program::Program(Context const & context, std::string const & source) : backend_ //Load cached program if(cache_path.size() && std::ifstream(fname, std::ios::binary)) { - cuda::check(cuModuleLoad(&h_.cu(), fname.c_str())); + cuda::check(dispatch::cuModuleLoad(&h_.cu(), fname.c_str())); break; } @@ -43,25 +40,25 @@ Program::Program(Context const & context, std::string const & source) : backend_ const char * includes[] = {"helper_math.h"}; const char * src[] = {helpers::cuda::vector}; - nvrtc::check(nvrtcCreateProgram(&prog, source.c_str(), NULL, 1, src, includes)); + nvrtc::check(dispatch::nvrtcCreateProgram(&prog, source.c_str(), NULL, 1, src, includes)); try{ const char * options[] = {"--gpu-architecture=compute_52", "--restrict"}; - nvrtc::check(nvrtcCompileProgram(prog, 2, options)); + nvrtc::check(dispatch::nvrtcCompileProgram(prog, 2, options)); }catch(nvrtc::exception::compilation const &) { size_t logsize; - nvrtc::check(nvrtcGetProgramLogSize(prog, &logsize)); + nvrtc::check(dispatch::nvrtcGetProgramLogSize(prog, &logsize)); std::string log(logsize, 0); - nvrtc::check(nvrtcGetProgramLog(prog, (char*)log.data())); + nvrtc::check(dispatch::nvrtcGetProgramLog(prog, (char*)log.data())); std::cout << "Compilation failed:" << std::endl; std::cout << log << std::endl; } size_t ptx_size; - nvrtc::check(nvrtcGetPTXSize(prog, &ptx_size)); + nvrtc::check(dispatch::nvrtcGetPTXSize(prog, &ptx_size)); std::vector ptx(ptx_size); - nvrtc::check(nvrtcGetPTX(prog, ptx.data())); - cuda::check(cuModuleLoadDataEx(&h_.cu(), ptx.data(), 0, NULL, NULL)); + nvrtc::check(dispatch::nvrtcGetPTX(prog, ptx.data())); + cuda::check(dispatch::cuModuleLoadDataEx(&h_.cu(), ptx.data(), 0, NULL, NULL)); //Save cached program if (cache_path.size()) @@ -77,8 +74,8 @@ Program::Program(Context const & context, std::string const & source) : backend_ // oss.close(); // system(("/usr/local/cuda-7.0/bin/nvcc " + sha1 + ".cu -gencode arch=compute_50,code=sm_50 -cubin").c_str()); -// system(("perl /home/philippe/Development/maxas/maxas.pl -e " + sha1 + ".cubin > " + sha1 + ".sass").c_str()); -// system(("perl /home/philippe/Development/maxas/maxas.pl -i --noreuse" + sha1 + ".sass " + sha1 + ".cubin").c_str()); +// system(("perl /maxas.pl -e " + sha1 + ".cubin > " + sha1 + ".sass").c_str()); +// system(("perl /maxas.pl -i --noreuse" + sha1 + ".sass " + sha1 + ".cubin").c_str()); // std::ifstream ifs(sha1 + ".cubin"); // std::cout << sha1 << std::endl; @@ -90,11 +87,10 @@ Program::Program(Context const & context, std::string const & source) : backend_ // str.assign((std::istreambuf_iterator(ifs)), // std::istreambuf_iterator()); -// cuda::check(cuModuleLoadDataEx(&h_.cu(), str.c_str(), 0, NULL, NULL)); +// cuda::check(dispatch::cuModuleLoadDataEx(&h_.cu(), str.c_str(), 0, NULL, NULL)); break; } -#endif case OPENCL: { cl_int err; @@ -118,18 +114,18 @@ Program::Program(Context const & context, std::string const & source) : backend_ buffer.resize(len); cached.read((char*)buffer.data(), std::streamsize(len)); char* cbuffer = buffer.data(); - h_.cl() = clCreateProgramWithBinary(context_.h_.cl(), static_cast(devices.size()), devices.data(), &len, (const unsigned char **)&cbuffer, NULL, &err); + h_.cl() = dispatch::clCreateProgramWithBinary(context_.h_.cl(), static_cast(devices.size()), devices.data(), &len, (const unsigned char **)&cbuffer, NULL, &err); ocl::check(err); - ocl::check(clBuildProgram(h_.cl(), static_cast(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL)); + ocl::check(dispatch::clBuildProgram(h_.cl(), static_cast(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL)); return; } } std::size_t srclen = source.size(); const char * csrc = source.c_str(); - h_.cl() = clCreateProgramWithSource(context_.h_.cl(), 1, &csrc, &srclen, &err); + h_.cl() = dispatch::clCreateProgramWithSource(context_.h_.cl(), 1, &csrc, &srclen, &err); try{ - ocl::check(clBuildProgram(h_.cl(), static_cast(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL)); + ocl::check(dispatch::clBuildProgram(h_.cl(), static_cast(devices.size()), devices.data(), build_opt.c_str(), NULL, NULL)); }catch(ocl::exception::build_program_failure const &){ for(std::vector::const_iterator it = devices.begin(); it != devices.end(); ++it) { diff --git a/lib/kernels/keywords.cpp b/lib/kernels/keywords.cpp index 348ae4993..b16f047df 100644 --- a/lib/kernels/keywords.cpp +++ b/lib/kernels/keywords.cpp @@ -12,10 +12,10 @@ std::string const & keyword::get() const { switch(backend_) { - case driver::OPENCL: return opencl_; -#ifdef ISAAC_WITH_CUDA - case driver::CUDA: return cuda_; -#endif + case driver::OPENCL: + return opencl_; + case driver::CUDA: + return cuda_; default: throw; } } diff --git a/lib/kernels/templates/axpy.cpp b/lib/kernels/templates/axpy.cpp index 92343c019..a92228b17 100644 --- a/lib/kernels/templates/axpy.cpp +++ b/lib/kernels/templates/axpy.cpp @@ -42,10 +42,10 @@ std::string axpy::generate_impl(std::string const & suffix, expressions_tuple co switch(backend) { -#ifdef ISAAC_WITH_CUDA - case driver::CUDA: stream << "#include \"helper_math.h\"" << std::endl; break; -#endif - case driver::OPENCL: stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break; + case driver::CUDA: + stream << "#include \"helper_math.h\"" << std::endl; break; + case driver::OPENCL: + stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break; } stream << KernelPrefix(backend) << " void " << "axpy" << suffix << "(" << _size_t << " N," << generate_arguments(dtype, device, mappings, expressions) << ")" << std::endl; diff --git a/lib/kernels/templates/dot.cpp b/lib/kernels/templates/dot.cpp index 546298282..44af2b315 100644 --- a/lib/kernels/templates/dot.cpp +++ b/lib/kernels/templates/dot.cpp @@ -98,10 +98,10 @@ std::string dot::generate_impl(std::string const & suffix, expressions_tuple con * -----------------------*/ switch(backend) { -#ifdef ISAAC_WITH_CUDA - case driver::CUDA: stream << "#include \"helper_math.h\"" << std::endl; break; -#endif - case driver::OPENCL: stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl; break; + case driver::CUDA: + stream << "#include \"helper_math.h\"" << std::endl; break; + case driver::OPENCL: + stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl; break; } stream << KernelPrefix(backend) << " void " << name[0] << "(" << arguments << generate_arguments("#scalartype", device, mappings, expressions) << ")" << std::endl; diff --git a/lib/kernels/templates/gemm.cpp b/lib/kernels/templates/gemm.cpp index 353a58718..f90116a3a 100644 --- a/lib/kernels/templates/gemm.cpp +++ b/lib/kernels/templates/gemm.cpp @@ -143,10 +143,10 @@ gemm_parameters::gemm_parameters(unsigned int simd_width switch(backend) { - #ifdef ISAAC_WITH_CUDA - case driver::CUDA: stream << "#include \"helper_math.h\"" << std::endl; break; - #endif - case driver::OPENCL: stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break; + case driver::CUDA: + stream << "#include \"helper_math.h\"" << std::endl; break; + case driver::OPENCL: + stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break; } stream << KernelPrefix(backend) << " void " << gemm_name << "(" << _size_t << " M, " << _size_t << " N, " << _size_t << " K, " diff --git a/lib/kernels/templates/gemv.cpp b/lib/kernels/templates/gemv.cpp index fb2f1a95f..9930725ed 100644 --- a/lib/kernels/templates/gemv.cpp +++ b/lib/kernels/templates/gemv.cpp @@ -74,10 +74,10 @@ std::string gemv::generate_impl(std::string const & suffix, expressions_tuple co int col_simd_width = (dot_type_ == REDUCE_COLUMNS) ? 1 : p_.simd_width; switch(backend) { -#ifdef ISAAC_WITH_CUDA - case driver::CUDA: stream << "#include \"helper_math.h\"" << std::endl; break; -#endif - case driver::OPENCL: stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break; + case driver::CUDA: + stream << "#include \"helper_math.h\"" << std::endl; break; + case driver::OPENCL: + stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break; } stream << KernelPrefix(backend) << " void " << name[0] << "(" << arguments << generate_arguments("#scalartype", device, mappings, expressions) << ")" << std::endl; diff --git a/lib/kernels/templates/ger.cpp b/lib/kernels/templates/ger.cpp index 3898256c7..c904dc477 100644 --- a/lib/kernels/templates/ger.cpp +++ b/lib/kernels/templates/ger.cpp @@ -39,10 +39,10 @@ std::string ger::generate_impl(std::string const & suffix, expressions_tuple con switch(backend) { -#ifdef ISAAC_WITH_CUDA - case driver::CUDA: stream << "#include \"helper_math.h\"" << std::endl; break; -#endif - case driver::OPENCL: stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break; + case driver::CUDA: + stream << "#include \"helper_math.h\"" << std::endl; break; + case driver::OPENCL: + stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break; } stream << KernelPrefix(backend) << " void axpy" << suffix << "(" << _size_t << " M, " << _size_t << " N, " << generate_arguments("#scalartype", device, mappings, expressions) << ")" << std::endl; diff --git a/lib/wrap/clBLAS.cpp b/lib/wrap/clBLAS.cpp index bd8809c0c..5bf275d7b 100644 --- a/lib/wrap/clBLAS.cpp +++ b/lib/wrap/clBLAS.cpp @@ -33,9 +33,9 @@ extern "C" if(events) { events[i] = levents.front().handle().cl(); - clRetainEvent(events[i]); + is::driver::dispatch::clRetainEvent(events[i]); } - clFlush(commandQueues[i]); + is::driver::dispatch::clFlush(commandQueues[i]); } } diff --git a/python/setup.py b/python/setup.py index 0dc3fa2fb..a4c575a49 100644 --- a/python/setup.py +++ b/python/setup.py @@ -60,7 +60,7 @@ def main(): def find_opencl(): cvars = sysconfig.get_config_vars() - lib = find_library('OpenCL', '/opt/adreno-driver*/lib' if for_android else '/opt/AMDAPPSDK*/lib/x86_64') + lib = find_library('OpenCL', '' if for_android else '') return {'include': '', 'lib': dirname(lib)} if lib else None def find_in_path(name, path): @@ -124,7 +124,7 @@ def main(): libraries += ['gnustl_shared'] #Source files - src = 'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/program_cache.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/profiles/presets.cpp src/lib/profiles/profiles.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/stream.cpp src/lib/kernels/parse.cpp src/lib/kernels/keywords.cpp src/lib/kernels/binder.cpp src/lib/wrap/clBLAS.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']] + src = 'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/dispatch.cpp src/lib/driver/program_cache.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/profiles/presets.cpp src/lib/profiles/profiles.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/stream.cpp src/lib/kernels/parse.cpp src/lib/kernels/keywords.cpp src/lib/kernels/binder.cpp src/lib/wrap/clBLAS.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']] boostsrc = 'external/boost/libs/' for s in ['numpy','python','smart_ptr','system','thread']: src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x] diff --git a/python/src/bind/driver.cpp b/python/src/bind/driver.cpp index d0de36da4..d4b86cc76 100644 --- a/python/src/bind/driver.cpp +++ b/python/src/bind/driver.cpp @@ -107,9 +107,7 @@ void export_driver() bp::enum_ ("backend_type") .value("OPENCL", sc::driver::OPENCL) - #ifdef ISAAC_WITH_CUDA .value("CUDA", sc::driver::CUDA) - #endif ; bp::enum_ diff --git a/tests/linalg/CMakeLists.txt b/tests/linalg/CMakeLists.txt index 6126de834..60905cbb4 100644 --- a/tests/linalg/CMakeLists.txt +++ b/tests/linalg/CMakeLists.txt @@ -7,5 +7,5 @@ get_property(ISAAC_PATH TARGET isaac PROPERTY LOCATION) foreach(PROG axpy dot ger gemv gemm) add_executable(${PROG}-test ${PROG}.cpp) add_test(${PROG} ${PROG}-test) - target_link_libraries(${PROG}-test isaac ${OPENCL_LIBRARIES}) + target_link_libraries(${PROG}-test isaac) endforeach(PROG)