API: adding cuBLAS interface
This commit is contained in:
@@ -20,6 +20,9 @@ else()
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -pedantic")
|
||||
endif()
|
||||
|
||||
#CUDA
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include/external/ ${CMAKE_CURRENT_SOURCE_DIR}/include/external/cuda)
|
||||
|
||||
#Binary to convert .cu files to const char *
|
||||
if(NOT ANDROID)
|
||||
add_executable(bin2cpp ${CMAKE_MODULE_PATH}/helpers/bin2cpp.cpp)
|
||||
@@ -39,6 +42,9 @@ foreach(FILE ${LIBISAAC_SRC})
|
||||
set(LIBISAAC_SRC_STR "${_TMP} ${LIBISAAC_SRC_STR}")
|
||||
endforeach()
|
||||
|
||||
|
||||
|
||||
#Include directories
|
||||
set(INCLUDE_DIRECTORIES_STR)
|
||||
get_property(INCLUDE_DIRECTORIES_LST DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
|
||||
set(INCLUDE_DIRECTORIES_STR)
|
||||
|
@@ -11,12 +11,12 @@ if(CUDA_FOUND)
|
||||
set(BLAS_LIBS ${BLAS_LIBS} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
|
||||
endif()
|
||||
|
||||
#CLAMDBLAS
|
||||
find_package(CLAMDBLAS QUIET)
|
||||
if(CLAMDBLAS_FOUND)
|
||||
#CLBLAS
|
||||
find_package(CLBLAS QUIET)
|
||||
if(CLBLAS_FOUND)
|
||||
set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_CLBLAS")
|
||||
include_directories(${CLAMDBLAS_INCLUDE_DIR})
|
||||
set(BLAS_LIBS ${BLAS_LIBS} ${CLAMDBLAS_LIBRARIES} )
|
||||
include_directories(${CLBLAS_INCLUDE_DIR})
|
||||
set(BLAS_LIBS ${BLAS_LIBS} ${CLBLAS_LIBRARIES} )
|
||||
endif()
|
||||
|
||||
#CBLAS
|
||||
|
@@ -144,7 +144,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
||||
/* clblas */
|
||||
#ifdef BENCH_CLBLAS
|
||||
if(x.context().backend()==sc::driver::OPENCL)
|
||||
BENCHMARK_CLBLAS(clblasSaxpy(N, alpha, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event), 3*N*dtsize/t);
|
||||
BENCHMARK_CLBLAS(clblasSaxpy(N, alpha, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event), 3*N*dtsize/t);
|
||||
#endif
|
||||
/* BLAS */
|
||||
#ifdef BENCH_CBLAS
|
||||
|
64
include/external/cuda/builtin_types.h
vendored
Normal file
64
include/external/cuda/builtin_types.h
vendored
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#include "device_types.h"
|
||||
#if !defined(__CUDACC_RTC__)
|
||||
#define EXCLUDE_FROM_RTC
|
||||
#include "driver_types.h"
|
||||
#undef EXCLUDE_FROM_RTC
|
||||
#endif /* !__CUDACC_RTC__ */
|
||||
#include "surface_types.h"
|
||||
#include "texture_types.h"
|
||||
#include "vector_types.h"
|
412
include/external/cuda/channel_descriptor.h
vendored
Normal file
412
include/external/cuda/channel_descriptor.h
vendored
Normal file
@@ -0,0 +1,412 @@
|
||||
/*
|
||||
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
#if !defined(__CHANNEL_DESCRIPTOR_H__)
|
||||
#define __CHANNEL_DESCRIPTOR_H__
|
||||
|
||||
#if defined(__cplusplus)
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#include "driver_types.h"
|
||||
#include "cuda_runtime_api.h"
|
||||
#include "host_defines.h"
|
||||
#include "vector_types.h"
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
/**
|
||||
* \addtogroup CUDART_HIGHLEVEL
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* \brief \hl Returns a channel descriptor using the specified format
|
||||
*
|
||||
* Returns a channel descriptor with format \p f and number of bits of each
|
||||
* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
|
||||
* defined as:
|
||||
* \code
|
||||
struct cudaChannelFormatDesc {
|
||||
int x, y, z, w;
|
||||
enum cudaChannelFormatKind f;
|
||||
};
|
||||
* \endcode
|
||||
*
|
||||
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
|
||||
* ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
|
||||
*
|
||||
* \return
|
||||
* Channel descriptor with format \p f
|
||||
*
|
||||
* \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
|
||||
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
|
||||
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
|
||||
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
|
||||
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
|
||||
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
|
||||
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
|
||||
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
|
||||
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
|
||||
*/
|
||||
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
|
||||
{
|
||||
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
|
||||
}
|
||||
|
||||
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
|
||||
{
|
||||
int e = (int)sizeof(char) * 8;
|
||||
|
||||
#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
||||
#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
|
||||
{
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
|
||||
{
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
|
||||
{
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
|
||||
{
|
||||
int e = (int)sizeof(signed char) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned char) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
|
||||
{
|
||||
int e = (int)sizeof(short) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
|
||||
{
|
||||
int e = (int)sizeof(short) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
|
||||
{
|
||||
int e = (int)sizeof(short) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
|
||||
{
|
||||
int e = (int)sizeof(short) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned short) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
|
||||
{
|
||||
int e = (int)sizeof(int) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
|
||||
{
|
||||
int e = (int)sizeof(int) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
|
||||
{
|
||||
int e = (int)sizeof(int) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
|
||||
{
|
||||
int e = (int)sizeof(int) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned int) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
#if !defined(__LP64__)
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
|
||||
{
|
||||
int e = (int)sizeof(long) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
|
||||
{
|
||||
int e = (int)sizeof(long) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
|
||||
{
|
||||
int e = (int)sizeof(long) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
|
||||
{
|
||||
int e = (int)sizeof(long) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
|
||||
{
|
||||
int e = (int)sizeof(unsigned long) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
|
||||
}
|
||||
|
||||
#endif /* !__LP64__ */
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
|
||||
{
|
||||
int e = (int)sizeof(float) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
|
||||
{
|
||||
int e = (int)sizeof(float) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
|
||||
{
|
||||
int e = (int)sizeof(float) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
|
||||
{
|
||||
int e = (int)sizeof(float) * 8;
|
||||
|
||||
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
|
||||
}
|
||||
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/** @} */
|
||||
/** @} */ /* END CUDART_TEXTURE_HL */
|
||||
|
||||
#endif /* !__CHANNEL_DESCRIPTOR_H__ */
|
338
include/external/cuda/cuComplex.h
vendored
Normal file
338
include/external/cuda/cuComplex.h
vendored
Normal file
@@ -0,0 +1,338 @@
|
||||
/*
|
||||
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
#if !defined(CU_COMPLEX_H_)
|
||||
#define CU_COMPLEX_H_
|
||||
|
||||
/* When trying to include C header file in C++ Code extern "C" is required
|
||||
* But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
|
||||
* extern "C" cannot be nested
|
||||
* Hence keep the header out of extern "C" block
|
||||
*/
|
||||
|
||||
#include <math.h> /* import fabsf, sqrt */
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#include "vector_types.h"
|
||||
|
||||
typedef float2 cuFloatComplex;
|
||||
|
||||
__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
|
||||
{
|
||||
return x.x;
|
||||
}
|
||||
|
||||
__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
|
||||
{
|
||||
return x.y;
|
||||
}
|
||||
|
||||
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
|
||||
(float r, float i)
|
||||
{
|
||||
cuFloatComplex res;
|
||||
res.x = r;
|
||||
res.y = i;
|
||||
return res;
|
||||
}
|
||||
|
||||
__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
|
||||
{
|
||||
return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
|
||||
}
|
||||
__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
|
||||
cuFloatComplex y)
|
||||
{
|
||||
return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),
|
||||
cuCimagf(x) + cuCimagf(y));
|
||||
}
|
||||
|
||||
__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
|
||||
cuFloatComplex y)
|
||||
{
|
||||
return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),
|
||||
cuCimagf(x) - cuCimagf(y));
|
||||
}
|
||||
|
||||
/* This implementation could suffer from intermediate overflow even though
|
||||
* the final result would be in range. However, various implementations do
|
||||
* not guard against this (presumably to avoid losing performance), so we
|
||||
* don't do it either to stay competitive.
|
||||
*/
|
||||
__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
|
||||
cuFloatComplex y)
|
||||
{
|
||||
cuFloatComplex prod;
|
||||
prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) -
|
||||
(cuCimagf(x) * cuCimagf(y)),
|
||||
(cuCrealf(x) * cuCimagf(y)) +
|
||||
(cuCimagf(x) * cuCrealf(y)));
|
||||
return prod;
|
||||
}
|
||||
|
||||
/* This implementation guards against intermediate underflow and overflow
|
||||
* by scaling. Such guarded implementations are usually the default for
|
||||
* complex library implementations, with some also offering an unguarded,
|
||||
* faster version.
|
||||
*/
|
||||
__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
|
||||
cuFloatComplex y)
|
||||
{
|
||||
cuFloatComplex quot;
|
||||
float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
|
||||
float oos = 1.0f / s;
|
||||
float ars = cuCrealf(x) * oos;
|
||||
float ais = cuCimagf(x) * oos;
|
||||
float brs = cuCrealf(y) * oos;
|
||||
float bis = cuCimagf(y) * oos;
|
||||
s = (brs * brs) + (bis * bis);
|
||||
oos = 1.0f / s;
|
||||
quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
|
||||
((ais * brs) - (ars * bis)) * oos);
|
||||
return quot;
|
||||
}
|
||||
|
||||
/*
|
||||
* We would like to call hypotf(), but it's not available on all platforms.
|
||||
* This discrete implementation guards against intermediate underflow and
|
||||
* overflow by scaling. Otherwise we would lose half the exponent range.
|
||||
* There are various ways of doing guarded computation. For now chose the
|
||||
* simplest and fastest solution, however this may suffer from inaccuracies
|
||||
* if sqrt and division are not IEEE compliant.
|
||||
*/
|
||||
__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
|
||||
{
|
||||
float a = cuCrealf(x);
|
||||
float b = cuCimagf(x);
|
||||
float v, w, t;
|
||||
a = fabsf(a);
|
||||
b = fabsf(b);
|
||||
if (a > b) {
|
||||
v = a;
|
||||
w = b;
|
||||
} else {
|
||||
v = b;
|
||||
w = a;
|
||||
}
|
||||
t = w / v;
|
||||
t = 1.0f + t * t;
|
||||
t = v * sqrtf(t);
|
||||
if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
|
||||
t = v + w;
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
/* Double precision */
|
||||
typedef double2 cuDoubleComplex;
|
||||
|
||||
__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
|
||||
{
|
||||
return x.x;
|
||||
}
|
||||
|
||||
__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)
|
||||
{
|
||||
return x.y;
|
||||
}
|
||||
|
||||
__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex
|
||||
(double r, double i)
|
||||
{
|
||||
cuDoubleComplex res;
|
||||
res.x = r;
|
||||
res.y = i;
|
||||
return res;
|
||||
}
|
||||
|
||||
__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
|
||||
{
|
||||
return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
|
||||
}
|
||||
|
||||
__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
|
||||
cuDoubleComplex y)
|
||||
{
|
||||
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),
|
||||
cuCimag(x) + cuCimag(y));
|
||||
}
|
||||
|
||||
__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
|
||||
cuDoubleComplex y)
|
||||
{
|
||||
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),
|
||||
cuCimag(x) - cuCimag(y));
|
||||
}
|
||||
|
||||
/* This implementation could suffer from intermediate overflow even though
|
||||
* the final result would be in range. However, various implementations do
|
||||
* not guard against this (presumably to avoid losing performance), so we
|
||||
* don't do it either to stay competitive.
|
||||
*/
|
||||
__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
|
||||
cuDoubleComplex y)
|
||||
{
|
||||
cuDoubleComplex prod;
|
||||
prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -
|
||||
(cuCimag(x) * cuCimag(y)),
|
||||
(cuCreal(x) * cuCimag(y)) +
|
||||
(cuCimag(x) * cuCreal(y)));
|
||||
return prod;
|
||||
}
|
||||
|
||||
/* This implementation guards against intermediate underflow and overflow
|
||||
* by scaling. Such guarded implementations are usually the default for
|
||||
* complex library implementations, with some also offering an unguarded,
|
||||
* faster version.
|
||||
*/
|
||||
__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
|
||||
cuDoubleComplex y)
|
||||
{
|
||||
cuDoubleComplex quot;
|
||||
double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
|
||||
double oos = 1.0 / s;
|
||||
double ars = cuCreal(x) * oos;
|
||||
double ais = cuCimag(x) * oos;
|
||||
double brs = cuCreal(y) * oos;
|
||||
double bis = cuCimag(y) * oos;
|
||||
s = (brs * brs) + (bis * bis);
|
||||
oos = 1.0 / s;
|
||||
quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
|
||||
((ais * brs) - (ars * bis)) * oos);
|
||||
return quot;
|
||||
}
|
||||
|
||||
/* This implementation guards against intermediate underflow and overflow
|
||||
* by scaling. Otherwise we would lose half the exponent range. There are
|
||||
* various ways of doing guarded computation. For now chose the simplest
|
||||
* and fastest solution, however this may suffer from inaccuracies if sqrt
|
||||
* and division are not IEEE compliant.
|
||||
*/
|
||||
__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
|
||||
{
|
||||
double a = cuCreal(x);
|
||||
double b = cuCimag(x);
|
||||
double v, w, t;
|
||||
a = fabs(a);
|
||||
b = fabs(b);
|
||||
if (a > b) {
|
||||
v = a;
|
||||
w = b;
|
||||
} else {
|
||||
v = b;
|
||||
w = a;
|
||||
}
|
||||
t = w / v;
|
||||
t = 1.0 + t * t;
|
||||
t = v * sqrt(t);
|
||||
if ((v == 0.0) ||
|
||||
(v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
|
||||
t = v + w;
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/* aliases */
|
||||
typedef cuFloatComplex cuComplex;
|
||||
__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
|
||||
float y)
|
||||
{
|
||||
return make_cuFloatComplex (x, y);
|
||||
}
|
||||
|
||||
/* float-to-double promotion */
|
||||
__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
|
||||
(cuFloatComplex c)
|
||||
{
|
||||
return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
|
||||
}
|
||||
|
||||
__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
|
||||
(cuDoubleComplex c)
|
||||
{
|
||||
return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
|
||||
}
|
||||
|
||||
|
||||
__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
|
||||
{
|
||||
float real_res;
|
||||
float imag_res;
|
||||
|
||||
real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
|
||||
imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);
|
||||
|
||||
real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
|
||||
imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;
|
||||
|
||||
return make_cuComplex(real_res, imag_res);
|
||||
}
|
||||
|
||||
__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
|
||||
{
|
||||
double real_res;
|
||||
double imag_res;
|
||||
|
||||
real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
|
||||
imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);
|
||||
|
||||
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
|
||||
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
|
||||
|
||||
return make_cuDoubleComplex(real_res, imag_res);
|
||||
}
|
||||
|
||||
#endif /* !defined(CU_COMPLEX_H_) */
|
565
include/external/cuda/cublas.h
vendored
Normal file
565
include/external/cuda/cublas.h
vendored
Normal file
@@ -0,0 +1,565 @@
|
||||
/*
|
||||
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This is the public header file for the CUBLAS library, defining the API
|
||||
*
|
||||
* CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
|
||||
* on top of the CUDA runtime.
|
||||
*/
|
||||
|
||||
#if !defined(CUBLAS_H_)
|
||||
#define CUBLAS_H_
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#ifndef CUBLASWINAPI
|
||||
#ifdef _WIN32
|
||||
#define CUBLASWINAPI __stdcall
|
||||
#else
|
||||
#define CUBLASWINAPI
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#undef CUBLASAPI
|
||||
#ifdef __CUDACC__
|
||||
#define CUBLASAPI __host__
|
||||
#else
|
||||
#define CUBLASAPI
|
||||
#endif
|
||||
|
||||
#include "cublas_api.h"
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* CUBLAS data types */
|
||||
#define cublasStatus cublasStatus_t
|
||||
|
||||
cublasStatus CUBLASWINAPI cublasInit (void);
|
||||
cublasStatus CUBLASWINAPI cublasShutdown (void);
|
||||
cublasStatus CUBLASWINAPI cublasGetError (void);
|
||||
|
||||
cublasStatus CUBLASWINAPI cublasGetVersion(int *version);
|
||||
cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr);
|
||||
|
||||
cublasStatus CUBLASWINAPI cublasFree (void *devicePtr);
|
||||
|
||||
|
||||
cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream);
|
||||
|
||||
|
||||
|
||||
/* ---------------- CUBLAS BLAS1 functions ---------------- */
|
||||
/* NRM2 */
|
||||
float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx);
|
||||
double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx);
|
||||
float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx);
|
||||
double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* DOT */
|
||||
float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y,
|
||||
int incy);
|
||||
double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y,
|
||||
int incy);
|
||||
cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y,
|
||||
int incy);
|
||||
cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y,
|
||||
int incy);
|
||||
cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
|
||||
int incy);
|
||||
cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
|
||||
int incy);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* SCAL */
|
||||
void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx);
|
||||
void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx);
|
||||
void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx);
|
||||
void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx);
|
||||
|
||||
void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx);
|
||||
void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* AXPY */
|
||||
void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx,
|
||||
float *y, int incy);
|
||||
void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x,
|
||||
int incx, double *y, int incy);
|
||||
void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x,
|
||||
int incx, cuComplex *y, int incy);
|
||||
void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
|
||||
int incx, cuDoubleComplex *y, int incy);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* COPY */
|
||||
void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y,
|
||||
int incy);
|
||||
void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y,
|
||||
int incy);
|
||||
void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
|
||||
int incy);
|
||||
void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
|
||||
int incy);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* SWAP */
|
||||
void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy);
|
||||
void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy);
|
||||
void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy);
|
||||
void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* AMAX */
|
||||
int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx);
|
||||
int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx);
|
||||
int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx);
|
||||
int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* AMIN */
|
||||
int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx);
|
||||
int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx);
|
||||
|
||||
int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx);
|
||||
int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* ASUM */
|
||||
float CUBLASWINAPI cublasSasum (int n, const float *x, int incx);
|
||||
double CUBLASWINAPI cublasDasum (int n, const double *x, int incx);
|
||||
float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx);
|
||||
double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* ROT */
|
||||
void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy,
|
||||
float sc, float ss);
|
||||
void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy,
|
||||
double sc, double ss);
|
||||
void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y,
|
||||
int incy, float c, cuComplex s);
|
||||
void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx,
|
||||
cuDoubleComplex *y, int incy, double sc,
|
||||
cuDoubleComplex cs);
|
||||
void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
|
||||
int incy, float c, float s);
|
||||
void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx,
|
||||
cuDoubleComplex *y, int incy, double c, double s);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* ROTG */
|
||||
void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss);
|
||||
void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss);
|
||||
void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
|
||||
cuComplex *cs);
|
||||
void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
|
||||
cuDoubleComplex *cs);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* ROTM */
|
||||
void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
|
||||
const float* sparam);
|
||||
void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
|
||||
const double* sparam);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* ROTMG */
|
||||
void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1,
|
||||
const float *sy1, float* sparam);
|
||||
void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1,
|
||||
const double *sy1, double* sparam);
|
||||
|
||||
/* --------------- CUBLAS BLAS2 functions ---------------- */
|
||||
/* GEMV */
|
||||
void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
|
||||
const float *A, int lda, const float *x, int incx,
|
||||
float beta, float *y, int incy);
|
||||
void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
|
||||
const double *A, int lda, const double *x, int incx,
|
||||
double beta, double *y, int incy);
|
||||
void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
|
||||
const cuComplex *A, int lda, const cuComplex *x, int incx,
|
||||
cuComplex beta, cuComplex *y, int incy);
|
||||
void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
|
||||
const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
|
||||
cuDoubleComplex beta, cuDoubleComplex *y, int incy);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* GBMV */
|
||||
void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku,
|
||||
float alpha, const float *A, int lda,
|
||||
const float *x, int incx, float beta, float *y,
|
||||
int incy);
|
||||
void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku,
|
||||
double alpha, const double *A, int lda,
|
||||
const double *x, int incx, double beta, double *y,
|
||||
int incy);
|
||||
void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku,
|
||||
cuComplex alpha, const cuComplex *A, int lda,
|
||||
const cuComplex *x, int incx, cuComplex beta, cuComplex *y,
|
||||
int incy);
|
||||
void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku,
|
||||
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
|
||||
const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y,
|
||||
int incy);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* TRMV */
|
||||
void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n,
|
||||
const float *A, int lda, float *x, int incx);
|
||||
void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n,
|
||||
const double *A, int lda, double *x, int incx);
|
||||
void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n,
|
||||
const cuComplex *A, int lda, cuComplex *x, int incx);
|
||||
void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n,
|
||||
const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* TBMV */
|
||||
void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k,
|
||||
const float *A, int lda, float *x, int incx);
|
||||
void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k,
|
||||
const double *A, int lda, double *x, int incx);
|
||||
void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k,
|
||||
const cuComplex *A, int lda, cuComplex *x, int incx);
|
||||
void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k,
|
||||
const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* TPMV */
|
||||
void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx);
|
||||
|
||||
void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
|
||||
|
||||
void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
|
||||
|
||||
void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* TRSV */
|
||||
void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx);
|
||||
|
||||
void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx);
|
||||
|
||||
void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx);
|
||||
|
||||
void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda,
|
||||
cuDoubleComplex *x, int incx);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* TPSV */
|
||||
void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP,
|
||||
float *x, int incx);
|
||||
|
||||
void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
|
||||
|
||||
void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
|
||||
|
||||
void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP,
|
||||
cuDoubleComplex *x, int incx);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* TBSV */
|
||||
void CUBLASWINAPI cublasStbsv(char uplo, char trans,
|
||||
char diag, int n, int k, const float *A,
|
||||
int lda, float *x, int incx);
|
||||
|
||||
void CUBLASWINAPI cublasDtbsv(char uplo, char trans,
|
||||
char diag, int n, int k, const double *A,
|
||||
int lda, double *x, int incx);
|
||||
void CUBLASWINAPI cublasCtbsv(char uplo, char trans,
|
||||
char diag, int n, int k, const cuComplex *A,
|
||||
int lda, cuComplex *x, int incx);
|
||||
|
||||
void CUBLASWINAPI cublasZtbsv(char uplo, char trans,
|
||||
char diag, int n, int k, const cuDoubleComplex *A,
|
||||
int lda, cuDoubleComplex *x, int incx);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* SYMV/HEMV */
|
||||
void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
|
||||
int lda, const float *x, int incx, float beta,
|
||||
float *y, int incy);
|
||||
void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
|
||||
int lda, const double *x, int incx, double beta,
|
||||
double *y, int incy);
|
||||
void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
|
||||
int lda, const cuComplex *x, int incx, cuComplex beta,
|
||||
cuComplex *y, int incy);
|
||||
void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
|
||||
int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta,
|
||||
cuDoubleComplex *y, int incy);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* SBMV/HBMV */
|
||||
void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha,
|
||||
const float *A, int lda, const float *x, int incx,
|
||||
float beta, float *y, int incy);
|
||||
void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha,
|
||||
const double *A, int lda, const double *x, int incx,
|
||||
double beta, double *y, int incy);
|
||||
void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha,
|
||||
const cuComplex *A, int lda, const cuComplex *x, int incx,
|
||||
cuComplex beta, cuComplex *y, int incy);
|
||||
void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha,
|
||||
const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
|
||||
cuDoubleComplex beta, cuDoubleComplex *y, int incy);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* SPMV/HPMV */
|
||||
void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
|
||||
const float *AP, const float *x,
|
||||
int incx, float beta, float *y, int incy);
|
||||
void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
|
||||
const double *AP, const double *x,
|
||||
int incx, double beta, double *y, int incy);
|
||||
void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
|
||||
const cuComplex *AP, const cuComplex *x,
|
||||
int incx, cuComplex beta, cuComplex *y, int incy);
|
||||
void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
|
||||
const cuDoubleComplex *AP, const cuDoubleComplex *x,
|
||||
int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy);
|
||||
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* GER */
|
||||
void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
|
||||
const float *y, int incy, float *A, int lda);
|
||||
void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
|
||||
const double *y, int incy, double *A, int lda);
|
||||
|
||||
void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
|
||||
int incx, const cuComplex *y, int incy,
|
||||
cuComplex *A, int lda);
|
||||
void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
|
||||
int incx, const cuComplex *y, int incy,
|
||||
cuComplex *A, int lda);
|
||||
void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
|
||||
int incx, const cuDoubleComplex *y, int incy,
|
||||
cuDoubleComplex *A, int lda);
|
||||
void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
|
||||
int incx, const cuDoubleComplex *y, int incy,
|
||||
cuDoubleComplex *A, int lda);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* SYR/HER */
|
||||
void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
|
||||
int incx, float *A, int lda);
|
||||
void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
|
||||
int incx, double *A, int lda);
|
||||
|
||||
void CUBLASWINAPI cublasCher (char uplo, int n, float alpha,
|
||||
const cuComplex *x, int incx, cuComplex *A, int lda);
|
||||
void CUBLASWINAPI cublasZher (char uplo, int n, double alpha,
|
||||
const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda);
|
||||
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* SPR/HPR */
|
||||
void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
|
||||
int incx, float *AP);
|
||||
void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
|
||||
int incx, double *AP);
|
||||
void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
|
||||
int incx, cuComplex *AP);
|
||||
void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
|
||||
int incx, cuDoubleComplex *AP);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* SYR2/HER2 */
|
||||
void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x,
|
||||
int incx, const float *y, int incy, float *A,
|
||||
int lda);
|
||||
void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x,
|
||||
int incx, const double *y, int incy, double *A,
|
||||
int lda);
|
||||
void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x,
|
||||
int incx, const cuComplex *y, int incy, cuComplex *A,
|
||||
int lda);
|
||||
void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
|
||||
int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A,
|
||||
int lda);
|
||||
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* SPR2/HPR2 */
|
||||
void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x,
|
||||
int incx, const float *y, int incy, float *AP);
|
||||
void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
|
||||
const double *x, int incx, const double *y,
|
||||
int incy, double *AP);
|
||||
void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
|
||||
const cuComplex *x, int incx, const cuComplex *y,
|
||||
int incy, cuComplex *AP);
|
||||
void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
|
||||
const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
|
||||
int incy, cuDoubleComplex *AP);
|
||||
/* ------------------------BLAS3 Functions ------------------------------- */
|
||||
/* GEMM */
|
||||
void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k,
|
||||
float alpha, const float *A, int lda,
|
||||
const float *B, int ldb, float beta, float *C,
|
||||
int ldc);
|
||||
void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
|
||||
double alpha, const double *A, int lda,
|
||||
const double *B, int ldb, double beta, double *C,
|
||||
int ldc);
|
||||
void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k,
|
||||
cuComplex alpha, const cuComplex *A, int lda,
|
||||
const cuComplex *B, int ldb, cuComplex beta,
|
||||
cuComplex *C, int ldc);
|
||||
void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
|
||||
int k, cuDoubleComplex alpha,
|
||||
const cuDoubleComplex *A, int lda,
|
||||
const cuDoubleComplex *B, int ldb,
|
||||
cuDoubleComplex beta, cuDoubleComplex *C,
|
||||
int ldc);
|
||||
/* -------------------------------------------------------*/
|
||||
/* SYRK */
|
||||
void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha,
|
||||
const float *A, int lda, float beta, float *C,
|
||||
int ldc);
|
||||
void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
|
||||
double alpha, const double *A, int lda,
|
||||
double beta, double *C, int ldc);
|
||||
|
||||
void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
|
||||
cuComplex alpha, const cuComplex *A, int lda,
|
||||
cuComplex beta, cuComplex *C, int ldc);
|
||||
void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
|
||||
cuDoubleComplex alpha,
|
||||
const cuDoubleComplex *A, int lda,
|
||||
cuDoubleComplex beta,
|
||||
cuDoubleComplex *C, int ldc);
|
||||
/* ------------------------------------------------------- */
|
||||
/* HERK */
|
||||
void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
|
||||
float alpha, const cuComplex *A, int lda,
|
||||
float beta, cuComplex *C, int ldc);
|
||||
void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
|
||||
double alpha,
|
||||
const cuDoubleComplex *A, int lda,
|
||||
double beta,
|
||||
cuDoubleComplex *C, int ldc);
|
||||
/* ------------------------------------------------------- */
|
||||
/* SYR2K */
|
||||
void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha,
|
||||
const float *A, int lda, const float *B, int ldb,
|
||||
float beta, float *C, int ldc);
|
||||
|
||||
void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
|
||||
double alpha, const double *A, int lda,
|
||||
const double *B, int ldb, double beta,
|
||||
double *C, int ldc);
|
||||
void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
|
||||
cuComplex alpha, const cuComplex *A, int lda,
|
||||
const cuComplex *B, int ldb, cuComplex beta,
|
||||
cuComplex *C, int ldc);
|
||||
|
||||
void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
|
||||
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
|
||||
const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
|
||||
cuDoubleComplex *C, int ldc);
|
||||
/* ------------------------------------------------------- */
|
||||
/* HER2K */
|
||||
void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
|
||||
cuComplex alpha, const cuComplex *A, int lda,
|
||||
const cuComplex *B, int ldb, float beta,
|
||||
cuComplex *C, int ldc);
|
||||
|
||||
void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
|
||||
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
|
||||
const cuDoubleComplex *B, int ldb, double beta,
|
||||
cuDoubleComplex *C, int ldc);
|
||||
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* SYMM*/
|
||||
void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha,
|
||||
const float *A, int lda, const float *B, int ldb,
|
||||
float beta, float *C, int ldc);
|
||||
void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha,
|
||||
const double *A, int lda, const double *B, int ldb,
|
||||
double beta, double *C, int ldc);
|
||||
|
||||
void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha,
|
||||
const cuComplex *A, int lda, const cuComplex *B, int ldb,
|
||||
cuComplex beta, cuComplex *C, int ldc);
|
||||
|
||||
void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha,
|
||||
const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
|
||||
cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* HEMM*/
|
||||
void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
|
||||
cuComplex alpha, const cuComplex *A, int lda,
|
||||
const cuComplex *B, int ldb, cuComplex beta,
|
||||
cuComplex *C, int ldc);
|
||||
void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
|
||||
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
|
||||
const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
|
||||
cuDoubleComplex *C, int ldc);
|
||||
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* TRSM*/
|
||||
void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
|
||||
int m, int n, float alpha, const float *A, int lda,
|
||||
float *B, int ldb);
|
||||
|
||||
void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
|
||||
char diag, int m, int n, double alpha,
|
||||
const double *A, int lda, double *B,
|
||||
int ldb);
|
||||
|
||||
void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
|
||||
int m, int n, cuComplex alpha, const cuComplex *A,
|
||||
int lda, cuComplex *B, int ldb);
|
||||
|
||||
void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
|
||||
char diag, int m, int n, cuDoubleComplex alpha,
|
||||
const cuDoubleComplex *A, int lda,
|
||||
cuDoubleComplex *B, int ldb);
|
||||
/*------------------------------------------------------------------------*/
|
||||
/* TRMM*/
|
||||
void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
|
||||
int m, int n, float alpha, const float *A, int lda,
|
||||
float *B, int ldb);
|
||||
void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
|
||||
char diag, int m, int n, double alpha,
|
||||
const double *A, int lda, double *B,
|
||||
int ldb);
|
||||
void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
|
||||
int m, int n, cuComplex alpha, const cuComplex *A,
|
||||
int lda, cuComplex *B, int ldb);
|
||||
void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
|
||||
char diag, int m, int n, cuDoubleComplex alpha,
|
||||
const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
|
||||
int ldb);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* !defined(CUBLAS_H_) */
|
2583
include/external/cuda/cublas_api.h
vendored
Normal file
2583
include/external/cuda/cublas_api.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
228
include/external/cuda/cuda_device_runtime_api.h
vendored
Normal file
228
include/external/cuda/cuda_device_runtime_api.h
vendored
Normal file
@@ -0,0 +1,228 @@
|
||||
/*
|
||||
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
|
||||
#define __CUDA_DEVICE_RUNTIME_API_H__
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#if defined(__CUDABE__)
|
||||
|
||||
#if (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
|
||||
struct cudaFuncAttributes;
|
||||
|
||||
__device__ __attribute__((nv_weak)) cudaError_t cudaMalloc(void **p, size_t s)
|
||||
{
|
||||
return cudaErrorUnknown;
|
||||
}
|
||||
|
||||
__device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
|
||||
{
|
||||
return cudaErrorUnknown;
|
||||
}
|
||||
|
||||
__device__ __attribute__((nv_weak)) cudaError_t cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
|
||||
{
|
||||
return cudaErrorUnknown;
|
||||
}
|
||||
|
||||
__device__ __attribute__((nv_weak)) cudaError_t cudaGetDevice(int *device)
|
||||
{
|
||||
return cudaErrorUnknown;
|
||||
}
|
||||
|
||||
__device__ __attribute__((nv_weak)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
|
||||
{
|
||||
return cudaErrorUnknown;
|
||||
}
|
||||
|
||||
__device__ __attribute__((nv_weak)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
|
||||
{
|
||||
return cudaErrorUnknown;
|
||||
}
|
||||
|
||||
#endif /* (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
|
||||
|
||||
#else /* defined(__CUDABE__) */
|
||||
|
||||
#if defined(__cplusplus) && defined(__CUDACC__) // Visible to nvcc front-end only
|
||||
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) // Visible to SM>=3.5 and "__host__ __device__" only
|
||||
|
||||
#include "driver_types.h"
|
||||
#include "host_defines.h"
|
||||
|
||||
extern "C"
|
||||
{
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
|
||||
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
|
||||
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
|
||||
|
||||
/**
|
||||
* \ingroup CUDART_EXECUTION
|
||||
* \brief Obtains a parameter buffer
|
||||
*
|
||||
* Obtains a parameter buffer which can be filled with parameters for a kernel launch.
|
||||
* Parameters passed to ::cudaLaunchDevice must be allocated via this function.
|
||||
*
|
||||
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
|
||||
* CUDA user code should use <<< >>> to launch kernels.
|
||||
*
|
||||
* \param alignment - Specifies alignment requirement of the parameter buffer
|
||||
* \param size - Specifies size requirement in bytes
|
||||
*
|
||||
* \return
|
||||
* Returns pointer to the allocated parameterBuffer
|
||||
* \notefnerr
|
||||
*
|
||||
* \sa cudaLaunchDevice
|
||||
*/
|
||||
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
|
||||
|
||||
/**
|
||||
* \ingroup CUDART_EXECUTION
|
||||
* \brief Launches a specified kernel
|
||||
*
|
||||
* Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
|
||||
* by calling ::cudaGetParameterBuffer().
|
||||
*
|
||||
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
|
||||
* CUDA user code should use <<< >>> to launch the kernels.
|
||||
*
|
||||
* \param func - Pointer to the kernel to be launched
|
||||
* \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
|
||||
* \param gridDimension - Specifies grid dimensions
|
||||
* \param blockDimension - Specifies block dimensions
|
||||
* \param sharedMemSize - Specifies size of shared memory
|
||||
* \param stream - Specifies the stream to be used
|
||||
*
|
||||
* \return
|
||||
* ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
|
||||
* ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
|
||||
* \notefnerr
|
||||
* \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
|
||||
* Guide for the detailed descriptions of launch configuration and parameter layout respectively.
|
||||
*
|
||||
* \sa cudaGetParameterBuffer
|
||||
*/
|
||||
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
|
||||
|
||||
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
|
||||
// When compiling for the device and per thread default stream is enabled, add
|
||||
// a static inline redirect to the per thread stream entry points.
|
||||
|
||||
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
|
||||
cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
|
||||
{
|
||||
return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
|
||||
}
|
||||
|
||||
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
|
||||
cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
|
||||
{
|
||||
return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
|
||||
}
|
||||
#else
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
|
||||
#endif
|
||||
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
|
||||
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
|
||||
|
||||
}
|
||||
|
||||
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
|
||||
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
|
||||
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
|
||||
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
|
||||
|
||||
#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
|
||||
#endif // defined(__cplusplus) && defined(__CUDACC__)
|
||||
|
||||
#endif /* defined(__CUDABE__) */
|
||||
|
||||
#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */
|
1499
include/external/cuda/cuda_fp16.h
vendored
Normal file
1499
include/external/cuda/cuda_fp16.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1895
include/external/cuda/cuda_runtime.h
vendored
Normal file
1895
include/external/cuda/cuda_runtime.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
6520
include/external/cuda/cuda_runtime_api.h
vendored
Normal file
6520
include/external/cuda/cuda_runtime_api.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
69
include/external/cuda/device_types.h
vendored
Normal file
69
include/external/cuda/device_types.h
vendored
Normal file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
#if !defined(__DEVICE_TYPES_H__)
|
||||
#define __DEVICE_TYPES_H__
|
||||
|
||||
#include "host_defines.h"
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
enum __device_builtin__ cudaRoundMode
|
||||
{
|
||||
cudaRoundNearest,
|
||||
cudaRoundZero,
|
||||
cudaRoundPosInf,
|
||||
cudaRoundMinInf
|
||||
};
|
||||
|
||||
#endif /* !__DEVICE_TYPES_H__ */
|
145
include/external/cuda/driver_functions.h
vendored
Normal file
145
include/external/cuda/driver_functions.h
vendored
Normal file
@@ -0,0 +1,145 @@
|
||||
/*
|
||||
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
#if !defined(__DRIVER_FUNCTIONS_H__)
|
||||
#define __DRIVER_FUNCTIONS_H__
|
||||
|
||||
#include "builtin_types.h"
|
||||
#include "host_defines.h"
|
||||
#include "driver_types.h"
|
||||
|
||||
/**
|
||||
* \addtogroup CUDART_MEMORY
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/**
|
||||
* \brief Returns a cudaPitchedPtr based on input parameters
|
||||
*
|
||||
* Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
|
||||
* \p p, \p xsz, and \p ysz.
|
||||
*
|
||||
* \param d - Pointer to allocated memory
|
||||
* \param p - Pitch of allocated memory in bytes
|
||||
* \param xsz - Logical width of allocation in elements
|
||||
* \param ysz - Logical height of allocation in elements
|
||||
*
|
||||
* \return
|
||||
* ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
|
||||
*
|
||||
* \sa make_cudaExtent, make_cudaPos
|
||||
*/
|
||||
static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
|
||||
{
|
||||
struct cudaPitchedPtr s;
|
||||
|
||||
s.ptr = d;
|
||||
s.pitch = p;
|
||||
s.xsize = xsz;
|
||||
s.ysize = ysz;
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Returns a cudaPos based on input parameters
|
||||
*
|
||||
* Returns a ::cudaPos based on the specified input parameters \p x,
|
||||
* \p y, and \p z.
|
||||
*
|
||||
* \param x - X position
|
||||
* \param y - Y position
|
||||
* \param z - Z position
|
||||
*
|
||||
* \return
|
||||
* ::cudaPos specified by \p x, \p y, and \p z
|
||||
*
|
||||
* \sa make_cudaExtent, make_cudaPitchedPtr
|
||||
*/
|
||||
static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z)
|
||||
{
|
||||
struct cudaPos p;
|
||||
|
||||
p.x = x;
|
||||
p.y = y;
|
||||
p.z = z;
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Returns a cudaExtent based on input parameters
|
||||
*
|
||||
* Returns a ::cudaExtent based on the specified input parameters \p w,
|
||||
* \p h, and \p d.
|
||||
*
|
||||
* \param w - Width in bytes
|
||||
* \param h - Height in elements
|
||||
* \param d - Depth in elements
|
||||
*
|
||||
* \return
|
||||
* ::cudaExtent specified by \p w, \p h, and \p d
|
||||
*
|
||||
* \sa make_cudaPitchedPtr, make_cudaPos
|
||||
*/
|
||||
static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d)
|
||||
{
|
||||
struct cudaExtent e;
|
||||
|
||||
e.width = w;
|
||||
e.height = h;
|
||||
e.depth = d;
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
/** @} */ /* END CUDART_MEMORY */
|
||||
|
||||
#endif /* !__DRIVER_FUNCTIONS_H__ */
|
1450
include/external/cuda/driver_types.h
vendored
Normal file
1450
include/external/cuda/driver_types.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
201
include/external/cuda/host_config.h
vendored
Normal file
201
include/external/cuda/host_config.h
vendored
Normal file
@@ -0,0 +1,201 @@
|
||||
/*
|
||||
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
#if !defined(__HOST_CONFIG_H__)
|
||||
#define __HOST_CONFIG_H__
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
|
||||
#if defined(__CUDACC_RTC__)
|
||||
|
||||
#define _CRTIMP
|
||||
#define __THROW
|
||||
|
||||
#else /* __CUDACC_RTC__ */
|
||||
|
||||
/* check for host compilers that are compatible with nvcc */
|
||||
#if !defined(__GNUC__) && !defined(_WIN32)
|
||||
|
||||
#error --- !!! UNSUPPORTED COMPILER !!! ---
|
||||
|
||||
#endif /* !__GNUC__ && !_WIN32 */
|
||||
|
||||
#if defined(__ICC)
|
||||
|
||||
#if __ICC != 1500 || !defined(__GNUC__) || !defined(__LP64__)
|
||||
|
||||
#error -- unsupported ICC configuration! Only ICC 15.0 on Linux x86_64 is supported!
|
||||
|
||||
#endif /* __ICC != 1500 || !__GNUC__ || !__LP64__ */
|
||||
|
||||
#endif /* __ICC */
|
||||
|
||||
#if defined(__PGIC__)
|
||||
|
||||
#if __PGIC__ != 15 || __PGIC_MINOR__ != 4 || !defined(__GNUC__) || !defined(__LP64__)
|
||||
|
||||
#error -- unsupported pgc++ configuration! Only pgc++ 15.4 on Linux x86_64 is supported!
|
||||
|
||||
#endif /* __PGIC__ != 15 || __PGIC_MINOR != 4 || !__GNUC__ || !__LP64__ */
|
||||
|
||||
#endif /* __PGIC__ */
|
||||
|
||||
#if defined(__powerpc__)
|
||||
|
||||
#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
|
||||
|
||||
#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
|
||||
|
||||
#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
|
||||
|
||||
#if defined(__ibmxl_vrm__) && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000)
|
||||
|
||||
#error -- unsupported xlC version! only xlC 13.1 is supported
|
||||
|
||||
#endif /* __ibmxl_vrm__ && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) */
|
||||
|
||||
#endif /* __powerpc__ */
|
||||
|
||||
#if defined(__GNUC__)
|
||||
|
||||
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 9)
|
||||
|
||||
#error -- unsupported GNU version! gcc versions later than 4.9 are not supported!
|
||||
|
||||
#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 9) */
|
||||
|
||||
#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
|
||||
#error -- clang and clang++ are the only supported host compilers on Mac OS X!
|
||||
#endif /* __APPLE__ && __MACH__ && !__clang__ */
|
||||
|
||||
#endif /* __GNUC__ */
|
||||
|
||||
#if defined(_WIN32)
|
||||
|
||||
#if _MSC_VER < 1600 || _MSC_VER > 1800
|
||||
|
||||
#error -- unsupported Microsoft Visual Studio version! Only the versions 2010, 2012, and 2013 are supported!
|
||||
|
||||
#endif /* _MSC_VER < 1600 || _MSC_VER > 1800 */
|
||||
|
||||
#endif /* _WIN32 */
|
||||
|
||||
/* configure host compiler */
|
||||
#if defined(__APPLE__)
|
||||
|
||||
#define _CRTIMP
|
||||
#define __THROW
|
||||
|
||||
#if defined(__BLOCKS__) /* nvcc does not support closures */
|
||||
|
||||
#undef __BLOCKS__
|
||||
|
||||
#endif /* __BLOCKS__ */
|
||||
|
||||
#elif defined(__ANDROID__)
|
||||
|
||||
#define _CRTIMP
|
||||
#define __THROW
|
||||
|
||||
#elif defined(__QNX__)
|
||||
|
||||
#define _CRTIMP
|
||||
#define __THROW
|
||||
|
||||
#elif defined(__GNUC__)
|
||||
|
||||
#define _CRTIMP
|
||||
|
||||
#include <features.h> /* for __THROW */
|
||||
|
||||
#elif defined(_WIN32)
|
||||
|
||||
#if _MSC_VER >= 1500
|
||||
|
||||
#undef _USE_DECLSPECS_FOR_SAL
|
||||
#define _USE_DECLSPECS_FOR_SAL \
|
||||
1
|
||||
|
||||
#endif /* _MSC_VER >= 1500 */
|
||||
|
||||
#if !defined(_CRT_NONSTDC_NO_WARNINGS)
|
||||
|
||||
#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
|
||||
|
||||
#endif /* !_CRT_NONSTDC_NO_WARNINGS */
|
||||
|
||||
#if !defined(_CRT_SECURE_NO_WARNINGS)
|
||||
|
||||
#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
|
||||
|
||||
#endif /* !_CRT_SECURE_NO_WARNINGS */
|
||||
|
||||
#if !defined(NOMINMAX)
|
||||
|
||||
#define NOMINMAX /* min and max are part of cuda runtime */
|
||||
|
||||
#endif /* !NOMINMAX */
|
||||
|
||||
#include <crtdefs.h> /* for _CRTIMP */
|
||||
|
||||
#define __THROW
|
||||
|
||||
#endif /* __APPLE__ */
|
||||
|
||||
#endif /* __CUDACC_RTC__ */
|
||||
|
||||
#endif /* __CUDACC__ */
|
||||
|
||||
#endif /* !__HOST_CONFIG_H__ */
|
241
include/external/cuda/host_defines.h
vendored
Normal file
241
include/external/cuda/host_defines.h
vendored
Normal file
@@ -0,0 +1,241 @@
|
||||
/*
|
||||
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
#if !defined(__HOST_DEFINES_H__)
|
||||
#define __HOST_DEFINES_H__
|
||||
|
||||
/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
|
||||
#if defined(__GNUC__) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
|
||||
|
||||
#if defined(__CUDACC_RTC__)
|
||||
#define __volatile__ volatile
|
||||
#endif /* __CUDACC_RTC__ */
|
||||
|
||||
#define __no_return__ \
|
||||
__attribute__((noreturn))
|
||||
|
||||
#if defined(__CUDACC__) || defined(__CUDA_ARCH__)
|
||||
/* gcc allows users to define attributes with underscores,
|
||||
e.g., __attribute__((__noinline__)).
|
||||
Consider a non-CUDA source file (e.g. .cpp) that has the
|
||||
above attribute specification, and includes this header file. In that case,
|
||||
defining __noinline__ as below would cause a gcc compilation error.
|
||||
Hence, only define __noinline__ when the code is being processed
|
||||
by a CUDA compiler component.
|
||||
*/
|
||||
#define __noinline__ \
|
||||
__attribute__((noinline))
|
||||
#endif /* __CUDACC__ || __CUDA_ARCH__ */
|
||||
|
||||
#define __forceinline__ \
|
||||
__inline__ __attribute__((always_inline))
|
||||
#define __align__(n) \
|
||||
__attribute__((aligned(n)))
|
||||
#define __thread__ \
|
||||
__thread
|
||||
#define __import__
|
||||
#define __export__
|
||||
#define __cdecl
|
||||
#define __annotate__(a) \
|
||||
__attribute__((a))
|
||||
#define __location__(a) \
|
||||
__annotate__(a)
|
||||
#define CUDARTAPI
|
||||
|
||||
#elif defined(_MSC_VER)
|
||||
|
||||
#if _MSC_VER >= 1400
|
||||
|
||||
#define __restrict__ \
|
||||
__restrict
|
||||
|
||||
#else /* _MSC_VER >= 1400 */
|
||||
|
||||
#define __restrict__
|
||||
|
||||
#endif /* _MSC_VER >= 1400 */
|
||||
|
||||
#define __inline__ \
|
||||
__inline
|
||||
#define __no_return__ \
|
||||
__declspec(noreturn)
|
||||
#define __noinline__ \
|
||||
__declspec(noinline)
|
||||
#define __forceinline__ \
|
||||
__forceinline
|
||||
#define __align__(n) \
|
||||
__declspec(align(n))
|
||||
#define __thread__ \
|
||||
__declspec(thread)
|
||||
#define __import__ \
|
||||
__declspec(dllimport)
|
||||
#define __export__ \
|
||||
__declspec(dllexport)
|
||||
#define __annotate__(a) \
|
||||
__declspec(a)
|
||||
#define __location__(a) \
|
||||
__annotate__(__##a##__)
|
||||
#define CUDARTAPI \
|
||||
__stdcall
|
||||
|
||||
#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
|
||||
|
||||
#define __inline__
|
||||
|
||||
#if !defined(__align__)
|
||||
|
||||
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
|
||||
|
||||
#endif /* !__align__ */
|
||||
|
||||
#if !defined(CUDARTAPI)
|
||||
|
||||
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
|
||||
|
||||
#endif /* !CUDARTAPI */
|
||||
|
||||
#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
|
||||
|
||||
#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
|
||||
(defined(_MSC_VER) && _MSC_VER < 1900) || \
|
||||
(!defined(__GNUC__) && !defined(_MSC_VER))
|
||||
|
||||
#define __specialization_static \
|
||||
static
|
||||
|
||||
#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
|
||||
(_MSC_VER && _MSC_VER < 1900) ||
|
||||
(!__GNUC__ && !_MSC_VER) */
|
||||
|
||||
#define __specialization_static
|
||||
|
||||
#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
|
||||
(_MSC_VER && _MSC_VER < 1900) ||
|
||||
(!__GNUC__ && !_MSC_VER) */
|
||||
|
||||
#if !defined(__CUDACC__) && !defined(__CUDABE__)
|
||||
|
||||
#undef __annotate__
|
||||
#define __annotate__(a)
|
||||
|
||||
#else /* !__CUDACC__ && !__CUDABE__ */
|
||||
|
||||
#define __launch_bounds__(...) \
|
||||
__annotate__(launch_bounds(__VA_ARGS__))
|
||||
|
||||
#endif /* !__CUDACC__ && !__CUDABE__ */
|
||||
|
||||
#if defined(__CUDACC__) || defined(__CUDABE__) || \
|
||||
defined(__GNUC__) || defined(_WIN64)
|
||||
|
||||
#define __builtin_align__(a) \
|
||||
__align__(a)
|
||||
|
||||
#else /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */
|
||||
|
||||
#define __builtin_align__(a)
|
||||
|
||||
#endif /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */
|
||||
|
||||
#define __host__ \
|
||||
__location__(host)
|
||||
#define __device__ \
|
||||
__location__(device)
|
||||
#define __global__ \
|
||||
__location__(global)
|
||||
#define __shared__ \
|
||||
__location__(shared)
|
||||
#define __constant__ \
|
||||
__location__(constant)
|
||||
#define __managed__ \
|
||||
__location__(managed)
|
||||
|
||||
#if (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__)) || !defined(__CUDACC__)
|
||||
#define __device_builtin__
|
||||
#define __device_builtin_texture_type__
|
||||
#define __device_builtin_surface_type__
|
||||
#define __cudart_builtin__
|
||||
#else /* (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__)) || !__CUDACC__ */
|
||||
#define __device_builtin__ \
|
||||
__location__(device_builtin)
|
||||
#define __device_builtin_texture_type__ \
|
||||
__location__(device_builtin_texture_type)
|
||||
#define __device_builtin_surface_type__ \
|
||||
__location__(device_builtin_surface_type)
|
||||
#define __cudart_builtin__ \
|
||||
__location__(cudart_builtin)
|
||||
#endif /* (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__)) || !__CUDACC__ */
|
||||
|
||||
#if defined(__CUDACC__) && defined(__clang__)
|
||||
|
||||
#if !defined(__has_feature)
|
||||
#error --- !!! The Clang version does not support __has_feature !!! ---
|
||||
#endif /* !__has_feature */
|
||||
|
||||
#if defined(__cplusplus) && defined(__CUDACC__)
|
||||
#if (__has_feature(cxx_noexcept))
|
||||
#define NV_CLANG_ATOMIC_NOEXCEPT noexcept
|
||||
#define NV_CLANG_ATOMIC_NOEXCEPT_(x) noexcept(x)
|
||||
#else /* !__has_feature(cxx_noexcept) */
|
||||
#define NV_CLANG_ATOMIC_NOEXCEPT throw()
|
||||
#define NV_CLANG_ATOMIC_NOEXCEPT_(x)
|
||||
#endif /* __has_feature(cxx_noexcept) */
|
||||
template <typename T> struct __nv_clang_atomic_t {
|
||||
__nv_clang_atomic_t() NV_CLANG_ATOMIC_NOEXCEPT;
|
||||
__nv_clang_atomic_t(const T &x) NV_CLANG_ATOMIC_NOEXCEPT;
|
||||
operator T() volatile NV_CLANG_ATOMIC_NOEXCEPT;
|
||||
operator T() NV_CLANG_ATOMIC_NOEXCEPT;
|
||||
};
|
||||
#define _Atomic(X) __nv_clang_atomic_t<X>
|
||||
#endif /* defined(__cplusplus) && defined(__CUDACC__) */
|
||||
|
||||
#endif /* __CUDACC__ && __clang__ */
|
||||
|
||||
|
||||
#endif /* !__HOST_DEFINES_H__ */
|
119
include/external/cuda/surface_types.h
vendored
Normal file
119
include/external/cuda/surface_types.h
vendored
Normal file
@@ -0,0 +1,119 @@
|
||||
/*
|
||||
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
#if !defined(__SURFACE_TYPES_H__)
|
||||
#define __SURFACE_TYPES_H__
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#include "driver_types.h"
|
||||
|
||||
/**
|
||||
* \addtogroup CUDART_TYPES
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#define cudaSurfaceType1D 0x01
|
||||
#define cudaSurfaceType2D 0x02
|
||||
#define cudaSurfaceType3D 0x03
|
||||
#define cudaSurfaceTypeCubemap 0x0C
|
||||
#define cudaSurfaceType1DLayered 0xF1
|
||||
#define cudaSurfaceType2DLayered 0xF2
|
||||
#define cudaSurfaceTypeCubemapLayered 0xFC
|
||||
|
||||
/**
|
||||
* CUDA Surface boundary modes
|
||||
*/
|
||||
enum __device_builtin__ cudaSurfaceBoundaryMode
|
||||
{
|
||||
cudaBoundaryModeZero = 0, /**< Zero boundary mode */
|
||||
cudaBoundaryModeClamp = 1, /**< Clamp boundary mode */
|
||||
cudaBoundaryModeTrap = 2 /**< Trap boundary mode */
|
||||
};
|
||||
|
||||
/**
|
||||
* CUDA Surface format modes
|
||||
*/
|
||||
enum __device_builtin__ cudaSurfaceFormatMode
|
||||
{
|
||||
cudaFormatModeForced = 0, /**< Forced format mode */
|
||||
cudaFormatModeAuto = 1 /**< Auto format mode */
|
||||
};
|
||||
|
||||
/**
|
||||
* CUDA Surface reference
|
||||
*/
|
||||
struct __device_builtin__ surfaceReference
|
||||
{
|
||||
/**
|
||||
* Channel descriptor for surface reference
|
||||
*/
|
||||
struct cudaChannelFormatDesc channelDesc;
|
||||
};
|
||||
|
||||
/**
|
||||
* An opaque value that represents a CUDA Surface object
|
||||
*/
|
||||
typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
|
||||
|
||||
/** @} */
|
||||
/** @} */ /* END CUDART_TYPES */
|
||||
|
||||
#endif /* !__SURFACE_TYPES_H__ */
|
213
include/external/cuda/texture_types.h
vendored
Normal file
213
include/external/cuda/texture_types.h
vendored
Normal file
@@ -0,0 +1,213 @@
|
||||
/*
|
||||
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
#if !defined(__TEXTURE_TYPES_H__)
|
||||
#define __TEXTURE_TYPES_H__
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#include "driver_types.h"
|
||||
|
||||
/**
|
||||
* \addtogroup CUDART_TYPES
|
||||
*
|
||||
* @{
|
||||
*/
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#define cudaTextureType1D 0x01
|
||||
#define cudaTextureType2D 0x02
|
||||
#define cudaTextureType3D 0x03
|
||||
#define cudaTextureTypeCubemap 0x0C
|
||||
#define cudaTextureType1DLayered 0xF1
|
||||
#define cudaTextureType2DLayered 0xF2
|
||||
#define cudaTextureTypeCubemapLayered 0xFC
|
||||
|
||||
/**
|
||||
* CUDA texture address modes
|
||||
*/
|
||||
enum __device_builtin__ cudaTextureAddressMode
|
||||
{
|
||||
cudaAddressModeWrap = 0, /**< Wrapping address mode */
|
||||
cudaAddressModeClamp = 1, /**< Clamp to edge address mode */
|
||||
cudaAddressModeMirror = 2, /**< Mirror address mode */
|
||||
cudaAddressModeBorder = 3 /**< Border address mode */
|
||||
};
|
||||
|
||||
/**
|
||||
* CUDA texture filter modes
|
||||
*/
|
||||
enum __device_builtin__ cudaTextureFilterMode
|
||||
{
|
||||
cudaFilterModePoint = 0, /**< Point filter mode */
|
||||
cudaFilterModeLinear = 1 /**< Linear filter mode */
|
||||
};
|
||||
|
||||
/**
|
||||
* CUDA texture read modes
|
||||
*/
|
||||
enum __device_builtin__ cudaTextureReadMode
|
||||
{
|
||||
cudaReadModeElementType = 0, /**< Read texture as specified element type */
|
||||
cudaReadModeNormalizedFloat = 1 /**< Read texture as normalized float */
|
||||
};
|
||||
|
||||
/**
|
||||
* CUDA texture reference
|
||||
*/
|
||||
struct __device_builtin__ textureReference
|
||||
{
|
||||
/**
|
||||
* Indicates whether texture reads are normalized or not
|
||||
*/
|
||||
int normalized;
|
||||
/**
|
||||
* Texture filter mode
|
||||
*/
|
||||
enum cudaTextureFilterMode filterMode;
|
||||
/**
|
||||
* Texture address mode for up to 3 dimensions
|
||||
*/
|
||||
enum cudaTextureAddressMode addressMode[3];
|
||||
/**
|
||||
* Channel descriptor for the texture reference
|
||||
*/
|
||||
struct cudaChannelFormatDesc channelDesc;
|
||||
/**
|
||||
* Perform sRGB->linear conversion during texture read
|
||||
*/
|
||||
int sRGB;
|
||||
/**
|
||||
* Limit to the anisotropy ratio
|
||||
*/
|
||||
unsigned int maxAnisotropy;
|
||||
/**
|
||||
* Mipmap filter mode
|
||||
*/
|
||||
enum cudaTextureFilterMode mipmapFilterMode;
|
||||
/**
|
||||
* Offset applied to the supplied mipmap level
|
||||
*/
|
||||
float mipmapLevelBias;
|
||||
/**
|
||||
* Lower end of the mipmap level range to clamp access to
|
||||
*/
|
||||
float minMipmapLevelClamp;
|
||||
/**
|
||||
* Upper end of the mipmap level range to clamp access to
|
||||
*/
|
||||
float maxMipmapLevelClamp;
|
||||
int __cudaReserved[15];
|
||||
};
|
||||
|
||||
/**
|
||||
* CUDA texture descriptor
|
||||
*/
|
||||
struct __device_builtin__ cudaTextureDesc
|
||||
{
|
||||
/**
|
||||
* Texture address mode for up to 3 dimensions
|
||||
*/
|
||||
enum cudaTextureAddressMode addressMode[3];
|
||||
/**
|
||||
* Texture filter mode
|
||||
*/
|
||||
enum cudaTextureFilterMode filterMode;
|
||||
/**
|
||||
* Texture read mode
|
||||
*/
|
||||
enum cudaTextureReadMode readMode;
|
||||
/**
|
||||
* Perform sRGB->linear conversion during texture read
|
||||
*/
|
||||
int sRGB;
|
||||
/**
|
||||
* Indicates whether texture reads are normalized or not
|
||||
*/
|
||||
int normalizedCoords;
|
||||
/**
|
||||
* Limit to the anisotropy ratio
|
||||
*/
|
||||
unsigned int maxAnisotropy;
|
||||
/**
|
||||
* Mipmap filter mode
|
||||
*/
|
||||
enum cudaTextureFilterMode mipmapFilterMode;
|
||||
/**
|
||||
* Offset applied to the supplied mipmap level
|
||||
*/
|
||||
float mipmapLevelBias;
|
||||
/**
|
||||
* Lower end of the mipmap level range to clamp access to
|
||||
*/
|
||||
float minMipmapLevelClamp;
|
||||
/**
|
||||
* Upper end of the mipmap level range to clamp access to
|
||||
*/
|
||||
float maxMipmapLevelClamp;
|
||||
};
|
||||
|
||||
/**
|
||||
* An opaque value that represents a CUDA texture object
|
||||
*/
|
||||
typedef __device_builtin__ unsigned long long cudaTextureObject_t;
|
||||
|
||||
/** @} */
|
||||
/** @} */ /* END CUDART_TYPES */
|
||||
|
||||
#endif /* !__TEXTURE_TYPES_H__ */
|
177
include/external/cuda/vector_functions.h
vendored
Normal file
177
include/external/cuda/vector_functions.h
vendored
Normal file
@@ -0,0 +1,177 @@
|
||||
/*
|
||||
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
#if !defined(__VECTOR_FUNCTIONS_H__)
|
||||
#define __VECTOR_FUNCTIONS_H__
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#include "builtin_types.h"
|
||||
#include "host_defines.h"
|
||||
#include "vector_types.h"
|
||||
|
||||
#if defined(__CUDACC_RTC__)
|
||||
#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
|
||||
#else /* !__CUDACC_RTC__ */
|
||||
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
|
||||
#endif /* __CUDACC_RTC__ */
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
|
||||
|
||||
#undef __VECTOR_FUNCTIONS_DECL__
|
||||
|
||||
#if !defined(__CUDACC_RTC__)
|
||||
#include "vector_functions.hpp"
|
||||
#endif /* !__CUDACC_RTC__ */
|
||||
|
||||
#endif /* !__VECTOR_FUNCTIONS_H__ */
|
318
include/external/cuda/vector_functions.hpp
vendored
Normal file
318
include/external/cuda/vector_functions.hpp
vendored
Normal file
@@ -0,0 +1,318 @@
|
||||
/*
|
||||
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
#if !defined(__VECTOR_FUNCTIONS_HPP__)
|
||||
#define __VECTOR_FUNCTIONS_HPP__
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#include "builtin_types.h"
|
||||
#include "host_defines.h"
|
||||
#include "vector_types.h"
|
||||
|
||||
#if defined(__CUDACC_RTC__)
|
||||
#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
|
||||
#else /* !__CUDACC_RTC__ */
|
||||
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
|
||||
#endif /* __CUDACC_RTC__ */
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
|
||||
{
|
||||
char1 t; t.x = x; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
|
||||
{
|
||||
uchar1 t; t.x = x; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
|
||||
{
|
||||
char2 t; t.x = x; t.y = y; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
|
||||
{
|
||||
uchar2 t; t.x = x; t.y = y; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
|
||||
{
|
||||
char3 t; t.x = x; t.y = y; t.z = z; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
|
||||
{
|
||||
uchar3 t; t.x = x; t.y = y; t.z = z; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
|
||||
{
|
||||
char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
|
||||
{
|
||||
uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
|
||||
{
|
||||
short1 t; t.x = x; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
|
||||
{
|
||||
ushort1 t; t.x = x; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
|
||||
{
|
||||
short2 t; t.x = x; t.y = y; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
|
||||
{
|
||||
ushort2 t; t.x = x; t.y = y; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
|
||||
{
|
||||
short3 t; t.x = x; t.y = y; t.z = z; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
|
||||
{
|
||||
ushort3 t; t.x = x; t.y = y; t.z = z; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
|
||||
{
|
||||
short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
|
||||
{
|
||||
ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
|
||||
{
|
||||
int1 t; t.x = x; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
|
||||
{
|
||||
uint1 t; t.x = x; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
|
||||
{
|
||||
int2 t; t.x = x; t.y = y; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
|
||||
{
|
||||
uint2 t; t.x = x; t.y = y; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
|
||||
{
|
||||
int3 t; t.x = x; t.y = y; t.z = z; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
|
||||
{
|
||||
uint3 t; t.x = x; t.y = y; t.z = z; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
|
||||
{
|
||||
int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
|
||||
{
|
||||
uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
|
||||
{
|
||||
long1 t; t.x = x; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
|
||||
{
|
||||
ulong1 t; t.x = x; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
|
||||
{
|
||||
long2 t; t.x = x; t.y = y; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
|
||||
{
|
||||
ulong2 t; t.x = x; t.y = y; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
|
||||
{
|
||||
long3 t; t.x = x; t.y = y; t.z = z; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
|
||||
{
|
||||
ulong3 t; t.x = x; t.y = y; t.z = z; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
|
||||
{
|
||||
long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
|
||||
{
|
||||
ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
|
||||
{
|
||||
float1 t; t.x = x; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
|
||||
{
|
||||
float2 t; t.x = x; t.y = y; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
|
||||
{
|
||||
float3 t; t.x = x; t.y = y; t.z = z; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
|
||||
{
|
||||
float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
|
||||
{
|
||||
longlong1 t; t.x = x; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
|
||||
{
|
||||
ulonglong1 t; t.x = x; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
|
||||
{
|
||||
longlong2 t; t.x = x; t.y = y; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
|
||||
{
|
||||
ulonglong2 t; t.x = x; t.y = y; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
|
||||
{
|
||||
longlong3 t; t.x = x; t.y = y; t.z = z; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
|
||||
{
|
||||
ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
|
||||
{
|
||||
longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
|
||||
{
|
||||
ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
|
||||
{
|
||||
double1 t; t.x = x; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
|
||||
{
|
||||
double2 t; t.x = x; t.y = y; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
|
||||
{
|
||||
double3 t; t.x = x; t.y = y; t.z = z; return t;
|
||||
}
|
||||
|
||||
__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
|
||||
{
|
||||
double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
|
||||
}
|
||||
|
||||
#undef __VECTOR_FUNCTIONS_DECL__
|
||||
|
||||
#endif /* !__VECTOR_FUNCTIONS_HPP__ */
|
||||
|
431
include/external/cuda/vector_types.h
vendored
Normal file
431
include/external/cuda/vector_types.h
vendored
Normal file
@@ -0,0 +1,431 @@
|
||||
/*
|
||||
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO LICENSEE:
|
||||
*
|
||||
* This source code and/or documentation ("Licensed Deliverables") are
|
||||
* subject to NVIDIA intellectual property rights under U.S. and
|
||||
* international Copyright laws.
|
||||
*
|
||||
* These Licensed Deliverables contained herein is PROPRIETARY and
|
||||
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
||||
* conditions of a form of NVIDIA software license agreement by and
|
||||
* between NVIDIA and Licensee ("License Agreement") or electronically
|
||||
* accepted by Licensee. Notwithstanding any terms or conditions to
|
||||
* the contrary in the License Agreement, reproduction or disclosure
|
||||
* of the Licensed Deliverables to any third party without the express
|
||||
* written consent of NVIDIA is prohibited.
|
||||
*
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
||||
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
||||
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
||||
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
||||
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
||||
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
||||
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
||||
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
||||
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||||
* OF THESE LICENSED DELIVERABLES.
|
||||
*
|
||||
* U.S. Government End Users. These Licensed Deliverables are a
|
||||
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
||||
* 1995), consisting of "commercial computer software" and "commercial
|
||||
* computer software documentation" as such terms are used in 48
|
||||
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
||||
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
||||
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
||||
* U.S. Government End Users acquire the Licensed Deliverables with
|
||||
* only those rights set forth herein.
|
||||
*
|
||||
* Any use of the Licensed Deliverables in individual and commercial
|
||||
* software must include, in the user documentation and internal
|
||||
* comments to the code, the above Disclaimer and U.S. Government End
|
||||
* Users Notice.
|
||||
*/
|
||||
|
||||
#if !defined(__VECTOR_TYPES_H__)
|
||||
#define __VECTOR_TYPES_H__
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(__CUDA_LIBDEVICE__) && !defined(__CUDACC_RTC__)
|
||||
#define EXCLUDE_FROM_RTC
|
||||
#include "builtin_types.h"
|
||||
#undef EXCLUDE_FROM_RTC
|
||||
#endif /* !__CUDA_LIBDEVICE__ && !__CUDACC_RTC__ */
|
||||
#include "host_defines.h"
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && !defined(__CUDABE__) && \
|
||||
defined(_WIN32) && !defined(_WIN64)
|
||||
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable: 4201 4408)
|
||||
|
||||
#define __cuda_builtin_vector_align8(tag, members) \
|
||||
struct __device_builtin__ tag \
|
||||
{ \
|
||||
union \
|
||||
{ \
|
||||
struct { members }; \
|
||||
struct { long long int :1,:0; }; \
|
||||
}; \
|
||||
}
|
||||
|
||||
#else /* !__CUDACC__ && !__CUDACC_RTC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
|
||||
|
||||
#define __cuda_builtin_vector_align8(tag, members) \
|
||||
struct __device_builtin__ __align__(8) tag \
|
||||
{ \
|
||||
members \
|
||||
}
|
||||
|
||||
#endif /* !__CUDACC__ && !__CUDACC_RTC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
|
||||
|
||||
struct __device_builtin__ char1
|
||||
{
|
||||
signed char x;
|
||||
};
|
||||
|
||||
struct __device_builtin__ uchar1
|
||||
{
|
||||
unsigned char x;
|
||||
};
|
||||
|
||||
|
||||
struct __device_builtin__ __align__(2) char2
|
||||
{
|
||||
signed char x, y;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __align__(2) uchar2
|
||||
{
|
||||
unsigned char x, y;
|
||||
};
|
||||
|
||||
struct __device_builtin__ char3
|
||||
{
|
||||
signed char x, y, z;
|
||||
};
|
||||
|
||||
struct __device_builtin__ uchar3
|
||||
{
|
||||
unsigned char x, y, z;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __align__(4) char4
|
||||
{
|
||||
signed char x, y, z, w;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __align__(4) uchar4
|
||||
{
|
||||
unsigned char x, y, z, w;
|
||||
};
|
||||
|
||||
struct __device_builtin__ short1
|
||||
{
|
||||
short x;
|
||||
};
|
||||
|
||||
struct __device_builtin__ ushort1
|
||||
{
|
||||
unsigned short x;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __align__(4) short2
|
||||
{
|
||||
short x, y;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __align__(4) ushort2
|
||||
{
|
||||
unsigned short x, y;
|
||||
};
|
||||
|
||||
struct __device_builtin__ short3
|
||||
{
|
||||
short x, y, z;
|
||||
};
|
||||
|
||||
struct __device_builtin__ ushort3
|
||||
{
|
||||
unsigned short x, y, z;
|
||||
};
|
||||
|
||||
__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
|
||||
__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
|
||||
|
||||
struct __device_builtin__ int1
|
||||
{
|
||||
int x;
|
||||
};
|
||||
|
||||
struct __device_builtin__ uint1
|
||||
{
|
||||
unsigned int x;
|
||||
};
|
||||
|
||||
__cuda_builtin_vector_align8(int2, int x; int y;);
|
||||
__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
|
||||
|
||||
struct __device_builtin__ int3
|
||||
{
|
||||
int x, y, z;
|
||||
};
|
||||
|
||||
struct __device_builtin__ uint3
|
||||
{
|
||||
unsigned int x, y, z;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __builtin_align__(16) int4
|
||||
{
|
||||
int x, y, z, w;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __builtin_align__(16) uint4
|
||||
{
|
||||
unsigned int x, y, z, w;
|
||||
};
|
||||
|
||||
struct __device_builtin__ long1
|
||||
{
|
||||
long int x;
|
||||
};
|
||||
|
||||
struct __device_builtin__ ulong1
|
||||
{
|
||||
unsigned long x;
|
||||
};
|
||||
|
||||
#if defined(__CUDACC_RTC__) || defined(_WIN32)
|
||||
__cuda_builtin_vector_align8(long2, long int x; long int y;);
|
||||
__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
|
||||
#else /* __CUDACC_RTC__ || _WIN32 */
|
||||
|
||||
struct __device_builtin__ __align__(2*sizeof(long int)) long2
|
||||
{
|
||||
long int x, y;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
|
||||
{
|
||||
unsigned long int x, y;
|
||||
};
|
||||
|
||||
#endif /* __CUDACC_RTC__ || _WIN32 */
|
||||
|
||||
struct __device_builtin__ long3
|
||||
{
|
||||
long int x, y, z;
|
||||
};
|
||||
|
||||
struct __device_builtin__ ulong3
|
||||
{
|
||||
unsigned long int x, y, z;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __builtin_align__(16) long4
|
||||
{
|
||||
long int x, y, z, w;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __builtin_align__(16) ulong4
|
||||
{
|
||||
unsigned long int x, y, z, w;
|
||||
};
|
||||
|
||||
struct __device_builtin__ float1
|
||||
{
|
||||
float x;
|
||||
};
|
||||
|
||||
#if !defined(__CUDACC__) && !defined(__CUDABE__) && defined(__arm__) && \
|
||||
defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-pedantic"
|
||||
|
||||
struct __device_builtin__ __attribute__((aligned(8))) float2
|
||||
{
|
||||
float x; float y; float __cuda_gnu_arm_ice_workaround[0];
|
||||
};
|
||||
|
||||
#pragma GCC poison __cuda_gnu_arm_ice_workaround
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
#else /* !__CUDACC__ && !__CUDABE__ && __arm__ && __ARM_PCS_VFP &&
|
||||
__GNUC__ == 4&& __GNUC_MINOR__ == 6 */
|
||||
|
||||
__cuda_builtin_vector_align8(float2, float x; float y;);
|
||||
|
||||
#endif /* !__CUDACC__ && !__CUDABE__ && __arm__ && __ARM_PCS_VFP &&
|
||||
__GNUC__ == 4&& __GNUC_MINOR__ == 6 */
|
||||
|
||||
struct __device_builtin__ float3
|
||||
{
|
||||
float x, y, z;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __builtin_align__(16) float4
|
||||
{
|
||||
float x, y, z, w;
|
||||
};
|
||||
|
||||
struct __device_builtin__ longlong1
|
||||
{
|
||||
long long int x;
|
||||
};
|
||||
|
||||
struct __device_builtin__ ulonglong1
|
||||
{
|
||||
unsigned long long int x;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __builtin_align__(16) longlong2
|
||||
{
|
||||
long long int x, y;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __builtin_align__(16) ulonglong2
|
||||
{
|
||||
unsigned long long int x, y;
|
||||
};
|
||||
|
||||
struct __device_builtin__ longlong3
|
||||
{
|
||||
long long int x, y, z;
|
||||
};
|
||||
|
||||
struct __device_builtin__ ulonglong3
|
||||
{
|
||||
unsigned long long int x, y, z;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __builtin_align__(16) longlong4
|
||||
{
|
||||
long long int x, y, z ,w;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __builtin_align__(16) ulonglong4
|
||||
{
|
||||
unsigned long long int x, y, z, w;
|
||||
};
|
||||
|
||||
struct __device_builtin__ double1
|
||||
{
|
||||
double x;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __builtin_align__(16) double2
|
||||
{
|
||||
double x, y;
|
||||
};
|
||||
|
||||
struct __device_builtin__ double3
|
||||
{
|
||||
double x, y, z;
|
||||
};
|
||||
|
||||
struct __device_builtin__ __builtin_align__(16) double4
|
||||
{
|
||||
double x, y, z, w;
|
||||
};
|
||||
|
||||
#if !defined(__CUDACC__) && !defined(__CUDABE__) && \
|
||||
defined(_WIN32) && !defined(_WIN64)
|
||||
|
||||
#pragma warning(pop)
|
||||
|
||||
#endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
typedef __device_builtin__ struct char1 char1;
|
||||
typedef __device_builtin__ struct uchar1 uchar1;
|
||||
typedef __device_builtin__ struct char2 char2;
|
||||
typedef __device_builtin__ struct uchar2 uchar2;
|
||||
typedef __device_builtin__ struct char3 char3;
|
||||
typedef __device_builtin__ struct uchar3 uchar3;
|
||||
typedef __device_builtin__ struct char4 char4;
|
||||
typedef __device_builtin__ struct uchar4 uchar4;
|
||||
typedef __device_builtin__ struct short1 short1;
|
||||
typedef __device_builtin__ struct ushort1 ushort1;
|
||||
typedef __device_builtin__ struct short2 short2;
|
||||
typedef __device_builtin__ struct ushort2 ushort2;
|
||||
typedef __device_builtin__ struct short3 short3;
|
||||
typedef __device_builtin__ struct ushort3 ushort3;
|
||||
typedef __device_builtin__ struct short4 short4;
|
||||
typedef __device_builtin__ struct ushort4 ushort4;
|
||||
typedef __device_builtin__ struct int1 int1;
|
||||
typedef __device_builtin__ struct uint1 uint1;
|
||||
typedef __device_builtin__ struct int2 int2;
|
||||
typedef __device_builtin__ struct uint2 uint2;
|
||||
typedef __device_builtin__ struct int3 int3;
|
||||
typedef __device_builtin__ struct uint3 uint3;
|
||||
typedef __device_builtin__ struct int4 int4;
|
||||
typedef __device_builtin__ struct uint4 uint4;
|
||||
typedef __device_builtin__ struct long1 long1;
|
||||
typedef __device_builtin__ struct ulong1 ulong1;
|
||||
typedef __device_builtin__ struct long2 long2;
|
||||
typedef __device_builtin__ struct ulong2 ulong2;
|
||||
typedef __device_builtin__ struct long3 long3;
|
||||
typedef __device_builtin__ struct ulong3 ulong3;
|
||||
typedef __device_builtin__ struct long4 long4;
|
||||
typedef __device_builtin__ struct ulong4 ulong4;
|
||||
typedef __device_builtin__ struct float1 float1;
|
||||
typedef __device_builtin__ struct float2 float2;
|
||||
typedef __device_builtin__ struct float3 float3;
|
||||
typedef __device_builtin__ struct float4 float4;
|
||||
typedef __device_builtin__ struct longlong1 longlong1;
|
||||
typedef __device_builtin__ struct ulonglong1 ulonglong1;
|
||||
typedef __device_builtin__ struct longlong2 longlong2;
|
||||
typedef __device_builtin__ struct ulonglong2 ulonglong2;
|
||||
typedef __device_builtin__ struct longlong3 longlong3;
|
||||
typedef __device_builtin__ struct ulonglong3 ulonglong3;
|
||||
typedef __device_builtin__ struct longlong4 longlong4;
|
||||
typedef __device_builtin__ struct ulonglong4 ulonglong4;
|
||||
typedef __device_builtin__ struct double1 double1;
|
||||
typedef __device_builtin__ struct double2 double2;
|
||||
typedef __device_builtin__ struct double3 double3;
|
||||
typedef __device_builtin__ struct double4 double4;
|
||||
|
||||
/*******************************************************************************
|
||||
* *
|
||||
* *
|
||||
* *
|
||||
*******************************************************************************/
|
||||
|
||||
struct __device_builtin__ dim3
|
||||
{
|
||||
unsigned int x, y, z;
|
||||
#if defined(__cplusplus)
|
||||
__host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
|
||||
__host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
|
||||
__host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
|
||||
#endif /* __cplusplus */
|
||||
};
|
||||
|
||||
typedef __device_builtin__ struct dim3 dim3;
|
||||
|
||||
#undef __cuda_builtin_vector_align8
|
||||
|
||||
#endif /* !__VECTOR_TYPES_H__ */
|
@@ -19,10 +19,10 @@ class ISAACAPI Buffer
|
||||
friend class CommandQueue;
|
||||
friend class Kernel;
|
||||
public:
|
||||
Buffer(CUdeviceptr h = 0, bool take_ownership = true);
|
||||
Buffer(cl_mem Buffer = 0, bool take_ownership = true);
|
||||
Buffer(Context const & context, size_t size);
|
||||
Context const & context() const;
|
||||
size_t size() const;
|
||||
bool operator<(Buffer const &) const;
|
||||
bool operator==(Buffer const &) const;
|
||||
HANDLE_TYPE(cl_mem, CUdeviceptr)& handle();
|
||||
@@ -30,7 +30,6 @@ public:
|
||||
private:
|
||||
backend_type backend_;
|
||||
Context context_;
|
||||
size_t size_;
|
||||
HANDLE_TYPE(cl_mem, CUdeviceptr) h_;
|
||||
};
|
||||
|
||||
|
@@ -9,12 +9,17 @@ namespace isaac
|
||||
namespace driver
|
||||
{
|
||||
|
||||
Buffer::Buffer(cl_mem buffer, bool take_ownership) : backend_(OPENCL), context_(backend::contexts::import(ocl::info<CL_MEM_CONTEXT>(buffer))), size_(ocl::info<CL_MEM_SIZE>(buffer)), h_(backend_, take_ownership)
|
||||
Buffer::Buffer(CUdeviceptr h, bool take_ownership) : backend_(CUDA), context_(backend::contexts::get_default()), h_(backend_, take_ownership)
|
||||
{
|
||||
h_.cu() = h;
|
||||
}
|
||||
|
||||
Buffer::Buffer(cl_mem buffer, bool take_ownership) : backend_(OPENCL), context_(backend::contexts::import(ocl::info<CL_MEM_CONTEXT>(buffer))), h_(backend_, take_ownership)
|
||||
{
|
||||
h_.cl() = buffer;
|
||||
}
|
||||
|
||||
Buffer::Buffer(Context const & context, size_t size) : backend_(context.backend_), context_(context), size_(size), h_(backend_, true)
|
||||
Buffer::Buffer(Context const & context, size_t size) : backend_(context.backend_), context_(context), h_(backend_, true)
|
||||
{
|
||||
switch(backend_)
|
||||
{
|
||||
@@ -31,9 +36,6 @@ Buffer::Buffer(Context const & context, size_t size) : backend_(context.backend_
|
||||
}
|
||||
}
|
||||
|
||||
size_t Buffer::size() const
|
||||
{ return size_; }
|
||||
|
||||
Context const & Buffer::context() const
|
||||
{ return context_; }
|
||||
|
||||
|
@@ -1,6 +1,6 @@
|
||||
#include "isaac/wrap/clBLAS.h"
|
||||
#include "isaac/array.h"
|
||||
#include "isaac/symbolic/execute.h"
|
||||
#include "clBLAS.h"
|
||||
|
||||
namespace sc = isaac;
|
||||
|
||||
|
128
lib/wrap/cublas.cpp
Normal file
128
lib/wrap/cublas.cpp
Normal file
@@ -0,0 +1,128 @@
|
||||
#include "isaac/array.h"
|
||||
#include "isaac/symbolic/execute.h"
|
||||
#include "cublas.h"
|
||||
|
||||
namespace sc = isaac;
|
||||
|
||||
extern "C"
|
||||
{
|
||||
|
||||
//*****************
|
||||
//BLAS1
|
||||
//*****************
|
||||
|
||||
//AXPY
|
||||
#define MAKE_AXPY(TYPE_CHAR, TYPE_ISAAC, TYPE_CU) \
|
||||
void cublas ## TYPE_CHAR ## axpy (int n, TYPE_CU alpha, const TYPE_CU *x, int incx, TYPE_CU *y, int incy)\
|
||||
{\
|
||||
sc::array dx((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)x,false), 0, incx); \
|
||||
sc::array dy((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)y,false), 0, incy); \
|
||||
sc::execute(sc::assign(dy, alpha*dx + dy));\
|
||||
}\
|
||||
|
||||
MAKE_AXPY(S, sc::FLOAT_TYPE, float)
|
||||
MAKE_AXPY(D, sc::DOUBLE_TYPE, double)
|
||||
|
||||
//COPY
|
||||
#define MAKE_COPY(TYPE_CHAR, TYPE_ISAAC, TYPE_CU) \
|
||||
void cublas ## TYPE_CHAR ## copy (int n, const TYPE_CU *x, int incx, TYPE_CU *y, int incy)\
|
||||
{\
|
||||
sc::array dx((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)x,false), 0, incx); \
|
||||
sc::array dy((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)y,false), 0, incy); \
|
||||
sc::execute(sc::assign(dy,dx));\
|
||||
}\
|
||||
|
||||
MAKE_COPY(S, sc::FLOAT_TYPE, float)
|
||||
MAKE_COPY(D, sc::DOUBLE_TYPE, double)
|
||||
|
||||
//SCAL
|
||||
#define MAKE_SCAL(TYPE_CHAR, TYPE_ISAAC, TYPE_CU) \
|
||||
void cublas ## TYPE_CHAR ## scal (int n, TYPE_CU alpha, TYPE_CU *x, int incx)\
|
||||
{\
|
||||
sc::array dx((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)x,false), 0, incx); \
|
||||
sc::execute(sc::assign(dx,alpha*dx));\
|
||||
}\
|
||||
|
||||
MAKE_SCAL(S, sc::FLOAT_TYPE, float)
|
||||
MAKE_SCAL(D, sc::DOUBLE_TYPE, double)
|
||||
|
||||
//DOT
|
||||
#define MAKE_DOT(TYPE_CHAR, TYPE_ISAAC, TYPE_CU) \
|
||||
TYPE_CU cublas ## TYPE_CHAR ## dot (int n, const TYPE_CU *x, int incx, const TYPE_CU *y, int incy)\
|
||||
{\
|
||||
sc::array dx((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)x,false), 0, incx); \
|
||||
sc::array dy((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)y,false), 0, incy); \
|
||||
return sc::value_scalar(sc::dot(dx,dy));\
|
||||
}\
|
||||
|
||||
MAKE_DOT(S, sc::FLOAT_TYPE, float)
|
||||
MAKE_DOT(D, sc::DOUBLE_TYPE, double)
|
||||
|
||||
//ASUM
|
||||
#define MAKE_ASUM(TYPE_CHAR, TYPE_ISAAC, TYPE_CU) \
|
||||
TYPE_CU cublas ## TYPE_CHAR ## asum (int n, const TYPE_CU *x, int incx)\
|
||||
{\
|
||||
sc::array dx((sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)x,false), 0, incx); \
|
||||
return sc::value_scalar(sum(abs(dx)));\
|
||||
}\
|
||||
|
||||
MAKE_ASUM(S, sc::FLOAT_TYPE, float)
|
||||
MAKE_ASUM(D, sc::DOUBLE_TYPE, double)
|
||||
|
||||
//*****************
|
||||
//BLAS2
|
||||
//*****************
|
||||
|
||||
#define MAKE_GEMV(TYPE_CHAR, TYPE_ISAAC, TYPE_CU) \
|
||||
void cublas ## TYPE_CHAR ## gemv (char trans, int m, int n, TYPE_CU alpha,\
|
||||
const TYPE_CU *A, int lda, const TYPE_CU *x, int incx,\
|
||||
TYPE_CU beta, TYPE_CU *y, int incy)\
|
||||
{\
|
||||
sc::array dA((sc::int_t)m, (sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)A, false), 0, (sc::int_t)lda);\
|
||||
\
|
||||
sc::int_t sx = (sc::int_t)n, sy = (sc::int_t)m;\
|
||||
if(trans=='T') std::swap(sx, sy);\
|
||||
sc::array dx(sx, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)x, false), 0, incx);\
|
||||
sc::array dy(sy, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)y, false), 0, incy);\
|
||||
\
|
||||
if(trans=='T')\
|
||||
sc::execute(sc::assign(dy, alpha*dot(dA.T, dx) + beta*dy));\
|
||||
else\
|
||||
sc::execute(sc::assign(dy, alpha*dot(dA, dx) + beta*dy));\
|
||||
}
|
||||
|
||||
MAKE_GEMV(S, sc::FLOAT_TYPE, float)
|
||||
MAKE_GEMV(D, sc::DOUBLE_TYPE, double)
|
||||
|
||||
//*****************
|
||||
//BLAS3
|
||||
//*****************
|
||||
|
||||
#define MAKE_GEMM(TYPE_CHAR, TYPE_ISAAC, TYPE_CU) \
|
||||
void cublas ## TYPE_CHAR ## gemm (char transa, char transb, int m, int n, int k,\
|
||||
TYPE_CU alpha, const TYPE_CU *A, int lda,\
|
||||
const TYPE_CU *B, int ldb, TYPE_CU beta, TYPE_CU *C,\
|
||||
int ldc)\
|
||||
{\
|
||||
sc::int_t As1 = (sc::int_t)m, As2 = (sc::int_t)k;\
|
||||
sc::int_t Bs1 = (sc::int_t)k, Bs2 = (sc::int_t)n;\
|
||||
if(transa=='T') std::swap(As1, As2);\
|
||||
if(transb=='T') std::swap(Bs1, Bs2);\
|
||||
/*Struct*/\
|
||||
sc::array dA(As1, As2, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)A, false), 0, (sc::int_t)lda);\
|
||||
sc::array dB(Bs1, Bs2, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)B, false), 0, (sc::int_t)ldb);\
|
||||
sc::array dC((sc::int_t)m, (sc::int_t)n, TYPE_ISAAC, sc::driver::Buffer((CUdeviceptr)C, false), 0, (sc::int_t)ldc);\
|
||||
/*Operation*/\
|
||||
if((transa=='T') && (transb=='T'))\
|
||||
sc::execute(sc::assign(dC, alpha*dot(dA.T, dB.T) + beta*dC));\
|
||||
else if((transa=='T') && (transb=='N'))\
|
||||
sc::execute(sc::assign(dC, alpha*dot(dA.T, dB) + beta*dC));\
|
||||
else if((transa=='N') && (transb=='T'))\
|
||||
sc::execute(sc::assign(dC, alpha*dot(dA, dB.T) + beta*dC));\
|
||||
else\
|
||||
sc::execute(sc::assign(dC, alpha*dot(dA, dB) + beta*dC));\
|
||||
}
|
||||
|
||||
MAKE_GEMM(S, sc::FLOAT_TYPE, cl_float)
|
||||
MAKE_GEMM(D, sc::DOUBLE_TYPE, cl_double)
|
||||
}
|
@@ -63,7 +63,7 @@ def main():
|
||||
|
||||
#Include directories
|
||||
numpy_include = os.path.join(find_module("numpy")[1], "core", "include")
|
||||
include =' src/include src/lib/external'.split() + ['external/boost/', 'external/boost/boost/', numpy_include]
|
||||
include =' src/include src/lib/external src/include/external src/include/external/cuda'.split() + ['external/boost/', 'external/boost/boost/', numpy_include]
|
||||
|
||||
#Android
|
||||
if for_android:
|
||||
@@ -73,7 +73,7 @@ def main():
|
||||
libraries += ['gnustl_shared']
|
||||
|
||||
#Source files
|
||||
src = 'src/lib/wrap/clBLAS.cpp src/lib/exception/operation_not_supported.cpp src/lib/exception/unknown_datatype.cpp src/lib/value_scalar.cpp src/lib/array.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/kernels/binder.cpp src/lib/kernels/keywords.cpp src/lib/kernels/parse.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/stream.cpp src/lib/driver/dispatch.cpp src/lib/driver/kernel.cpp src/lib/driver/backend.cpp src/lib/driver/platform.cpp src/lib/driver/buffer.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/check.cpp src/lib/driver/command_queue.cpp src/lib/driver/handle.cpp src/lib/driver/context.cpp src/lib/driver/program.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/profiles/profiles.cpp src/lib/profiles/presets.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
|
||||
src = 'src/lib/wrap/cublas.cpp src/lib/wrap/clBLAS.cpp src/lib/exception/operation_not_supported.cpp src/lib/exception/unknown_datatype.cpp src/lib/value_scalar.cpp src/lib/array.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/kernels/binder.cpp src/lib/kernels/keywords.cpp src/lib/kernels/parse.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/stream.cpp src/lib/driver/dispatch.cpp src/lib/driver/kernel.cpp src/lib/driver/backend.cpp src/lib/driver/platform.cpp src/lib/driver/buffer.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/check.cpp src/lib/driver/command_queue.cpp src/lib/driver/handle.cpp src/lib/driver/context.cpp src/lib/driver/program.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/profiles/profiles.cpp src/lib/profiles/presets.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
|
||||
boostsrc = 'external/boost/libs/'
|
||||
for s in ['numpy','python','smart_ptr','system','thread']:
|
||||
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]
|
||||
|
@@ -1,2 +1,9 @@
|
||||
function(add_isaac_test NAME)
|
||||
add_executable(test-${NAME} ${NAME}.cpp)
|
||||
#set_target_properties(test-${NAME} PROPERTIES OUTPUT_NAME ${NAME})
|
||||
add_test(test-${NAME} test-${NAME})
|
||||
target_link_libraries(test-${NAME} isaac)
|
||||
endfunction()
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
add_subdirectory(operations)
|
||||
|
@@ -18,10 +18,12 @@ template<> struct BLAS<double> { template<class FT, class DT> static DT F(FT , D
|
||||
enum interface_t
|
||||
{
|
||||
clBLAS,
|
||||
cuBLAS,
|
||||
CPP
|
||||
};
|
||||
|
||||
#define CHANDLE(X) X.data().handle().cl()
|
||||
#define CUHANDLE(X) X.data().handle().cu()
|
||||
#define OFF(X) X.start()
|
||||
#define INC(X) X.stride()[0]
|
||||
#define LD(X) X.stride()[1]
|
||||
|
@@ -1,5 +1,3 @@
|
||||
foreach(PROG axpy dot ger gemv gemm)
|
||||
add_executable(test-${PROG} ${PROG}.cpp)
|
||||
add_test(test-${PROG} test-${PROG})
|
||||
target_link_libraries(test-${PROG} isaac)
|
||||
foreach(NAME axpy dot ger gemv gemm)
|
||||
add_isaac_test(${NAME})
|
||||
endforeach()
|
||||
|
@@ -2,8 +2,9 @@
|
||||
#include <iostream>
|
||||
#include "common.hpp"
|
||||
#include "isaac/array.h"
|
||||
#include "isaac/wrap/clBLAS.h"
|
||||
#include "isaac/driver/common.h"
|
||||
#include "clBLAS.h"
|
||||
#include "cublas.h"
|
||||
|
||||
namespace sc = isaac;
|
||||
typedef isaac::int_t int_t;
|
||||
@@ -20,7 +21,7 @@ void test_impl(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T>& cy,
|
||||
sc::driver::CommandQueue queue = sc::driver::backend::queues::get(context,0);
|
||||
int_t N = cz.size();
|
||||
|
||||
T aa = static_cast<T>(-4.3);
|
||||
T aa = static_cast<T>(-4);
|
||||
T bb = static_cast<T>(3.5);
|
||||
isaac::value_scalar a(aa), b(bb);
|
||||
isaac::scalar da(a, context), db(b, context);
|
||||
@@ -49,65 +50,66 @@ void test_impl(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T>& cy,
|
||||
if(queue.device().backend()==sc::driver::OPENCL && interf==clBLAS)
|
||||
{
|
||||
cl_command_queue clqueue = queue.handle().cl();
|
||||
RUN_TEST("AXPY", cz[i] = a*cx[i] + cz[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, CHANDLE(x), OFF(x), INC(x), CHANDLE(z), OFF(z), INC(z), 1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("COPY", cz[i] = cx[i], BLAS<T>::F(clblasScopy, clblasDcopy)(N, CHANDLE(x), OFF(x), INC(x), CHANDLE(z), OFF(z), INC(z), 1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("SCAL", cz[i] = a*cz[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, CHANDLE(z), OFF(z), INC(z), 1, &clqueue, 0, NULL, NULL));
|
||||
}
|
||||
if(queue.device().backend()==sc::driver::CUDA && interf == cuBLAS)
|
||||
{
|
||||
RUN_TEST("AXPY", cz[i] = a*cx[i] + cz[i], BLAS<T>::F(cublasSaxpy, cublasDaxpy)(N, a, (T*)CUHANDLE(x) + OFF(x), INC(x), (T*)CUHANDLE(z) + OFF(z), INC(z)));
|
||||
RUN_TEST("COPY", cz[i] = cx[i], BLAS<T>::F(cublasScopy, cublasDcopy)(N, (T*)CUHANDLE(x) + OFF(x), INC(x), (T*)CUHANDLE(z) + OFF(z), INC(z)));
|
||||
RUN_TEST("SCAL", cz[i] = a*cz[i], BLAS<T>::F(cublasSscal, cublasDscal)(N, a, (T*)CUHANDLE(z) + OFF(z), INC(z)));
|
||||
}
|
||||
if(interf == CPP)
|
||||
{
|
||||
RUN_TEST("z = 0", cz[i] = 0, z = zeros(N, 1, dtype, context))
|
||||
RUN_TEST("z = x", cz[i] = cx[i], z = x)
|
||||
RUN_TEST("z = -x", cz[i] = -cx[i], z = -x)
|
||||
|
||||
RUN_TEST("AXPY", cz[i] = a*cx[i] + cz[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, CHANDLE(x), OFF(x), INC(x),
|
||||
CHANDLE(z), OFF(z), INC(z),
|
||||
1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("z = x + y", cz[i] = cx[i] + cy[i], z = x + y)
|
||||
RUN_TEST("z = x - y", cz[i] = cx[i] - cy[i], z = x - y)
|
||||
RUN_TEST("z = x + y + z", cz[i] = cx[i] + cy[i] + cz[i], z = x + y + z)
|
||||
|
||||
RUN_TEST("COPY", cz[i] = cx[i], BLAS<T>::F(clblasScopy, clblasDcopy)(N, CHANDLE(x), OFF(x), INC(x),
|
||||
CHANDLE(z), OFF(z), INC(z),
|
||||
1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("z = a*x", cz[i] = aa*cx[i], z = a*x)
|
||||
RUN_TEST("z = da*x", cz[i] = aa*cx[i], z = da*x)
|
||||
RUN_TEST("z = a*x + b*y", cz[i] = aa*cx[i] + bb*cy[i], z= a*x + b*y)
|
||||
RUN_TEST("z = da*x + b*y", cz[i] = aa*cx[i] + bb*cy[i], z= da*x + b*y)
|
||||
RUN_TEST("z = a*x + db*y", cz[i] = aa*cx[i] + bb*cy[i], z= a*x + db*y)
|
||||
RUN_TEST("z = da*x + db*y", cz[i] = aa*cx[i] + bb*cy[i], z= da*x + db*y)
|
||||
|
||||
RUN_TEST("SCAL", cz[i] = a*cz[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, CHANDLE(z), OFF(z), INC(z),
|
||||
1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("z = exp(x)", cz[i] = exp(cx[i]), z= exp(x))
|
||||
RUN_TEST("z = abs(x)", cz[i] = abs(cx[i]), z= abs(x))
|
||||
RUN_TEST("z = acos(x)", cz[i] = acos(cx[i]), z= acos(x))
|
||||
RUN_TEST("z = asin(x)", cz[i] = asin(cx[i]), z= asin(x))
|
||||
RUN_TEST("z = atan(x)", cz[i] = atan(cx[i]), z= atan(x))
|
||||
RUN_TEST("z = ceil(x)", cz[i] = ceil(cx[i]), z= ceil(x))
|
||||
RUN_TEST("z = cos(x)", cz[i] = cos(cx[i]), z= cos(x))
|
||||
RUN_TEST("z = cosh(x)", cz[i] = cosh(cx[i]), z= cosh(x))
|
||||
RUN_TEST("z = floor(x)", cz[i] = floor(cx[i]), z= floor(x))
|
||||
RUN_TEST("z = log(x)", cz[i] = log(cx[i]), z= log(x))
|
||||
RUN_TEST("z = log10(x)", cz[i] = log10(cx[i]), z= log10(x))
|
||||
RUN_TEST("z = sin(x)", cz[i] = sin(cx[i]), z= sin(x))
|
||||
RUN_TEST("z = sinh(x)", cz[i] = sinh(cx[i]), z= sinh(x))
|
||||
RUN_TEST("z = sqrt(x)", cz[i] = sqrt(cx[i]), z= sqrt(x))
|
||||
RUN_TEST("z = tan(x)", cz[i] = tan(cx[i]), z= tan(x))
|
||||
RUN_TEST("z = tanh(x)", cz[i] = tanh(cx[i]), z= tanh(x))
|
||||
|
||||
RUN_TEST("z = x.*y", cz[i] = cx[i]*cy[i], z= x*y)
|
||||
RUN_TEST("z = x./y", cz[i] = cx[i]/cy[i], z= x/y)
|
||||
|
||||
RUN_TEST("z = pow(x,y)", cz[i] = pow(cx[i], cy[i]), z= pow(x,y))
|
||||
|
||||
#undef CONVERT
|
||||
#define CONVERT for(int_t i = 0 ; i < N ; ++i) {cz[i] = !!cz[i] ; buffer[i] = !!buffer[i];}
|
||||
RUN_TEST("z = x==y", cz[i] = cx[i]==cy[i], z= cast(x==y, dtype))
|
||||
RUN_TEST("z = x>=y", cz[i] = cx[i]>=cy[i], z= cast(x>=y, dtype))
|
||||
RUN_TEST("z = x>y", cz[i] = cx[i]>cy[i], z= cast(x>y, dtype))
|
||||
RUN_TEST("z = x<=y", cz[i] = cx[i]<=cy[i], z= cast(x<=y, dtype))
|
||||
RUN_TEST("z = x<y", cz[i] = cx[i]<cy[i], z= cast(x<y, dtype))
|
||||
RUN_TEST("z = x!=y", cz[i] = cx[i]!=cy[i], z= cast(x!=y, dtype))
|
||||
#undef RUN_TEST
|
||||
}
|
||||
|
||||
RUN_TEST("z = 0", cz[i] = 0, z = zeros(N, 1, dtype, context))
|
||||
RUN_TEST("z = x", cz[i] = cx[i], z = x)
|
||||
RUN_TEST("z = -x", cz[i] = -cx[i], z = -x)
|
||||
|
||||
RUN_TEST("z = x + y", cz[i] = cx[i] + cy[i], z = x + y)
|
||||
RUN_TEST("z = x - y", cz[i] = cx[i] - cy[i], z = x - y)
|
||||
RUN_TEST("z = x + y + z", cz[i] = cx[i] + cy[i] + cz[i], z = x + y + z)
|
||||
|
||||
RUN_TEST("z = a*x", cz[i] = aa*cx[i], z = a*x)
|
||||
RUN_TEST("z = da*x", cz[i] = aa*cx[i], z = da*x)
|
||||
RUN_TEST("z = a*x + b*y", cz[i] = aa*cx[i] + bb*cy[i], z= a*x + b*y)
|
||||
RUN_TEST("z = da*x + b*y", cz[i] = aa*cx[i] + bb*cy[i], z= da*x + b*y)
|
||||
RUN_TEST("z = a*x + db*y", cz[i] = aa*cx[i] + bb*cy[i], z= a*x + db*y)
|
||||
RUN_TEST("z = da*x + db*y", cz[i] = aa*cx[i] + bb*cy[i], z= da*x + db*y)
|
||||
|
||||
RUN_TEST("z = exp(x)", cz[i] = exp(cx[i]), z= exp(x))
|
||||
RUN_TEST("z = abs(x)", cz[i] = abs(cx[i]), z= abs(x))
|
||||
RUN_TEST("z = acos(x)", cz[i] = acos(cx[i]), z= acos(x))
|
||||
RUN_TEST("z = asin(x)", cz[i] = asin(cx[i]), z= asin(x))
|
||||
RUN_TEST("z = atan(x)", cz[i] = atan(cx[i]), z= atan(x))
|
||||
RUN_TEST("z = ceil(x)", cz[i] = ceil(cx[i]), z= ceil(x))
|
||||
RUN_TEST("z = cos(x)", cz[i] = cos(cx[i]), z= cos(x))
|
||||
RUN_TEST("z = cosh(x)", cz[i] = cosh(cx[i]), z= cosh(x))
|
||||
RUN_TEST("z = floor(x)", cz[i] = floor(cx[i]), z= floor(x))
|
||||
RUN_TEST("z = log(x)", cz[i] = log(cx[i]), z= log(x))
|
||||
RUN_TEST("z = log10(x)", cz[i] = log10(cx[i]), z= log10(x))
|
||||
RUN_TEST("z = sin(x)", cz[i] = sin(cx[i]), z= sin(x))
|
||||
RUN_TEST("z = sinh(x)", cz[i] = sinh(cx[i]), z= sinh(x))
|
||||
RUN_TEST("z = sqrt(x)", cz[i] = sqrt(cx[i]), z= sqrt(x))
|
||||
RUN_TEST("z = tan(x)", cz[i] = tan(cx[i]), z= tan(x))
|
||||
RUN_TEST("z = tanh(x)", cz[i] = tanh(cx[i]), z= tanh(x))
|
||||
|
||||
RUN_TEST("z = x.*y", cz[i] = cx[i]*cy[i], z= x*y)
|
||||
RUN_TEST("z = x./y", cz[i] = cx[i]/cy[i], z= x/y)
|
||||
|
||||
RUN_TEST("z = pow(x,y)", cz[i] = pow(cx[i], cy[i]), z= pow(x,y))
|
||||
|
||||
#undef CONVERT
|
||||
#define CONVERT for(int_t i = 0 ; i < N ; ++i) {cz[i] = !!cz[i] ; buffer[i] = !!buffer[i];}
|
||||
RUN_TEST("z = x==y", cz[i] = cx[i]==cy[i], z= cast(x==y, dtype))
|
||||
RUN_TEST("z = x>=y", cz[i] = cx[i]>=cy[i], z= cast(x>=y, dtype))
|
||||
RUN_TEST("z = x>y", cz[i] = cx[i]>cy[i], z= cast(x>y, dtype))
|
||||
RUN_TEST("z = x<=y", cz[i] = cx[i]<=cy[i], z= cast(x<=y, dtype))
|
||||
RUN_TEST("z = x<y", cz[i] = cx[i]<cy[i], z= cast(x<y, dtype))
|
||||
RUN_TEST("z = x!=y", cz[i] = cx[i]!=cy[i], z= cast(x!=y, dtype))
|
||||
#undef RUN_TEST
|
||||
|
||||
|
||||
if(failure_count > 0)
|
||||
@@ -126,9 +128,11 @@ void test(T epsilon, sc::driver::Context const & ctx)
|
||||
|
||||
std::cout << "> standard..." << std::endl;
|
||||
test_impl(epsilon, cx, cy, cz, x, y, z, clBLAS);
|
||||
test_impl(epsilon, cx, cy, cz, x, y, z, cuBLAS);
|
||||
test_impl(epsilon, cx, cy, cz, x, y, z, CPP);
|
||||
std::cout << "> slice..." << std::endl;
|
||||
test_impl(epsilon, cx_s, cy_s, cz_s, x_s, y_s, z_s, clBLAS);
|
||||
test_impl(epsilon, cx_s, cy_s, cz_s, x_s, y_s, z_s, cuBLAS);
|
||||
test_impl(epsilon, cx_s, cy_s, cz_s, x_s, y_s, z_s, CPP);
|
||||
}
|
||||
|
||||
|
@@ -3,7 +3,8 @@
|
||||
|
||||
#include "common.hpp"
|
||||
#include "isaac/array.h"
|
||||
#include "isaac/wrap/clBLAS.h"
|
||||
#include "clBLAS.h"
|
||||
#include "cublas.h"
|
||||
|
||||
namespace sc = isaac;
|
||||
typedef sc::int_t int_t;
|
||||
@@ -47,21 +48,25 @@ void test_impl(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T> & c
|
||||
if(ctx.backend()==sc::driver::OPENCL && interf==clBLAS)
|
||||
{
|
||||
cl_command_queue clqueue = queue.handle().cl();
|
||||
|
||||
RUN_TEST("DOT", cs+=cx[i]*cy[i], 0, cs, BLAS<T>::F(clblasSdot, clblasDdot)(N, CHANDLE(ds), 0, CHANDLE(x), OFF(x), INC(x),
|
||||
CHANDLE(y), OFF(y), INC(y),
|
||||
CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("ASUM", cs+=std::fabs(cx[i]), 0, cs, BLAS<T>::F(clblasSasum, clblasDasum)(N, CHANDLE(ds), 0, CHANDLE(x), OFF(x), INC(x),
|
||||
CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("DOT", cs+=cx[i]*cy[i], 0, cs, BLAS<T>::F(clblasSdot, clblasDdot)(N, CHANDLE(ds), 0, CHANDLE(x), OFF(x), INC(x), CHANDLE(y), OFF(y), INC(y), CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("ASUM", cs+=std::fabs(cx[i]), 0, cs, BLAS<T>::F(clblasSasum, clblasDasum)(N, CHANDLE(ds), 0, CHANDLE(x), OFF(x), INC(x), CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL));
|
||||
}
|
||||
|
||||
if(ctx.backend()==sc::driver::CUDA && interf==cuBLAS)
|
||||
{
|
||||
RUN_TEST("DOT", cs+=cx[i]*cy[i], 0, cs, ds = BLAS<T>::F(cublasSdot, cublasDdot)(N, (T*)CUHANDLE(x) + OFF(x), INC(x), (T*)CUHANDLE(y) + OFF(y), INC(y)));
|
||||
RUN_TEST("ASUM", cs+=std::fabs(cx[i]), 0, cs, ds = BLAS<T>::F(cublasSasum, cublasDasum)(N, (T*)CUHANDLE(x) + OFF(x), INC(x)));
|
||||
}
|
||||
|
||||
RUN_TEST("s = x'.y", cs+=cx[i]*cy[i], 0, cs, ds = dot(x,y));
|
||||
RUN_TEST("s = exp(x'.y)", cs += cx[i]*cy[i], 0, std::exp(cs), ds = exp(dot(x,y)));
|
||||
RUN_TEST("s = 1 + x'.y", cs += cx[i]*cy[i], 0, 1 + cs, ds = 1 + dot(x,y));
|
||||
RUN_TEST("s = x'.y + y'.y", cs+= cx[i]*cy[i] + cy[i]*cy[i], 0, cs, ds = dot(x,y) + dot(y,y));
|
||||
RUN_TEST("s = max(x)", cs = std::max(cs, cx[i]), std::numeric_limits<T>::min(), cs, ds = max(x));
|
||||
RUN_TEST("s = min(x)", cs = std::min(cs, cx[i]), std::numeric_limits<T>::max(), cs, ds = min(x));
|
||||
if(interf==CPP)
|
||||
{
|
||||
RUN_TEST("s = x'.y", cs+=cx[i]*cy[i], 0, cs, ds = dot(x,y));
|
||||
RUN_TEST("s = exp(x'.y)", cs += cx[i]*cy[i], 0, std::exp(cs), ds = exp(dot(x,y)));
|
||||
RUN_TEST("s = 1 + x'.y", cs += cx[i]*cy[i], 0, 1 + cs, ds = 1 + dot(x,y));
|
||||
RUN_TEST("s = x'.y + y'.y", cs+= cx[i]*cy[i] + cy[i]*cy[i], 0, cs, ds = dot(x,y) + dot(y,y));
|
||||
RUN_TEST("s = max(x)", cs = std::max(cs, cx[i]), std::numeric_limits<T>::min(), cs, ds = max(x));
|
||||
RUN_TEST("s = min(x)", cs = std::min(cs, cx[i]), std::numeric_limits<T>::max(), cs, ds = min(x));
|
||||
}
|
||||
|
||||
#undef RUN_TEST
|
||||
|
||||
@@ -80,9 +85,11 @@ void test(T epsilon, sc::driver::Context const & ctx)
|
||||
|
||||
std::cout << "> standard..." << std::endl;
|
||||
test_impl(epsilon, cx, cy, x, y, clBLAS);
|
||||
test_impl(epsilon, cx, cy, x, y, cuBLAS);
|
||||
test_impl(epsilon, cx, cy, x, y, CPP);
|
||||
std::cout << "> slice..." << std::endl;
|
||||
test_impl(epsilon, cx_s, cy_s, x_s, y_s, clBLAS);
|
||||
test_impl(epsilon, cx_s, cy_s, x_s, y_s, cuBLAS);
|
||||
test_impl(epsilon, cx_s, cy_s, x_s, y_s, CPP);
|
||||
}
|
||||
|
||||
|
@@ -1,7 +1,8 @@
|
||||
#include <cmath>
|
||||
#include "common.hpp"
|
||||
#include "isaac/array.h"
|
||||
#include "isaac/wrap/clBLAS.h"
|
||||
#include "clBLAS.h"
|
||||
#include "cublas.h"
|
||||
|
||||
namespace sc = isaac;
|
||||
|
||||
@@ -56,7 +57,7 @@ void test(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> const & c
|
||||
{
|
||||
cl_command_queue clqueue = queue.handle().cl();
|
||||
|
||||
//// //Row-major
|
||||
//Row-major
|
||||
RUN_TEST("GEMM(ROW, N, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B),
|
||||
CHANDLE(A), OFF(A), LD(A), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("GEMM(ROW, N, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT),
|
||||
@@ -78,10 +79,21 @@ void test(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> const & c
|
||||
RUN_TEST("GEMM(COL, T, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasTrans, clblasTrans, M, N, K, alpha, CHANDLE(AT), OFF(AT), LD(AT),
|
||||
CHANDLE(BT), OFF(BT), LD(BT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||
|
||||
|
||||
|
||||
}
|
||||
else
|
||||
|
||||
if(C.context().backend()==sc::driver::CUDA && interf==cuBLAS)
|
||||
{
|
||||
RUN_TEST("GEMM-NN", BLAS<T>::F(cublasSgemm,cublasDgemm)('N', 'N', M, N, K, alpha, (T*)CUHANDLE(A) + OFF(A), LD(A),
|
||||
(T*)CUHANDLE(B) + OFF(B), LD(B), beta, (T*)CUHANDLE(C) + OFF(C), LD(C)));
|
||||
RUN_TEST("GEMM-NT", BLAS<T>::F(cublasSgemm,cublasDgemm)('N', 'T', M, N, K, alpha, (T*)CUHANDLE(A) + OFF(A), LD(A),
|
||||
(T*)CUHANDLE(BT) + OFF(BT), LD(BT), beta, (T*)CUHANDLE(C) + OFF(C), LD(C)));
|
||||
RUN_TEST("GEMM-TN", BLAS<T>::F(cublasSgemm,cublasDgemm)('T', 'N', M, N, K, alpha, (T*)CUHANDLE(AT) + OFF(AT), LD(AT),
|
||||
(T*)CUHANDLE(B) + OFF(B), LD(B), beta, (T*)CUHANDLE(C) + OFF(C), LD(C)));
|
||||
RUN_TEST("GEMM-TT", BLAS<T>::F(cublasSgemm,cublasDgemm)('T', 'T', M, N, K, alpha, (T*)CUHANDLE(AT) + OFF(AT), LD(AT),
|
||||
(T*)CUHANDLE(BT) + OFF(BT), LD(BT), beta, (T*)CUHANDLE(C) + OFF(C), LD(C)));
|
||||
}
|
||||
|
||||
if(interf==CPP)
|
||||
{
|
||||
RUN_TEST("C = A * B", C = dot(A,B))
|
||||
RUN_TEST("C = A' * B", C = dot(trans(AT),B))
|
||||
@@ -108,8 +120,10 @@ void test(T epsilon, sc::driver::Context const & ctx)
|
||||
INIT_MATRIX(M, SUBM, 5, 1, N, SUBN, 7, 1, cC, C, ctx);
|
||||
INIT_MATRIX(M, SUBM, 8, 1, K, SUBK, 4, 1, cA, A, ctx);
|
||||
INIT_MATRIX(K, SUBK, 9, 1, N, SUBN, 6, 1, cB, B, ctx);
|
||||
test(epsilon, cC, cA, cB, C, A, AT, B, BT, clBLAS, "BLAS, FULL");
|
||||
test(epsilon, cC_s, cA_s, cB_s, C_s, A_s, AT_s, B_s, BT_s, clBLAS, "BLAS, SUB");
|
||||
test(epsilon, cC, cA, cB, C, A, AT, B, BT, clBLAS, "clBLAS, FULL");
|
||||
test(epsilon, cC, cA, cB, C, A, AT, B, BT, cuBLAS, "cuBLAS, FULL");
|
||||
test(epsilon, cC_s, cA_s, cB_s, C_s, A_s, AT_s, B_s, BT_s, clBLAS, "clBLAS, SUB");
|
||||
test(epsilon, cC_s, cA_s, cB_s, C_s, A_s, AT_s, B_s, BT_s, cuBLAS, "cuBLAS, SUB");
|
||||
}
|
||||
|
||||
{
|
||||
|
@@ -3,13 +3,15 @@
|
||||
|
||||
#include "common.hpp"
|
||||
#include "isaac/array.h"
|
||||
#include "isaac/wrap/clBLAS.h"
|
||||
#include "clBLAS.h"
|
||||
#include "cublas.h"
|
||||
|
||||
|
||||
namespace sc = isaac;
|
||||
|
||||
template<typename T>
|
||||
void test_impl(T epsilon, simple_vector_base<T> & cy, simple_matrix_base<T> const & cA, simple_vector_base<T> & cx,
|
||||
sc::array_base & y, sc::array_base const & A, sc::array_base & x, interface_t interf, const char * prefix)
|
||||
sc::array_base & y, sc::array_base const & A, sc::array_base & x, interface_t interf)
|
||||
{
|
||||
int failure_count = 0;
|
||||
|
||||
@@ -26,7 +28,7 @@ void test_impl(T epsilon, simple_vector_base<T> & cy, simple_matrix_base<T> cons
|
||||
|
||||
T yi = 0, xi = 0;
|
||||
#define RUN_TEST(NAME, SIZE1, SIZE2, NEUTRAL, REDUCTION, ASSIGNMENT, GPU_REDUCTION, RES, BUF, CRES)\
|
||||
std::cout << "[" << prefix << "] \t" << NAME "..." << std::flush;\
|
||||
std::cout << NAME "..." << std::flush;\
|
||||
for(int i = 0 ; i < SIZE1 ; ++i)\
|
||||
{\
|
||||
yi = NEUTRAL;\
|
||||
@@ -53,36 +55,38 @@ void test_impl(T epsilon, simple_vector_base<T> & cy, simple_matrix_base<T> cons
|
||||
|
||||
|
||||
RUN_TEST("GEMV(ROW, NoTrans)", M, N, 0, yi+=cA(i,j)*cx[j], cy[i] = alpha*yi + beta*cy[i],
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasTrans, N, M, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(x), OFF(x), INC(x), beta, CHANDLE(y), OFF(y), INC(y),
|
||||
1, &clqueue, 0, NULL, NULL), y, bufy, cy);
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasTrans, N, M, alpha, CHANDLE(A), OFF(A), LD(A), CHANDLE(x), OFF(x), INC(x), beta, CHANDLE(y), OFF(y), INC(y), 1, &clqueue, 0, NULL, NULL), y, bufy, cy);
|
||||
|
||||
RUN_TEST("GEMV(ROW, Trans)", N, M, 0, xi+=cA(j,i)*cy[j], cx[i] = alpha*xi + beta*cx[i],
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasNoTrans, N, M, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(y), OFF(y), INC(y), beta, CHANDLE(x), OFF(x), INC(x),
|
||||
1, &clqueue, 0, NULL, NULL), x, bufx, cx);
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasNoTrans, N, M, alpha, CHANDLE(A), OFF(A), LD(A), CHANDLE(y), OFF(y), INC(y), beta, CHANDLE(x), OFF(x), INC(x), 1, &clqueue, 0, NULL, NULL), x, bufx, cx);
|
||||
|
||||
RUN_TEST("GEMV(COL, NoTrans)", M, N, 0, yi+=cA(i,j)*cx[j], cy[i] = alpha*yi + beta*cy[i],
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasNoTrans, M, N, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(x), OFF(x), INC(x), beta, CHANDLE(y), OFF(y), INC(y),
|
||||
1, &clqueue, 0, NULL, NULL), y, bufy, cy);
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasNoTrans, M, N, alpha, CHANDLE(A), OFF(A), LD(A), CHANDLE(x), OFF(x), INC(x), beta, CHANDLE(y), OFF(y), INC(y), 1, &clqueue, 0, NULL, NULL), y, bufy, cy);
|
||||
|
||||
RUN_TEST("GEMV(COL, Trans)", N, M, 0, xi+=cA(j,i)*cy[j], cx[i] = alpha*xi + beta*cx[i],
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasTrans, M, N, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(y), OFF(y), INC(y), beta, CHANDLE(x), OFF(x), INC(x),
|
||||
1, &clqueue, 0, NULL, NULL), x, bufx, cx);
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasTrans, M, N, alpha, CHANDLE(A), OFF(A), LD(A), CHANDLE(y), OFF(y), INC(y), beta, CHANDLE(x), OFF(x), INC(x), 1, &clqueue, 0, NULL, NULL), x, bufx, cx);
|
||||
}
|
||||
else
|
||||
{
|
||||
RUN_TEST("x = dot(A.T, y)", N, M, 0, xi+=cA(j,i)*cy[j], cx[i] = xi, x = dot(trans(A),y), x, bufx, cx);
|
||||
RUN_TEST("x = sum(A, 0)", N, M, 0, xi+=cA(j,i), cx[i] = xi, x = sum(A,0), x, bufx, cx);
|
||||
RUN_TEST("x = max(A, 0)", N, M, std::numeric_limits<T>::min(), xi=std::max(xi,cA(j,i)), cx[i] = xi, x = max(A,0), x, bufx, cx);
|
||||
RUN_TEST("x = min(A, 0)", N, M, std::numeric_limits<T>::max(), xi=std::min(xi,cA(j,i)), cx[i] = xi, x = min(A,0), x, bufx, cx);
|
||||
|
||||
RUN_TEST("y = dot(A, x)", M, N, 0, yi+=cA(i,j)*cx[j], cy[i] = yi, y = dot(A,x), y, bufy, cy);
|
||||
RUN_TEST("y = sum(A, 1)", M, N, 0, yi+=cA(i,j), cy[i] = yi, y = sum(A,1), y, bufy, cy);
|
||||
RUN_TEST("y = max(A, 1)", M, N, std::numeric_limits<T>::min(), yi=std::max(yi,cA(i,j)), cy[i] = yi, y = max(A,1), y, bufy, cy);
|
||||
RUN_TEST("y = min(A, 1)", M, N, std::numeric_limits<T>::max(), yi=std::min(yi,cA(i,j)), cy[i] = yi, y = min(A,1), y, bufy, cy);
|
||||
if(y.context().backend()==sc::driver::CUDA && interf==cuBLAS)
|
||||
{
|
||||
RUN_TEST("GEMV-N", M, N, 0, yi+=cA(i,j)*cx[j], cy[i] = alpha*yi + beta*cy[i],
|
||||
BLAS<T>::F(cublasSgemv, cublasDgemv)('N', M, N, alpha, (T*)CUHANDLE(A) + OFF(A), LD(A), (T*)CUHANDLE(x) + OFF(x), INC(x), beta, (T*)CUHANDLE(y) + OFF(y), INC(y)), y, bufy, cy);
|
||||
|
||||
RUN_TEST("GEMV-T", N, M, 0, xi+=cA(j,i)*cy[j], cx[i] = alpha*xi + beta*cx[i],
|
||||
BLAS<T>::F(cublasSgemv, cublasDgemv)('T', M, N, alpha, (T*)CUHANDLE(A) + OFF(A), LD(A), (T*)CUHANDLE(y) + OFF(y), INC(y), beta, (T*)CUHANDLE(x) + OFF(x), INC(x)), x, bufx, cx);
|
||||
}
|
||||
|
||||
if(interf==CPP)
|
||||
{
|
||||
// RUN_TEST("x = dot(A.T, y)", N, M, 0, xi+=cA(j,i)*cy[j], cx[i] = xi, x = dot(trans(A),y), x, bufx, cx);
|
||||
// RUN_TEST("x = sum(A, 0)", N, M, 0, xi+=cA(j,i), cx[i] = xi, x = sum(A,0), x, bufx, cx);
|
||||
// RUN_TEST("x = max(A, 0)", N, M, std::numeric_limits<T>::min(), xi=std::max(xi,cA(j,i)), cx[i] = xi, x = max(A,0), x, bufx, cx);
|
||||
// RUN_TEST("x = min(A, 0)", N, M, std::numeric_limits<T>::max(), xi=std::min(xi,cA(j,i)), cx[i] = xi, x = min(A,0), x, bufx, cx);
|
||||
|
||||
// RUN_TEST("y = dot(A, x)", M, N, 0, yi+=cA(i,j)*cx[j], cy[i] = yi, y = dot(A,x), y, bufy, cy);
|
||||
// RUN_TEST("y = sum(A, 1)", M, N, 0, yi+=cA(i,j), cy[i] = yi, y = sum(A,1), y, bufy, cy);
|
||||
// RUN_TEST("y = max(A, 1)", M, N, std::numeric_limits<T>::min(), yi=std::max(yi,cA(i,j)), cy[i] = yi, y = max(A,1), y, bufy, cy);
|
||||
// RUN_TEST("y = min(A, 1)", M, N, std::numeric_limits<T>::max(), yi=std::min(yi,cA(i,j)), cy[i] = yi, y = min(A,1), y, bufy, cy);
|
||||
}
|
||||
|
||||
if(failure_count>0)
|
||||
@@ -102,13 +106,15 @@ void test(T epsilon, sc::driver::Context const & ctx)
|
||||
|
||||
{
|
||||
INIT_MATRIX(M, SUBM, 9, 1, N, SUBN, 8, 1, cA, A, ctx);
|
||||
test_impl(epsilon, cy, cA, cx, y, A, x, clBLAS, "BLAS, FULL");
|
||||
test_impl(epsilon, cy_s, cA_s, cx_s, y_s, A_s, x_s, clBLAS, "BLAS, SUB");
|
||||
test_impl(epsilon, cy, cA, cx, y, A, x, clBLAS);
|
||||
test_impl(epsilon, cy, cA, cx, y, A, x, cuBLAS);
|
||||
test_impl(epsilon, cy_s, cA_s, cx_s, y_s, A_s, x_s, clBLAS);
|
||||
test_impl(epsilon, cy_s, cA_s, cx_s, y_s, A_s, x_s, cuBLAS);
|
||||
}
|
||||
{
|
||||
INIT_MATRIX(M, SUBM, 9, 5, N, SUBN, 8, 4, cA, A, ctx);
|
||||
test_impl(epsilon, cy, cA, cx, y, A, x, CPP, "C++, FULL");
|
||||
test_impl(epsilon, cy_s, cA_s, cx_s, y_s, A_s, x_s, CPP, "C++, SUB");
|
||||
test_impl(epsilon, cy, cA, cx, y, A, x, CPP);
|
||||
test_impl(epsilon, cy_s, cA_s, cx_s, y_s, A_s, x_s, CPP);
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user