GEMM: Added skeleton for cuBLAS GEMM calls

This commit is contained in:
Philippe Tillet
2016-10-03 21:26:05 -04:00
parent 889b4cffdf
commit f1a636f83f
27 changed files with 18566 additions and 667 deletions

View File

@@ -31,6 +31,7 @@
//CUDA Backend //CUDA Backend
#include "isaac/driver/external/CUDA/cuda.h" #include "isaac/driver/external/CUDA/cuda.h"
#include "isaac/driver/external/CUDA/nvrtc.h" #include "isaac/driver/external/CUDA/nvrtc.h"
#include "isaac/driver/external/CUDA/cublas.h"
#include <iostream> #include <iostream>
@@ -48,9 +49,7 @@ private:
template <class R, class... A> template <class R, class... A>
struct return_type<R (*)(A...)> struct return_type<R (*)(A...)>
{ { typedef R type; };
typedef R type;
};
typedef bool (*f_init_t)(); typedef bool (*f_init_t)();
@@ -65,10 +64,13 @@ private:
return (*fptr)(args...); return (*fptr)(args...);
} }
static void cublasCreate(cublasHandle_t* h);
public: public:
static bool clinit(); static bool clinit();
static bool cuinit(); static bool cublasinit();
static bool nvrtcinit(); static bool nvrtcinit();
static bool cuinit();
static void release(); static void release();
@@ -144,10 +146,17 @@ public:
static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames); static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
static void cublasGetStream(cudaStream_t *streamId);
static void cublasSetStream(cudaStream_t streamId);
static void cublasSgemm (cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
static void cublasDgemm (cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
private: private:
static void* opencl_; static void* opencl_;
static void* cuda_; static void* cuda_;
static void* nvrtc_; static void* nvrtc_;
static void* cublas_;
static cublasHandle_t cublas_handle_;
//OpenCL //OpenCL
static void* clBuildProgram_; static void* clBuildProgram_;
@@ -220,6 +229,12 @@ private:
static void* nvrtcGetPTXSize_; static void* nvrtcGetPTXSize_;
static void* nvrtcCreateProgram_; static void* nvrtcCreateProgram_;
static void* nvrtcGetProgramLog_; static void* nvrtcGetProgramLog_;
static void* cublasCreate_;
static void* cublasGetStream_;
static void* cublasSetStream_;
static void* cublasSgemm_;
static void* cublasDgemm_;
}; };
} }

View File

@@ -0,0 +1,64 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "device_types.h"
#if !defined(__CUDACC_RTC__)
#define EXCLUDE_FROM_RTC
#include "driver_types.h"
#undef EXCLUDE_FROM_RTC
#endif /* !__CUDACC_RTC__ */
#include "surface_types.h"
#include "texture_types.h"
#include "vector_types.h"

View File

@@ -0,0 +1,412 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CHANNEL_DESCRIPTOR_H__)
#define __CHANNEL_DESCRIPTOR_H__
#if defined(__cplusplus)
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "driver_types.h"
#include "cuda_runtime_api.h"
#include "host_defines.h"
#include "vector_types.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
/**
* \addtogroup CUDART_HIGHLEVEL
*
* @{
*/
/**
* \brief \hl Returns a channel descriptor using the specified format
*
* Returns a channel descriptor with format \p f and number of bits of each
* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
* defined as:
* \code
struct cudaChannelFormatDesc {
int x, y, z, w;
enum cudaChannelFormatKind f;
};
* \endcode
*
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
* ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
*
* \return
* Channel descriptor with format \p f
*
* \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
*/
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
{
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
{
int e = (int)sizeof(char) * 8;
#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
#if !defined(__LP64__)
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
#endif /* !__LP64__ */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
}
#endif /* __cplusplus */
/** @} */
/** @} */ /* END CUDART_TEXTURE_HL */
#endif /* !__CHANNEL_DESCRIPTOR_H__ */

View File

@@ -0,0 +1,338 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CU_COMPLEX_H_)
#define CU_COMPLEX_H_
/* When trying to include C header file in C++ Code extern "C" is required
* But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
* extern "C" cannot be nested
* Hence keep the header out of extern "C" block
*/
#include <math.h> /* import fabsf, sqrt */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
#include "vector_types.h"
typedef float2 cuFloatComplex;
__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
{
return x.x;
}
__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
{
return x.y;
}
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
(float r, float i)
{
cuFloatComplex res;
res.x = r;
res.y = i;
return res;
}
__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
{
return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
}
__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
cuFloatComplex y)
{
return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),
cuCimagf(x) + cuCimagf(y));
}
__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
cuFloatComplex y)
{
return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),
cuCimagf(x) - cuCimagf(y));
}
/* This implementation could suffer from intermediate overflow even though
* the final result would be in range. However, various implementations do
* not guard against this (presumably to avoid losing performance), so we
* don't do it either to stay competitive.
*/
__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
cuFloatComplex y)
{
cuFloatComplex prod;
prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) -
(cuCimagf(x) * cuCimagf(y)),
(cuCrealf(x) * cuCimagf(y)) +
(cuCimagf(x) * cuCrealf(y)));
return prod;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Such guarded implementations are usually the default for
* complex library implementations, with some also offering an unguarded,
* faster version.
*/
__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
cuFloatComplex y)
{
cuFloatComplex quot;
float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
float oos = 1.0f / s;
float ars = cuCrealf(x) * oos;
float ais = cuCimagf(x) * oos;
float brs = cuCrealf(y) * oos;
float bis = cuCimagf(y) * oos;
s = (brs * brs) + (bis * bis);
oos = 1.0f / s;
quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
((ais * brs) - (ars * bis)) * oos);
return quot;
}
/*
* We would like to call hypotf(), but it's not available on all platforms.
* This discrete implementation guards against intermediate underflow and
* overflow by scaling. Otherwise we would lose half the exponent range.
* There are various ways of doing guarded computation. For now chose the
* simplest and fastest solution, however this may suffer from inaccuracies
* if sqrt and division are not IEEE compliant.
*/
__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
{
float a = cuCrealf(x);
float b = cuCimagf(x);
float v, w, t;
a = fabsf(a);
b = fabsf(b);
if (a > b) {
v = a;
w = b;
} else {
v = b;
w = a;
}
t = w / v;
t = 1.0f + t * t;
t = v * sqrtf(t);
if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
t = v + w;
}
return t;
}
/* Double precision */
typedef double2 cuDoubleComplex;
__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
{
return x.x;
}
__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)
{
return x.y;
}
__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex
(double r, double i)
{
cuDoubleComplex res;
res.x = r;
res.y = i;
return res;
}
__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
{
return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
cuDoubleComplex y)
{
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),
cuCimag(x) + cuCimag(y));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
cuDoubleComplex y)
{
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),
cuCimag(x) - cuCimag(y));
}
/* This implementation could suffer from intermediate overflow even though
* the final result would be in range. However, various implementations do
* not guard against this (presumably to avoid losing performance), so we
* don't do it either to stay competitive.
*/
__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
cuDoubleComplex y)
{
cuDoubleComplex prod;
prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -
(cuCimag(x) * cuCimag(y)),
(cuCreal(x) * cuCimag(y)) +
(cuCimag(x) * cuCreal(y)));
return prod;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Such guarded implementations are usually the default for
* complex library implementations, with some also offering an unguarded,
* faster version.
*/
__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
cuDoubleComplex y)
{
cuDoubleComplex quot;
double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
double oos = 1.0 / s;
double ars = cuCreal(x) * oos;
double ais = cuCimag(x) * oos;
double brs = cuCreal(y) * oos;
double bis = cuCimag(y) * oos;
s = (brs * brs) + (bis * bis);
oos = 1.0 / s;
quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
((ais * brs) - (ars * bis)) * oos);
return quot;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Otherwise we would lose half the exponent range. There are
* various ways of doing guarded computation. For now chose the simplest
* and fastest solution, however this may suffer from inaccuracies if sqrt
* and division are not IEEE compliant.
*/
__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
{
double a = cuCreal(x);
double b = cuCimag(x);
double v, w, t;
a = fabs(a);
b = fabs(b);
if (a > b) {
v = a;
w = b;
} else {
v = b;
w = a;
}
t = w / v;
t = 1.0 + t * t;
t = v * sqrt(t);
if ((v == 0.0) ||
(v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
t = v + w;
}
return t;
}
#if defined(__cplusplus)
}
#endif /* __cplusplus */
/* aliases */
typedef cuFloatComplex cuComplex;
__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
float y)
{
return make_cuFloatComplex (x, y);
}
/* float-to-double promotion */
__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
(cuFloatComplex c)
{
return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
}
__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
(cuDoubleComplex c)
{
return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
}
__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
{
float real_res;
float imag_res;
real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);
real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;
return make_cuComplex(real_res, imag_res);
}
__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
{
double real_res;
double imag_res;
real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
return make_cuDoubleComplex(real_res, imag_res);
}
#endif /* !defined(CU_COMPLEX_H_) */

View File

@@ -0,0 +1,565 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
* This is the public header file for the CUBLAS library, defining the API
*
* CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
* on top of the CUDA runtime.
*/
#if !defined(CUBLAS_H_)
#define CUBLAS_H_
#include <cuda_runtime.h>
#ifndef CUBLASWINAPI
#ifdef _WIN32
#define CUBLASWINAPI __stdcall
#else
#define CUBLASWINAPI
#endif
#endif
#undef CUBLASAPI
#ifdef __CUDACC__
#define CUBLASAPI __host__
#else
#define CUBLASAPI
#endif
#include "cublas_api.h"
#if defined(__cplusplus)
extern "C" {
#endif
/* CUBLAS data types */
#define cublasStatus cublasStatus_t
cublasStatus CUBLASWINAPI cublasInit (void);
cublasStatus CUBLASWINAPI cublasShutdown (void);
cublasStatus CUBLASWINAPI cublasGetError (void);
cublasStatus CUBLASWINAPI cublasGetVersion(int *version);
cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr);
cublasStatus CUBLASWINAPI cublasFree (void *devicePtr);
cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream);
/* ---------------- CUBLAS BLAS1 functions ---------------- */
/* NRM2 */
float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx);
double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx);
float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx);
double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* DOT */
float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y,
int incy);
double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y,
int incy);
cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y,
int incy);
cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y,
int incy);
cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
int incy);
cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
int incy);
/*------------------------------------------------------------------------*/
/* SCAL */
void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx);
void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx);
void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx);
void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx);
void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx);
void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* AXPY */
void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx,
float *y, int incy);
void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x,
int incx, double *y, int incy);
void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x,
int incx, cuComplex *y, int incy);
void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* COPY */
void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y,
int incy);
void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y,
int incy);
void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
int incy);
void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
int incy);
/*------------------------------------------------------------------------*/
/* SWAP */
void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy);
void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy);
void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy);
void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* AMAX */
int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx);
int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx);
int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx);
int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* AMIN */
int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx);
int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx);
int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx);
int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* ASUM */
float CUBLASWINAPI cublasSasum (int n, const float *x, int incx);
double CUBLASWINAPI cublasDasum (int n, const double *x, int incx);
float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx);
double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* ROT */
void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy,
float sc, float ss);
void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy,
double sc, double ss);
void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y,
int incy, float c, cuComplex s);
void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx,
cuDoubleComplex *y, int incy, double sc,
cuDoubleComplex cs);
void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
int incy, float c, float s);
void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx,
cuDoubleComplex *y, int incy, double c, double s);
/*------------------------------------------------------------------------*/
/* ROTG */
void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss);
void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss);
void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
cuComplex *cs);
void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
cuDoubleComplex *cs);
/*------------------------------------------------------------------------*/
/* ROTM */
void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
const float* sparam);
void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
const double* sparam);
/*------------------------------------------------------------------------*/
/* ROTMG */
void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1,
const float *sy1, float* sparam);
void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1,
const double *sy1, double* sparam);
/* --------------- CUBLAS BLAS2 functions ---------------- */
/* GEMV */
void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
const float *A, int lda, const float *x, int incx,
float beta, float *y, int incy);
void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
const double *A, int lda, const double *x, int incx,
double beta, double *y, int incy);
void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x, int incx,
cuComplex beta, cuComplex *y, int incy);
void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
cuDoubleComplex beta, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* GBMV */
void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku,
float alpha, const float *A, int lda,
const float *x, int incx, float beta, float *y,
int incy);
void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku,
double alpha, const double *A, int lda,
const double *x, int incx, double beta, double *y,
int incy);
void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *x, int incx, cuComplex beta, cuComplex *y,
int incy);
void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y,
int incy);
/*------------------------------------------------------------------------*/
/* TRMV */
void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n,
const float *A, int lda, float *x, int incx);
void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n,
const double *A, int lda, double *x, int incx);
void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n,
const cuComplex *A, int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n,
const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TBMV */
void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k,
const float *A, int lda, float *x, int incx);
void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k,
const double *A, int lda, double *x, int incx);
void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k,
const cuComplex *A, int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k,
const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TPMV */
void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx);
void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TRSV */
void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx);
void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx);
void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda,
cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TPSV */
void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP,
float *x, int incx);
void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP,
cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TBSV */
void CUBLASWINAPI cublasStbsv(char uplo, char trans,
char diag, int n, int k, const float *A,
int lda, float *x, int incx);
void CUBLASWINAPI cublasDtbsv(char uplo, char trans,
char diag, int n, int k, const double *A,
int lda, double *x, int incx);
void CUBLASWINAPI cublasCtbsv(char uplo, char trans,
char diag, int n, int k, const cuComplex *A,
int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtbsv(char uplo, char trans,
char diag, int n, int k, const cuDoubleComplex *A,
int lda, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* SYMV/HEMV */
void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
int lda, const float *x, int incx, float beta,
float *y, int incy);
void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
int lda, const double *x, int incx, double beta,
double *y, int incy);
void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
int lda, const cuComplex *x, int incx, cuComplex beta,
cuComplex *y, int incy);
void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta,
cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* SBMV/HBMV */
void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha,
const float *A, int lda, const float *x, int incx,
float beta, float *y, int incy);
void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha,
const double *A, int lda, const double *x, int incx,
double beta, double *y, int incy);
void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x, int incx,
cuComplex beta, cuComplex *y, int incy);
void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
cuDoubleComplex beta, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* SPMV/HPMV */
void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
const float *AP, const float *x,
int incx, float beta, float *y, int incy);
void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
const double *AP, const double *x,
int incx, double beta, double *y, int incy);
void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
const cuComplex *AP, const cuComplex *x,
int incx, cuComplex beta, cuComplex *y, int incy);
void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
const cuDoubleComplex *AP, const cuDoubleComplex *x,
int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* GER */
void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
const float *y, int incy, float *A, int lda);
void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
const double *y, int incy, double *A, int lda);
void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy,
cuComplex *A, int lda);
void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy,
cuComplex *A, int lda);
void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, const cuDoubleComplex *y, int incy,
cuDoubleComplex *A, int lda);
void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, const cuDoubleComplex *y, int incy,
cuDoubleComplex *A, int lda);
/*------------------------------------------------------------------------*/
/* SYR/HER */
void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
int incx, float *A, int lda);
void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
int incx, double *A, int lda);
void CUBLASWINAPI cublasCher (char uplo, int n, float alpha,
const cuComplex *x, int incx, cuComplex *A, int lda);
void CUBLASWINAPI cublasZher (char uplo, int n, double alpha,
const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda);
/*------------------------------------------------------------------------*/
/* SPR/HPR */
void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
int incx, float *AP);
void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
int incx, double *AP);
void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
int incx, cuComplex *AP);
void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
int incx, cuDoubleComplex *AP);
/*------------------------------------------------------------------------*/
/* SYR2/HER2 */
void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x,
int incx, const float *y, int incy, float *A,
int lda);
void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x,
int incx, const double *y, int incy, double *A,
int lda);
void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy, cuComplex *A,
int lda);
void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A,
int lda);
/*------------------------------------------------------------------------*/
/* SPR2/HPR2 */
void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x,
int incx, const float *y, int incy, float *AP);
void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
const double *x, int incx, const double *y,
int incy, double *AP);
void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
const cuComplex *x, int incx, const cuComplex *y,
int incy, cuComplex *AP);
void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
int incy, cuDoubleComplex *AP);
/* ------------------------BLAS3 Functions ------------------------------- */
/* GEMM */
void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k,
float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C,
int ldc);
void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
double alpha, const double *A, int lda,
const double *B, int ldb, double beta, double *C,
int ldc);
void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
int k, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb,
cuDoubleComplex beta, cuDoubleComplex *C,
int ldc);
/* -------------------------------------------------------*/
/* SYRK */
void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha,
const float *A, int lda, float beta, float *C,
int ldc);
void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
double alpha, const double *A, int lda,
double beta, double *C, int ldc);
void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
cuComplex beta, cuComplex *C, int ldc);
void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/* ------------------------------------------------------- */
/* HERK */
void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
float alpha, const cuComplex *A, int lda,
float beta, cuComplex *C, int ldc);
void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
double alpha,
const cuDoubleComplex *A, int lda,
double beta,
cuDoubleComplex *C, int ldc);
/* ------------------------------------------------------- */
/* SYR2K */
void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha,
const float *A, int lda, const float *B, int ldb,
float beta, float *C, int ldc);
void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
double alpha, const double *A, int lda,
const double *B, int ldb, double beta,
double *C, int ldc);
void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/* ------------------------------------------------------- */
/* HER2K */
void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, float beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, double beta,
cuDoubleComplex *C, int ldc);
/*------------------------------------------------------------------------*/
/* SYMM*/
void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha,
const float *A, int lda, const float *B, int ldb,
float beta, float *C, int ldc);
void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha,
const double *A, int lda, const double *B, int ldb,
double beta, double *C, int ldc);
void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *B, int ldb,
cuComplex beta, cuComplex *C, int ldc);
void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
/*------------------------------------------------------------------------*/
/* HEMM*/
void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/*------------------------------------------------------------------------*/
/* TRSM*/
void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
int m, int n, float alpha, const float *A, int lda,
float *B, int ldb);
void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
char diag, int m, int n, double alpha,
const double *A, int lda, double *B,
int ldb);
void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
int m, int n, cuComplex alpha, const cuComplex *A,
int lda, cuComplex *B, int ldb);
void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
char diag, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
cuDoubleComplex *B, int ldb);
/*------------------------------------------------------------------------*/
/* TRMM*/
void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
int m, int n, float alpha, const float *A, int lda,
float *B, int ldb);
void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
char diag, int m, int n, double alpha,
const double *A, int lda, double *B,
int ldb);
void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
int m, int n, cuComplex alpha, const cuComplex *A,
int lda, cuComplex *B, int ldb);
void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
char diag, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
int ldb);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif /* !defined(CUBLAS_H_) */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,274 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
* This is the public header file for the new CUBLAS library API, it mapped the generic
* Cublas name functions to the actual _v2 implementations.
*/
#if !defined(CUBLAS_V2_H_)
#define CUBLAS_V2_H_
#undef CUBLASAPI
#ifdef __CUDACC__
#define CUBLASAPI __host__ __device__
#else
#define CUBLASAPI
#endif
#include "cublas_api.h"
#define cublasCreate cublasCreate_v2
#define cublasDestroy cublasDestroy_v2
#define cublasGetVersion cublasGetVersion_v2
#define cublasSetStream cublasSetStream_v2
#define cublasGetStream cublasGetStream_v2
#define cublasGetPointerMode cublasGetPointerMode_v2
#define cublasSetPointerMode cublasSetPointerMode_v2
/* Blas3 Routines */
#define cublasSnrm2 cublasSnrm2_v2
#define cublasDnrm2 cublasDnrm2_v2
#define cublasScnrm2 cublasScnrm2_v2
#define cublasDznrm2 cublasDznrm2_v2
#define cublasSdot cublasSdot_v2
#define cublasDdot cublasDdot_v2
#define cublasCdotu cublasCdotu_v2
#define cublasCdotc cublasCdotc_v2
#define cublasZdotu cublasZdotu_v2
#define cublasZdotc cublasZdotc_v2
#define cublasSscal cublasSscal_v2
#define cublasDscal cublasDscal_v2
#define cublasCscal cublasCscal_v2
#define cublasCsscal cublasCsscal_v2
#define cublasZscal cublasZscal_v2
#define cublasZdscal cublasZdscal_v2
#define cublasSaxpy cublasSaxpy_v2
#define cublasDaxpy cublasDaxpy_v2
#define cublasCaxpy cublasCaxpy_v2
#define cublasZaxpy cublasZaxpy_v2
#define cublasScopy cublasScopy_v2
#define cublasDcopy cublasDcopy_v2
#define cublasCcopy cublasCcopy_v2
#define cublasZcopy cublasZcopy_v2
#define cublasSswap cublasSswap_v2
#define cublasDswap cublasDswap_v2
#define cublasCswap cublasCswap_v2
#define cublasZswap cublasZswap_v2
#define cublasIsamax cublasIsamax_v2
#define cublasIdamax cublasIdamax_v2
#define cublasIcamax cublasIcamax_v2
#define cublasIzamax cublasIzamax_v2
#define cublasIsamin cublasIsamin_v2
#define cublasIdamin cublasIdamin_v2
#define cublasIcamin cublasIcamin_v2
#define cublasIzamin cublasIzamin_v2
#define cublasSasum cublasSasum_v2
#define cublasDasum cublasDasum_v2
#define cublasScasum cublasScasum_v2
#define cublasDzasum cublasDzasum_v2
#define cublasSrot cublasSrot_v2
#define cublasDrot cublasDrot_v2
#define cublasCrot cublasCrot_v2
#define cublasCsrot cublasCsrot_v2
#define cublasZrot cublasZrot_v2
#define cublasZdrot cublasZdrot_v2
#define cublasSrotg cublasSrotg_v2
#define cublasDrotg cublasDrotg_v2
#define cublasCrotg cublasCrotg_v2
#define cublasZrotg cublasZrotg_v2
#define cublasSrotm cublasSrotm_v2
#define cublasDrotm cublasDrotm_v2
#define cublasSrotmg cublasSrotmg_v2
#define cublasDrotmg cublasDrotmg_v2
/* Blas2 Routines */
#define cublasSgemv cublasSgemv_v2
#define cublasDgemv cublasDgemv_v2
#define cublasCgemv cublasCgemv_v2
#define cublasZgemv cublasZgemv_v2
#define cublasSgbmv cublasSgbmv_v2
#define cublasDgbmv cublasDgbmv_v2
#define cublasCgbmv cublasCgbmv_v2
#define cublasZgbmv cublasZgbmv_v2
#define cublasStrmv cublasStrmv_v2
#define cublasDtrmv cublasDtrmv_v2
#define cublasCtrmv cublasCtrmv_v2
#define cublasZtrmv cublasZtrmv_v2
#define cublasStbmv cublasStbmv_v2
#define cublasDtbmv cublasDtbmv_v2
#define cublasCtbmv cublasCtbmv_v2
#define cublasZtbmv cublasZtbmv_v2
#define cublasStpmv cublasStpmv_v2
#define cublasDtpmv cublasDtpmv_v2
#define cublasCtpmv cublasCtpmv_v2
#define cublasZtpmv cublasZtpmv_v2
#define cublasStrsv cublasStrsv_v2
#define cublasDtrsv cublasDtrsv_v2
#define cublasCtrsv cublasCtrsv_v2
#define cublasZtrsv cublasZtrsv_v2
#define cublasStpsv cublasStpsv_v2
#define cublasDtpsv cublasDtpsv_v2
#define cublasCtpsv cublasCtpsv_v2
#define cublasZtpsv cublasZtpsv_v2
#define cublasStbsv cublasStbsv_v2
#define cublasDtbsv cublasDtbsv_v2
#define cublasCtbsv cublasCtbsv_v2
#define cublasZtbsv cublasZtbsv_v2
#define cublasSsymv cublasSsymv_v2
#define cublasDsymv cublasDsymv_v2
#define cublasCsymv cublasCsymv_v2
#define cublasZsymv cublasZsymv_v2
#define cublasChemv cublasChemv_v2
#define cublasZhemv cublasZhemv_v2
#define cublasSsbmv cublasSsbmv_v2
#define cublasDsbmv cublasDsbmv_v2
#define cublasChbmv cublasChbmv_v2
#define cublasZhbmv cublasZhbmv_v2
#define cublasSspmv cublasSspmv_v2
#define cublasDspmv cublasDspmv_v2
#define cublasChpmv cublasChpmv_v2
#define cublasZhpmv cublasZhpmv_v2
#define cublasSger cublasSger_v2
#define cublasDger cublasDger_v2
#define cublasCgeru cublasCgeru_v2
#define cublasCgerc cublasCgerc_v2
#define cublasZgeru cublasZgeru_v2
#define cublasZgerc cublasZgerc_v2
#define cublasSsyr cublasSsyr_v2
#define cublasDsyr cublasDsyr_v2
#define cublasCsyr cublasCsyr_v2
#define cublasZsyr cublasZsyr_v2
#define cublasCher cublasCher_v2
#define cublasZher cublasZher_v2
#define cublasSspr cublasSspr_v2
#define cublasDspr cublasDspr_v2
#define cublasChpr cublasChpr_v2
#define cublasZhpr cublasZhpr_v2
#define cublasSsyr2 cublasSsyr2_v2
#define cublasDsyr2 cublasDsyr2_v2
#define cublasCsyr2 cublasCsyr2_v2
#define cublasZsyr2 cublasZsyr2_v2
#define cublasCher2 cublasCher2_v2
#define cublasZher2 cublasZher2_v2
#define cublasSspr2 cublasSspr2_v2
#define cublasDspr2 cublasDspr2_v2
#define cublasChpr2 cublasChpr2_v2
#define cublasZhpr2 cublasZhpr2_v2
/* Blas3 Routines */
#define cublasSgemm cublasSgemm_v2
#define cublasDgemm cublasDgemm_v2
#define cublasCgemm cublasCgemm_v2
#define cublasZgemm cublasZgemm_v2
#define cublasSsyrk cublasSsyrk_v2
#define cublasDsyrk cublasDsyrk_v2
#define cublasCsyrk cublasCsyrk_v2
#define cublasZsyrk cublasZsyrk_v2
#define cublasCherk cublasCherk_v2
#define cublasZherk cublasZherk_v2
#define cublasSsyr2k cublasSsyr2k_v2
#define cublasDsyr2k cublasDsyr2k_v2
#define cublasCsyr2k cublasCsyr2k_v2
#define cublasZsyr2k cublasZsyr2k_v2
#define cublasCher2k cublasCher2k_v2
#define cublasZher2k cublasZher2k_v2
#define cublasSsymm cublasSsymm_v2
#define cublasDsymm cublasDsymm_v2
#define cublasCsymm cublasCsymm_v2
#define cublasZsymm cublasZsymm_v2
#define cublasChemm cublasChemm_v2
#define cublasZhemm cublasZhemm_v2
#define cublasStrsm cublasStrsm_v2
#define cublasDtrsm cublasDtrsm_v2
#define cublasCtrsm cublasCtrsm_v2
#define cublasZtrsm cublasZtrsm_v2
#define cublasStrmm cublasStrmm_v2
#define cublasDtrmm cublasDtrmm_v2
#define cublasCtrmm cublasCtrmm_v2
#define cublasZtrmm cublasZtrmm_v2
#endif /* !defined(CUBLAS_V2_H_) */

View File

@@ -0,0 +1,228 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
#define __CUDA_DEVICE_RUNTIME_API_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__CUDABE__)
#if (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
struct cudaFuncAttributes;
__device__ __attribute__((nv_weak)) cudaError_t cudaMalloc(void **p, size_t s)
{
return cudaErrorUnknown;
}
__device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
{
return cudaErrorUnknown;
}
__device__ __attribute__((nv_weak)) cudaError_t cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
{
return cudaErrorUnknown;
}
__device__ __attribute__((nv_weak)) cudaError_t cudaGetDevice(int *device)
{
return cudaErrorUnknown;
}
__device__ __attribute__((nv_weak)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
{
return cudaErrorUnknown;
}
__device__ __attribute__((nv_weak)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
{
return cudaErrorUnknown;
}
#endif /* (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
#else /* defined(__CUDABE__) */
#if defined(__cplusplus) && defined(__CUDACC__) // Visible to nvcc front-end only
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) // Visible to SM>=3.5 and "__host__ __device__" only
#include "driver_types.h"
#include "host_defines.h"
extern "C"
{
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
/**
* \ingroup CUDART_EXECUTION
* \brief Obtains a parameter buffer
*
* Obtains a parameter buffer which can be filled with parameters for a kernel launch.
* Parameters passed to ::cudaLaunchDevice must be allocated via this function.
*
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
* CUDA user code should use <<< >>> to launch kernels.
*
* \param alignment - Specifies alignment requirement of the parameter buffer
* \param size - Specifies size requirement in bytes
*
* \return
* Returns pointer to the allocated parameterBuffer
* \notefnerr
*
* \sa cudaLaunchDevice
*/
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
/**
* \ingroup CUDART_EXECUTION
* \brief Launches a specified kernel
*
* Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
* by calling ::cudaGetParameterBuffer().
*
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
* CUDA user code should use <<< >>> to launch the kernels.
*
* \param func - Pointer to the kernel to be launched
* \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
* \param gridDimension - Specifies grid dimensions
* \param blockDimension - Specifies block dimensions
* \param sharedMemSize - Specifies size of shared memory
* \param stream - Specifies the stream to be used
*
* \return
* ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
* ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
* \notefnerr
* \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
* Guide for the detailed descriptions of launch configuration and parameter layout respectively.
*
* \sa cudaGetParameterBuffer
*/
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
// When compiling for the device and per thread default stream is enabled, add
// a static inline redirect to the per thread stream entry points.
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
{
return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
}
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
{
return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
}
#else
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
#endif
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
}
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
#endif // defined(__cplusplus) && defined(__CUDACC__)
#endif /* defined(__CUDABE__) */
#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,69 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__DEVICE_TYPES_H__)
#define __DEVICE_TYPES_H__
#include "host_defines.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
enum __device_builtin__ cudaRoundMode
{
cudaRoundNearest,
cudaRoundZero,
cudaRoundPosInf,
cudaRoundMinInf
};
#endif /* !__DEVICE_TYPES_H__ */

View File

@@ -0,0 +1,145 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__DRIVER_FUNCTIONS_H__)
#define __DRIVER_FUNCTIONS_H__
#include "builtin_types.h"
#include "host_defines.h"
#include "driver_types.h"
/**
* \addtogroup CUDART_MEMORY
*
* @{
*/
/**
* \brief Returns a cudaPitchedPtr based on input parameters
*
* Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
* \p p, \p xsz, and \p ysz.
*
* \param d - Pointer to allocated memory
* \param p - Pitch of allocated memory in bytes
* \param xsz - Logical width of allocation in elements
* \param ysz - Logical height of allocation in elements
*
* \return
* ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
*
* \sa make_cudaExtent, make_cudaPos
*/
static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
{
struct cudaPitchedPtr s;
s.ptr = d;
s.pitch = p;
s.xsize = xsz;
s.ysize = ysz;
return s;
}
/**
* \brief Returns a cudaPos based on input parameters
*
* Returns a ::cudaPos based on the specified input parameters \p x,
* \p y, and \p z.
*
* \param x - X position
* \param y - Y position
* \param z - Z position
*
* \return
* ::cudaPos specified by \p x, \p y, and \p z
*
* \sa make_cudaExtent, make_cudaPitchedPtr
*/
static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z)
{
struct cudaPos p;
p.x = x;
p.y = y;
p.z = z;
return p;
}
/**
* \brief Returns a cudaExtent based on input parameters
*
* Returns a ::cudaExtent based on the specified input parameters \p w,
* \p h, and \p d.
*
* \param w - Width in bytes
* \param h - Height in elements
* \param d - Depth in elements
*
* \return
* ::cudaExtent specified by \p w, \p h, and \p d
*
* \sa make_cudaPitchedPtr, make_cudaPos
*/
static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d)
{
struct cudaExtent e;
e.width = w;
e.height = h;
e.depth = d;
return e;
}
/** @} */ /* END CUDART_MEMORY */
#endif /* !__DRIVER_FUNCTIONS_H__ */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,201 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__HOST_CONFIG_H__)
#define __HOST_CONFIG_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__CUDACC__)
#if defined(__CUDACC_RTC__)
#define _CRTIMP
#define __THROW
#else /* __CUDACC_RTC__ */
/* check for host compilers that are compatible with nvcc */
#if !defined(__GNUC__) && !defined(_WIN32)
#error --- !!! UNSUPPORTED COMPILER !!! ---
#endif /* !__GNUC__ && !_WIN32 */
#if defined(__ICC)
#if __ICC != 1500 || !defined(__GNUC__) || !defined(__LP64__)
#error -- unsupported ICC configuration! Only ICC 15.0 on Linux x86_64 is supported!
#endif /* __ICC != 1500 || !__GNUC__ || !__LP64__ */
#endif /* __ICC */
#if defined(__PGIC__)
#if __PGIC__ != 15 || __PGIC_MINOR__ != 4 || !defined(__GNUC__) || !defined(__LP64__)
#error -- unsupported pgc++ configuration! Only pgc++ 15.4 on Linux x86_64 is supported!
#endif /* __PGIC__ != 15 || __PGIC_MINOR != 4 || !__GNUC__ || !__LP64__ */
#endif /* __PGIC__ */
#if defined(__powerpc__)
#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
#if defined(__ibmxl_vrm__) && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000)
#error -- unsupported xlC version! only xlC 13.1 is supported
#endif /* __ibmxl_vrm__ && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) */
#endif /* __powerpc__ */
#if defined(__GNUC__)
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 9)
#error -- unsupported GNU version! gcc versions later than 4.9 are not supported!
#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 9) */
#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
#error -- clang and clang++ are the only supported host compilers on Mac OS X!
#endif /* __APPLE__ && __MACH__ && !__clang__ */
#endif /* __GNUC__ */
#if defined(_WIN32)
#if _MSC_VER < 1600 || _MSC_VER > 1800
#error -- unsupported Microsoft Visual Studio version! Only the versions 2010, 2012, and 2013 are supported!
#endif /* _MSC_VER < 1600 || _MSC_VER > 1800 */
#endif /* _WIN32 */
/* configure host compiler */
#if defined(__APPLE__)
#define _CRTIMP
#define __THROW
#if defined(__BLOCKS__) /* nvcc does not support closures */
#undef __BLOCKS__
#endif /* __BLOCKS__ */
#elif defined(__ANDROID__)
#define _CRTIMP
#define __THROW
#elif defined(__QNX__)
#define _CRTIMP
#define __THROW
#elif defined(__GNUC__)
#define _CRTIMP
#include <features.h> /* for __THROW */
#elif defined(_WIN32)
#if _MSC_VER >= 1500
#undef _USE_DECLSPECS_FOR_SAL
#define _USE_DECLSPECS_FOR_SAL \
1
#endif /* _MSC_VER >= 1500 */
#if !defined(_CRT_NONSTDC_NO_WARNINGS)
#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
#endif /* !_CRT_NONSTDC_NO_WARNINGS */
#if !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
#endif /* !_CRT_SECURE_NO_WARNINGS */
#if !defined(NOMINMAX)
#define NOMINMAX /* min and max are part of cuda runtime */
#endif /* !NOMINMAX */
#include <crtdefs.h> /* for _CRTIMP */
#define __THROW
#endif /* __APPLE__ */
#endif /* __CUDACC_RTC__ */
#endif /* __CUDACC__ */
#endif /* !__HOST_CONFIG_H__ */

View File

@@ -0,0 +1,241 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__HOST_DEFINES_H__)
#define __HOST_DEFINES_H__
/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
#if defined(__GNUC__) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
#if defined(__CUDACC_RTC__)
#define __volatile__ volatile
#endif /* __CUDACC_RTC__ */
#define __no_return__ \
__attribute__((noreturn))
#if defined(__CUDACC__) || defined(__CUDA_ARCH__)
/* gcc allows users to define attributes with underscores,
e.g., __attribute__((__noinline__)).
Consider a non-CUDA source file (e.g. .cpp) that has the
above attribute specification, and includes this header file. In that case,
defining __noinline__ as below would cause a gcc compilation error.
Hence, only define __noinline__ when the code is being processed
by a CUDA compiler component.
*/
#define __noinline__ \
__attribute__((noinline))
#endif /* __CUDACC__ || __CUDA_ARCH__ */
#define __forceinline__ \
__inline__ __attribute__((always_inline))
#define __align__(n) \
__attribute__((aligned(n)))
#define __thread__ \
__thread
#define __import__
#define __export__
#define __cdecl
#define __annotate__(a) \
__attribute__((a))
#define __location__(a) \
__annotate__(a)
#define CUDARTAPI
#elif defined(_MSC_VER)
#if _MSC_VER >= 1400
#define __restrict__ \
__restrict
#else /* _MSC_VER >= 1400 */
#define __restrict__
#endif /* _MSC_VER >= 1400 */
#define __inline__ \
__inline
#define __no_return__ \
__declspec(noreturn)
#define __noinline__ \
__declspec(noinline)
#define __forceinline__ \
__forceinline
#define __align__(n) \
__declspec(align(n))
#define __thread__ \
__declspec(thread)
#define __import__ \
__declspec(dllimport)
#define __export__ \
__declspec(dllexport)
#define __annotate__(a) \
__declspec(a)
#define __location__(a) \
__annotate__(__##a##__)
#define CUDARTAPI \
__stdcall
#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
#define __inline__
#if !defined(__align__)
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
#endif /* !__align__ */
#if !defined(CUDARTAPI)
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
#endif /* !CUDARTAPI */
#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
(defined(_MSC_VER) && _MSC_VER < 1900) || \
(!defined(__GNUC__) && !defined(_MSC_VER))
#define __specialization_static \
static
#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
(_MSC_VER && _MSC_VER < 1900) ||
(!__GNUC__ && !_MSC_VER) */
#define __specialization_static
#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
(_MSC_VER && _MSC_VER < 1900) ||
(!__GNUC__ && !_MSC_VER) */
#if !defined(__CUDACC__) && !defined(__CUDABE__)
#undef __annotate__
#define __annotate__(a)
#else /* !__CUDACC__ && !__CUDABE__ */
#define __launch_bounds__(...) \
__annotate__(launch_bounds(__VA_ARGS__))
#endif /* !__CUDACC__ && !__CUDABE__ */
#if defined(__CUDACC__) || defined(__CUDABE__) || \
defined(__GNUC__) || defined(_WIN64)
#define __builtin_align__(a) \
__align__(a)
#else /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */
#define __builtin_align__(a)
#endif /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */
#define __host__ \
__location__(host)
#define __device__ \
__location__(device)
#define __global__ \
__location__(global)
#define __shared__ \
__location__(shared)
#define __constant__ \
__location__(constant)
#define __managed__ \
__location__(managed)
#if (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__)) || !defined(__CUDACC__)
#define __device_builtin__
#define __device_builtin_texture_type__
#define __device_builtin_surface_type__
#define __cudart_builtin__
#else /* (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__)) || !__CUDACC__ */
#define __device_builtin__ \
__location__(device_builtin)
#define __device_builtin_texture_type__ \
__location__(device_builtin_texture_type)
#define __device_builtin_surface_type__ \
__location__(device_builtin_surface_type)
#define __cudart_builtin__ \
__location__(cudart_builtin)
#endif /* (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__)) || !__CUDACC__ */
#if defined(__CUDACC__) && defined(__clang__)
#if !defined(__has_feature)
#error --- !!! The Clang version does not support __has_feature !!! ---
#endif /* !__has_feature */
#if defined(__cplusplus) && defined(__CUDACC__)
#if (__has_feature(cxx_noexcept))
#define NV_CLANG_ATOMIC_NOEXCEPT noexcept
#define NV_CLANG_ATOMIC_NOEXCEPT_(x) noexcept(x)
#else /* !__has_feature(cxx_noexcept) */
#define NV_CLANG_ATOMIC_NOEXCEPT throw()
#define NV_CLANG_ATOMIC_NOEXCEPT_(x)
#endif /* __has_feature(cxx_noexcept) */
template <typename T> struct __nv_clang_atomic_t {
__nv_clang_atomic_t() NV_CLANG_ATOMIC_NOEXCEPT;
__nv_clang_atomic_t(const T &x) NV_CLANG_ATOMIC_NOEXCEPT;
operator T() volatile NV_CLANG_ATOMIC_NOEXCEPT;
operator T() NV_CLANG_ATOMIC_NOEXCEPT;
};
#define _Atomic(X) __nv_clang_atomic_t<X>
#endif /* defined(__cplusplus) && defined(__CUDACC__) */
#endif /* __CUDACC__ && __clang__ */
#endif /* !__HOST_DEFINES_H__ */

View File

@@ -0,0 +1,119 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__SURFACE_TYPES_H__)
#define __SURFACE_TYPES_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "driver_types.h"
/**
* \addtogroup CUDART_TYPES
*
* @{
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#define cudaSurfaceType1D 0x01
#define cudaSurfaceType2D 0x02
#define cudaSurfaceType3D 0x03
#define cudaSurfaceTypeCubemap 0x0C
#define cudaSurfaceType1DLayered 0xF1
#define cudaSurfaceType2DLayered 0xF2
#define cudaSurfaceTypeCubemapLayered 0xFC
/**
* CUDA Surface boundary modes
*/
enum __device_builtin__ cudaSurfaceBoundaryMode
{
cudaBoundaryModeZero = 0, /**< Zero boundary mode */
cudaBoundaryModeClamp = 1, /**< Clamp boundary mode */
cudaBoundaryModeTrap = 2 /**< Trap boundary mode */
};
/**
* CUDA Surface format modes
*/
enum __device_builtin__ cudaSurfaceFormatMode
{
cudaFormatModeForced = 0, /**< Forced format mode */
cudaFormatModeAuto = 1 /**< Auto format mode */
};
/**
* CUDA Surface reference
*/
struct __device_builtin__ surfaceReference
{
/**
* Channel descriptor for surface reference
*/
struct cudaChannelFormatDesc channelDesc;
};
/**
* An opaque value that represents a CUDA Surface object
*/
typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
/** @} */
/** @} */ /* END CUDART_TYPES */
#endif /* !__SURFACE_TYPES_H__ */

View File

@@ -0,0 +1,213 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__TEXTURE_TYPES_H__)
#define __TEXTURE_TYPES_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "driver_types.h"
/**
* \addtogroup CUDART_TYPES
*
* @{
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#define cudaTextureType1D 0x01
#define cudaTextureType2D 0x02
#define cudaTextureType3D 0x03
#define cudaTextureTypeCubemap 0x0C
#define cudaTextureType1DLayered 0xF1
#define cudaTextureType2DLayered 0xF2
#define cudaTextureTypeCubemapLayered 0xFC
/**
* CUDA texture address modes
*/
enum __device_builtin__ cudaTextureAddressMode
{
cudaAddressModeWrap = 0, /**< Wrapping address mode */
cudaAddressModeClamp = 1, /**< Clamp to edge address mode */
cudaAddressModeMirror = 2, /**< Mirror address mode */
cudaAddressModeBorder = 3 /**< Border address mode */
};
/**
* CUDA texture filter modes
*/
enum __device_builtin__ cudaTextureFilterMode
{
cudaFilterModePoint = 0, /**< Point filter mode */
cudaFilterModeLinear = 1 /**< Linear filter mode */
};
/**
* CUDA texture read modes
*/
enum __device_builtin__ cudaTextureReadMode
{
cudaReadModeElementType = 0, /**< Read texture as specified element type */
cudaReadModeNormalizedFloat = 1 /**< Read texture as normalized float */
};
/**
* CUDA texture reference
*/
struct __device_builtin__ textureReference
{
/**
* Indicates whether texture reads are normalized or not
*/
int normalized;
/**
* Texture filter mode
*/
enum cudaTextureFilterMode filterMode;
/**
* Texture address mode for up to 3 dimensions
*/
enum cudaTextureAddressMode addressMode[3];
/**
* Channel descriptor for the texture reference
*/
struct cudaChannelFormatDesc channelDesc;
/**
* Perform sRGB->linear conversion during texture read
*/
int sRGB;
/**
* Limit to the anisotropy ratio
*/
unsigned int maxAnisotropy;
/**
* Mipmap filter mode
*/
enum cudaTextureFilterMode mipmapFilterMode;
/**
* Offset applied to the supplied mipmap level
*/
float mipmapLevelBias;
/**
* Lower end of the mipmap level range to clamp access to
*/
float minMipmapLevelClamp;
/**
* Upper end of the mipmap level range to clamp access to
*/
float maxMipmapLevelClamp;
int __cudaReserved[15];
};
/**
* CUDA texture descriptor
*/
struct __device_builtin__ cudaTextureDesc
{
/**
* Texture address mode for up to 3 dimensions
*/
enum cudaTextureAddressMode addressMode[3];
/**
* Texture filter mode
*/
enum cudaTextureFilterMode filterMode;
/**
* Texture read mode
*/
enum cudaTextureReadMode readMode;
/**
* Perform sRGB->linear conversion during texture read
*/
int sRGB;
/**
* Indicates whether texture reads are normalized or not
*/
int normalizedCoords;
/**
* Limit to the anisotropy ratio
*/
unsigned int maxAnisotropy;
/**
* Mipmap filter mode
*/
enum cudaTextureFilterMode mipmapFilterMode;
/**
* Offset applied to the supplied mipmap level
*/
float mipmapLevelBias;
/**
* Lower end of the mipmap level range to clamp access to
*/
float minMipmapLevelClamp;
/**
* Upper end of the mipmap level range to clamp access to
*/
float maxMipmapLevelClamp;
};
/**
* An opaque value that represents a CUDA texture object
*/
typedef __device_builtin__ unsigned long long cudaTextureObject_t;
/** @} */
/** @} */ /* END CUDART_TYPES */
#endif /* !__TEXTURE_TYPES_H__ */

View File

@@ -0,0 +1,177 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__VECTOR_FUNCTIONS_H__)
#define __VECTOR_FUNCTIONS_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "builtin_types.h"
#include "host_defines.h"
#include "vector_types.h"
#if defined(__CUDACC_RTC__)
#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
#else /* !__CUDACC_RTC__ */
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
#endif /* __CUDACC_RTC__ */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
#undef __VECTOR_FUNCTIONS_DECL__
#if !defined(__CUDACC_RTC__)
#include "vector_functions.hpp"
#endif /* !__CUDACC_RTC__ */
#endif /* !__VECTOR_FUNCTIONS_H__ */

View File

@@ -0,0 +1,318 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__VECTOR_FUNCTIONS_HPP__)
#define __VECTOR_FUNCTIONS_HPP__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "builtin_types.h"
#include "host_defines.h"
#include "vector_types.h"
#if defined(__CUDACC_RTC__)
#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
#else /* !__CUDACC_RTC__ */
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
#endif /* __CUDACC_RTC__ */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
{
char1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
{
uchar1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
{
char2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
{
uchar2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
{
char3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
{
uchar3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
{
char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
{
uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
{
short1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
{
ushort1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
{
short2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
{
ushort2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
{
short3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
{
ushort3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
{
short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
{
ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
{
int1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
{
uint1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
{
int2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
{
uint2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
{
int3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
{
uint3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
{
int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
{
uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
{
long1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
{
ulong1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
{
long2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
{
ulong2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
{
long3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
{
ulong3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
{
long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
{
ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
{
float1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
{
float2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
{
float3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
{
float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
{
longlong1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
{
ulonglong1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
{
longlong2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
{
ulonglong2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
{
longlong3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
{
ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
{
longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
{
ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
{
double1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
{
double2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
{
double3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
{
double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
#undef __VECTOR_FUNCTIONS_DECL__
#endif /* !__VECTOR_FUNCTIONS_HPP__ */

View File

@@ -0,0 +1,431 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__VECTOR_TYPES_H__)
#define __VECTOR_TYPES_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if !defined(__CUDA_LIBDEVICE__) && !defined(__CUDACC_RTC__)
#define EXCLUDE_FROM_RTC
#include "builtin_types.h"
#undef EXCLUDE_FROM_RTC
#endif /* !__CUDA_LIBDEVICE__ && !__CUDACC_RTC__ */
#include "host_defines.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && !defined(__CUDABE__) && \
defined(_WIN32) && !defined(_WIN64)
#pragma warning(push)
#pragma warning(disable: 4201 4408)
#define __cuda_builtin_vector_align8(tag, members) \
struct __device_builtin__ tag \
{ \
union \
{ \
struct { members }; \
struct { long long int :1,:0; }; \
}; \
}
#else /* !__CUDACC__ && !__CUDACC_RTC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
#define __cuda_builtin_vector_align8(tag, members) \
struct __device_builtin__ __align__(8) tag \
{ \
members \
}
#endif /* !__CUDACC__ && !__CUDACC_RTC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
struct __device_builtin__ char1
{
signed char x;
};
struct __device_builtin__ uchar1
{
unsigned char x;
};
struct __device_builtin__ __align__(2) char2
{
signed char x, y;
};
struct __device_builtin__ __align__(2) uchar2
{
unsigned char x, y;
};
struct __device_builtin__ char3
{
signed char x, y, z;
};
struct __device_builtin__ uchar3
{
unsigned char x, y, z;
};
struct __device_builtin__ __align__(4) char4
{
signed char x, y, z, w;
};
struct __device_builtin__ __align__(4) uchar4
{
unsigned char x, y, z, w;
};
struct __device_builtin__ short1
{
short x;
};
struct __device_builtin__ ushort1
{
unsigned short x;
};
struct __device_builtin__ __align__(4) short2
{
short x, y;
};
struct __device_builtin__ __align__(4) ushort2
{
unsigned short x, y;
};
struct __device_builtin__ short3
{
short x, y, z;
};
struct __device_builtin__ ushort3
{
unsigned short x, y, z;
};
__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
struct __device_builtin__ int1
{
int x;
};
struct __device_builtin__ uint1
{
unsigned int x;
};
__cuda_builtin_vector_align8(int2, int x; int y;);
__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
struct __device_builtin__ int3
{
int x, y, z;
};
struct __device_builtin__ uint3
{
unsigned int x, y, z;
};
struct __device_builtin__ __builtin_align__(16) int4
{
int x, y, z, w;
};
struct __device_builtin__ __builtin_align__(16) uint4
{
unsigned int x, y, z, w;
};
struct __device_builtin__ long1
{
long int x;
};
struct __device_builtin__ ulong1
{
unsigned long x;
};
#if defined(__CUDACC_RTC__) || defined(_WIN32)
__cuda_builtin_vector_align8(long2, long int x; long int y;);
__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
#else /* __CUDACC_RTC__ || _WIN32 */
struct __device_builtin__ __align__(2*sizeof(long int)) long2
{
long int x, y;
};
struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
{
unsigned long int x, y;
};
#endif /* __CUDACC_RTC__ || _WIN32 */
struct __device_builtin__ long3
{
long int x, y, z;
};
struct __device_builtin__ ulong3
{
unsigned long int x, y, z;
};
struct __device_builtin__ __builtin_align__(16) long4
{
long int x, y, z, w;
};
struct __device_builtin__ __builtin_align__(16) ulong4
{
unsigned long int x, y, z, w;
};
struct __device_builtin__ float1
{
float x;
};
#if !defined(__CUDACC__) && !defined(__CUDABE__) && defined(__arm__) && \
defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-pedantic"
struct __device_builtin__ __attribute__((aligned(8))) float2
{
float x; float y; float __cuda_gnu_arm_ice_workaround[0];
};
#pragma GCC poison __cuda_gnu_arm_ice_workaround
#pragma GCC diagnostic pop
#else /* !__CUDACC__ && !__CUDABE__ && __arm__ && __ARM_PCS_VFP &&
__GNUC__ == 4&& __GNUC_MINOR__ == 6 */
__cuda_builtin_vector_align8(float2, float x; float y;);
#endif /* !__CUDACC__ && !__CUDABE__ && __arm__ && __ARM_PCS_VFP &&
__GNUC__ == 4&& __GNUC_MINOR__ == 6 */
struct __device_builtin__ float3
{
float x, y, z;
};
struct __device_builtin__ __builtin_align__(16) float4
{
float x, y, z, w;
};
struct __device_builtin__ longlong1
{
long long int x;
};
struct __device_builtin__ ulonglong1
{
unsigned long long int x;
};
struct __device_builtin__ __builtin_align__(16) longlong2
{
long long int x, y;
};
struct __device_builtin__ __builtin_align__(16) ulonglong2
{
unsigned long long int x, y;
};
struct __device_builtin__ longlong3
{
long long int x, y, z;
};
struct __device_builtin__ ulonglong3
{
unsigned long long int x, y, z;
};
struct __device_builtin__ __builtin_align__(16) longlong4
{
long long int x, y, z ,w;
};
struct __device_builtin__ __builtin_align__(16) ulonglong4
{
unsigned long long int x, y, z, w;
};
struct __device_builtin__ double1
{
double x;
};
struct __device_builtin__ __builtin_align__(16) double2
{
double x, y;
};
struct __device_builtin__ double3
{
double x, y, z;
};
struct __device_builtin__ __builtin_align__(16) double4
{
double x, y, z, w;
};
#if !defined(__CUDACC__) && !defined(__CUDABE__) && \
defined(_WIN32) && !defined(_WIN64)
#pragma warning(pop)
#endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
typedef __device_builtin__ struct char1 char1;
typedef __device_builtin__ struct uchar1 uchar1;
typedef __device_builtin__ struct char2 char2;
typedef __device_builtin__ struct uchar2 uchar2;
typedef __device_builtin__ struct char3 char3;
typedef __device_builtin__ struct uchar3 uchar3;
typedef __device_builtin__ struct char4 char4;
typedef __device_builtin__ struct uchar4 uchar4;
typedef __device_builtin__ struct short1 short1;
typedef __device_builtin__ struct ushort1 ushort1;
typedef __device_builtin__ struct short2 short2;
typedef __device_builtin__ struct ushort2 ushort2;
typedef __device_builtin__ struct short3 short3;
typedef __device_builtin__ struct ushort3 ushort3;
typedef __device_builtin__ struct short4 short4;
typedef __device_builtin__ struct ushort4 ushort4;
typedef __device_builtin__ struct int1 int1;
typedef __device_builtin__ struct uint1 uint1;
typedef __device_builtin__ struct int2 int2;
typedef __device_builtin__ struct uint2 uint2;
typedef __device_builtin__ struct int3 int3;
typedef __device_builtin__ struct uint3 uint3;
typedef __device_builtin__ struct int4 int4;
typedef __device_builtin__ struct uint4 uint4;
typedef __device_builtin__ struct long1 long1;
typedef __device_builtin__ struct ulong1 ulong1;
typedef __device_builtin__ struct long2 long2;
typedef __device_builtin__ struct ulong2 ulong2;
typedef __device_builtin__ struct long3 long3;
typedef __device_builtin__ struct ulong3 ulong3;
typedef __device_builtin__ struct long4 long4;
typedef __device_builtin__ struct ulong4 ulong4;
typedef __device_builtin__ struct float1 float1;
typedef __device_builtin__ struct float2 float2;
typedef __device_builtin__ struct float3 float3;
typedef __device_builtin__ struct float4 float4;
typedef __device_builtin__ struct longlong1 longlong1;
typedef __device_builtin__ struct ulonglong1 ulonglong1;
typedef __device_builtin__ struct longlong2 longlong2;
typedef __device_builtin__ struct ulonglong2 ulonglong2;
typedef __device_builtin__ struct longlong3 longlong3;
typedef __device_builtin__ struct ulonglong3 ulonglong3;
typedef __device_builtin__ struct longlong4 longlong4;
typedef __device_builtin__ struct ulonglong4 ulonglong4;
typedef __device_builtin__ struct double1 double1;
typedef __device_builtin__ struct double2 double2;
typedef __device_builtin__ struct double3 double3;
typedef __device_builtin__ struct double4 double4;
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
struct __device_builtin__ dim3
{
unsigned int x, y, z;
#if defined(__cplusplus)
__host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
__host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
__host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
#endif /* __cplusplus */
};
typedef __device_builtin__ struct dim3 dim3;
#undef __cuda_builtin_vector_align8
#endif /* !__VECTOR_TYPES_H__ */

View File

@@ -85,11 +85,20 @@ public:
virtual int is_invalid(expression_tree const & expressions, driver::Device const & device) const = 0; virtual int is_invalid(expression_tree const & expressions, driver::Device const & device) const = 0;
virtual void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & expressions) = 0; virtual void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & expressions) = 0;
std::string generate(std::string const & suffix, expression_tree const & expressions, driver::Device const & device); std::string generate(std::string const & suffix, expression_tree const & expressions, driver::Device const & device);
std::shared_ptr<base> getptr() { std::shared_ptr<base> getptr();
return shared_from_this();
}
}; };
class external_base: public base
{
private:
virtual std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & mapping);
public:
external_base();
virtual unsigned int temporary_workspace(expression_tree const &) const;
virtual unsigned int lmem_usage(expression_tree const &) const;
virtual unsigned int registers_usage(expression_tree const &) const;
virtual std::vector<int_t> input_sizes(expression_tree const & expressions) const = 0;
};
class parameterized_base : public base class parameterized_base : public base
{ {

View File

@@ -31,6 +31,21 @@ namespace isaac
namespace templates namespace templates
{ {
class cublas_gemm : public external_base
{
bool init();
public:
cublas_gemm(char A_trans, char B_trans);
int is_invalid(expression_tree const &, driver::Device const &) const;
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & h);
private:
const char A_trans_;
const char B_trans_;
bool init_;
};
class gemm : public parameterized_base class gemm : public parameterized_base
{ {
private: private:
@@ -41,16 +56,16 @@ private:
std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const &) const; std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const &) const;
void enqueue_block(driver::CommandQueue & queue, int_t M, int_t N, int_t K, const expression_tree::node &A, const expression_tree::node &B, const expression_tree::node &C, void enqueue_block(driver::CommandQueue & queue, int_t M, int_t N, int_t K, const expression_tree::node &A, const expression_tree::node &B, const expression_tree::node &C,
value_scalar const &alpha, value_scalar const &beta, driver::Program const & program, std::string const & suffix, runtime::execution_options_type const & options); value_scalar const &alpha, value_scalar const &beta, driver::Program const & program, std::string const & suffix, runtime::execution_options_type const & options);
std::vector<int_t> infos(expression_tree const & expressions, isaac::symbolic::preset::gemm::args &arguments) const;
public: public:
gemm(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D gemm(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns, fetch_type Afetch , fetch_type Bfetch , int_t ms, int_t ks, int_t ns, fetch_type Afetch , fetch_type Bfetch
, int_t lf0, int_t lf1, char A_trans, char B_trans); , int_t lf0, int_t lf1, char A_trans, char B_trans);
std::vector<int_t> input_sizes(expression_tree const & expressions) const; std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &ctr); void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & h);
private: private:
//Parameters //Parameters
unsigned int mL_; unsigned int mL_;
unsigned int kL_; unsigned int kL_;
unsigned int nL_; unsigned int nL_;

View File

@@ -60,6 +60,10 @@ namespace driver
#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\ #define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); } {return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }
#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
//Specialized helpers for OpenCL //Specialized helpers for OpenCL
#define OCL_DEFINE1(ret, fname, t1) DEFINE1(clinit, opencl_, ret, fname, t1) #define OCL_DEFINE1(ret, fname, t1) DEFINE1(clinit, opencl_, ret, fname, t1)
#define OCL_DEFINE2(ret, fname, t1, t2) DEFINE2(clinit, opencl_, ret, fname, t1, t2) #define OCL_DEFINE2(ret, fname, t1, t2) DEFINE2(clinit, opencl_, ret, fname, t1, t2)
@@ -96,6 +100,8 @@ namespace driver
#define NVRTC_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) #define NVRTC_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
#define NVRTC_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) #define NVRTC_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
#define CUBLAS_DEFINE1(ret, fname, t1) DEFINE1(cublasinit, cublas_, ret, fname, t1)
#define CUBLAS_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cublasinit, cublas_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13)
bool dispatch::clinit() bool dispatch::clinit()
{ {
@@ -118,6 +124,16 @@ bool dispatch::nvrtcinit()
return nvrtc_ != nullptr; return nvrtc_ != nullptr;
} }
bool dispatch::cublasinit()
{
if(cublas_==nullptr){
cublas_ = dlopen("libcublas.so", RTLD_LAZY);
if(cublas_!=nullptr)
cublasCreate(&cublas_handle_);
}
return cublas_ != nullptr;
}
//OpenCL //OpenCL
@@ -196,6 +212,20 @@ NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTXSize, nvrtcProgram, size_t *)
NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **) NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **)
NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *) NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *)
CUBLAS_DEFINE1(void, cublasCreate, cublasHandle_t*)
void dispatch::cublasGetStream(cudaStream_t *a)
{ f_impl<dispatch::cublasinit>(cublas_, cublasGetStream_v2, cublasGetStream_, "cublasGetStream_v2", cublas_handle_, a); }
void dispatch::cublasSetStream(cudaStream_t a)
{ f_impl<dispatch::cublasinit>(cublas_, cublasSetStream_v2, cublasSetStream_, "cublasSetStream_v2", cublas_handle_, a); }
void dispatch::cublasSgemm(cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc)
{ f_impl<dispatch::cublasinit>(cublas_, cublasSgemm_v2, cublasSgemm_, "cublasSgemm_v2", cublas_handle_, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
void dispatch::cublasDgemm(cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc)
{ f_impl<dispatch::cublasinit>(cublas_, cublasDgemm_v2, cublasDgemm_, "cublasDgemm_v2", cublas_handle_, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
void dispatch::release() void dispatch::release()
{ {
if(opencl_){ if(opencl_){
@@ -210,11 +240,17 @@ void dispatch::release()
dlclose(nvrtc_); dlclose(nvrtc_);
nvrtc_ = nullptr; nvrtc_ = nullptr;
} }
if(cublas_){
dlclose(cublas_);
cublas_ = nullptr;
}
} }
void * dispatch::opencl_; void * dispatch::opencl_;
void * dispatch::cuda_; void * dispatch::cuda_;
void * dispatch::nvrtc_; void * dispatch::nvrtc_;
void * dispatch::cublas_;
cublasHandle_t dispatch::cublas_handle_;
//OpenCL //OpenCL
void* dispatch::clBuildProgram_; void* dispatch::clBuildProgram_;
@@ -288,5 +324,11 @@ void* dispatch::nvrtcGetPTXSize_;
void* dispatch::nvrtcCreateProgram_; void* dispatch::nvrtcCreateProgram_;
void* dispatch::nvrtcGetProgramLog_; void* dispatch::nvrtcGetProgramLog_;
void* dispatch::cublasCreate_;
void* dispatch::cublasGetStream_;
void* dispatch::cublasSetStream_;
void* dispatch::cublasSgemm_;
void* dispatch::cublasDgemm_;
} }
} }

View File

@@ -66,6 +66,24 @@ std::string base::generate(std::string const & suffix, expression_tree const &
return generate_impl(suffix, expression, device, mapping); return generate_impl(suffix, expression, device, mapping);
} }
/* External base */
external_base::external_base()
{ }
std::string external_base::generate_impl(std::string const &, expression_tree const &, driver::Device const &, symbolic::symbols_table const &)
{ return ""; }
unsigned int external_base::temporary_workspace(expression_tree const &) const
{ return 0; }
unsigned int external_base::lmem_usage(expression_tree const &) const
{ return 0; }
unsigned int external_base::registers_usage(expression_tree const &) const
{ return 0; }
/* Parameterized base */
int parameterized_base::is_invalid_impl(driver::Device const &, expression_tree const &) const int parameterized_base::is_invalid_impl(driver::Device const &, expression_tree const &) const
{ return TEMPLATE_VALID; } { return TEMPLATE_VALID; }
@@ -104,5 +122,8 @@ int parameterized_base::is_invalid(expression_tree const & expressions, driver:
return is_invalid_impl(device, expressions); return is_invalid_impl(device, expressions);
} }
std::shared_ptr<base> base::getptr()
{ return shared_from_this(); }
} }
} }

View File

@@ -1,4 +1,4 @@
/* /*
* Copyright (c) 2015, PHILIPPE TILLET. All rights reserved. * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
* *
* This file is part of ISAAC. * This file is part of ISAAC.
@@ -20,6 +20,7 @@
*/ */
#include "isaac/array.h" #include "isaac/array.h"
#include "isaac/driver/dispatch.h"
#include "isaac/jit/syntax/expression/preset.h" #include "isaac/jit/syntax/expression/preset.h"
#include "isaac/jit/syntax/engine/process.h" #include "isaac/jit/syntax/engine/process.h"
#include "isaac/jit/generation/gemm.h" #include "isaac/jit/generation/gemm.h"
@@ -37,6 +38,71 @@ namespace isaac
namespace templates namespace templates
{ {
std::vector<int_t> infos(expression_tree const & tree, symbolic::preset::gemm::args& arguments, char A_trans)
{
expression_tree::data_type const & array = tree.data();
std::size_t root = tree.root();
arguments = symbolic::preset::gemm::check(array, root);
int_t M = arguments.C->shape[0];
int_t N = arguments.C->shape[1];
int_t K = (A_trans=='T')?arguments.A->shape[0]:arguments.A->shape[1];
return {M, N, K};
}
/* ------------------ CUBLAS ------------------ */
bool cublas_gemm::init()
{
return driver::dispatch::cublasinit();
}
cublas_gemm::cublas_gemm(char A_trans, char B_trans): A_trans_(A_trans), B_trans_(B_trans), init_(driver::dispatch::cublasinit())
{ }
int cublas_gemm::is_invalid(expression_tree const &, driver::Device const & device) const
{ return init_ && device.backend()==driver::CUDA; }
std::vector<int_t> cublas_gemm::input_sizes(expression_tree const & expressions) const
{
symbolic::preset::gemm::args dummy;
return infos((expression_tree&)expressions, dummy, A_trans_);
}
void cublas_gemm::enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & control)
{
namespace drv = driver;;
//Get GEMM info
symbolic::preset::gemm::args args;
std::vector<int_t> MNK = infos(control.x(), args, A_trans_);
int_t M = MNK[0], N = MNK[1], K = MNK[2];
CUdeviceptr cuA = args.A->array.handle.cu;
CUdeviceptr cuB = args.B->array.handle.cu;
CUdeviceptr cuC = args.C->array.handle.cu;
runtime::execution_options_type const & opt = control.execution_options();
auto cuT = [](char xt) { return xt=='N'?CUBLAS_OP_N:CUBLAS_OP_T; };
//Set new stream
cudaStream_t bkp;
drv::Event event(drv::CUDA);
drv::dispatch::cublasGetStream(&bkp);
drv::dispatch::cublasSetStream((cudaStream_t)queue.handle().cu());
values_holder alpha = args.alpha.values();
values_holder beta = args.beta.values();
if(opt.events)
drv::check(drv::dispatch::cuEventRecord(event.handle().cu().first, queue.handle().cu()));
if(args.C->dtype==FLOAT_TYPE)
drv::dispatch::cublasSgemm(cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float32, (float*)cuA, args.A->ld[1], (float*)cuB, args.B->ld[1], &beta.float32, (float*)cuC, args.C->ld[1]);
else
drv::dispatch::cublasDgemm(cuT(A_trans_), cuT(B_trans_), M, N, K, &alpha.float64, (double*)cuA, args.A->ld[1], (double*)cuB, args.B->ld[1], &beta.float64, (double*)cuC, args.C->ld[1]);
if(opt.events){
drv::check(drv::dispatch::cuEventRecord(event.handle().cu().second, queue.handle().cu()));
opt.events->push_back(event);
}
//Revert old stream
drv::dispatch::cublasSetStream(bkp);
}
/* -------------------------------------------- */
unsigned int gemm::lmem_usage(expression_tree const & expression) const unsigned int gemm::lmem_usage(expression_tree const & expression) const
{ {
unsigned int N = 0; unsigned int N = 0;
@@ -124,7 +190,7 @@ namespace templates
#define VSTORE_LDSB(value, offset, ptr) vstore(vwidth_, sdtype, value, offset, ptr, "1", backend, lldb%vwidth_==0) #define VSTORE_LDSB(value, offset, ptr) vstore(vwidth_, sdtype, value, offset, ptr, "1", backend, lldb%vwidth_==0)
symbolic::preset::gemm::args args; symbolic::preset::gemm::args args;
infos(tree, args); infos(tree, args, A_trans_);
std::string ASTRIDE1 = (args.A->ld[0] > 1)?"*Astride1":""; std::string ASTRIDE1 = (args.A->ld[0] > 1)?"*Astride1":"";
std::string BSTRIDE1 = (args.B->ld[0] > 1)?"*Bstride1":""; std::string BSTRIDE1 = (args.B->ld[0] > 1)?"*Bstride1":"";
std::string CSTRIDE1 = (args.C->ld[0] > 1)?"*Cstride1":""; std::string CSTRIDE1 = (args.C->ld[0] > 1)?"*Cstride1":"";
@@ -667,17 +733,6 @@ namespace templates
} }
std::vector<int_t> gemm::infos(expression_tree const & tree, symbolic::preset::gemm::args& arguments) const
{
expression_tree::data_type const & array = tree.data();
std::size_t root = tree.root();
arguments = symbolic::preset::gemm::check(array, root);
int_t M = arguments.C->shape[0];
int_t N = arguments.C->shape[1];
int_t K = (A_trans_=='T')?arguments.A->shape[0]:arguments.A->shape[1];
return {M, N, K};
}
gemm::gemm(unsigned int vwidth gemm::gemm(unsigned int vwidth
,int_t ls0, int_t kL, int_t ls1, int_t D ,int_t ls0, int_t kL, int_t ls1, int_t D
,int_t ms, int_t ks, int_t ns ,int_t ms, int_t ks, int_t ns
@@ -696,14 +751,14 @@ namespace templates
std::vector<int_t> gemm::input_sizes(expression_tree const & expressions) const std::vector<int_t> gemm::input_sizes(expression_tree const & expressions) const
{ {
symbolic::preset::gemm::args dummy; symbolic::preset::gemm::args dummy;
return infos((expression_tree&)expressions, dummy); return infos((expression_tree&)expressions, dummy, A_trans_);
} }
void gemm::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & control) void gemm::enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & control)
{ {
expression_tree const & expressions = control.x(); expression_tree const & expressions = control.x();
symbolic::preset::gemm::args args; symbolic::preset::gemm::args args;
std::vector<int_t> MNK = infos(expressions, args); std::vector<int_t> MNK = infos(expressions, args, A_trans_);
int_t M = MNK[0]; int_t M = MNK[0];
int_t N = MNK[1]; int_t N = MNK[1];
int_t K = MNK[2]; int_t K = MNK[2];

View File

@@ -73,7 +73,7 @@ def main():
libraries += ['gnustl_shared'] libraries += ['gnustl_shared']
#Source files #Source files
src = 'src/lib/runtime/predictors/random_forest.cpp src/lib/runtime/profiles.cpp src/lib/runtime/database.cpp src/lib/runtime/execute.cpp src/lib/exception/driver.cpp src/lib/exception/api.cpp src/lib/random/rand.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/gemm.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/object.cpp src/lib/value_scalar.cpp src/lib/array.cpp src/lib/api/blas/cublas.cpp src/lib/api/blas/clBLAS.cpp src/lib/driver/dispatch.cpp src/lib/driver/kernel.cpp src/lib/driver/backend.cpp src/lib/driver/platform.cpp src/lib/driver/buffer.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/check.cpp src/lib/driver/command_queue.cpp src/lib/driver/handle.cpp src/lib/driver/context.cpp src/lib/driver/program.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']] src = 'src/lib/exception/api.cpp src/lib/exception/driver.cpp src/lib/value_scalar.cpp src/lib/random/rand.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/driver/dispatch.cpp src/lib/jit/generation/engine/stream.cpp src/lib/jit/generation/engine/keywords.cpp src/lib/jit/generation/reduce_1d.cpp src/lib/jit/generation/elementwise_1d.cpp src/lib/jit/generation/base.cpp src/lib/jit/generation/elementwise_2d.cpp src/lib/jit/generation/reduce_2d.cpp src/lib/jit/generation/gemm.cpp src/lib/jit/syntax/engine/object.cpp src/lib/jit/syntax/engine/macro.cpp src/lib/jit/syntax/engine/process.cpp src/lib/jit/syntax/engine/binder.cpp src/lib/jit/syntax/expression/operations.cpp src/lib/jit/syntax/expression/expression.cpp src/lib/jit/syntax/expression/preset.cpp src/lib/api/blas/clBLAS.cpp src/lib/api/blas/cublas.cpp src/lib/runtime/execute.cpp src/lib/runtime/predictors/random_forest.cpp src/lib/runtime/profiles.cpp src/lib/runtime/database.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
boostsrc = 'external/boost/libs/' boostsrc = 'external/boost/libs/'
for s in ['numpy','python','smart_ptr','system','thread']: for s in ['numpy','python','smart_ptr','system','thread']:
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x] src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]