[driver] added driver source code from isaac repository

This commit is contained in:
Philippe Tillet
2019-02-08 14:47:56 -05:00
parent 937bc464a3
commit a9d219cdf5
56 changed files with 51329 additions and 0 deletions

116
include/driver/backend.h Executable file
View File

@@ -0,0 +1,116 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_BACKEND_H
#define TDL_INCLUDE_DRIVER_BACKEND_H
#include <map>
#include <list>
#include <vector>
namespace tdl
{
namespace driver
{
class Buffer;
class Stream;
class Device;
class Context;
class Platform;
class Module;
class Kernel;
struct backend
{
class modules
{
friend class backend;
public:
static void release();
static Module& get(Stream const & stream, std::string const & name, std::string const &src);
private:
static std::map<std::tuple<Stream, std::string>, Module * > cache_;
};
class kernels
{
friend class backend;
public:
static void release();
static Kernel & get(Module const & program, std::string const & name);
private:
static std::map<std::tuple<Module, std::string>, Kernel * > cache_;
};
class contexts
{
friend class backend;
private:
static void init(std::vector<Platform> const &);
static void release();
public:
static Context const & get_default();
template<class T>
static Context const & import(T context)
{
for(driver::Context const * x: cache_)
if((T)*x==context)
return *x;
cache_.emplace_back(new Context(context, false));
return *cache_.back();
}
static void get(std::list<Context const *> &);
private:
static std::list<Context const *> cache_;
};
class streams
{
friend class backend;
private:
static void init(std::list<Context const *> const &);
static void release();
public:
static void get(Context const &, std::vector<Stream *> &streams);
static Stream & get(Context const &, unsigned int id = 0);
static Stream & get_default();
private:
static std::map< Context, std::vector<Stream*> > cache_;
};
static void init();
static void release();
static std::vector<Device> devices();
static std::vector<Platform> platforms();
static void synchronize(Context const &);
static unsigned int default_device;
};
}
}
#endif

54
include/driver/buffer.h Executable file
View File

@@ -0,0 +1,54 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_BUFFER_H
#define TDL_INCLUDE_DRIVER_BUFFER_H
#include "driver/handle.h"
#include "driver/context.h"
namespace tdl
{
namespace driver
{
class Stream;
// Buffer
class Buffer: public HandleInterface<Buffer, CUdeviceptr>
{
public:
Buffer(Context const & context, size_t size);
Buffer(Context const & context, CUdeviceptr cu, bool take_ownership);
void set_zero(Stream const & queue, size_t size);
Handle<CUdeviceptr> const & cu() const;
Handle<CUdeviceptr> & cu();
private:
Context context_;
Handle<CUdeviceptr> cu_;
};
}
}
#endif

66
include/driver/context.h Executable file
View File

@@ -0,0 +1,66 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_CONTEXT_H
#define TDL_INCLUDE_DRIVER_CONTEXT_H
#include "driver/device.h"
#include "driver/handle.h"
namespace tdl
{
namespace driver
{
class Context: public HandleInterface<Context, CUcontext>
{
private:
static std::string get_cache_path();
static CUdevice device(CUcontext);
public:
//Constructors
explicit Context(CUcontext context, bool take_ownership = true);
explicit Context(Device const & device);
//Accessors
Device const & device() const;
std::string const & cache_path() const;
Handle<CUcontext> const & cu() const;
private:
Handle<CUcontext> cu_;
Device device_;
std::string cache_path_;
};
class ContextSwitcher{
public:
ContextSwitcher(Context const & ctx);
~ContextSwitcher();
private:
Context const & ctx_;
};
}
}
#endif

229
include/driver/cublas.h Executable file
View File

@@ -0,0 +1,229 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_CUBLAS_H
#define TDL_INCLUDE_DRIVER_CUBLAS_H
#include "isaac/templates/common.hpp"
#include "driver/dispatch.h"
#include "driver/buffer.h"
#include "driver/stream.h"
#include "driver/backend.h"
#include "driver/error.h"
#include "tools/bench.hpp"
#include "tools/collections.hpp"
namespace tdl
{
namespace driver
{
enum cublasStrategy_t{
CUBLAS_PREFER_FASTEST,
CUBLAS_HEURISTICS
};
static const std::vector<cublasGemmAlgo_t> cublasAlgorithms = {
CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1, CUBLAS_GEMM_ALGO2, CUBLAS_GEMM_ALGO3,
CUBLAS_GEMM_ALGO4, CUBLAS_GEMM_ALGO5, CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7
};
static const std::map<DType, cudaDataType> cudtype = {{FLOAT_TYPE, CUDA_R_32F}, {DOUBLE_TYPE,CUDA_R_64F}};
static const std::map<char, cublasOperation_t> cuop = {{'N', CUBLAS_OP_N}, {'T', CUBLAS_OP_T}};
inline cublasGemmAlgo_t cublasGemmFastest(Stream& stream, cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K,
void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb,
void* beta, CUdeviceptr C, int32_t ldc){
typedef std::tuple<cudaDataType_t, cublasOperation_t, cublasOperation_t, int32_t, int32_t, int32_t> key_t;
// Benchmark fastest algorithm in cublasGemmEx
auto benchmark_fastest = [&](key_t const &){
std::vector<double> times;
for(cublasGemmAlgo_t a: cublasAlgorithms){
try{
times.push_back(bench([&](){ dispatch::cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, a); },
[&](){ stream.synchronize(); },
stream.context().device()));
}catch(driver::exception::cublas::base const &){
times.push_back(INFINITY);
}
}
size_t argmin = std::min_element(times.begin(), times.end()) - times.begin();
return cublasAlgorithms[argmin];
};
// Cache result
static cpp::CachedMap<key_t, cublasGemmAlgo_t> cache(benchmark_fastest);
return cache.get(std::make_tuple(cudt, AT, BT, M, N, K));
}
/* Wrapper for cublasGemmEx */
inline void cublasGemmEx(cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K,
void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb,
void* beta, CUdeviceptr C, int32_t ldc, cublasGemmAlgo_t algo)
{ dispatch::cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, algo); }
/* Simplified API for default GEMM */
inline void cublasGemm(DType dtype, Stream& stream, char cAT, char cBT, int32_t M, int32_t N, int32_t K, scalar alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, scalar beta, Buffer& C, int32_t ldc, cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT){
ContextSwitcher ctx_switch(stream.context());
cublasHandle_t handle = dispatch::cublasHandle(stream.context());
dispatch::cublasSetStream_v2(handle, (CUstream)stream);
if(fastest)
*fastest = cublasGemmFastest(stream, handle, cudtype.at(dtype), cuop.at(cAT), cuop.at(cBT), M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc);
else
cublasGemmEx(handle, cudtype.at(dtype), cuop.at(cAT), cuop.at(cBT), M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc, algo);
}
inline cudnnDataType_t cudnnDtype(DType dtype){
switch(dtype){
case INT8X4_TYPE: return CUDNN_DATA_INT8x4;
case INT32_TYPE: return CUDNN_DATA_INT32;
case FLOAT_TYPE: return CUDNN_DATA_FLOAT;
case DOUBLE_TYPE: return CUDNN_DATA_DOUBLE;
}
throw;
}
inline cudnnTensorFormat_t format(cudnnDataType_t cutype){
switch(cutype){
case CUDNN_DATA_INT8x4: return CUDNN_TENSOR_NCHW_VECT_C;
default: return CUDNN_TENSOR_NCHW;
}
}
inline void cudnnConv(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t C, int32_t T, int32_t R, int32_t S,
int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, Buffer const & F, scalar beta, Buffer const & O){
driver::Context const & ctx = stream.context();
ContextSwitcher switch_ctx(ctx);
std::vector<int> pad = {pad_d, pad_h, pad_w};
std::vector<int> stride = {stride_d, stride_h, stride_w};
std::vector<int> upscale = {1, 1, 1};
std::vector<int> Oshapes = {N, K, M, P, Q};
std::vector<int> Fshapes = {K, C, T, R, S};
std::vector<int> Ishapes = {N, C, D, H, W};
if(M == 1 && T == 1 && D == 1){
pad.erase(pad.begin());
stride.erase(stride.begin());
upscale.erase(upscale.begin());
Oshapes.erase(Oshapes.begin() + 2);
Ishapes.erase(Ishapes.begin() + 2);
Fshapes.erase(Fshapes.begin() + 2);
}
cudnnHandle_t handle = dispatch::cudnnHandle(ctx);
cudnnDataType_t in_cutype = cudnnDtype(dtype);
cudnnDataType_t conv_cutype = (dtype == INT8X4_TYPE)?CUDNN_DATA_INT32:in_cutype;
dispatch::cudnnSetStream(handle, (CUstream)stream);
cudnnTensorDescriptor_t tO, tI;
cudnnFilterDescriptor_t tF;
cudnnConvolutionDescriptor_t conv;
cudnnConvolutionFwdAlgo_t algo;
dispatch::cudnnCreateTensorDescriptor(&tO);
dispatch::cudnnCreateTensorDescriptor(&tI);
dispatch::cudnnCreateFilterDescriptor(&tF);
dispatch::cudnnSetTensorNdDescriptorEx(tO, format(in_cutype), in_cutype, Oshapes.size(), Oshapes.data());
dispatch::cudnnSetFilterNdDescriptor(tF, in_cutype, format(in_cutype), Fshapes.size(), Fshapes.data());
dispatch::cudnnSetTensorNdDescriptorEx(tI, format(in_cutype), in_cutype, Ishapes.size(), Ishapes.data());
dispatch::cudnnCreateConvolutionDescriptor(&conv);
dispatch::cudnnSetConvolutionNdDescriptor(conv, pad.size(), pad.data(), stride.data(), upscale.data(), CUDNN_CROSS_CORRELATION, conv_cutype);
dispatch::cudnnGetConvolutionForwardAlgorithm(handle, tI, tF, conv, tO, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 1024*1024*64, &algo);
size_t workspace_size;
dispatch::cudnnGetConvolutionForwardWorkspaceSize(handle, tI, tF, conv, tO, algo, &workspace_size);
static Buffer work(ctx, 1024*1024*64);
CUdeviceptr twork = work;
CUdeviceptr pI = I, pF = F, pO = O;
dispatch::cudnnConvolutionForward(handle, alpha.data(), tI, (void*)pI, tF, (void*)pF, conv, algo, (void*)twork, workspace_size, beta.data(), tO, (void*)pO);
}
inline void cudnnPool(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t T, int32_t R, int32_t S,
int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, scalar beta, Buffer const & O){
driver::Context const & ctx = stream.context();
ContextSwitcher switch_ctx(ctx);
std::vector<int> pad = {pad_d, pad_h, pad_w};
std::vector<int> stride = {stride_d, stride_h, stride_w};
std::vector<int> upscale = {1, 1, 1};
std::vector<int> Oshapes = {N, K, M, P, Q};
std::vector<int> Ishapes = {N, K, D, H, W};
std::vector<int> window = {T, R, S};
if(M == 1 && T == 1 && D == 1){
window.erase(window.begin());
pad.erase(pad.begin());
stride.erase(stride.begin());
upscale.erase(upscale.begin());
Oshapes.erase(Oshapes.begin() + 2);
Ishapes.erase(Ishapes.begin() + 2);
}
cudnnHandle_t handle = dispatch::cudnnHandle(ctx);
cudnnDataType_t cutype = cudnnDtype(dtype);
dispatch::cudnnSetStream(handle, (CUstream)stream);
cudnnTensorDescriptor_t tO, tI;
cudnnPoolingDescriptor_t desc;
dispatch::cudnnCreateTensorDescriptor(&tO);
dispatch::cudnnCreateTensorDescriptor(&tI);
dispatch::cudnnSetTensorNdDescriptorEx(tO, CUDNN_TENSOR_NCHW, cutype, Oshapes.size(), Oshapes.data());
dispatch::cudnnSetTensorNdDescriptorEx(tI, CUDNN_TENSOR_NCHW, cutype, Ishapes.size(), Ishapes.data());
dispatch::cudnnCreatePoolingDescriptor(&desc);
dispatch::cudnnSetPoolingNdDescriptor(desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN, window.size(), window.data(), pad.data(), stride.data());
CUdeviceptr pI = I, pO = O;
dispatch::cudnnPoolingForward(handle, desc, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO);
}
inline void cudnnTransformTensor(driver::Stream & stream,
DType in_dtype, DType out_dtype,
cudnnTensorFormat_t in_layout, cudnnTensorFormat_t out_layout,
int32_t N, int32_t C, int32_t D, int32_t H, int32_t W,
scalar alpha, driver::Buffer const & I, scalar beta, driver::Buffer& O)
{
cudnnHandle_t handle = dispatch::cudnnHandle(stream.context());
dispatch::cudnnSetStream(handle, (CUstream)stream);
cudnnTensorDescriptor_t tO, tI;
std::vector<int> shapes = {N, C, D, H, W};
dispatch::cudnnCreateTensorDescriptor(&tI);
dispatch::cudnnSetTensorNdDescriptorEx(tI, in_layout, cudnnDtype(in_dtype), shapes.size(), shapes.data());
dispatch::cudnnCreateTensorDescriptor(&tO);
dispatch::cudnnSetTensorNdDescriptorEx(tO, out_layout, cudnnDtype(out_dtype), shapes.size(), shapes.data());
CUdeviceptr pI = I, pO = O;
dispatch::cudnnTransformTensor(handle, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO);
}
}
}
#endif

98
include/driver/device.h Executable file
View File

@@ -0,0 +1,98 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_DEVICE_H
#define TDL_INCLUDE_DRIVER_DEVICE_H
#include "driver/platform.h"
#include "driver/handle.h"
namespace tdl
{
namespace driver
{
// Device
class Device: public HandleInterface<Device, CUdevice>
{
public:
//Supported architectures
enum class Architecture{
//NVidia
SM_2_0,
SM_2_1,
SM_3_0,
SM_3_5,
SM_3_7,
SM_5_0,
SM_5_2,
SM_6_0,
SM_6_1,
SM_7_0,
UNKNOWN
};
private:
//Metaprogramming elper to get cuda info from attribute
template<CUdevice_attribute attr>
int cuGetInfo() const;
inline Architecture nv_arch(std::pair<unsigned int, unsigned int> sm) const;
inline nvmlDevice_t nvml_device() const;
public:
Device(CUdevice cu = CUdevice(), bool take_ownership = true): cu_(cu, take_ownership){}
//Accessors
Architecture architecture() const;
Handle<CUdevice> const & cu() const;
//Informations
std::string infos() const;
size_t address_bits() const;
driver::Platform platform() const;
std::vector<size_t> max_block_dim() const;
size_t max_threads_per_block() const;
size_t max_shared_memory() const;
size_t warp_size() const;
//Compute Capability
void interpret_as(std::pair<size_t, size_t> cc);
std::pair<size_t, size_t> compute_capability() const;
//Identifier
std::string name() const;
std::string pci_bus_id() const;
//Clocks
size_t current_sm_clock() const;
size_t current_mem_clock() const;
size_t max_sm_clock() const;
size_t max_mem_clock() const;
private:
Handle<CUdevice> cu_;
std::shared_ptr<std::pair<size_t, size_t>> interpreted_as_;
};
}
}
#endif

258
include/driver/dispatch.h Executable file
View File

@@ -0,0 +1,258 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_DISPATCHER_H
#define TDL_INCLUDE_DRIVER_DISPATCHER_H
#include <type_traits>
#include <dlfcn.h>
//CUDA Backend
#include "external/CUDA/cuda.h"
#include "external/CUDA/nvrtc.h"
#include "external/CUDA/cublas_v2.h"
#include "external/CUDA/cudnn.h"
#include "external/CUDA/nvml.h"
//Exceptions
#include <iostream>
#include <stdexcept>
namespace tdl
{
namespace driver
{
class Context;
template<class T> void check(T){}
void check(nvrtcResult err);
void check(CUresult err);
void check(cublasStatus_t err);
void check(cudnnStatus_t err);
class dispatch
{
private:
template <class F>
struct return_type;
template <class R, class... A>
struct return_type<R (*)(A...)>
{ typedef R type; };
typedef bool (*f_init_t)();
template<f_init_t initializer, typename FunPtrT, typename... Args>
static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args)
{
initializer();
if(cache == nullptr){
cache = dlsym(lib_h, name);
if(cache == 0)
throw std::runtime_error("dlsym unable to load function");
}
FunPtrT fptr;
*reinterpret_cast<void **>(&fptr) = cache;
typename return_type<FunPtrT>::type res = (*fptr)(args...);
check(res);
return res;
}
public:
static bool nvrtcinit();
static bool nvmlinit();
static bool cuinit();
static bool cublasinit();
static bool cudnninit();
static void release();
//CUDA
static CUresult cuCtxGetCurrent(CUcontext *pctx);
static CUresult cuCtxSetCurrent(CUcontext ctx);
static CUresult cuCtxDestroy_v2(CUcontext ctx);
static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
static CUresult cuDeviceGet(CUdevice *device, int ordinal);
static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
static CUresult cuMemFree_v2(CUdeviceptr dptr);
static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
static CUresult cuDriverGetVersion(int *driverVersion);
static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev);
static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t* bytes, CUmodule hmod, const char *name);
static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
static CUresult cuModuleLoad(CUmodule *module, const char *fname);
static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
static CUresult cuModuleUnload(CUmodule hmod);
static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
static CUresult cuDeviceGetCount(int *count);
static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
static CUresult cuInit(unsigned int Flags);
static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
static CUresult cuStreamSynchronize(CUstream hStream);
static CUresult cuStreamDestroy_v2(CUstream hStream);
static CUresult cuEventDestroy_v2(CUevent hEvent);
static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr);
static CUresult cuCtxGetDevice(CUdevice* result);
static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream);
static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device);
static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
static nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char **options);
static nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
static nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
static nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
static cublasHandle_t cublasHandle(Context const & ctx);
static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId);
static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId);
static cublasStatus_t cublasSgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
static cublasStatus_t cublasDgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
static cublasStatus_t cublasHgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc);
static cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const void *beta, void *C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo);
static cudnnHandle_t cudnnHandle(Context const & ctx);
static cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
static cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc);
static cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
static cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
static cudnnStatus_t cudnnCreate(cudnnHandle_t *handle);
static cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w);
static cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w);
static cudnnStatus_t cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int nbDims, const int dimA[]);
static cudnnStatus_t cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int nbDims, const int filterDimA[]);
static cudnnStatus_t cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, int pad_h, int pad_w, int u, int v, int upscalex, int upscaley, cudnnConvolutionMode_t mode);
static cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, int arrayLength, const int padA[], const int filterStrideA[], const int upscaleA[], cudnnConvolutionMode_t mode, cudnnDataType_t dataType);
static cudnnStatus_t cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims, const int windowDimA[], const int paddingA[], const int strideA[]);
static cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionFwdAlgo_t *algo);
static cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo, size_t *sizeInBytes);
static cudnnStatus_t cudnnConvolutionForward(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const cudnnFilterDescriptor_t wDesc, const void *w, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo, void *workSpace, size_t workSpaceSizeInBytes, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
static cudnnStatus_t cudnnPoolingForward(cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
static cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
static cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
private:
static void* cuda_;
static void* nvrtc_;
static void* nvml_;
static void* cublas_;
static void* cudnn_;
//CUDA
static void* cuCtxGetCurrent_;
static void* cuCtxSetCurrent_;
static void* cuCtxDestroy_v2_;
static void* cuEventCreate_;
static void* cuDeviceGet_;
static void* cuMemcpyDtoH_v2_;
static void* cuStreamCreate_;
static void* cuEventElapsedTime_;
static void* cuMemFree_v2_;
static void* cuMemcpyDtoHAsync_v2_;
static void* cuDriverGetVersion_;
static void* cuDeviceGetName_;
static void* cuDeviceGetPCIBusId_;
static void* cuModuleGetGlobal_v2_;
static void* cuMemcpyHtoDAsync_v2_;
static void* cuModuleLoad_;
static void* cuLaunchKernel_;
static void* cuModuleUnload_;
static void* cuModuleLoadDataEx_;
static void* cuDeviceGetAttribute_;
static void* cuDeviceGetCount_;
static void* cuMemcpyHtoD_v2_;
static void* cuInit_;
static void* cuEventRecord_;
static void* cuCtxCreate_v2_;
static void* cuModuleGetFunction_;
static void* cuStreamSynchronize_;
static void* cuStreamDestroy_v2_;
static void* cuEventDestroy_v2_;
static void* cuMemAlloc_v2_;
static void* cuPointerGetAttribute_;
static void* cuCtxGetDevice_;
static void* cuMemsetD8Async_;
static void* cuCtxPushCurrent_v2_;
static void* cuCtxPopCurrent_v2_;
static void* nvmlInit_v2_;
static void* nvmlDeviceGetHandleByPciBusId_v2_;
static void* nvmlDeviceGetClockInfo_;
static void* nvmlDeviceGetMaxClockInfo_;
static void* nvrtcCompileProgram_;
static void* nvrtcGetProgramLogSize_;
static void* nvrtcGetPTX_;
static void* nvrtcGetPTXSize_;
static void* nvrtcCreateProgram_;
static void* nvrtcGetProgramLog_;
static void* cublasCreate_v2_;
static void* cublasGetStream_v2_;
static void* cublasSetStream_v2_;
static void* cublasHgemm_;
static void* cublasSgemm_v2_;
static void* cublasDgemm_v2_;
static void* cublasGemmEx_;
static void* cudnnCreateConvolutionDescriptor_;
static void* cudnnCreatePoolingDescriptor_;
static void* cudnnCreateTensorDescriptor_;
static void* cudnnCreateFilterDescriptor_;
static void* cudnnCreate_;
static void* cudnnSetTensor4dDescriptor_;
static void* cudnnSetFilter4dDescriptor_;
static void* cudnnSetTensorNdDescriptorEx_;
static void* cudnnSetFilterNdDescriptor_;
static void* cudnnSetConvolution2dDescriptor_;
static void* cudnnSetConvolutionNdDescriptor_;
static void* cudnnSetPoolingNdDescriptor_;
static void* cudnnGetConvolutionForwardAlgorithm_;
static void* cudnnGetConvolutionForwardWorkspaceSize_;
static void* cudnnConvolutionForward_;
static void* cudnnPoolingForward_;
static void* cudnnSetStream_;
static void* cudnnTransformTensor_;
};
}
}
#endif

228
include/driver/error.h Executable file
View File

@@ -0,0 +1,228 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_ERROR_H
#define TDL_INCLUDE_DRIVER_ERROR_H
#include <exception>
#include "driver/dispatch.h"
namespace tdl
{
namespace driver
{
namespace exception
{
namespace nvrtc
{
#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory ,"out of memory");
ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure ,"program creation failure");
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input ,"invalid input");
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program ,"invalid program");
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_option ,"invalid option");
ISAAC_CREATE_NVRTC_EXCEPTION(compilation ,"compilation");
ISAAC_CREATE_NVRTC_EXCEPTION(builtin_operation_failure ,"builtin operation failure");
ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error ,"unknown error");
#undef ISAAC_CREATE_NVRTC_EXCEPTION
}
namespace cuda
{
class base: public std::exception{};
#define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "CUDA: Error- " msg; } }
ISAAC_CREATE_CUDA_EXCEPTION(invalid_value ,"invalid value");
ISAAC_CREATE_CUDA_EXCEPTION(out_of_memory ,"out of memory");
ISAAC_CREATE_CUDA_EXCEPTION(not_initialized ,"not initialized");
ISAAC_CREATE_CUDA_EXCEPTION(deinitialized ,"deinitialized");
ISAAC_CREATE_CUDA_EXCEPTION(profiler_disabled ,"profiler disabled");
ISAAC_CREATE_CUDA_EXCEPTION(profiler_not_initialized ,"profiler not initialized");
ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_started ,"profiler already started");
ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_stopped ,"profiler already stopped");
ISAAC_CREATE_CUDA_EXCEPTION(no_device ,"no device");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_device ,"invalid device");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_image ,"invalid image");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_context ,"invalid context");
ISAAC_CREATE_CUDA_EXCEPTION(context_already_current ,"context already current");
ISAAC_CREATE_CUDA_EXCEPTION(map_failed ,"map failed");
ISAAC_CREATE_CUDA_EXCEPTION(unmap_failed ,"unmap failed");
ISAAC_CREATE_CUDA_EXCEPTION(array_is_mapped ,"array is mapped");
ISAAC_CREATE_CUDA_EXCEPTION(already_mapped ,"already mapped");
ISAAC_CREATE_CUDA_EXCEPTION(no_binary_for_gpu ,"no binary for gpu");
ISAAC_CREATE_CUDA_EXCEPTION(already_acquired ,"already acquired");
ISAAC_CREATE_CUDA_EXCEPTION(not_mapped ,"not mapped");
ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_array ,"not mapped as array");
ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer ,"not mapped as pointer");
ISAAC_CREATE_CUDA_EXCEPTION(ecc_uncorrectable ,"ecc uncorrectable");
ISAAC_CREATE_CUDA_EXCEPTION(unsupported_limit ,"unsupported limit");
ISAAC_CREATE_CUDA_EXCEPTION(context_already_in_use ,"context already in use");
ISAAC_CREATE_CUDA_EXCEPTION(peer_access_unsupported ,"peer access unsupported");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_ptx ,"invalid ptx");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_graphics_context ,"invalid graphics context");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_source ,"invalid source");
ISAAC_CREATE_CUDA_EXCEPTION(file_not_found ,"file not found");
ISAAC_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found ,"shared object symbol not found");
ISAAC_CREATE_CUDA_EXCEPTION(shared_object_init_failed ,"shared object init failed");
ISAAC_CREATE_CUDA_EXCEPTION(operating_system ,"operating system");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_handle ,"invalid handle");
ISAAC_CREATE_CUDA_EXCEPTION(not_found ,"not found");
ISAAC_CREATE_CUDA_EXCEPTION(not_ready ,"not ready");
ISAAC_CREATE_CUDA_EXCEPTION(illegal_address ,"illegal address");
ISAAC_CREATE_CUDA_EXCEPTION(launch_out_of_resources ,"launch out of resources");
ISAAC_CREATE_CUDA_EXCEPTION(launch_timeout ,"launch timeout");
ISAAC_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing ,"launch incompatible texturing");
ISAAC_CREATE_CUDA_EXCEPTION(peer_access_already_enabled ,"peer access already enabled");
ISAAC_CREATE_CUDA_EXCEPTION(peer_access_not_enabled ,"peer access not enabled");
ISAAC_CREATE_CUDA_EXCEPTION(primary_context_active ,"primary context active");
ISAAC_CREATE_CUDA_EXCEPTION(context_is_destroyed ,"context is destroyed");
ISAAC_CREATE_CUDA_EXCEPTION(assert_error ,"assert");
ISAAC_CREATE_CUDA_EXCEPTION(too_many_peers ,"too many peers");
ISAAC_CREATE_CUDA_EXCEPTION(host_memory_already_registered ,"host memory already registered");
ISAAC_CREATE_CUDA_EXCEPTION(host_memory_not_registered ,"hot memory not registered");
ISAAC_CREATE_CUDA_EXCEPTION(hardware_stack_error ,"hardware stack error");
ISAAC_CREATE_CUDA_EXCEPTION(illegal_instruction ,"illegal instruction");
ISAAC_CREATE_CUDA_EXCEPTION(misaligned_address ,"misaligned address");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_address_space ,"invalid address space");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_pc ,"invalid pc");
ISAAC_CREATE_CUDA_EXCEPTION(launch_failed ,"launch failed");
ISAAC_CREATE_CUDA_EXCEPTION(not_permitted ,"not permitted");
ISAAC_CREATE_CUDA_EXCEPTION(not_supported ,"not supported");
ISAAC_CREATE_CUDA_EXCEPTION(unknown ,"unknown");
#undef ISAAC_CREATE_CUDA_EXCEPTION
}
namespace cublas
{
class base: public std::exception{};
#define ISAAC_CREATE_CUBLAS_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } }
ISAAC_CREATE_CUBLAS_EXCEPTION(not_initialized ,"not initialized");
ISAAC_CREATE_CUBLAS_EXCEPTION(alloc_failed ,"alloc failed");
ISAAC_CREATE_CUBLAS_EXCEPTION(invalid_value ,"invalid value");
ISAAC_CREATE_CUBLAS_EXCEPTION(arch_mismatch ,"arch mismatch");
ISAAC_CREATE_CUBLAS_EXCEPTION(mapping_error ,"mapping error");
ISAAC_CREATE_CUBLAS_EXCEPTION(execution_failed ,"execution failed");
ISAAC_CREATE_CUBLAS_EXCEPTION(internal_error ,"internal error");
ISAAC_CREATE_CUBLAS_EXCEPTION(not_supported ,"not supported");
ISAAC_CREATE_CUBLAS_EXCEPTION(license_error ,"license error");
ISAAC_CREATE_CUBLAS_EXCEPTION(unknown ,"unknown");
#undef ISAAC_CREATE_CUBLAS_EXCEPTION
}
namespace cudnn
{
#define ISAAC_CREATE_CUDNN_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "CUDNN: Error- " msg; } }
ISAAC_CREATE_CUDNN_EXCEPTION(not_initialized ,"not initialized");
ISAAC_CREATE_CUDNN_EXCEPTION(alloc_failed ,"allocation failed");
ISAAC_CREATE_CUDNN_EXCEPTION(bad_param ,"bad param");
ISAAC_CREATE_CUDNN_EXCEPTION(internal_error ,"internal error");
ISAAC_CREATE_CUDNN_EXCEPTION(invalid_value ,"invalid value");
ISAAC_CREATE_CUDNN_EXCEPTION(arch_mismatch ,"arch mismatch");
ISAAC_CREATE_CUDNN_EXCEPTION(mapping_error ,"mapping error");
ISAAC_CREATE_CUDNN_EXCEPTION(execution_failed ,"execution failed");
ISAAC_CREATE_CUDNN_EXCEPTION(not_supported ,"not supported");
ISAAC_CREATE_CUDNN_EXCEPTION(license_error ,"license error");
ISAAC_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing ,"prerequisite missing");
ISAAC_CREATE_CUDNN_EXCEPTION(runtime_in_progress ,"runtime in progress");
ISAAC_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow ,"runtime fp overflow");
}
namespace ocl
{
class base: public std::exception{};
#define ISAAC_CREATE_CL_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "OpenCL: Error- " msg; } }
ISAAC_CREATE_CL_EXCEPTION(device_not_found, "device not found");
ISAAC_CREATE_CL_EXCEPTION(device_not_available, "device not available");
ISAAC_CREATE_CL_EXCEPTION(compiler_not_available, "compiler not available");
ISAAC_CREATE_CL_EXCEPTION(mem_object_allocation_failure, "object allocation failure");
ISAAC_CREATE_CL_EXCEPTION(out_of_resources, "launch out of resources");
ISAAC_CREATE_CL_EXCEPTION(out_of_host_memory, "out of host memory");
ISAAC_CREATE_CL_EXCEPTION(profiling_info_not_available, "profiling info not available");
ISAAC_CREATE_CL_EXCEPTION(mem_copy_overlap, "mem copy overlap");
ISAAC_CREATE_CL_EXCEPTION(image_format_mismatch, "image format mismatch");
ISAAC_CREATE_CL_EXCEPTION(image_format_not_supported, "image format not supported");
ISAAC_CREATE_CL_EXCEPTION(build_program_failure, "build program failure");
ISAAC_CREATE_CL_EXCEPTION(map_failure, "map failure");
ISAAC_CREATE_CL_EXCEPTION(invalid_value, "invalid value");
ISAAC_CREATE_CL_EXCEPTION(invalid_device_type, "invalid device type");
ISAAC_CREATE_CL_EXCEPTION(invalid_platform, "invalid platform");
ISAAC_CREATE_CL_EXCEPTION(invalid_device, "invalid device");
ISAAC_CREATE_CL_EXCEPTION(invalid_context, "invalid context");
ISAAC_CREATE_CL_EXCEPTION(invalid_queue_properties, "invalid queue properties");
ISAAC_CREATE_CL_EXCEPTION(invalid_command_queue, "invalid command queue");
ISAAC_CREATE_CL_EXCEPTION(invalid_host_ptr, "invalid host pointer");
ISAAC_CREATE_CL_EXCEPTION(invalid_mem_object, "invalid mem object");
ISAAC_CREATE_CL_EXCEPTION(invalid_image_format_descriptor, "invalid image format descriptor");
ISAAC_CREATE_CL_EXCEPTION(invalid_image_size, "invalid image size");
ISAAC_CREATE_CL_EXCEPTION(invalid_sampler, "invalid sampler");
ISAAC_CREATE_CL_EXCEPTION(invalid_binary, "invalid binary");
ISAAC_CREATE_CL_EXCEPTION(invalid_build_options, "invalid build options");
ISAAC_CREATE_CL_EXCEPTION(invalid_program, "invalid program");
ISAAC_CREATE_CL_EXCEPTION(invalid_program_executable, "invalid program executable");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_name, "invalid kernel name");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_definition, "invalid kernel definition");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel, "invalid kernel");
ISAAC_CREATE_CL_EXCEPTION(invalid_arg_index, "invalid arg index");
ISAAC_CREATE_CL_EXCEPTION(invalid_arg_value, "invalid arg value");
ISAAC_CREATE_CL_EXCEPTION(invalid_arg_size, "invalid arg size");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_args, "invalid kernel args");
ISAAC_CREATE_CL_EXCEPTION(invalid_work_dimension, "invalid work dimension");
ISAAC_CREATE_CL_EXCEPTION(invalid_work_group_size, "invalid work group size");
ISAAC_CREATE_CL_EXCEPTION(invalid_work_item_size, "invalid work item size");
ISAAC_CREATE_CL_EXCEPTION(invalid_global_offset, "invalid global offset");
ISAAC_CREATE_CL_EXCEPTION(invalid_event_wait_list, "invalid event wait list");
ISAAC_CREATE_CL_EXCEPTION(invalid_event, "invalid event");
ISAAC_CREATE_CL_EXCEPTION(invalid_operation, "invalid operation");
ISAAC_CREATE_CL_EXCEPTION(invalid_gl_object, "invalid GL object");
ISAAC_CREATE_CL_EXCEPTION(invalid_buffer_size, "invalid buffer size");
ISAAC_CREATE_CL_EXCEPTION(invalid_mip_level, "invalid MIP level");
ISAAC_CREATE_CL_EXCEPTION(invalid_global_work_size, "invalid global work size");
#ifdef CL_INVALID_PROPERTY
ISAAC_CREATE_CL_EXCEPTION(invalid_property, "invalid property");
#endif
}
}
}
}
#endif

49
include/driver/event.h Executable file
View File

@@ -0,0 +1,49 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_EVENT_H
#define TDL_INCLUDE_DRIVER_EVENT_H
#include "driver/handle.h"
namespace tdl
{
namespace driver
{
// Event
class Event: public HandleInterface<Event, cu_event_t>
{
public:
float elapsed_time() const;
Handle<cu_event_t> const & cu() const;
private:
Handle<cu_event_t> cu_;
};
}
}
#endif

82
include/driver/handle.h Executable file
View File

@@ -0,0 +1,82 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_HANDLE_H
#define TDL_INCLUDE_DRIVER_HANDLE_H
#include <memory>
#include <iostream>
#include <functional>
#include <type_traits>
#include "driver/dispatch.h"
namespace tdl
{
namespace driver
{
struct cu_event_t{
operator bool() const { return first && second; }
CUevent first;
CUevent second;
};
struct cu_platform{
cu_platform() : status_(dispatch::cuInit(0)) { }
operator bool() const { return status_; }
private:
CUresult status_;
};
template<class T, class CUType>
class HandleInterface{
public:
//Accessors
operator CUType() const { return *(((T*)this)->cu().h_); }
//Comparison
bool operator==(HandleInterface const & y) { return (CUType)(*this) == (CUType)(y); }
bool operator!=(HandleInterface const & y) { return (CUType)(*this) != (CUType)(y); }
bool operator<(HandleInterface const & y) { return (CUType)(*this) < (CUType)(y); }
};
template<class CUType>
class Handle{
public:
template<class, class> friend class HandleInterface;
public:
//Constructors
Handle(CUType cu = CUType(), bool take_ownership = true);
~Handle();
CUType& operator*() { return *h_; }
CUType const & operator*() const { return *h_; }
CUType* operator->() const { return h_.get(); }
protected:
std::shared_ptr<CUType> h_;
bool has_ownership_;
};
}
}
#endif

68
include/driver/kernel.h Executable file
View File

@@ -0,0 +1,68 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_KERNEL_H
#define TDL_INCLUDE_DRIVER_KERNEL_H
#include "driver/module.h"
#include "driver/handle.h"
#include <memory>
namespace tdl
{
namespace driver
{
class Buffer;
// Kernel
class Kernel: public HandleInterface<Kernel, CUfunction>
{
public:
//Constructors
Kernel(Module const & program, const char * name);
//Accessors
Handle<CUfunction> const & cu() const;
Module const & module() const;
//Arguments setters
void setArg(unsigned int index, std::size_t size, void* ptr);
void setArg(unsigned int index, Buffer const &);
template<class T> void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); }
//Arguments getters
void* const* cu_params() const;
private:
Handle<CUfunction> cu_;
Module program_;
unsigned int address_bits_;
std::vector<std::shared_ptr<void> > cu_params_store_;
std::vector<void*> cu_params_;
};
}
}
#endif

61
include/driver/module.h Executable file
View File

@@ -0,0 +1,61 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_MODULE_H
#define TDL_INCLUDE_DRIVER_MODULE_H
#include <map>
#include "driver/handle.h"
#include "driver/context.h"
#include "driver/buffer.h"
namespace tdl
{
namespace driver
{
class Context;
class Device;
class Module: public HandleInterface<Module, CUmodule>
{
static std::string header(Device const & device);
public:
Module(Context const & context, std::string const & source);
Context const & context() const;
Handle<CUmodule> const & cu() const;
Buffer symbol(const char * name) const;
private:
Handle<CUmodule> cu_;
Context context_;
std::string source_;
};
}
}
#endif

54
include/driver/platform.h Executable file
View File

@@ -0,0 +1,54 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_PLATFORM_H
#define TDL_INCLUDE_DRIVER_PLATFORM_H
#include <vector>
#include <string>
#include "driver/handle.h"
namespace tdl
{
namespace driver
{
class Device;
class Platform
{
public:
//Accessors
std::string name() const;
std::string version() const;
std::vector<Device> devices() const;
private:
Handle<cu_platform> cu_;
};
}
}
#endif

82
include/driver/stream.h Executable file
View File

@@ -0,0 +1,82 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef TDL_INCLUDE_DRIVER_STREAM_H
#define TDL_INCLUDE_DRIVER_STREAM_H
#include <map>
#include "driver/context.h"
#include "driver/device.h"
#include "driver/handle.h"
#include "driver/buffer.h"
namespace tdl
{
namespace driver
{
class Kernel;
class Event;
class Range;
class Buffer;
// Command Queue
class Stream: public HandleInterface<Stream, CUstream>
{
public:
//Constructors
Stream(CUstream stream, bool take_ownership);
Stream(Context const & context);
//Accessors
Handle<CUstream> const & cu() const;
Context const & context() const;
//Synchronize
void synchronize();
//Enqueue
void enqueue(Kernel const & kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const * = NULL, Event *event = NULL);
// Write
void write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
template<class T> void write(Buffer const & buffer, bool blocking, std::size_t offset, std::vector<T> const & x)
{ write(buffer, blocking, offset, x.size()*sizeof(T), x.data()); }
// Read
void read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr);
template<class T> void read(Buffer const & buffer, bool blocking, std::size_t offset, std::vector<T>& x)
{ read(buffer, blocking, offset, x.size()*sizeof(T), x.data()); }
private:
Context context_;
Handle<CUstream> cu_;
};
}
}
#endif

64
include/external/CUDA/builtin_types.h vendored Executable file
View File

@@ -0,0 +1,64 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "device_types.h"
#if !defined(__CUDACC_RTC__)
#define EXCLUDE_FROM_RTC
#include "driver_types.h"
#undef EXCLUDE_FROM_RTC
#endif /* !__CUDACC_RTC__ */
#include "surface_types.h"
#include "texture_types.h"
#include "vector_types.h"

412
include/external/CUDA/channel_descriptor.h vendored Executable file
View File

@@ -0,0 +1,412 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CHANNEL_DESCRIPTOR_H__)
#define __CHANNEL_DESCRIPTOR_H__
#if defined(__cplusplus)
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "driver_types.h"
#include "cuda_runtime_api.h"
#include "host_defines.h"
#include "vector_types.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
/**
* \addtogroup CUDART_HIGHLEVEL
*
* @{
*/
/**
* \brief \hl Returns a channel descriptor using the specified format
*
* Returns a channel descriptor with format \p f and number of bits of each
* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
* defined as:
* \code
struct cudaChannelFormatDesc {
int x, y, z, w;
enum cudaChannelFormatKind f;
};
* \endcode
*
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
* ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
*
* \return
* Channel descriptor with format \p f
*
* \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
*/
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
{
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
{
int e = (int)sizeof(char) * 8;
#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
#if !defined(__LP64__)
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
#endif /* !__LP64__ */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
}
#endif /* __cplusplus */
/** @} */
/** @} */ /* END CUDART_TEXTURE_HL */
#endif /* !__CHANNEL_DESCRIPTOR_H__ */

266
include/external/CUDA/crt/host_config.h vendored Normal file
View File

@@ -0,0 +1,266 @@
/*
* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__HOST_CONFIG_H__)
#define __HOST_CONFIG_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__CUDACC__)
#if defined(__CUDACC_RTC__)
#define _CRTIMP
#define __THROW
#else /* __CUDACC_RTC__ */
/* check for host compilers that are compatible with nvcc */
#if !defined(__GNUC__) && !defined(_WIN32)
#error --- !!! UNSUPPORTED COMPILER !!! ---
#endif /* !__GNUC__ && !_WIN32 */
#if defined(__ICC)
#if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700) || !defined(__GNUC__) || !defined(__LP64__)
#error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, and ICC 17.0 on Linux x86_64 are supported!
#endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 17.0) || !__GNUC__ || !__LP64__ */
#endif /* __ICC */
#if defined(__PGIC__)
#if (!(__PGIC__ == 17) && \
!(__PGIC__ == 99 && __PGIC_MINOR__ == 99)) || \
!defined(__GNUC__) || !defined(__LP64__)
#error -- unsupported pgc++ configuration! Only pgc++ 17 on Linux x86_64 is supported!
#endif /* (!(__PGIC__ == 17) &&
!(__PGIC__ == 99 && __PGIC_MINOR__ == 99 )) ||
!__GNUC__ || !__LP64__ */
#endif /* __PGIC__ */
#if defined(__powerpc__)
#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
#if defined(__ibmxl_vrm__) && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000)
#error -- unsupported xlC version! only xlC 13.1 is supported
#endif /* __ibmxl_vrm__ && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) */
#endif /* __powerpc__ */
#if defined(__GNUC__)
#if __GNUC__ > 6
#error -- unsupported GNU version! gcc versions later than 6 are not supported!
#endif /* __GNUC__ > 6 */
#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
#error -- clang and clang++ are the only supported host compilers on Mac OS X!
#endif /* __APPLE__ && __MACH__ && !__clang__ */
#endif /* __GNUC__ */
#if defined(_WIN32)
#if _MSC_VER < 1600 || _MSC_VER > 1911
#error -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!
#elif _MSC_VER == 1600 /* _MSC_VERION == 1600 */
#pragma message("support for Microsoft Visual Studio 2010 has been deprecated!")
#endif /* _MSC_VER < 1600 || _MSC_VER > 1800 || _MSC_VERSION == 1600 */
#endif /* _WIN32 */
/* configure host compiler */
#if defined(__APPLE__)
#define _CRTIMP
#define _ACRTIMP
#define __THROW
#if defined(__BLOCKS__) /* nvcc does not support closures */
#undef __BLOCKS__
#endif /* __BLOCKS__ */
#elif defined(__ANDROID__)
#define _CRTIMP
#define _ACRTIMP
#define __THROW
#elif defined(__QNX__)
#define _CRTIMP
#define _ACRTIMP
#define __THROW
#elif defined(__HORIZON__)
#define _CRTIMP
#define _ACRTIMP
#define __THROW
#elif defined(__GNUC__)
#define _CRTIMP
#define _ACRTIMP
#include <features.h> /* for __THROW */
#elif defined(_WIN32)
#if _MSC_VER >= 1500
#undef _USE_DECLSPECS_FOR_SAL
#define _USE_DECLSPECS_FOR_SAL \
1
#endif /* _MSC_VER >= 1500 */
#if !defined(_CRT_NONSTDC_NO_WARNINGS)
#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
#endif /* !_CRT_NONSTDC_NO_WARNINGS */
#if !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
#endif /* !_CRT_SECURE_NO_WARNINGS */
#if !defined(NOMINMAX)
#define NOMINMAX /* min and max are part of cuda runtime */
#endif /* !NOMINMAX */
#include <crtdefs.h> /* for _CRTIMP */
#if _MSC_VER >= 1900
#include <corecrt.h> /* for _ACRTIMP */
#endif /* _MSC_VER >= 1900 */
#define __THROW
#endif /* __APPLE__ */
#endif /* __CUDACC_RTC__ */
#if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER)))
#if __CUDACC_RTC__
typedef char *va_list;
#else /* !__CUDACC_RTC__ */
#include <cstdarg>
#endif /* __CUDACC_RTC__ */
#undef va_start
#undef va_end
#undef va_arg
#ifdef __PGIC__
#undef __builtin_va_end
#define va_start(v,l) __builtin_alt_va_start(v,l)
#define va_end(v) __builtin_va_end(v)
#define va_arg(v,l) __builtin_alt_va_arg(v,l)
#if (__cplusplus >= 201103L)
#undef va_copy
#define va_copy(d,s) __builtin_va_copy(d,s)
#endif
#else /* !__PGIC__ */
#define va_start(ap, x) (__cu_va_start(&ap, x))
#define va_end(ap) (__cu_va_end(&ap))
#define va_arg(ap, t) (*((t *)__cu_va_arg(&ap, (t *)0)))
#if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L))
#undef va_copy
#define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps)))
#endif /* (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */
#endif /* __PGIC__ */
#endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */
#endif /* __CUDACC__ */
#endif /* !__HOST_CONFIG_H__ */

216
include/external/CUDA/crt/host_defines.h vendored Normal file
View File

@@ -0,0 +1,216 @@
/*
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__HOST_DEFINES_H__)
#define __HOST_DEFINES_H__
/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
#if defined(__GNUC__) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
#if defined(__CUDACC_RTC__)
#define __volatile__ volatile
#endif /* __CUDACC_RTC__ */
#define __no_return__ \
__attribute__((noreturn))
#if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
/* gcc allows users to define attributes with underscores,
e.g., __attribute__((__noinline__)).
Consider a non-CUDA source file (e.g. .cpp) that has the
above attribute specification, and includes this header file. In that case,
defining __noinline__ as below would cause a gcc compilation error.
Hence, only define __noinline__ when the code is being processed
by a CUDA compiler component.
*/
#define __noinline__ \
__attribute__((noinline))
#endif /* __CUDACC__ || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */
#define __forceinline__ \
__inline__ __attribute__((always_inline))
#define __align__(n) \
__attribute__((aligned(n)))
#define __thread__ \
__thread
#define __import__
#define __export__
#define __cdecl
#define __annotate__(a) \
__attribute__((a))
#define __location__(a) \
__annotate__(a)
#define CUDARTAPI
#elif defined(_MSC_VER)
#if _MSC_VER >= 1400
#define __restrict__ \
__restrict
#else /* _MSC_VER >= 1400 */
#define __restrict__
#endif /* _MSC_VER >= 1400 */
#define __inline__ \
__inline
#define __no_return__ \
__declspec(noreturn)
#define __noinline__ \
__declspec(noinline)
#define __forceinline__ \
__forceinline
#define __align__(n) \
__declspec(align(n))
#define __thread__ \
__declspec(thread)
#define __import__ \
__declspec(dllimport)
#define __export__ \
__declspec(dllexport)
#define __annotate__(a) \
__declspec(a)
#define __location__(a) \
__annotate__(__##a##__)
#define CUDARTAPI \
__stdcall
#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
#define __inline__
#if !defined(__align__)
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
#endif /* !__align__ */
#if !defined(CUDARTAPI)
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
#endif /* !CUDARTAPI */
#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
(defined(_MSC_VER) && _MSC_VER < 1900) || \
(!defined(__GNUC__) && !defined(_MSC_VER))
#define __specialization_static \
static
#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
(_MSC_VER && _MSC_VER < 1900) ||
(!__GNUC__ && !_MSC_VER) */
#define __specialization_static
#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
(_MSC_VER && _MSC_VER < 1900) ||
(!__GNUC__ && !_MSC_VER) */
#if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__)
#undef __annotate__
#define __annotate__(a)
#else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
#define __launch_bounds__(...) \
__annotate__(launch_bounds(__VA_ARGS__))
#endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
#if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \
defined(__GNUC__) || defined(_WIN64)
#define __builtin_align__(a) \
__align__(a)
#else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
#define __builtin_align__(a)
#endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
#define __host__ \
__location__(host)
#define __device__ \
__location__(device)
#define __global__ \
__location__(global)
#define __shared__ \
__location__(shared)
#define __constant__ \
__location__(constant)
#define __managed__ \
__location__(managed)
#if !defined(__CUDACC__)
#define __device_builtin__
#define __device_builtin_texture_type__
#define __device_builtin_surface_type__
#define __cudart_builtin__
#else /* defined(__CUDACC__) */
#define __device_builtin__ \
__location__(device_builtin)
#define __device_builtin_texture_type__ \
__location__(device_builtin_texture_type)
#define __device_builtin_surface_type__ \
__location__(device_builtin_surface_type)
#define __cudart_builtin__ \
__location__(cudart_builtin)
#endif /* !defined(__CUDACC__) */
#endif /* !__HOST_DEFINES_H__ */

338
include/external/CUDA/cuComplex.h vendored Executable file
View File

@@ -0,0 +1,338 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CU_COMPLEX_H_)
#define CU_COMPLEX_H_
/* When trying to include C header file in C++ Code extern "C" is required
* But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
* extern "C" cannot be nested
* Hence keep the header out of extern "C" block
*/
#include <math.h> /* import fabsf, sqrt */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
#include "vector_types.h"
typedef float2 cuFloatComplex;
__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
{
return x.x;
}
__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
{
return x.y;
}
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
(float r, float i)
{
cuFloatComplex res;
res.x = r;
res.y = i;
return res;
}
__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
{
return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
}
__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
cuFloatComplex y)
{
return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),
cuCimagf(x) + cuCimagf(y));
}
__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
cuFloatComplex y)
{
return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),
cuCimagf(x) - cuCimagf(y));
}
/* This implementation could suffer from intermediate overflow even though
* the final result would be in range. However, various implementations do
* not guard against this (presumably to avoid losing performance), so we
* don't do it either to stay competitive.
*/
__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
cuFloatComplex y)
{
cuFloatComplex prod;
prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) -
(cuCimagf(x) * cuCimagf(y)),
(cuCrealf(x) * cuCimagf(y)) +
(cuCimagf(x) * cuCrealf(y)));
return prod;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Such guarded implementations are usually the default for
* complex library implementations, with some also offering an unguarded,
* faster version.
*/
__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
cuFloatComplex y)
{
cuFloatComplex quot;
float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
float oos = 1.0f / s;
float ars = cuCrealf(x) * oos;
float ais = cuCimagf(x) * oos;
float brs = cuCrealf(y) * oos;
float bis = cuCimagf(y) * oos;
s = (brs * brs) + (bis * bis);
oos = 1.0f / s;
quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
((ais * brs) - (ars * bis)) * oos);
return quot;
}
/*
* We would like to call hypotf(), but it's not available on all platforms.
* This discrete implementation guards against intermediate underflow and
* overflow by scaling. Otherwise we would lose half the exponent range.
* There are various ways of doing guarded computation. For now chose the
* simplest and fastest solution, however this may suffer from inaccuracies
* if sqrt and division are not IEEE compliant.
*/
__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
{
float a = cuCrealf(x);
float b = cuCimagf(x);
float v, w, t;
a = fabsf(a);
b = fabsf(b);
if (a > b) {
v = a;
w = b;
} else {
v = b;
w = a;
}
t = w / v;
t = 1.0f + t * t;
t = v * sqrtf(t);
if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
t = v + w;
}
return t;
}
/* Double precision */
typedef double2 cuDoubleComplex;
__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
{
return x.x;
}
__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)
{
return x.y;
}
__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex
(double r, double i)
{
cuDoubleComplex res;
res.x = r;
res.y = i;
return res;
}
__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
{
return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
cuDoubleComplex y)
{
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),
cuCimag(x) + cuCimag(y));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
cuDoubleComplex y)
{
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),
cuCimag(x) - cuCimag(y));
}
/* This implementation could suffer from intermediate overflow even though
* the final result would be in range. However, various implementations do
* not guard against this (presumably to avoid losing performance), so we
* don't do it either to stay competitive.
*/
__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
cuDoubleComplex y)
{
cuDoubleComplex prod;
prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -
(cuCimag(x) * cuCimag(y)),
(cuCreal(x) * cuCimag(y)) +
(cuCimag(x) * cuCreal(y)));
return prod;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Such guarded implementations are usually the default for
* complex library implementations, with some also offering an unguarded,
* faster version.
*/
__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
cuDoubleComplex y)
{
cuDoubleComplex quot;
double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
double oos = 1.0 / s;
double ars = cuCreal(x) * oos;
double ais = cuCimag(x) * oos;
double brs = cuCreal(y) * oos;
double bis = cuCimag(y) * oos;
s = (brs * brs) + (bis * bis);
oos = 1.0 / s;
quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
((ais * brs) - (ars * bis)) * oos);
return quot;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Otherwise we would lose half the exponent range. There are
* various ways of doing guarded computation. For now chose the simplest
* and fastest solution, however this may suffer from inaccuracies if sqrt
* and division are not IEEE compliant.
*/
__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
{
double a = cuCreal(x);
double b = cuCimag(x);
double v, w, t;
a = fabs(a);
b = fabs(b);
if (a > b) {
v = a;
w = b;
} else {
v = b;
w = a;
}
t = w / v;
t = 1.0 + t * t;
t = v * sqrt(t);
if ((v == 0.0) ||
(v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
t = v + w;
}
return t;
}
#if defined(__cplusplus)
}
#endif /* __cplusplus */
/* aliases */
typedef cuFloatComplex cuComplex;
__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
float y)
{
return make_cuFloatComplex (x, y);
}
/* float-to-double promotion */
__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
(cuFloatComplex c)
{
return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
}
__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
(cuDoubleComplex c)
{
return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
}
__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
{
float real_res;
float imag_res;
real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);
real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;
return make_cuComplex(real_res, imag_res);
}
__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
{
double real_res;
double imag_res;
real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
return make_cuDoubleComplex(real_res, imag_res);
}
#endif /* !defined(CU_COMPLEX_H_) */

565
include/external/CUDA/cublas.h vendored Executable file
View File

@@ -0,0 +1,565 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
* This is the public header file for the CUBLAS library, defining the API
*
* CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
* on top of the CUDA runtime.
*/
#if !defined(CUBLAS_H_)
#define CUBLAS_H_
#include <cuda_runtime.h>
#ifndef CUBLASWINAPI
#ifdef _WIN32
#define CUBLASWINAPI __stdcall
#else
#define CUBLASWINAPI
#endif
#endif
#undef CUBLASAPI
#ifdef __CUDACC__
#define CUBLASAPI __host__
#else
#define CUBLASAPI
#endif
#include "cublas_api.h"
#if defined(__cplusplus)
extern "C" {
#endif
/* CUBLAS data types */
#define cublasStatus cublasStatus_t
cublasStatus CUBLASWINAPI cublasInit (void);
cublasStatus CUBLASWINAPI cublasShutdown (void);
cublasStatus CUBLASWINAPI cublasGetError (void);
cublasStatus CUBLASWINAPI cublasGetVersion(int *version);
cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr);
cublasStatus CUBLASWINAPI cublasFree (void *devicePtr);
cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream);
/* ---------------- CUBLAS BLAS1 functions ---------------- */
/* NRM2 */
float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx);
double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx);
float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx);
double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* DOT */
float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y,
int incy);
double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y,
int incy);
cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y,
int incy);
cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y,
int incy);
cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
int incy);
cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
int incy);
/*------------------------------------------------------------------------*/
/* SCAL */
void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx);
void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx);
void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx);
void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx);
void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx);
void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* AXPY */
void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx,
float *y, int incy);
void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x,
int incx, double *y, int incy);
void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x,
int incx, cuComplex *y, int incy);
void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* COPY */
void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y,
int incy);
void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y,
int incy);
void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
int incy);
void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
int incy);
/*------------------------------------------------------------------------*/
/* SWAP */
void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy);
void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy);
void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy);
void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* AMAX */
int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx);
int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx);
int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx);
int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* AMIN */
int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx);
int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx);
int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx);
int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* ASUM */
float CUBLASWINAPI cublasSasum (int n, const float *x, int incx);
double CUBLASWINAPI cublasDasum (int n, const double *x, int incx);
float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx);
double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* ROT */
void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy,
float sc, float ss);
void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy,
double sc, double ss);
void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y,
int incy, float c, cuComplex s);
void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx,
cuDoubleComplex *y, int incy, double sc,
cuDoubleComplex cs);
void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
int incy, float c, float s);
void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx,
cuDoubleComplex *y, int incy, double c, double s);
/*------------------------------------------------------------------------*/
/* ROTG */
void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss);
void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss);
void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
cuComplex *cs);
void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
cuDoubleComplex *cs);
/*------------------------------------------------------------------------*/
/* ROTM */
void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
const float* sparam);
void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
const double* sparam);
/*------------------------------------------------------------------------*/
/* ROTMG */
void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1,
const float *sy1, float* sparam);
void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1,
const double *sy1, double* sparam);
/* --------------- CUBLAS BLAS2 functions ---------------- */
/* GEMV */
void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
const float *A, int lda, const float *x, int incx,
float beta, float *y, int incy);
void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
const double *A, int lda, const double *x, int incx,
double beta, double *y, int incy);
void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x, int incx,
cuComplex beta, cuComplex *y, int incy);
void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
cuDoubleComplex beta, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* GBMV */
void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku,
float alpha, const float *A, int lda,
const float *x, int incx, float beta, float *y,
int incy);
void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku,
double alpha, const double *A, int lda,
const double *x, int incx, double beta, double *y,
int incy);
void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *x, int incx, cuComplex beta, cuComplex *y,
int incy);
void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y,
int incy);
/*------------------------------------------------------------------------*/
/* TRMV */
void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n,
const float *A, int lda, float *x, int incx);
void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n,
const double *A, int lda, double *x, int incx);
void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n,
const cuComplex *A, int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n,
const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TBMV */
void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k,
const float *A, int lda, float *x, int incx);
void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k,
const double *A, int lda, double *x, int incx);
void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k,
const cuComplex *A, int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k,
const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TPMV */
void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx);
void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TRSV */
void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx);
void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx);
void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda,
cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TPSV */
void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP,
float *x, int incx);
void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP,
cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TBSV */
void CUBLASWINAPI cublasStbsv(char uplo, char trans,
char diag, int n, int k, const float *A,
int lda, float *x, int incx);
void CUBLASWINAPI cublasDtbsv(char uplo, char trans,
char diag, int n, int k, const double *A,
int lda, double *x, int incx);
void CUBLASWINAPI cublasCtbsv(char uplo, char trans,
char diag, int n, int k, const cuComplex *A,
int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtbsv(char uplo, char trans,
char diag, int n, int k, const cuDoubleComplex *A,
int lda, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* SYMV/HEMV */
void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
int lda, const float *x, int incx, float beta,
float *y, int incy);
void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
int lda, const double *x, int incx, double beta,
double *y, int incy);
void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
int lda, const cuComplex *x, int incx, cuComplex beta,
cuComplex *y, int incy);
void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta,
cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* SBMV/HBMV */
void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha,
const float *A, int lda, const float *x, int incx,
float beta, float *y, int incy);
void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha,
const double *A, int lda, const double *x, int incx,
double beta, double *y, int incy);
void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x, int incx,
cuComplex beta, cuComplex *y, int incy);
void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
cuDoubleComplex beta, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* SPMV/HPMV */
void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
const float *AP, const float *x,
int incx, float beta, float *y, int incy);
void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
const double *AP, const double *x,
int incx, double beta, double *y, int incy);
void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
const cuComplex *AP, const cuComplex *x,
int incx, cuComplex beta, cuComplex *y, int incy);
void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
const cuDoubleComplex *AP, const cuDoubleComplex *x,
int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* GER */
void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
const float *y, int incy, float *A, int lda);
void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
const double *y, int incy, double *A, int lda);
void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy,
cuComplex *A, int lda);
void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy,
cuComplex *A, int lda);
void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, const cuDoubleComplex *y, int incy,
cuDoubleComplex *A, int lda);
void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, const cuDoubleComplex *y, int incy,
cuDoubleComplex *A, int lda);
/*------------------------------------------------------------------------*/
/* SYR/HER */
void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
int incx, float *A, int lda);
void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
int incx, double *A, int lda);
void CUBLASWINAPI cublasCher (char uplo, int n, float alpha,
const cuComplex *x, int incx, cuComplex *A, int lda);
void CUBLASWINAPI cublasZher (char uplo, int n, double alpha,
const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda);
/*------------------------------------------------------------------------*/
/* SPR/HPR */
void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
int incx, float *AP);
void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
int incx, double *AP);
void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
int incx, cuComplex *AP);
void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
int incx, cuDoubleComplex *AP);
/*------------------------------------------------------------------------*/
/* SYR2/HER2 */
void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x,
int incx, const float *y, int incy, float *A,
int lda);
void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x,
int incx, const double *y, int incy, double *A,
int lda);
void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy, cuComplex *A,
int lda);
void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A,
int lda);
/*------------------------------------------------------------------------*/
/* SPR2/HPR2 */
void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x,
int incx, const float *y, int incy, float *AP);
void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
const double *x, int incx, const double *y,
int incy, double *AP);
void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
const cuComplex *x, int incx, const cuComplex *y,
int incy, cuComplex *AP);
void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
int incy, cuDoubleComplex *AP);
/* ------------------------BLAS3 Functions ------------------------------- */
/* GEMM */
void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k,
float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C,
int ldc);
void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
double alpha, const double *A, int lda,
const double *B, int ldb, double beta, double *C,
int ldc);
void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
int k, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb,
cuDoubleComplex beta, cuDoubleComplex *C,
int ldc);
/* -------------------------------------------------------*/
/* SYRK */
void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha,
const float *A, int lda, float beta, float *C,
int ldc);
void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
double alpha, const double *A, int lda,
double beta, double *C, int ldc);
void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
cuComplex beta, cuComplex *C, int ldc);
void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/* ------------------------------------------------------- */
/* HERK */
void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
float alpha, const cuComplex *A, int lda,
float beta, cuComplex *C, int ldc);
void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
double alpha,
const cuDoubleComplex *A, int lda,
double beta,
cuDoubleComplex *C, int ldc);
/* ------------------------------------------------------- */
/* SYR2K */
void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha,
const float *A, int lda, const float *B, int ldb,
float beta, float *C, int ldc);
void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
double alpha, const double *A, int lda,
const double *B, int ldb, double beta,
double *C, int ldc);
void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/* ------------------------------------------------------- */
/* HER2K */
void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, float beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, double beta,
cuDoubleComplex *C, int ldc);
/*------------------------------------------------------------------------*/
/* SYMM*/
void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha,
const float *A, int lda, const float *B, int ldb,
float beta, float *C, int ldc);
void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha,
const double *A, int lda, const double *B, int ldb,
double beta, double *C, int ldc);
void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *B, int ldb,
cuComplex beta, cuComplex *C, int ldc);
void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
/*------------------------------------------------------------------------*/
/* HEMM*/
void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/*------------------------------------------------------------------------*/
/* TRSM*/
void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
int m, int n, float alpha, const float *A, int lda,
float *B, int ldb);
void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
char diag, int m, int n, double alpha,
const double *A, int lda, double *B,
int ldb);
void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
int m, int n, cuComplex alpha, const cuComplex *A,
int lda, cuComplex *B, int ldb);
void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
char diag, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
cuDoubleComplex *B, int ldb);
/*------------------------------------------------------------------------*/
/* TRMM*/
void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
int m, int n, float alpha, const float *A, int lda,
float *B, int ldb);
void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
char diag, int m, int n, double alpha,
const double *A, int lda, double *B,
int ldb);
void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
int m, int n, cuComplex alpha, const cuComplex *A,
int lda, cuComplex *B, int ldb);
void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
char diag, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
int ldb);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif /* !defined(CUBLAS_H_) */

2977
include/external/CUDA/cublas_api.h vendored Executable file

File diff suppressed because it is too large Load Diff

274
include/external/CUDA/cublas_v2.h vendored Normal file
View File

@@ -0,0 +1,274 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
* This is the public header file for the new CUBLAS library API, it mapped the generic
* Cublas name functions to the actual _v2 implementations.
*/
#if !defined(CUBLAS_V2_H_)
#define CUBLAS_V2_H_
#undef CUBLASAPI
#ifdef __CUDACC__
#define CUBLASAPI __host__ __device__
#else
#define CUBLASAPI
#endif
#include "cublas_api.h"
#define cublasCreate cublasCreate_v2
#define cublasDestroy cublasDestroy_v2
#define cublasGetVersion cublasGetVersion_v2
#define cublasSetStream cublasSetStream_v2
#define cublasGetStream cublasGetStream_v2
#define cublasGetPointerMode cublasGetPointerMode_v2
#define cublasSetPointerMode cublasSetPointerMode_v2
/* Blas3 Routines */
#define cublasSnrm2 cublasSnrm2_v2
#define cublasDnrm2 cublasDnrm2_v2
#define cublasScnrm2 cublasScnrm2_v2
#define cublasDznrm2 cublasDznrm2_v2
#define cublasSdot cublasSdot_v2
#define cublasDdot cublasDdot_v2
#define cublasCdotu cublasCdotu_v2
#define cublasCdotc cublasCdotc_v2
#define cublasZdotu cublasZdotu_v2
#define cublasZdotc cublasZdotc_v2
#define cublasSscal cublasSscal_v2
#define cublasDscal cublasDscal_v2
#define cublasCscal cublasCscal_v2
#define cublasCsscal cublasCsscal_v2
#define cublasZscal cublasZscal_v2
#define cublasZdscal cublasZdscal_v2
#define cublasSaxpy cublasSaxpy_v2
#define cublasDaxpy cublasDaxpy_v2
#define cublasCaxpy cublasCaxpy_v2
#define cublasZaxpy cublasZaxpy_v2
#define cublasScopy cublasScopy_v2
#define cublasDcopy cublasDcopy_v2
#define cublasCcopy cublasCcopy_v2
#define cublasZcopy cublasZcopy_v2
#define cublasSswap cublasSswap_v2
#define cublasDswap cublasDswap_v2
#define cublasCswap cublasCswap_v2
#define cublasZswap cublasZswap_v2
#define cublasIsamax cublasIsamax_v2
#define cublasIdamax cublasIdamax_v2
#define cublasIcamax cublasIcamax_v2
#define cublasIzamax cublasIzamax_v2
#define cublasIsamin cublasIsamin_v2
#define cublasIdamin cublasIdamin_v2
#define cublasIcamin cublasIcamin_v2
#define cublasIzamin cublasIzamin_v2
#define cublasSasum cublasSasum_v2
#define cublasDasum cublasDasum_v2
#define cublasScasum cublasScasum_v2
#define cublasDzasum cublasDzasum_v2
#define cublasSrot cublasSrot_v2
#define cublasDrot cublasDrot_v2
#define cublasCrot cublasCrot_v2
#define cublasCsrot cublasCsrot_v2
#define cublasZrot cublasZrot_v2
#define cublasZdrot cublasZdrot_v2
#define cublasSrotg cublasSrotg_v2
#define cublasDrotg cublasDrotg_v2
#define cublasCrotg cublasCrotg_v2
#define cublasZrotg cublasZrotg_v2
#define cublasSrotm cublasSrotm_v2
#define cublasDrotm cublasDrotm_v2
#define cublasSrotmg cublasSrotmg_v2
#define cublasDrotmg cublasDrotmg_v2
/* Blas2 Routines */
#define cublasSgemv cublasSgemv_v2
#define cublasDgemv cublasDgemv_v2
#define cublasCgemv cublasCgemv_v2
#define cublasZgemv cublasZgemv_v2
#define cublasSgbmv cublasSgbmv_v2
#define cublasDgbmv cublasDgbmv_v2
#define cublasCgbmv cublasCgbmv_v2
#define cublasZgbmv cublasZgbmv_v2
#define cublasStrmv cublasStrmv_v2
#define cublasDtrmv cublasDtrmv_v2
#define cublasCtrmv cublasCtrmv_v2
#define cublasZtrmv cublasZtrmv_v2
#define cublasStbmv cublasStbmv_v2
#define cublasDtbmv cublasDtbmv_v2
#define cublasCtbmv cublasCtbmv_v2
#define cublasZtbmv cublasZtbmv_v2
#define cublasStpmv cublasStpmv_v2
#define cublasDtpmv cublasDtpmv_v2
#define cublasCtpmv cublasCtpmv_v2
#define cublasZtpmv cublasZtpmv_v2
#define cublasStrsv cublasStrsv_v2
#define cublasDtrsv cublasDtrsv_v2
#define cublasCtrsv cublasCtrsv_v2
#define cublasZtrsv cublasZtrsv_v2
#define cublasStpsv cublasStpsv_v2
#define cublasDtpsv cublasDtpsv_v2
#define cublasCtpsv cublasCtpsv_v2
#define cublasZtpsv cublasZtpsv_v2
#define cublasStbsv cublasStbsv_v2
#define cublasDtbsv cublasDtbsv_v2
#define cublasCtbsv cublasCtbsv_v2
#define cublasZtbsv cublasZtbsv_v2
#define cublasSsymv cublasSsymv_v2
#define cublasDsymv cublasDsymv_v2
#define cublasCsymv cublasCsymv_v2
#define cublasZsymv cublasZsymv_v2
#define cublasChemv cublasChemv_v2
#define cublasZhemv cublasZhemv_v2
#define cublasSsbmv cublasSsbmv_v2
#define cublasDsbmv cublasDsbmv_v2
#define cublasChbmv cublasChbmv_v2
#define cublasZhbmv cublasZhbmv_v2
#define cublasSspmv cublasSspmv_v2
#define cublasDspmv cublasDspmv_v2
#define cublasChpmv cublasChpmv_v2
#define cublasZhpmv cublasZhpmv_v2
#define cublasSger cublasSger_v2
#define cublasDger cublasDger_v2
#define cublasCgeru cublasCgeru_v2
#define cublasCgerc cublasCgerc_v2
#define cublasZgeru cublasZgeru_v2
#define cublasZgerc cublasZgerc_v2
#define cublasSsyr cublasSsyr_v2
#define cublasDsyr cublasDsyr_v2
#define cublasCsyr cublasCsyr_v2
#define cublasZsyr cublasZsyr_v2
#define cublasCher cublasCher_v2
#define cublasZher cublasZher_v2
#define cublasSspr cublasSspr_v2
#define cublasDspr cublasDspr_v2
#define cublasChpr cublasChpr_v2
#define cublasZhpr cublasZhpr_v2
#define cublasSsyr2 cublasSsyr2_v2
#define cublasDsyr2 cublasDsyr2_v2
#define cublasCsyr2 cublasCsyr2_v2
#define cublasZsyr2 cublasZsyr2_v2
#define cublasCher2 cublasCher2_v2
#define cublasZher2 cublasZher2_v2
#define cublasSspr2 cublasSspr2_v2
#define cublasDspr2 cublasDspr2_v2
#define cublasChpr2 cublasChpr2_v2
#define cublasZhpr2 cublasZhpr2_v2
/* Blas3 Routines */
#define cublasSgemm cublasSgemm_v2
#define cublasDgemm cublasDgemm_v2
#define cublasCgemm cublasCgemm_v2
#define cublasZgemm cublasZgemm_v2
#define cublasSsyrk cublasSsyrk_v2
#define cublasDsyrk cublasDsyrk_v2
#define cublasCsyrk cublasCsyrk_v2
#define cublasZsyrk cublasZsyrk_v2
#define cublasCherk cublasCherk_v2
#define cublasZherk cublasZherk_v2
#define cublasSsyr2k cublasSsyr2k_v2
#define cublasDsyr2k cublasDsyr2k_v2
#define cublasCsyr2k cublasCsyr2k_v2
#define cublasZsyr2k cublasZsyr2k_v2
#define cublasCher2k cublasCher2k_v2
#define cublasZher2k cublasZher2k_v2
#define cublasSsymm cublasSsymm_v2
#define cublasDsymm cublasDsymm_v2
#define cublasCsymm cublasCsymm_v2
#define cublasZsymm cublasZsymm_v2
#define cublasChemm cublasChemm_v2
#define cublasZhemm cublasZhemm_v2
#define cublasStrsm cublasStrsm_v2
#define cublasDtrsm cublasDtrsm_v2
#define cublasCtrsm cublasCtrsm_v2
#define cublasZtrsm cublasZtrsm_v2
#define cublasStrmm cublasStrmm_v2
#define cublasDtrmm cublasDtrmm_v2
#define cublasCtrmm cublasCtrmm_v2
#define cublasZtrmm cublasZtrmm_v2
#endif /* !defined(CUBLAS_V2_H_) */

12185
include/external/CUDA/cuda.h vendored Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,248 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
#define __CUDA_DEVICE_RUNTIME_API_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if !defined(__CUDACC_RTC__)
#if (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
#if defined(__cplusplus)
extern "C" {
#endif
struct cudaFuncAttributes;
#if defined(_WIN32)
#define __NV_WEAK__ __declspec(nv_weak)
#else
#define __NV_WEAK__ __attribute__((nv_weak))
#endif
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s)
{
return cudaErrorUnknown;
}
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
{
return cudaErrorUnknown;
}
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
{
return cudaErrorUnknown;
}
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaGetDevice(int *device)
{
return cudaErrorUnknown;
}
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
{
return cudaErrorUnknown;
}
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
{
return cudaErrorUnknown;
}
#undef __NV_WEAK__
#if defined(__cplusplus)
}
#endif
#endif /* (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
#endif /* !defined(__CUDACC_RTC__) */
#if defined(__cplusplus) && defined(__CUDACC__) /* Visible to nvcc front-end only */
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) // Visible to SM>=3.5 and "__host__ __device__" only
#include "driver_types.h"
#include "host_defines.h"
extern "C"
{
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
/**
* \ingroup CUDART_EXECUTION
* \brief Obtains a parameter buffer
*
* Obtains a parameter buffer which can be filled with parameters for a kernel launch.
* Parameters passed to ::cudaLaunchDevice must be allocated via this function.
*
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
* CUDA user code should use <<< >>> to launch kernels.
*
* \param alignment - Specifies alignment requirement of the parameter buffer
* \param size - Specifies size requirement in bytes
*
* \return
* Returns pointer to the allocated parameterBuffer
* \notefnerr
*
* \sa cudaLaunchDevice
*/
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
/**
* \ingroup CUDART_EXECUTION
* \brief Launches a specified kernel
*
* Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
* by calling ::cudaGetParameterBuffer().
*
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
* CUDA user code should use <<< >>> to launch the kernels.
*
* \param func - Pointer to the kernel to be launched
* \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
* \param gridDimension - Specifies grid dimensions
* \param blockDimension - Specifies block dimensions
* \param sharedMemSize - Specifies size of shared memory
* \param stream - Specifies the stream to be used
*
* \return
* ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
* ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
* \notefnerr
* \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
* Guide for the detailed descriptions of launch configuration and parameter layout respectively.
*
* \sa cudaGetParameterBuffer
*/
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
// When compiling for the device and per thread default stream is enabled, add
// a static inline redirect to the per thread stream entry points.
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
{
return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
}
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
{
return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
}
#else
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
#endif
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle);
}
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
#endif /* defined(__cplusplus) && defined(__CUDACC__) */
#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */

1969
include/external/CUDA/cuda_fp16.h vendored Executable file

File diff suppressed because it is too large Load Diff

1797
include/external/CUDA/cuda_fp16.hpp vendored Executable file

File diff suppressed because it is too large Load Diff

2040
include/external/CUDA/cuda_runtime.h vendored Executable file

File diff suppressed because it is too large Load Diff

7422
include/external/CUDA/cuda_runtime_api.h vendored Executable file

File diff suppressed because it is too large Load Diff

1805
include/external/CUDA/cudnn.h vendored Executable file

File diff suppressed because it is too large Load Diff

6257
include/external/CUDA/cusparse.h vendored Normal file

File diff suppressed because it is too large Load Diff

69
include/external/CUDA/device_types.h vendored Executable file
View File

@@ -0,0 +1,69 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__DEVICE_TYPES_H__)
#define __DEVICE_TYPES_H__
#include "host_defines.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
enum __device_builtin__ cudaRoundMode
{
cudaRoundNearest,
cudaRoundZero,
cudaRoundPosInf,
cudaRoundMinInf
};
#endif /* !__DEVICE_TYPES_H__ */

145
include/external/CUDA/driver_functions.h vendored Executable file
View File

@@ -0,0 +1,145 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__DRIVER_FUNCTIONS_H__)
#define __DRIVER_FUNCTIONS_H__
#include "builtin_types.h"
#include "host_defines.h"
#include "driver_types.h"
/**
* \addtogroup CUDART_MEMORY
*
* @{
*/
/**
* \brief Returns a cudaPitchedPtr based on input parameters
*
* Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
* \p p, \p xsz, and \p ysz.
*
* \param d - Pointer to allocated memory
* \param p - Pitch of allocated memory in bytes
* \param xsz - Logical width of allocation in elements
* \param ysz - Logical height of allocation in elements
*
* \return
* ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
*
* \sa make_cudaExtent, make_cudaPos
*/
static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
{
struct cudaPitchedPtr s;
s.ptr = d;
s.pitch = p;
s.xsize = xsz;
s.ysize = ysz;
return s;
}
/**
* \brief Returns a cudaPos based on input parameters
*
* Returns a ::cudaPos based on the specified input parameters \p x,
* \p y, and \p z.
*
* \param x - X position
* \param y - Y position
* \param z - Z position
*
* \return
* ::cudaPos specified by \p x, \p y, and \p z
*
* \sa make_cudaExtent, make_cudaPitchedPtr
*/
static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z)
{
struct cudaPos p;
p.x = x;
p.y = y;
p.z = z;
return p;
}
/**
* \brief Returns a cudaExtent based on input parameters
*
* Returns a ::cudaExtent based on the specified input parameters \p w,
* \p h, and \p d.
*
* \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
* \param h - Height in elements
* \param d - Depth in elements
*
* \return
* ::cudaExtent specified by \p w, \p h, and \p d
*
* \sa make_cudaPitchedPtr, make_cudaPos
*/
static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d)
{
struct cudaExtent e;
e.width = w;
e.height = h;
e.depth = d;
return e;
}
/** @} */ /* END CUDART_MEMORY */
#endif /* !__DRIVER_FUNCTIONS_H__ */

1610
include/external/CUDA/driver_types.h vendored Executable file

File diff suppressed because it is too large Load Diff

50
include/external/CUDA/host_config.h vendored Executable file
View File

@@ -0,0 +1,50 @@
/*
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#include "crt/host_config.h"

50
include/external/CUDA/host_defines.h vendored Executable file
View File

@@ -0,0 +1,50 @@
/*
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#include "crt/host_defines.h"

80
include/external/CUDA/library_types.h vendored Executable file
View File

@@ -0,0 +1,80 @@
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__LIBRARY_TYPES_H__)
#define __LIBRARY_TYPES_H__
typedef enum cudaDataType_t
{
CUDA_R_16F= 2, /* real as a half */
CUDA_C_16F= 6, /* complex as a pair of half numbers */
CUDA_R_32F= 0, /* real as a float */
CUDA_C_32F= 4, /* complex as a pair of float numbers */
CUDA_R_64F= 1, /* real as a double */
CUDA_C_64F= 5, /* complex as a pair of double numbers */
CUDA_R_8I = 3, /* real as a signed char */
CUDA_C_8I = 7, /* complex as a pair of signed char numbers */
CUDA_R_8U = 8, /* real as a unsigned char */
CUDA_C_8U = 9, /* complex as a pair of unsigned char numbers */
CUDA_R_32I= 10, /* real as a signed int */
CUDA_C_32I= 11, /* complex as a pair of signed int numbers */
CUDA_R_32U= 12, /* real as a unsigned int */
CUDA_C_32U= 13 /* complex as a pair of unsigned int numbers */
} cudaDataType;
typedef enum libraryPropertyType_t
{
MAJOR_VERSION,
MINOR_VERSION,
PATCH_LEVEL
} libraryPropertyType;
#endif /* !__LIBRARY_TYPES_H__ */

5628
include/external/CUDA/nvml.h vendored Executable file

File diff suppressed because it is too large Load Diff

525
include/external/CUDA/nvrtc.h vendored Executable file
View File

@@ -0,0 +1,525 @@
//
// NVIDIA_COPYRIGHT_BEGIN
//
// Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//
// NVIDIA_COPYRIGHT_END
//
#ifndef __NVRTC_H__
#define __NVRTC_H__
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#include <stdlib.h>
/*************************************************************************//**
*
* \defgroup error Error Handling
*
* NVRTC defines the following enumeration type and function for API call
* error handling.
*
****************************************************************************/
/**
* \ingroup error
* \brief The enumerated type nvrtcResult defines API call result codes.
* NVRTC API functions return nvrtcResult to indicate the call
* result.
*/
typedef enum {
NVRTC_SUCCESS = 0,
NVRTC_ERROR_OUT_OF_MEMORY = 1,
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
NVRTC_ERROR_INVALID_INPUT = 3,
NVRTC_ERROR_INVALID_PROGRAM = 4,
NVRTC_ERROR_INVALID_OPTION = 5,
NVRTC_ERROR_COMPILATION = 6,
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
NVRTC_ERROR_INTERNAL_ERROR = 11
} nvrtcResult;
/**
* \ingroup error
* \brief nvrtcGetErrorString is a helper function that returns a string
* describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to
* \c "NVRTC_SUCCESS".
* For unrecognized enumeration values, it returns
* \c "NVRTC_ERROR unknown".
*
* \param [in] result CUDA Runtime Compilation API result code.
* \return Message string for the given #nvrtcResult code.
*/
const char *nvrtcGetErrorString(nvrtcResult result);
/*************************************************************************//**
*
* \defgroup query General Information Query
*
* NVRTC defines the following function for general information query.
*
****************************************************************************/
/**
* \ingroup query
* \brief nvrtcVersion sets the output parameters \p major and \p minor
* with the CUDA Runtime Compilation version number.
*
* \param [out] major CUDA Runtime Compilation major version number.
* \param [out] minor CUDA Runtime Compilation minor version number.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
*
*/
nvrtcResult nvrtcVersion(int *major, int *minor);
/*************************************************************************//**
*
* \defgroup compilation Compilation
*
* NVRTC defines the following type and functions for actual compilation.
*
****************************************************************************/
/**
* \ingroup compilation
* \brief nvrtcProgram is the unit of compilation, and an opaque handle for
* a program.
*
* To compile a CUDA program string, an instance of nvrtcProgram must be
* created first with ::nvrtcCreateProgram, then compiled with
* ::nvrtcCompileProgram.
*/
typedef struct _nvrtcProgram *nvrtcProgram;
/**
* \ingroup compilation
* \brief nvrtcCreateProgram creates an instance of nvrtcProgram with the
* given input parameters, and sets the output parameter \p prog with
* it.
*
* \param [out] prog CUDA Runtime Compilation program.
* \param [in] src CUDA program source.
* \param [in] name CUDA program name.\n
* \p name can be \c NULL; \c "default_program" is
* used when \p name is \c NULL.
* \param [in] numHeaders Number of headers used.\n
* \p numHeaders must be greater than or equal to 0.
* \param [in] headers Sources of the headers.\n
* \p headers can be \c NULL when \p numHeaders is
* 0.
* \param [in] includeNames Name of each header by which they can be
* included in the CUDA program source.\n
* \p includeNames can be \c NULL when \p numHeaders
* is 0.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
* - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
*
* \see ::nvrtcDestroyProgram
*/
nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
const char *src,
const char *name,
int numHeaders,
const char * const *headers,
const char * const *includeNames);
/**
* \ingroup compilation
* \brief nvrtcDestroyProgram destroys the given program.
*
* \param [in] prog CUDA Runtime Compilation program.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
*
* \see ::nvrtcCreateProgram
*/
nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
/**
* \ingroup compilation
* \brief nvrtcCompileProgram compiles the given program.
*
* It supports compile options listed in \ref options.
*/
nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
int numOptions, const char * const *options);
/**
* \ingroup compilation
* \brief nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX
* generated by the previous compilation of \p prog (including the
* trailing \c NULL).
*
* \param [in] prog CUDA Runtime Compilation program.
* \param [out] ptxSizeRet Size of the generated PTX (including the trailing
* \c NULL).
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
*
* \see ::nvrtcGetPTX
*/
nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
/**
* \ingroup compilation
* \brief nvrtcGetPTX stores the PTX generated by the previous compilation
* of \p prog in the memory pointed by \p ptx.
*
* \param [in] prog CUDA Runtime Compilation program.
* \param [out] ptx Compiled result.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
*
* \see ::nvrtcGetPTXSize
*/
nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
/**
* \ingroup compilation
* \brief nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
* log generated by the previous compilation of \p prog (including the
* trailing \c NULL).
*
* Note that compilation log may be generated with warnings and informative
* messages, even when the compilation of \p prog succeeds.
*
* \param [in] prog CUDA Runtime Compilation program.
* \param [out] logSizeRet Size of the compilation log
* (including the trailing \c NULL).
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
*
* \see ::nvrtcGetProgramLog
*/
nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
/**
* \ingroup compilation
* \brief nvrtcGetProgramLog stores the log generated by the previous
* compilation of \p prog in the memory pointed by \p log.
*
* \param [in] prog CUDA Runtime Compilation program.
* \param [out] log Compilation log.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
*
* \see ::nvrtcGetProgramLogSize
*/
nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
/**
* \ingroup compilation
* \brief nvrtcAddNameExpression notes the given name expression
* denoting a __global__ function or function template
* instantiation.
*
* The identical name expression string must be provided on a subsequent
* call to nvrtcGetLoweredName to extract the lowered name.
* \param [in] prog CUDA Runtime Compilation program.
* \param [in] name_expression constant expression denoting a __global__
* function or function template instantiation.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink
*
* \see ::nvrtcGetLoweredName
*/
nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog,
const char * const name_expression);
/**
* \ingroup compilation
* \brief nvrtcGetLoweredName extracts the lowered (mangled) name
* for a __global__ function or function template instantiation,
* and updates *lowered_name to point to it. The memory containing
* the name is released when the NVRTC program is destroyed by
* nvrtcDestroyProgram.
* The identical name expression must have been previously
* provided to nvrtcAddNameExpression.
*
* \param [in] prog CUDA Runtime Compilation program.
* \param [in] name_expression constant expression denoting a __global__
* function or function template instantiation.
* \param [out] lowered_name initialized by the function to point to a
* C string containing the lowered (mangled)
* name corresponding to the provided name expression.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink
* - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink
*
* \see ::nvrtcAddNameExpression
*/
nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog,
const char *const name_expression,
const char** lowered_name);
/**
* \defgroup options Supported Compile Options
*
* NVRTC supports the compile options below.
* Option names with two preceding dashs (\c --) are long option names and
* option names with one preceding dash (\c -) are short option names.
* Short option names can be used instead of long option names.
* When a compile option takes an argument, an assignment operator (\c =)
* is used to separate the compile option argument from the compile option
* name, e.g., \c "--gpu-architecture=compute_30".
* Alternatively, the compile option name and the argument can be specified in
* separate strings without an assignment operator, .e.g,
* \c "--gpu-architecture" \c "compute_30".
* Single-character short option names, such as \c -D, \c -U, and \c -I, do
* not require an assignment operator, and the compile option name and the
* argument can be present in the same string with or without spaces between
* them.
* For instance, \c "-D=<def>", \c "-D<def>", and \c "-D <def>" are all
* supported.
*
* The valid compiler options are:
*
* - Compilation targets
* - \c --gpu-architecture=\<arch\> (\c -arch)\n
* Specify the name of the class of GPU architectures for which the
* input must be compiled.\n
* - Valid <c>\<arch\></c>s:
* - \c compute_30
* - \c compute_32
* - \c compute_35
* - \c compute_37
* - \c compute_50
* - \c compute_52
* - \c compute_53
* - \c compute_60
* - \c compute_61
* - \c compute_62
* - \c compute_70
* - \c compute_72
* - Default: \c compute_30
* - Separate compilation / whole-program compilation
* - \c --device-c (\c -dc)\n
* Generate relocatable code that can be linked with other relocatable
* device code. It is equivalent to --relocatable-device-code=true.
* - \c --device-w (\c -dw)\n
* Generate non-relocatable code. It is equivalent to
* \c --relocatable-device-code=false.
* - \c --relocatable-device-code={true|false} (\c -rdc)\n
* Enable (disable) the generation of relocatable device code.
* - Default: \c false
* - Debugging support
* - \c --device-debug (\c -G)\n
* Generate debug information.
* - \c --generate-line-info (\c -lineinfo)\n
* Generate line-number information.
* - Code generation
* - \c --maxrregcount=\<N\> (\c -maxrregcount)\n
* Specify the maximum amount of registers that GPU functions can use.
* Until a function-specific limit, a higher value will generally
* increase the performance of individual GPU threads that execute this
* function. However, because thread registers are allocated from a
* global register pool on each GPU, a higher value of this option will
* also reduce the maximum thread block size, thereby reducing the amount
* of thread parallelism. Hence, a good maxrregcount value is the result
* of a trade-off. If this option is not specified, then no maximum is
* assumed. Value less than the minimum registers required by ABI will
* be bumped up by the compiler to ABI minimum limit.
* - \c --ftz={true|false} (\c -ftz)\n
* When performing single-precision floating-point operations, flush
* denormal values to zero or preserve denormal values.
* \c --use_fast_math implies \c --ftz=true.
* - Default: \c false
* - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n
* For single-precision floating-point square root, use IEEE
* round-to-nearest mode or use a faster approximation.
* \c --use_fast_math implies \c --prec-sqrt=false.
* - Default: \c true
* - \c --prec-div={true|false} (\c -prec-div)\n
* For single-precision floating-point division and reciprocals, use IEEE
* round-to-nearest mode or use a faster approximation.
* \c --use_fast_math implies \c --prec-div=false.
* - Default: \c true
* - \c --fmad={true|false} (\c -fmad)\n
* Enables (disables) the contraction of floating-point multiplies and
* adds/subtracts into floating-point multiply-add operations (FMAD,
* FFMA, or DFMA). \c --use_fast_math implies \c --fmad=true.
* - Default: \c true
* - \c --use_fast_math (\c -use_fast_math)\n
* Make use of fast math operations.
* \c --use_fast_math implies \c --ftz=true \c --prec-div=false
* \c --prec-sqrt=false \c --fmad=true.
* - Preprocessing
* - \c --define-macro=\<def\> (\c -D)\n
* \c \<def\> can be either \c \<name\> or \c \<name=definitions\>.
* - \c \<name\> \n
* Predefine \c \<name\> as a macro with definition \c 1.
* - \c \<name\>=\<definition\> \n
* The contents of \c \<definition\> are tokenized and preprocessed
* as if they appeared during translation phase three in a \c \#define
* directive. In particular, the definition will be truncated by
* embedded new line characters.
* - \c --undefine-macro=\<def\> (\c -U)\n
* Cancel any previous definition of \c \<def\>.
* - \c --include-path=\<dir\> (\c -I)\n
* Add the directory \c \<dir\> to the list of directories to be
* searched for headers. These paths are searched after the list of
* headers given to ::nvrtcCreateProgram.
* - \c --pre-include=\<header\> (\c -include)\n
* Preinclude \c \<header\> during preprocessing.
* - Language Dialect
* - \c --std={c++11|c++14} (\c -std={c++11|c++14})\n
* Set language dialect to C++11 or C++14.
* - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n
* Provide builtin definitions of \c std::move and \c std::forward,
* when C++11 language dialect is selected.
* - Default: \c true
* - \c --builtin-initializer-list={true|false}
* (\c -builtin-initializer-list)\n
* Provide builtin definitions of \c std::initializer_list class and
* member functions when C++11 language dialect is selected.
* - Default: \c true
* - Misc.
* - \c --disable-warnings (\c -w)\n
* Inhibit all warning messages.
* - \c --restrict (\c -restrict)\n
* Programmer assertion that all kernel pointer parameters are restrict
* pointers.
* - \c --device-as-default-execution-space
* (\c -default-device)\n
* Treat entities with no execution space annotation as \c __device__
* entities.
*
* \param [in] prog CUDA Runtime Compilation program.
* \param [in] numOptions Number of compiler options passed.
* \param [in] options Compiler options in the form of C string array.\n
* \p options can be \c NULL when \p numOptions is 0.
*
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
* - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
* - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
*/
#ifdef __cplusplus
}
#endif /* __cplusplus */
/* The utility function 'nvrtcGetTypeName' is not available by default. Define
the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available.
*/
#if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__
#if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__
#include <cxxabi.h>
#include <cstdlib>
#elif defined(_WIN32)
#include <Windows.h>
#include <DbgHelp.h>
#endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */
#include <string>
#include <typeinfo>
/*************************************************************************//**
*
* \defgroup hosthelper Host Helper
*
* NVRTC defines the following functions for easier interaction with host code.
*
****************************************************************************/
/**
* \ingroup hosthelper
* \brief nvrtcGetTypeName stores the source level name of the template type argument
* T in the given std::string location.
*
* This function is only provided when the macro NVRTC_GET_TYPE_NAME is
* defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
* function calls to extract the type name, when using gcc/clang or cl.exe compilers,
* respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
* otherwise *result is initialized with the extracted name.
*
* \param [in] result: pointer to std::string in which to store the type name.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
*
*/
template <typename T>
nvrtcResult nvrtcGetTypeName(std::string *result)
{
const char *name = typeid(T).name();
#if USE_CXXABI || __clang__ || __GNUC__
int status;
char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status);
if (status == 0) {
*result = undecorated_name;
free(undecorated_name);
return NVRTC_SUCCESS;
}
#elif defined(_WIN32)
char undecorated_name[4096];
if(UnDecorateSymbolName(name, undecorated_name,
sizeof(undecorated_name) / sizeof(*undecorated_name),
UNDNAME_COMPLETE) ) {
*result = undecorated_name;
return NVRTC_SUCCESS;
}
#endif /* USE_CXXABI || __clang__ || __GNUC__ */
return NVRTC_ERROR_INTERNAL_ERROR;
}
#endif /* NVRTC_GET_TYPE_NAME */
#endif /* __NVRTC_H__ */

119
include/external/CUDA/surface_types.h vendored Executable file
View File

@@ -0,0 +1,119 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__SURFACE_TYPES_H__)
#define __SURFACE_TYPES_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "driver_types.h"
/**
* \addtogroup CUDART_TYPES
*
* @{
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#define cudaSurfaceType1D 0x01
#define cudaSurfaceType2D 0x02
#define cudaSurfaceType3D 0x03
#define cudaSurfaceTypeCubemap 0x0C
#define cudaSurfaceType1DLayered 0xF1
#define cudaSurfaceType2DLayered 0xF2
#define cudaSurfaceTypeCubemapLayered 0xFC
/**
* CUDA Surface boundary modes
*/
enum __device_builtin__ cudaSurfaceBoundaryMode
{
cudaBoundaryModeZero = 0, /**< Zero boundary mode */
cudaBoundaryModeClamp = 1, /**< Clamp boundary mode */
cudaBoundaryModeTrap = 2 /**< Trap boundary mode */
};
/**
* CUDA Surface format modes
*/
enum __device_builtin__ cudaSurfaceFormatMode
{
cudaFormatModeForced = 0, /**< Forced format mode */
cudaFormatModeAuto = 1 /**< Auto format mode */
};
/**
* CUDA Surface reference
*/
struct __device_builtin__ surfaceReference
{
/**
* Channel descriptor for surface reference
*/
struct cudaChannelFormatDesc channelDesc;
};
/**
* An opaque value that represents a CUDA Surface object
*/
typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
/** @} */
/** @} */ /* END CUDART_TYPES */
#endif /* !__SURFACE_TYPES_H__ */

217
include/external/CUDA/texture_types.h vendored Executable file
View File

@@ -0,0 +1,217 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__TEXTURE_TYPES_H__)
#define __TEXTURE_TYPES_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "driver_types.h"
/**
* \addtogroup CUDART_TYPES
*
* @{
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#define cudaTextureType1D 0x01
#define cudaTextureType2D 0x02
#define cudaTextureType3D 0x03
#define cudaTextureTypeCubemap 0x0C
#define cudaTextureType1DLayered 0xF1
#define cudaTextureType2DLayered 0xF2
#define cudaTextureTypeCubemapLayered 0xFC
/**
* CUDA texture address modes
*/
enum __device_builtin__ cudaTextureAddressMode
{
cudaAddressModeWrap = 0, /**< Wrapping address mode */
cudaAddressModeClamp = 1, /**< Clamp to edge address mode */
cudaAddressModeMirror = 2, /**< Mirror address mode */
cudaAddressModeBorder = 3 /**< Border address mode */
};
/**
* CUDA texture filter modes
*/
enum __device_builtin__ cudaTextureFilterMode
{
cudaFilterModePoint = 0, /**< Point filter mode */
cudaFilterModeLinear = 1 /**< Linear filter mode */
};
/**
* CUDA texture read modes
*/
enum __device_builtin__ cudaTextureReadMode
{
cudaReadModeElementType = 0, /**< Read texture as specified element type */
cudaReadModeNormalizedFloat = 1 /**< Read texture as normalized float */
};
/**
* CUDA texture reference
*/
struct __device_builtin__ textureReference
{
/**
* Indicates whether texture reads are normalized or not
*/
int normalized;
/**
* Texture filter mode
*/
enum cudaTextureFilterMode filterMode;
/**
* Texture address mode for up to 3 dimensions
*/
enum cudaTextureAddressMode addressMode[3];
/**
* Channel descriptor for the texture reference
*/
struct cudaChannelFormatDesc channelDesc;
/**
* Perform sRGB->linear conversion during texture read
*/
int sRGB;
/**
* Limit to the anisotropy ratio
*/
unsigned int maxAnisotropy;
/**
* Mipmap filter mode
*/
enum cudaTextureFilterMode mipmapFilterMode;
/**
* Offset applied to the supplied mipmap level
*/
float mipmapLevelBias;
/**
* Lower end of the mipmap level range to clamp access to
*/
float minMipmapLevelClamp;
/**
* Upper end of the mipmap level range to clamp access to
*/
float maxMipmapLevelClamp;
int __cudaReserved[15];
};
/**
* CUDA texture descriptor
*/
struct __device_builtin__ cudaTextureDesc
{
/**
* Texture address mode for up to 3 dimensions
*/
enum cudaTextureAddressMode addressMode[3];
/**
* Texture filter mode
*/
enum cudaTextureFilterMode filterMode;
/**
* Texture read mode
*/
enum cudaTextureReadMode readMode;
/**
* Perform sRGB->linear conversion during texture read
*/
int sRGB;
/**
* Texture Border Color
*/
float borderColor[4];
/**
* Indicates whether texture reads are normalized or not
*/
int normalizedCoords;
/**
* Limit to the anisotropy ratio
*/
unsigned int maxAnisotropy;
/**
* Mipmap filter mode
*/
enum cudaTextureFilterMode mipmapFilterMode;
/**
* Offset applied to the supplied mipmap level
*/
float mipmapLevelBias;
/**
* Lower end of the mipmap level range to clamp access to
*/
float minMipmapLevelClamp;
/**
* Upper end of the mipmap level range to clamp access to
*/
float maxMipmapLevelClamp;
};
/**
* An opaque value that represents a CUDA texture object
*/
typedef __device_builtin__ unsigned long long cudaTextureObject_t;
/** @} */
/** @} */ /* END CUDART_TYPES */
#endif /* !__TEXTURE_TYPES_H__ */

177
include/external/CUDA/vector_functions.h vendored Executable file
View File

@@ -0,0 +1,177 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__VECTOR_FUNCTIONS_H__)
#define __VECTOR_FUNCTIONS_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "builtin_types.h"
#include "host_defines.h"
#include "vector_types.h"
#if defined(__CUDACC_RTC__)
#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
#else /* !__CUDACC_RTC__ */
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
#endif /* __CUDACC_RTC__ */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
#undef __VECTOR_FUNCTIONS_DECL__
#if !defined(__CUDACC_RTC__)
#include "vector_functions.hpp"
#endif /* !__CUDACC_RTC__ */
#endif /* !__VECTOR_FUNCTIONS_H__ */

318
include/external/CUDA/vector_functions.hpp vendored Executable file
View File

@@ -0,0 +1,318 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__VECTOR_FUNCTIONS_HPP__)
#define __VECTOR_FUNCTIONS_HPP__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "builtin_types.h"
#include "host_defines.h"
#include "vector_types.h"
#if defined(__CUDACC_RTC__)
#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
#else /* !__CUDACC_RTC__ */
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
#endif /* __CUDACC_RTC__ */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
{
char1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
{
uchar1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
{
char2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
{
uchar2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
{
char3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
{
uchar3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
{
char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
{
uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
{
short1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
{
ushort1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
{
short2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
{
ushort2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
{
short3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
{
ushort3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
{
short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
{
ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
{
int1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
{
uint1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
{
int2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
{
uint2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
{
int3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
{
uint3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
{
int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
{
uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
{
long1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
{
ulong1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
{
long2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
{
ulong2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
{
long3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
{
ulong3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
{
long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
{
ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
{
float1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
{
float2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
{
float3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
{
float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
{
longlong1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
{
ulonglong1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
{
longlong2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
{
ulonglong2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
{
longlong3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
{
ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
{
longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
{
ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
{
double1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
{
double2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
{
double3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
{
double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
#undef __VECTOR_FUNCTIONS_DECL__
#endif /* !__VECTOR_FUNCTIONS_HPP__ */

425
include/external/CUDA/vector_types.h vendored Executable file
View File

@@ -0,0 +1,425 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__VECTOR_TYPES_H__)
#define __VECTOR_TYPES_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "host_defines.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && \
defined(_WIN32) && !defined(_WIN64)
#pragma warning(push)
#pragma warning(disable: 4201 4408)
#define __cuda_builtin_vector_align8(tag, members) \
struct __device_builtin__ tag \
{ \
union \
{ \
struct { members }; \
struct { long long int :1,:0; }; \
}; \
}
#else /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
#define __cuda_builtin_vector_align8(tag, members) \
struct __device_builtin__ __align__(8) tag \
{ \
members \
}
#endif /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
struct __device_builtin__ char1
{
signed char x;
};
struct __device_builtin__ uchar1
{
unsigned char x;
};
struct __device_builtin__ __align__(2) char2
{
signed char x, y;
};
struct __device_builtin__ __align__(2) uchar2
{
unsigned char x, y;
};
struct __device_builtin__ char3
{
signed char x, y, z;
};
struct __device_builtin__ uchar3
{
unsigned char x, y, z;
};
struct __device_builtin__ __align__(4) char4
{
signed char x, y, z, w;
};
struct __device_builtin__ __align__(4) uchar4
{
unsigned char x, y, z, w;
};
struct __device_builtin__ short1
{
short x;
};
struct __device_builtin__ ushort1
{
unsigned short x;
};
struct __device_builtin__ __align__(4) short2
{
short x, y;
};
struct __device_builtin__ __align__(4) ushort2
{
unsigned short x, y;
};
struct __device_builtin__ short3
{
short x, y, z;
};
struct __device_builtin__ ushort3
{
unsigned short x, y, z;
};
__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
struct __device_builtin__ int1
{
int x;
};
struct __device_builtin__ uint1
{
unsigned int x;
};
__cuda_builtin_vector_align8(int2, int x; int y;);
__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
struct __device_builtin__ int3
{
int x, y, z;
};
struct __device_builtin__ uint3
{
unsigned int x, y, z;
};
struct __device_builtin__ __builtin_align__(16) int4
{
int x, y, z, w;
};
struct __device_builtin__ __builtin_align__(16) uint4
{
unsigned int x, y, z, w;
};
struct __device_builtin__ long1
{
long int x;
};
struct __device_builtin__ ulong1
{
unsigned long x;
};
#if defined(_WIN32)
__cuda_builtin_vector_align8(long2, long int x; long int y;);
__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
#else /* !_WIN32 */
struct __device_builtin__ __align__(2*sizeof(long int)) long2
{
long int x, y;
};
struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
{
unsigned long int x, y;
};
#endif /* _WIN32 */
struct __device_builtin__ long3
{
long int x, y, z;
};
struct __device_builtin__ ulong3
{
unsigned long int x, y, z;
};
struct __device_builtin__ __builtin_align__(16) long4
{
long int x, y, z, w;
};
struct __device_builtin__ __builtin_align__(16) ulong4
{
unsigned long int x, y, z, w;
};
struct __device_builtin__ float1
{
float x;
};
#if !defined(__CUDACC__) && defined(__arm__) && \
defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-pedantic"
struct __device_builtin__ __attribute__((aligned(8))) float2
{
float x; float y; float __cuda_gnu_arm_ice_workaround[0];
};
#pragma GCC poison __cuda_gnu_arm_ice_workaround
#pragma GCC diagnostic pop
#else /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
__GNUC__ == 4&& __GNUC_MINOR__ == 6 */
__cuda_builtin_vector_align8(float2, float x; float y;);
#endif /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
__GNUC__ == 4&& __GNUC_MINOR__ == 6 */
struct __device_builtin__ float3
{
float x, y, z;
};
struct __device_builtin__ __builtin_align__(16) float4
{
float x, y, z, w;
};
struct __device_builtin__ longlong1
{
long long int x;
};
struct __device_builtin__ ulonglong1
{
unsigned long long int x;
};
struct __device_builtin__ __builtin_align__(16) longlong2
{
long long int x, y;
};
struct __device_builtin__ __builtin_align__(16) ulonglong2
{
unsigned long long int x, y;
};
struct __device_builtin__ longlong3
{
long long int x, y, z;
};
struct __device_builtin__ ulonglong3
{
unsigned long long int x, y, z;
};
struct __device_builtin__ __builtin_align__(16) longlong4
{
long long int x, y, z ,w;
};
struct __device_builtin__ __builtin_align__(16) ulonglong4
{
unsigned long long int x, y, z, w;
};
struct __device_builtin__ double1
{
double x;
};
struct __device_builtin__ __builtin_align__(16) double2
{
double x, y;
};
struct __device_builtin__ double3
{
double x, y, z;
};
struct __device_builtin__ __builtin_align__(16) double4
{
double x, y, z, w;
};
#if !defined(__CUDACC__) && defined(_WIN32) && !defined(_WIN64)
#pragma warning(pop)
#endif /* !__CUDACC__ && _WIN32 && !_WIN64 */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
typedef __device_builtin__ struct char1 char1;
typedef __device_builtin__ struct uchar1 uchar1;
typedef __device_builtin__ struct char2 char2;
typedef __device_builtin__ struct uchar2 uchar2;
typedef __device_builtin__ struct char3 char3;
typedef __device_builtin__ struct uchar3 uchar3;
typedef __device_builtin__ struct char4 char4;
typedef __device_builtin__ struct uchar4 uchar4;
typedef __device_builtin__ struct short1 short1;
typedef __device_builtin__ struct ushort1 ushort1;
typedef __device_builtin__ struct short2 short2;
typedef __device_builtin__ struct ushort2 ushort2;
typedef __device_builtin__ struct short3 short3;
typedef __device_builtin__ struct ushort3 ushort3;
typedef __device_builtin__ struct short4 short4;
typedef __device_builtin__ struct ushort4 ushort4;
typedef __device_builtin__ struct int1 int1;
typedef __device_builtin__ struct uint1 uint1;
typedef __device_builtin__ struct int2 int2;
typedef __device_builtin__ struct uint2 uint2;
typedef __device_builtin__ struct int3 int3;
typedef __device_builtin__ struct uint3 uint3;
typedef __device_builtin__ struct int4 int4;
typedef __device_builtin__ struct uint4 uint4;
typedef __device_builtin__ struct long1 long1;
typedef __device_builtin__ struct ulong1 ulong1;
typedef __device_builtin__ struct long2 long2;
typedef __device_builtin__ struct ulong2 ulong2;
typedef __device_builtin__ struct long3 long3;
typedef __device_builtin__ struct ulong3 ulong3;
typedef __device_builtin__ struct long4 long4;
typedef __device_builtin__ struct ulong4 ulong4;
typedef __device_builtin__ struct float1 float1;
typedef __device_builtin__ struct float2 float2;
typedef __device_builtin__ struct float3 float3;
typedef __device_builtin__ struct float4 float4;
typedef __device_builtin__ struct longlong1 longlong1;
typedef __device_builtin__ struct ulonglong1 ulonglong1;
typedef __device_builtin__ struct longlong2 longlong2;
typedef __device_builtin__ struct ulonglong2 ulonglong2;
typedef __device_builtin__ struct longlong3 longlong3;
typedef __device_builtin__ struct ulonglong3 ulonglong3;
typedef __device_builtin__ struct longlong4 longlong4;
typedef __device_builtin__ struct ulonglong4 ulonglong4;
typedef __device_builtin__ struct double1 double1;
typedef __device_builtin__ struct double2 double2;
typedef __device_builtin__ struct double3 double3;
typedef __device_builtin__ struct double4 double4;
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
struct __device_builtin__ dim3
{
unsigned int x, y, z;
#if defined(__cplusplus)
__host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
__host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
__host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
#endif /* __cplusplus */
};
typedef __device_builtin__ struct dim3 dim3;
#undef __cuda_builtin_vector_align8
#endif /* !__VECTOR_TYPES_H__ */

56
include/tools/sys/getenv.hpp Executable file
View File

@@ -0,0 +1,56 @@
/*
* Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
*
* This file is part of ISAAC.
*
* ISAAC is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301 USA
*/
#ifndef TDL_TOOLS_SYS_GETENV_HPP
#define TDL_TOOLS_SYS_GETENV_HPP
#include <string>
#include <cstdlib>
namespace tdl
{
namespace tools
{
inline std::string getenv(const char * name)
{
#ifdef _MSC_VER
char* cache_path = 0;
std::size_t sz = 0;
_dupenv_s(&cache_path, &sz, name);
#else
const char * cache_path = std::getenv(name);
#endif
if(!cache_path)
return "";
std::string result(cache_path);
#ifdef _MSC_VER
free(cache_path);
#endif
return result;
}
}
}
#endif

68
include/tools/sys/mkdir.hpp Executable file
View File

@@ -0,0 +1,68 @@
/*
* Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
*
* This file is part of ISAAC.
*
* ISAAC is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301 USA
*/
#ifndef TDL_TOOLS_SYS_MKDIR_HPP
#define TDL_TOOLS_SYS_MKDIR_HPP
#include <cstring>
#include <string>
#include <cstdlib>
#include <sys/stat.h>
#include <errno.h>
#if defined(_WIN32)
#include <direct.h>
#endif
namespace tdl
{
namespace tools
{
inline int mkdir(std::string const & path)
{
#if defined(_WIN32)
return _mkdir(path.c_str());
#else
return ::mkdir(path.c_str(), 0777);
#endif
}
inline int mkpath(std::string const & path)
{
int status = 0;
size_t pp = 0;
size_t sp;
while ((sp = path.find('/', pp)) != std::string::npos)
{
if (sp != pp){
status = mkdir(path.substr(0, sp));
}
pp = sp + 1;
}
return (status==0 || errno==EEXIST)?0:-1;
}
}
}
#endif

196
lib/driver/backend.cpp Executable file
View File

@@ -0,0 +1,196 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "driver/dispatch.h"
#include "driver/backend.h"
#include "driver/buffer.h"
#include "driver/context.h"
#include "driver/stream.h"
#include "driver/kernel.h"
#include <assert.h>
#include <stdexcept>
#include <vector>
namespace tdl
{
namespace driver
{
/*-----------------------------------*/
//---------- Modules ----------------*/
/*-----------------------------------*/
void backend::modules::release(){
for(auto & x: cache_)
delete x.second;
cache_.clear();
}
Module& backend::modules::get(Stream const & stream, std::string const & name, std::string const & src){
std::tuple<Stream, std::string> key(stream, name);
if(cache_.find(key)==cache_.end())
return *cache_.insert(std::make_pair(key, new Module(stream.context(), src))).first->second;
return *cache_.at(key);
}
std::map<std::tuple<Stream, std::string>, Module * > backend::modules::cache_;
/*-----------------------------------*/
//----------- Kernels --------------*/
/*-----------------------------------*/
void backend::kernels::release(){
for(auto & x: cache_)
delete x.second;
cache_.clear();
}
Kernel & backend::kernels::get(Module const & program, std::string const & name){
std::tuple<Module, std::string> key(program, name);
if(cache_.find(key)==cache_.end())
return *cache_.insert(std::make_pair(key, new Kernel(program, name.c_str()))).first->second;
return *cache_.at(key);
}
std::map<std::tuple<Module, std::string>, Kernel * > backend::kernels::cache_;
/*-----------------------------------*/
//------------ Queues --------------*/
/*-----------------------------------*/
void backend::streams::init(std::list<const Context *> const & contexts){
for(Context const * ctx : contexts)
if(cache_.find(*ctx)==cache_.end())
cache_.insert(std::make_pair(*ctx, std::vector<Stream*>{new Stream(*ctx)}));
}
void backend::streams::release(){
for(auto & x: cache_)
for(auto & y: x.second)
delete y;
cache_.clear();
}
Stream & backend::streams::get_default()
{ return get(contexts::get_default(), 0); }
Stream & backend::streams::get(Context const & context, unsigned int id){
init(std::list<Context const *>(1,&context));
for(auto & x : cache_)
if(x.first==context)
return *x.second[id];
throw;
}
void backend::streams::get(Context const & context, std::vector<Stream*> & queues){
init(std::list<Context const *>(1,&context));
queues = cache_.at(context);
}
std::map<Context, std::vector<Stream*> > backend::streams::cache_;
/*-----------------------------------*/
//------------ Contexts ------------*/
/*-----------------------------------*/
void backend::contexts::init(std::vector<Platform> const & platforms){
for(Platform const & platform: platforms){
for(Device const & device: platform.devices())
cache_.push_back(new Context(device));
}
}
void backend::contexts::release(){
for(auto & x: cache_)
delete x;
cache_.clear();
}
Context const & backend::contexts::get_default(){
backend::init();
std::list<Context const *>::const_iterator it = cache_.begin();
std::advance(it, default_device);
return **it;
}
void backend::contexts::get(std::list<Context const *> & contexts){
backend::init();
contexts = cache_;
}
std::list<Context const *> backend::contexts::cache_;
/*-----------------------------------*/
//------------ General -------------*/
/*-----------------------------------*/
std::vector<Device> backend::devices(){
std::vector<Platform> platforms = backend::platforms();
std::vector<Device> result;
for(Platform const & platform: platforms){
auto devices = platform.devices();
result.insert(result.end(), devices.begin(), devices.end());
}
return result;
}
std::vector<Platform> backend::platforms(){
std::vector<Platform> platforms;
//if CUDA is here
if(dispatch::cuinit())
platforms.push_back(Platform());
if(platforms.empty())
throw std::runtime_error("ISAAC: No backend available. Make sure CUDA is available in your library path");
return platforms;
}
void backend::synchronize(Context const & context){
for(Stream * queue: streams::cache_.at(context))
queue->synchronize();
}
void backend::release(){
backend::kernels::release();
// backend::programs::release();
backend::streams::release();
backend::contexts::release();
}
void backend::init(){
if(!contexts::cache_.empty())
return;
std::vector<Platform> platforms = backend::platforms();
contexts::init(platforms);
streams::init(contexts::cache_);
}
unsigned int backend::default_device = 0;
}
}

60
lib/driver/buffer.cpp Executable file
View File

@@ -0,0 +1,60 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <iostream>
#include "driver/stream.h"
#include "driver/buffer.h"
#include "driver/context.h"
#include "driver/dispatch.h"
namespace tdl
{
namespace driver
{
Buffer::Buffer(Context const & context, size_t size) : context_(context)
{
ContextSwitcher ctx_switch(context_);
dispatch::cuMemAlloc(&*cu_, size);
}
Buffer::Buffer(Context const & context, CUdeviceptr cu, bool take_ownership):
context_(context), cu_(cu, take_ownership)
{ }
void Buffer::set_zero(Stream const & queue, size_t size)
{
ContextSwitcher ctx_switch(context_);
dispatch::cuMemsetD8Async(*cu_, 0, size, queue);
}
Handle<CUdeviceptr> const & Buffer::cu() const
{ return cu_; }
Handle<CUdeviceptr> & Buffer::cu()
{ return cu_; }
}
}

99
lib/driver/context.cpp Executable file
View File

@@ -0,0 +1,99 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <iostream>
#include <cassert>
#include "driver/context.h"
#include "driver/module.h"
#include "tools/sys/getenv.hpp"
#include "tools/sys/mkdir.hpp"
namespace tdl
{
namespace driver
{
std::string Context::get_cache_path(){
//user-specified cache path
std::string result = tools::getenv("ISAAC_CACHE_PATH");
if(!result.empty()){
if(tools::mkpath(result)==0)
return result;
}
//create in home
result = tools::getenv("HOME");
if(!result.empty())
{
result = result + "/.isaac/cache/";
if(tools::mkpath(result)==0)
return result;
}
//couldn't find a directory
return "";
}
CUdevice Context::device(CUcontext context){
dispatch::cuCtxPushCurrent_v2(context);
CUdevice res;
dispatch::cuCtxGetDevice(&res);
dispatch::cuCtxPopCurrent_v2(NULL);
return res;
}
Context::Context(CUcontext context, bool take_ownership): cu_(context, take_ownership), device_(device(context), false), cache_path_(get_cache_path())
{ }
Context::Context(Device const & device): device_(device), cache_path_(get_cache_path())
{
dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, (CUdevice)device);
dispatch::cuCtxPopCurrent_v2(NULL);
}
Device const & Context::device() const
{ return device_; }
std::string const & Context::cache_path() const
{ return cache_path_; }
Handle<CUcontext> const & Context::cu() const
{ return cu_; }
/* Context Switcher */
ContextSwitcher::ContextSwitcher(Context const & ctx): ctx_(ctx)
{
dispatch::cuCtxPushCurrent_v2(ctx_);
}
ContextSwitcher::~ContextSwitcher()
{
CUcontext tmp;
dispatch::cuCtxPopCurrent_v2(&tmp);
assert(tmp==(CUcontext)ctx_ && "Switching back to invalid context!");
}
}
}

197
lib/driver/device.cpp Executable file
View File

@@ -0,0 +1,197 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <map>
#include <algorithm>
#include <sstream>
#include <cstring>
#include <memory>
#include "driver/device.h"
namespace tdl
{
namespace driver
{
/* Architecture [NVidia] */
Device::Architecture Device::nv_arch(std::pair<unsigned int, unsigned int> sm) const{
switch(sm.first)
{
case 7:
switch(sm.second)
{
case 0: return Architecture::SM_7_0;
}
case 6:
switch(sm.second)
{
case 0: return Architecture::SM_6_0;
case 1: return Architecture::SM_6_1;
}
case 5:
switch(sm.second)
{
case 0: return Architecture::SM_5_0;
case 2: return Architecture::SM_5_2;
default: return Architecture::UNKNOWN;
}
case 3:
switch(sm.second)
{
case 0: return Architecture::SM_3_0;
case 5: return Architecture::SM_3_5;
case 7: return Architecture::SM_3_7;
default: return Architecture::UNKNOWN;
}
case 2:
switch(sm.second)
{
case 0: return Architecture::SM_2_0;
case 1: return Architecture::SM_2_1;
default: return Architecture::UNKNOWN;
}
default: return Architecture::UNKNOWN;
}
}
template<CUdevice_attribute attr>
int Device::cuGetInfo() const{
int res;
dispatch::cuDeviceGetAttribute(&res, attr, *cu_);
return res;
}
nvmlDevice_t Device::nvml_device() const{
std::map<std::string, nvmlDevice_t> map;
std::string key = pci_bus_id();
if(map.find(key)==map.end()){
nvmlDevice_t device;
dispatch::nvmlDeviceGetHandleByPciBusId_v2(key.c_str(), &device);
return map.insert(std::make_pair(key, device)).first->second;
}
return map.at(key);
}
/* Architecture */
Device::Architecture Device::architecture() const
{ return nv_arch(compute_capability()); }
/* Attributes */
size_t Device::address_bits() const
{ return sizeof(size_t)*8; }
driver::Platform Device::platform() const
{ return Platform(); }
std::string Device::name() const{
char tmp[128];
dispatch::cuDeviceGetName(tmp, 128, *cu_);
return std::string(tmp);
}
std::string Device::pci_bus_id() const{
char tmp[128];
dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_);
return std::string(tmp);
}
void Device::interpret_as(std::pair<size_t, size_t> cc){
interpreted_as_ = std::make_shared<std::pair<size_t, size_t>>(cc);
}
std::pair<size_t, size_t> Device::compute_capability() const{
if(interpreted_as_)
return *interpreted_as_;
size_t _major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>();
size_t _minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
return std::make_pair(_major, _minor);
}
size_t Device::max_threads_per_block() const
{ return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK>(); }
size_t Device::max_shared_memory() const
{ return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK>(); }
size_t Device::warp_size() const
{ return cuGetInfo<CU_DEVICE_ATTRIBUTE_WARP_SIZE>(); }
std::vector<size_t> Device::max_block_dim() const{
std::vector<size_t> result(3);
result[0] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X>();
result[1] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y>();
result[2] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z>();
return result;
}
size_t Device::current_sm_clock() const{
unsigned int result;
dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
return result;
}
size_t Device::max_sm_clock() const{
unsigned int result;
dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
return result;
}
size_t Device::current_mem_clock() const{
unsigned int result;
dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
return result;
}
size_t Device::max_mem_clock() const{
unsigned int result;
dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
return result;
}
/* Infos */
std::string Device::infos() const{
std::ostringstream oss;
std::vector<size_t> max_wi_sizes = max_block_dim();
oss << "Platform: " << platform().name() << std::endl;
oss << "Name: " << name() << std::endl;
oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl;
oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl;
oss << "Local memory size: " << max_shared_memory() << std::endl;
return oss.str();
}
Handle<CUdevice> const & Device::cu() const
{ return cu_; }
}
}

363
lib/driver/dispatch.cpp Executable file
View File

@@ -0,0 +1,363 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <map>
#include "driver/dispatch.h"
#include "driver/context.h"
namespace tdl
{
namespace driver
{
//Helpers for function definition
#define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }
#define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }
#define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }
#define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }
#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }
#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }
#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }
#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }
#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }
#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }
#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }
#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }
#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }
//Specialized helpers for CUDA
#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
#define NVRTC_DEFINE1(ret, fname, t1) DEFINE1(nvrtcinit, nvrtc_, ret, fname, t1)
#define NVRTC_DEFINE2(ret, fname, t1, t2) DEFINE2(nvrtcinit, nvrtc_, ret, fname, t1, t2)
#define NVRTC_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3)
#define NVRTC_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4)
#define NVRTC_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5)
#define NVRTC_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6)
#define NVRTC_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
#define NVRTC_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
#define NVRTC_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
#define NVRTC_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
#define NVRTC_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
#define CUBLAS_DEFINE1(ret, fname, t1) DEFINE1(cublasinit, cublas_, ret, fname, t1)
#define CUBLAS_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cublasinit, cublas_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13)
#define CUBLAS_DEFINE19(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) DEFINE19(cublasinit, cublas_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19)
#define CUDNN_DEFINE1(ret, fname, t1) DEFINE1(cudnninit, cudnn_, ret, fname, t1)
#define CUDNN_DEFINE2(ret, fname, t1, t2) DEFINE2(cudnninit, cudnn_, ret, fname, t1, t2)
#define CUDNN_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cudnninit, cudnn_, ret, fname, t1, t2, t3)
#define CUDNN_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5)
#define CUDNN_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6)
#define CUDNN_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
#define CUDNN_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
#define CUDNN_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13)
bool dispatch::cuinit(){
if(cuda_==nullptr)
cuda_ = dlopen("libcuda.so", RTLD_LAZY);
CUresult (*fptr)(unsigned int);
cuInit_ = dlsym(cuda_, "cuInit");
*reinterpret_cast<void **>(&fptr) = cuInit_;
CUresult res = (*fptr)(0);
check(res);
return cuda_ != nullptr;
}
bool dispatch::nvrtcinit(){
if(nvrtc_==nullptr)
nvrtc_ = dlopen("libnvrtc.so", RTLD_LAZY);
return nvrtc_ != nullptr;
}
bool dispatch::nvmlinit(){
if(nvml_==nullptr)
nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
nvmlReturn_t (*fptr)();
nvmlInit_v2_ = dlsym(nvml_, "nvmlInit_v2");
*reinterpret_cast<void **>(&fptr) = nvmlInit_v2_;
nvmlReturn_t res = (*fptr)();
check(res);
return res;
}
bool dispatch::cublasinit(){
if(cublas_==nullptr)
cublas_ = dlopen("libcublas.so", RTLD_LAZY);
return cublas_ != nullptr;
}
bool dispatch::cudnninit(){
if(cudnn_==nullptr)
cudnn_ = dlopen("libcudnn.so", RTLD_LAZY);
return cudnn_ != nullptr;
}
//CUDA
CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
CUDA_DEFINE1(CUresult, cuInit, unsigned int)
CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*)
CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext)
CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext)
CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
NVRTC_DEFINE3(nvrtcResult, nvrtcCompileProgram, nvrtcProgram, int, const char **)
NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLogSize, nvrtcProgram, size_t *)
NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTX, nvrtcProgram, char *)
NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTXSize, nvrtcProgram, size_t *)
NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **)
NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *)
NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
cublasHandle_t dispatch::cublasHandle(Context const & ctx){
static std::map<Context, cublasHandle_t> handles;
auto pr = handles.insert({ctx, cublasHandle_t()});
if(pr.second)
cublasCreate_v2(&pr.first->second);
return pr.first->second;
}
cudnnHandle_t dispatch::cudnnHandle(Context const & ctx){
static std::map<Context, cudnnHandle_t> handles;
auto pr = handles.insert({ctx, cudnnHandle_t()});
if(pr.second)
cudnnCreate(&pr.first->second);
return pr.first->second;
}
CUBLAS_DEFINE1(cublasStatus_t, cublasCreate_v2, cublasHandle_t*)
cublasStatus_t dispatch::cublasGetStream_v2(cublasHandle_t h, cudaStream_t *a)
{ return f_impl<dispatch::cublasinit>(cublas_, cublasGetStream_v2, cublasGetStream_v2_, "cublasGetStream_v2", h, a); }
cublasStatus_t dispatch::cublasSetStream_v2(cublasHandle_t h, cudaStream_t a)
{ return f_impl<dispatch::cublasinit>(cublas_, cublasSetStream_v2, cublasSetStream_v2_, "cublasSetStream_v2", h, a); }
cublasStatus_t dispatch::cublasSgemm_v2(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc)
{ return f_impl<dispatch::cublasinit>(cublas_, cublasSgemm_v2, cublasSgemm_v2_, "cublasSgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
cublasStatus_t dispatch::cublasDgemm_v2(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc)
{ return f_impl<dispatch::cublasinit>(cublas_, cublasDgemm_v2, cublasDgemm_v2_, "cublasDgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
cublasStatus_t dispatch::cublasHgemm(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc)
{ return f_impl<dispatch::cublasinit>(cublas_, cublasHgemm, cublasHgemm_, "cublasHgemm", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);}
CUBLAS_DEFINE19(cublasStatus_t, cublasGemmEx, cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void*, const void*, cudaDataType, int, const void*, cudaDataType, int, const void*, void*, cudaDataType, int, cudaDataType, cublasGemmAlgo_t)
//cuDNN
CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateConvolutionDescriptor, cudnnConvolutionDescriptor_t*)
CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateTensorDescriptor, cudnnTensorDescriptor_t*)
CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateFilterDescriptor, cudnnFilterDescriptor_t*)
CUDNN_DEFINE1(cudnnStatus_t, cudnnCreate, cudnnHandle_t*)
CUDNN_DEFINE7(cudnnStatus_t, cudnnSetTensor4dDescriptor, cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int)
CUDNN_DEFINE7(cudnnStatus_t, cudnnSetFilter4dDescriptor, cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int)
CUDNN_DEFINE5(cudnnStatus_t, cudnnSetTensorNdDescriptorEx, cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int*)
CUDNN_DEFINE5(cudnnStatus_t, cudnnSetFilterNdDescriptor, cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int*)
CUDNN_DEFINE1(cudnnStatus_t, cudnnCreatePoolingDescriptor, cudnnPoolingDescriptor_t*)
CUDNN_DEFINE7(cudnnStatus_t, cudnnSetPoolingNdDescriptor, cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int*, const int*, const int*)
CUDNN_DEFINE8(cudnnStatus_t, cudnnPoolingForward, cudnnHandle_t, const cudnnPoolingDescriptor_t, const void*, const cudnnTensorDescriptor_t, const void*, const void*, const cudnnTensorDescriptor_t, void*)
CUDNN_DEFINE8(cudnnStatus_t, cudnnSetConvolution2dDescriptor, cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t)
CUDNN_DEFINE7(cudnnStatus_t, cudnnSetConvolutionNdDescriptor, cudnnConvolutionDescriptor_t, int, const int*, const int*, const int*, cudnnConvolutionMode_t, cudnnDataType_t)
CUDNN_DEFINE8(cudnnStatus_t, cudnnGetConvolutionForwardAlgorithm, cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *)
CUDNN_DEFINE7(cudnnStatus_t, cudnnGetConvolutionForwardWorkspaceSize, cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t*)
CUDNN_DEFINE13(cudnnStatus_t, cudnnConvolutionForward, cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *)
CUDNN_DEFINE2(cudnnStatus_t, cudnnSetStream, cudnnHandle_t, cudaStream_t)
CUDNN_DEFINE7(cudnnStatus_t, cudnnTransformTensor, cudnnHandle_t, const void*, const cudnnTensorDescriptor_t, const void*, const void*, const cudnnTensorDescriptor_t, void*)
void dispatch::release(){
if(cuda_){
dlclose(cuda_);
cuda_ = nullptr;
}
if(nvrtc_){
dlclose(nvrtc_);
nvrtc_ = nullptr;
}
if(cublas_){
dlclose(cublas_);
cublas_ = nullptr;
}
if(cudnn_){
dlclose(cudnn_);
cudnn_ = nullptr;
}
}
void* dispatch::cuda_;
void* dispatch::nvrtc_;
void* dispatch::nvml_;
void* dispatch::cublas_;
void* dispatch::cudnn_;
//CUDA
void* dispatch::cuCtxGetCurrent_;
void* dispatch::cuCtxSetCurrent_;
void* dispatch::cuCtxDestroy_v2_;
void* dispatch::cuEventCreate_;
void* dispatch::cuDeviceGet_;
void* dispatch::cuMemcpyDtoH_v2_;
void* dispatch::cuStreamCreate_;
void* dispatch::cuEventElapsedTime_;
void* dispatch::cuMemFree_v2_;
void* dispatch::cuMemcpyDtoHAsync_v2_;
void* dispatch::cuDriverGetVersion_;
void* dispatch::cuDeviceGetName_;
void* dispatch::cuDeviceGetPCIBusId_;
void* dispatch::cuModuleGetGlobal_v2_;
void* dispatch::cuMemcpyHtoDAsync_v2_;
void* dispatch::cuModuleLoad_;
void* dispatch::cuLaunchKernel_;
void* dispatch::cuModuleUnload_;
void* dispatch::cuModuleLoadDataEx_;
void* dispatch::cuDeviceGetAttribute_;
void* dispatch::cuDeviceGetCount_;
void* dispatch::cuMemcpyHtoD_v2_;
void* dispatch::cuInit_;
void* dispatch::cuEventRecord_;
void* dispatch::cuCtxCreate_v2_;
void* dispatch::cuModuleGetFunction_;
void* dispatch::cuStreamSynchronize_;
void* dispatch::cuStreamDestroy_v2_;
void* dispatch::cuEventDestroy_v2_;
void* dispatch::cuMemAlloc_v2_;
void* dispatch::cuPointerGetAttribute_;
void* dispatch::cuCtxGetDevice_;
void* dispatch::cuMemsetD8Async_;
void* dispatch::cuCtxPushCurrent_v2_;
void* dispatch::cuCtxPopCurrent_v2_;
void* dispatch::nvrtcCompileProgram_;
void* dispatch::nvrtcGetProgramLogSize_;
void* dispatch::nvrtcGetPTX_;
void* dispatch::nvrtcGetPTXSize_;
void* dispatch::nvrtcCreateProgram_;
void* dispatch::nvrtcGetProgramLog_;
void* dispatch::nvmlInit_v2_;
void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
void* dispatch::nvmlDeviceGetClockInfo_;
void* dispatch::nvmlDeviceGetMaxClockInfo_;
void* dispatch::cublasCreate_v2_;
void* dispatch::cublasGetStream_v2_;
void* dispatch::cublasSetStream_v2_;
void* dispatch::cublasHgemm_;
void* dispatch::cublasSgemm_v2_;
void* dispatch::cublasDgemm_v2_;
void* dispatch::cublasGemmEx_;
void* dispatch::cudnnCreateConvolutionDescriptor_;
void* dispatch::cudnnCreatePoolingDescriptor_;
void* dispatch::cudnnCreateTensorDescriptor_;
void* dispatch::cudnnCreateFilterDescriptor_;
void* dispatch::cudnnCreate_;
void* dispatch::cudnnSetTensor4dDescriptor_;
void* dispatch::cudnnSetFilter4dDescriptor_;
void* dispatch::cudnnSetTensorNdDescriptorEx_;
void* dispatch::cudnnSetFilterNdDescriptor_;
void* dispatch::cudnnSetPoolingNdDescriptor_;
void* dispatch::cudnnSetConvolution2dDescriptor_;
void* dispatch::cudnnSetConvolutionNdDescriptor_;
void* dispatch::cudnnGetConvolutionForwardAlgorithm_;
void* dispatch::cudnnGetConvolutionForwardWorkspaceSize_;
void* dispatch::cudnnConvolutionForward_;
void* dispatch::cudnnPoolingForward_;
void* dispatch::cudnnSetStream_;
void* dispatch::cudnnTransformTensor_;
}
}

155
lib/driver/error.cpp Executable file
View File

@@ -0,0 +1,155 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "driver/error.h"
namespace tdl
{
namespace driver
{
void check(CUresult err)
{
using namespace exception::cuda;
switch(err)
{
case CUDA_SUCCESS : break;
case CUDA_ERROR_INVALID_VALUE : throw invalid_value();
case CUDA_ERROR_OUT_OF_MEMORY : throw out_of_memory();
case CUDA_ERROR_NOT_INITIALIZED : throw not_initialized();
case CUDA_ERROR_DEINITIALIZED : throw deinitialized();
case CUDA_ERROR_PROFILER_DISABLED : throw profiler_disabled();
case CUDA_ERROR_PROFILER_NOT_INITIALIZED : throw profiler_not_initialized();
case CUDA_ERROR_PROFILER_ALREADY_STARTED : throw profiler_already_started();
case CUDA_ERROR_PROFILER_ALREADY_STOPPED : throw profiler_already_stopped();
case CUDA_ERROR_NO_DEVICE : throw no_device();
case CUDA_ERROR_INVALID_DEVICE : throw invalid_device();
case CUDA_ERROR_INVALID_IMAGE : throw invalid_image();
case CUDA_ERROR_INVALID_CONTEXT : throw invalid_context();
case CUDA_ERROR_CONTEXT_ALREADY_CURRENT : throw context_already_current();
case CUDA_ERROR_MAP_FAILED : throw map_failed();
case CUDA_ERROR_UNMAP_FAILED : throw unmap_failed();
case CUDA_ERROR_ARRAY_IS_MAPPED : throw array_is_mapped();
case CUDA_ERROR_ALREADY_MAPPED : throw already_mapped();
case CUDA_ERROR_NO_BINARY_FOR_GPU : throw no_binary_for_gpu();
case CUDA_ERROR_ALREADY_ACQUIRED : throw already_acquired();
case CUDA_ERROR_NOT_MAPPED : throw not_mapped();
case CUDA_ERROR_NOT_MAPPED_AS_ARRAY : throw not_mapped_as_array();
case CUDA_ERROR_NOT_MAPPED_AS_POINTER : throw not_mapped_as_pointer();
case CUDA_ERROR_ECC_UNCORRECTABLE : throw ecc_uncorrectable();
case CUDA_ERROR_UNSUPPORTED_LIMIT : throw unsupported_limit();
case CUDA_ERROR_CONTEXT_ALREADY_IN_USE : throw context_already_in_use();
case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED : throw peer_access_unsupported();
case CUDA_ERROR_INVALID_PTX : throw invalid_ptx();
case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT : throw invalid_graphics_context();
case CUDA_ERROR_INVALID_SOURCE : throw invalid_source();
case CUDA_ERROR_FILE_NOT_FOUND : throw file_not_found();
case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND : throw shared_object_symbol_not_found();
case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED : throw shared_object_init_failed();
case CUDA_ERROR_OPERATING_SYSTEM : throw operating_system();
case CUDA_ERROR_INVALID_HANDLE : throw invalid_handle();
case CUDA_ERROR_NOT_FOUND : throw not_found();
case CUDA_ERROR_NOT_READY : throw not_ready();
case CUDA_ERROR_ILLEGAL_ADDRESS : throw illegal_address();
case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES : throw launch_out_of_resources();
case CUDA_ERROR_LAUNCH_TIMEOUT : throw launch_timeout();
case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING : throw launch_incompatible_texturing();
case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED : throw peer_access_already_enabled();
case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED : throw peer_access_not_enabled();
case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE : throw primary_context_active();
case CUDA_ERROR_CONTEXT_IS_DESTROYED : throw context_is_destroyed();
case CUDA_ERROR_ASSERT : throw assert_error();
case CUDA_ERROR_TOO_MANY_PEERS : throw too_many_peers();
case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED : throw host_memory_already_registered();
case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED : throw host_memory_not_registered();
case CUDA_ERROR_HARDWARE_STACK_ERROR : throw hardware_stack_error();
case CUDA_ERROR_ILLEGAL_INSTRUCTION : throw illegal_instruction();
case CUDA_ERROR_MISALIGNED_ADDRESS : throw misaligned_address();
case CUDA_ERROR_INVALID_ADDRESS_SPACE : throw invalid_address_space();
case CUDA_ERROR_INVALID_PC : throw invalid_pc();
case CUDA_ERROR_LAUNCH_FAILED : throw launch_failed();
case CUDA_ERROR_NOT_PERMITTED : throw not_permitted();
case CUDA_ERROR_NOT_SUPPORTED : throw not_supported();
case CUDA_ERROR_UNKNOWN : throw unknown();
default : throw unknown();
}
}
void check(nvrtcResult err){
using namespace exception::nvrtc;
switch(err)
{
case NVRTC_SUCCESS: break;
case NVRTC_ERROR_OUT_OF_MEMORY: throw out_of_memory();
case NVRTC_ERROR_PROGRAM_CREATION_FAILURE: throw program_creation_failure();
case NVRTC_ERROR_INVALID_INPUT: throw invalid_input();
case NVRTC_ERROR_INVALID_PROGRAM: throw invalid_program();
case NVRTC_ERROR_INVALID_OPTION: throw invalid_option();
case NVRTC_ERROR_COMPILATION: throw compilation();
case NVRTC_ERROR_BUILTIN_OPERATION_FAILURE: throw builtin_operation_failure();
default: throw unknown_error();
}
}
void check(cublasStatus_t err){
using namespace exception::cublas;
switch(err)
{
case CUBLAS_STATUS_SUCCESS : break;
case CUBLAS_STATUS_NOT_INITIALIZED : throw not_initialized();
case CUBLAS_STATUS_ALLOC_FAILED : throw alloc_failed();
case CUBLAS_STATUS_INVALID_VALUE : throw invalid_value();
case CUBLAS_STATUS_ARCH_MISMATCH : throw arch_mismatch();
case CUBLAS_STATUS_MAPPING_ERROR : throw mapping_error();
case CUBLAS_STATUS_EXECUTION_FAILED: throw execution_failed();
case CUBLAS_STATUS_INTERNAL_ERROR : throw internal_error();
case CUBLAS_STATUS_NOT_SUPPORTED : throw not_supported();
case CUBLAS_STATUS_LICENSE_ERROR : throw license_error();
default : throw unknown();
}
}
void check(cudnnStatus_t err){
using namespace exception::cudnn;
switch(err)
{
case CUDNN_STATUS_SUCCESS: break;
case CUDNN_STATUS_NOT_INITIALIZED: throw not_initialized();
case CUDNN_STATUS_ALLOC_FAILED: throw alloc_failed();
case CUDNN_STATUS_BAD_PARAM: throw bad_param();
case CUDNN_STATUS_INTERNAL_ERROR: throw internal_error();
case CUDNN_STATUS_INVALID_VALUE: throw invalid_value();
case CUDNN_STATUS_ARCH_MISMATCH: throw arch_mismatch();
case CUDNN_STATUS_MAPPING_ERROR: throw mapping_error();
case CUDNN_STATUS_EXECUTION_FAILED: throw execution_failed();
case CUDNN_STATUS_NOT_SUPPORTED: throw not_supported();
case CUDNN_STATUS_LICENSE_ERROR: throw license_error();
case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING: throw runtime_prerequisite_missing();
case CUDNN_STATUS_RUNTIME_IN_PROGRESS: throw runtime_in_progress();
case CUDNN_STATUS_RUNTIME_FP_OVERFLOW: throw runtime_fp_overflow();
}
}
}
}

40
lib/driver/event.cpp Executable file
View File

@@ -0,0 +1,40 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "driver/event.h"
namespace tdl
{
namespace driver
{
float Event::elapsed_time() const{
float time;
dispatch::cuEventElapsedTime(&time, cu_->first, cu_->second);
return time;
}
Handle<cu_event_t> const & Event::cu() const
{ return cu_; }
}
}

66
lib/driver/handle.cpp Executable file
View File

@@ -0,0 +1,66 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <cassert>
#include <memory>
#include "driver/handle.h"
namespace tdl
{
namespace driver
{
//CUDA
inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); }
inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); }
inline void _delete(CUstream x) { dispatch::cuStreamDestroy(x); }
inline void _delete(CUdevice) { }
inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); }
inline void _delete(CUfunction) { }
inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); }
inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
inline void _delete(cu_platform){}
//Constructor
template<class CUType>
Handle<CUType>::Handle(CUType cu, bool take_ownership): h_(new CUType(cu)), has_ownership_(take_ownership)
{ }
template<class CUType>
Handle<CUType>::~Handle(){
if(has_ownership_ && h_ && h_.unique() && *h_)
_delete(*h_);
}
template class Handle<CUdeviceptr>;
template class Handle<CUstream>;
template class Handle<CUcontext>;
template class Handle<CUdevice>;
template class Handle<cu_event_t>;
template class Handle<CUfunction>;
template class Handle<CUmodule>;
template class Handle<cu_platform>;
}
}

67
lib/driver/kernel.cpp Executable file
View File

@@ -0,0 +1,67 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <iostream>
#include <cstring>
#include "driver/kernel.h"
#include "driver/buffer.h"
namespace tdl
{
namespace driver
{
Kernel::Kernel(Module const & program, const char * name) : program_(program), address_bits_(program.context().device().address_bits()){
cu_params_store_.reserve(64);
cu_params_.reserve(64);
dispatch::cuModuleGetFunction(&*cu_, program, name);
}
void Kernel::setArg(unsigned int index, std::size_t size, void* ptr){
if(index + 1> cu_params_store_.size()){
cu_params_store_.resize(index+1);
cu_params_.resize(index+1);
}
cu_params_store_[index].reset(malloc(size), free);
memcpy(cu_params_store_[index].get(), ptr, size);
cu_params_[index] = cu_params_store_[index].get();
}
void Kernel::setArg(unsigned int index, Buffer const & data)
{ return setArg(index, (CUdeviceptr)data);}
void* const* Kernel::cu_params() const
{ return cu_params_.data(); }
Handle<CUfunction> const & Kernel::cu() const
{ return cu_; }
Module const & Kernel::module() const
{ return program_; }
}
}

118
lib/driver/module.cpp Executable file
View File

@@ -0,0 +1,118 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <iostream>
#include <fstream>
#include "driver/module.h"
#include "driver/context.h"
#include "driver/error.h"
#include "tools/sys/getenv.hpp"
namespace tdl
{
namespace driver
{
CUjit_target_enum cutarget(Device::Architecture arch){
switch(arch){
case Device::Architecture::SM_2_0: return CU_TARGET_COMPUTE_20;
case Device::Architecture::SM_2_1: return CU_TARGET_COMPUTE_21;
case Device::Architecture::SM_3_0: return CU_TARGET_COMPUTE_30;
case Device::Architecture::SM_3_5: return CU_TARGET_COMPUTE_35;
case Device::Architecture::SM_3_7: return CU_TARGET_COMPUTE_37;
case Device::Architecture::SM_5_0: return CU_TARGET_COMPUTE_50;
case Device::Architecture::SM_5_2: return CU_TARGET_COMPUTE_52;
case Device::Architecture::SM_6_0: return CU_TARGET_COMPUTE_60;
case Device::Architecture::SM_6_1: return CU_TARGET_COMPUTE_61;
default: throw;
}
}
inline std::pair<int, int> ptx(std::pair<int, int> sm){
if(sm.first == 7) return {6, 0};
if(sm.first == 6) return {5, 0};
if(sm.first == 5) return {4, 3};
throw;
}
std::string Module::header(Device const & device){
auto cc = device.compute_capability();
auto vptx = ptx(cc);
std::string header;
header += ".version " + std::to_string(vptx.first) + "." + std::to_string(vptx.second) + "\n";
header += ".target sm_" + std::to_string(cc.first) + std::to_string(cc.second) + "\n";
header += ".address_size 64\n";
return header;
}
Module::Module(Context const & context, std::string const & source) : context_(context), source_(header(context.device()) + source){
ContextSwitcher ctx_switch(context_);
//Path to custom PTX compiler
std::string compiler = tools::getenv("ISAAC_PTXAS");
if(compiler.size()){
auto cc = context.device().compute_capability();
std::string out = context.cache_path() + "tmp.o";
std::string opt = " --gpu-name sm_" + std::to_string(cc.first) + std::to_string(cc.second)
+ " -o " + out
+ " -ias \"" + source_ + "\"";
std::string cmd = compiler + opt;
if(std::system(cmd.c_str()) != 0)
throw;
dispatch::cuModuleLoad(&*cu_, out.c_str());
}
//JIT Compilation
else{
CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER};
unsigned int errbufsize = 8096;
std::string errbuf(errbufsize, 0);
//CUjit_target_enum target = cutarget(context.device().architecture());
void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)errbuf.data()};
try{
dispatch::cuModuleLoadDataEx(&*cu_, source_.data(), 2, opt, optval);
}catch(exception::cuda::base const &){
std::cerr << "Compilation Failed! Log: " << std::endl;
std::cerr << errbuf << std::endl;
throw;
}
}
}
Context const & Module::context() const
{ return context_; }
Handle<CUmodule> const & Module::cu() const
{ return cu_; }
Buffer Module::symbol(const char *name) const{
CUdeviceptr handle;
size_t size;
dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
return Buffer(context_, handle, false);
}
}
}

56
lib/driver/platform.cpp Executable file
View File

@@ -0,0 +1,56 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "driver/platform.h"
#include "driver/device.h"
#include <string>
namespace tdl
{
namespace driver
{
std::string Platform::version() const{
int version;
dispatch::cuDriverGetVersion(&version);
return std::to_string(version);
}
std::string Platform::name() const
{ return (std::string)"CUDA"; }
std::vector<Device> Platform::devices() const{
std::vector<Device> devices;
int N;
dispatch::cuDeviceGetCount(&N);
for(int i = 0 ; i < N ; ++i){
CUdevice device;
dispatch::cuDeviceGet(&device, i);
devices.push_back(Device(device));
}
return devices;
}
}
}

95
lib/driver/stream.cpp Executable file
View File

@@ -0,0 +1,95 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <iostream>
#include <cassert>
#include <array>
#include "driver/backend.h"
#include "driver/stream.h"
#include "driver/context.h"
#include "driver/device.h"
#include "driver/event.h"
#include "driver/kernel.h"
#include "driver/buffer.h"
namespace tdl
{
namespace driver
{
inline CUcontext cucontext(){
CUcontext result;
dispatch::cuCtxGetCurrent(&result);
return result;
}
Stream::Stream(CUstream stream, bool take_ownership): context_(cucontext(), take_ownership), cu_(stream, take_ownership)
{}
Stream::Stream(Context const & context): context_(context), cu_(CUstream(), true)
{
ContextSwitcher ctx_switch(context_);
dispatch::cuStreamCreate(&*cu_, 0);
}
void Stream::synchronize()
{
ContextSwitcher ctx_switch(context_);
dispatch::cuStreamSynchronize(*cu_);
}
Context const & Stream::context() const
{ return context_; }
void Stream::enqueue(Kernel const & kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const *, Event* event){
ContextSwitcher ctx_switch(context_);
if(event)
dispatch::cuEventRecord(((cu_event_t)*event).first, *cu_);
dispatch::cuLaunchKernel(kernel, grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)kernel.cu_params(), NULL);
if(event)
dispatch::cuEventRecord(((cu_event_t)*event).second, *cu_);
}
void Stream::write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr){
ContextSwitcher ctx_switch(context_);
if(blocking)
dispatch::cuMemcpyHtoD(buffer + offset, ptr, size);
else
dispatch::cuMemcpyHtoDAsync(buffer + offset, ptr, size, *cu_);
}
void Stream::read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr){
ContextSwitcher ctx_switch(context_);
if(blocking)
dispatch::cuMemcpyDtoH(ptr, buffer + offset, size);
else
dispatch::cuMemcpyDtoHAsync(ptr, buffer + offset, size, *cu_);
}
Handle<CUstream> const & Stream::cu() const
{ return cu_; }
}
}