ISAAC-V2.0: INITIAL COMMIT

This commit is contained in:
philippe
2017-05-07 16:51:51 -07:00
parent 911f1fdb71
commit e99759d3b3
2882 changed files with 73975 additions and 1087661 deletions

View File

@@ -1,84 +1,28 @@
cmake_minimum_required(VERSION 2.8.7) cmake_minimum_required(VERSION 2.8.7)
project(isaac-research)
include(CTest)
#QtCreator: add visibility of headers
file( GLOB_RECURSE MAKE_HEADERS_VISIBLE_SRC *.cpp *.hpp *.h)
add_custom_target( MAKE_HEADERS_VISIBLE SOURCES ${MAKE_HEADERS_VISIBLE_SRC} )
#Default build type
if(NOT CMAKE_BUILD_TYPE) if(NOT CMAKE_BUILD_TYPE)
message(STATUS "Default build type: Release") message(STATUS "Default build type: Release")
set(CMAKE_BUILD_TYPE "Release") set(CMAKE_BUILD_TYPE "Release")
endif() endif()
if(WIN32)
SET(CMAKE_FIND_LIBRARY_PREFIXES "")
SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib" ".dll")
endif()
# Add visibility of headers
file( GLOB_RECURSE MAKE_HEADERS_VISIBLE_SRC *.cpp *.hpp *.h)
add_custom_target( MAKE_HEADERS_VISIBLE SOURCES ${MAKE_HEADERS_VISIBLE_SRC} )
#Modules
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/lib/external/)
#Compiler flags #Compiler flags
add_definitions(${BACKEND_DEFINES}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
if(WIN32) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -pedantic")
add_definitions("-DNOMINMAX")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -pedantic")
endif()
#Includes #Source
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/lib/tools/ ${CMAKE_CURRENT_SOURCE_DIR}/include/external/ ${CMAKE_CURRENT_SOURCE_DIR}/include/external/cuda)
#Binary to convert .cu files to const char *
if(NOT ANDROID)
add_executable(bin2cpp ${CMAKE_MODULE_PATH}/helpers/bin2cpp.cpp)
include("${CMAKE_MODULE_PATH}/helpers/CodeToH.cmake")
endif()
#Source files
file(GLOB_RECURSE LIBISAAC_SRC lib/*.cpp) file(GLOB_RECURSE LIBISAAC_SRC lib/*.cpp)
add_library(isaac SHARED ${LIBISAAC_SRC})
target_link_libraries(isaac "dl")
#Python wrapper #Examples
set(SETUP_PY_IN "${CMAKE_MODULE_PATH}/python/setup.py")
set(SETUP_PY "${CMAKE_SOURCE_DIR}/python/setup.py")
set(LIBISAAC_SRC_STR)
foreach(FILE ${LIBISAAC_SRC})
string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}" "src" _TMP ${FILE})
set(LIBISAAC_SRC_STR "${_TMP} ${LIBISAAC_SRC_STR}")
endforeach()
#Include directories
set(INCLUDE_DIRECTORIES_STR)
get_property(INCLUDE_DIRECTORIES_LST DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
set(INCLUDE_DIRECTORIES_STR)
foreach(FILE ${INCLUDE_DIRECTORIES_LST})
string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}" "src" _TMP ${FILE})
set(INCLUDE_DIRECTORIES_STR "${INCLUDE_DIRECTORIES_STR} ${_TMP}")
endforeach()
configure_file(${SETUP_PY_IN} ${SETUP_PY})
add_custom_command(OUTPUT "${CMAKE_BINARY_DIR}/build/timestamp"
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/python ${CMAKE_BINARY_DIR}/python
COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_BINARY_DIR}/python/src/lib/CMakeLists.txt
COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_BINARY_DIR}/python/build
COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_BINARY_DIR}/python/src/lib
COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_BINARY_DIR}/python/src/include
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/lib ${CMAKE_BINARY_DIR}/python/src/lib
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/include ${CMAKE_BINARY_DIR}/python/src/include
COMMAND ${CMAKE_COMMAND} -E tar czf isaac-1.0.tar.gz ${CMAKE_BINARY_DIR}/python
)
add_custom_target(package-python DEPENDS "${CMAKE_BINARY_DIR}/build/timestamp")
#Isaac
include(CTest)
add_subdirectory(lib)
add_subdirectory(tests)
add_subdirectory(bench)
add_subdirectory(examples) add_subdirectory(examples)
#Tests
add_subdirectory(tests)

View File

@@ -19,3 +19,4 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ */

View File

@@ -1,77 +1,40 @@
# ISAAC # ISAAC
This is the developer repository for ISAAC, a library that uses machine learning to find input-aware kernels for element-wise operations, 1D/2D reductions and GEMM. It works with both cuBLAS and clBLAS, and fallbacks on those when appropriate (typically large square matrices). This is the development branch for ISAAC v2.0. This is a major rewrite more targetted at compute-bound applications, with major performance gains at the expense of portability.
### License ### License
ISAAC is distributed under the MIT License. ISAAC is distributed under the MIT/X11 license.
### Installation ### Installation
ISAAC is dependency-free, and will load either OpenCL and/or CUDA 7.0+ _dynamically_ depending on which GPUs are detected at runtime. ISAAC only requires an NVIDIA GPU with compute-capability > 5.0 and the corresponding proprietary driver.
You only need CMake 2.8.7+ and a C++11 compliant compiler:
The CUDA SDK is *not* required.
``` ```
git clone https://github.com/ptillet/isaac.git git clone https://github.com/ptillet/isaac.git
mkdir -p isaac/build && cd isaac/build mkdir -p isaac/build && cd isaac/build
cmake ../ && make -j4 cmake ../ && make -j8
./examples/bench
``` ```
Link against libisaac.so instead of libcublas.so or libclblas.so, and you're good to go! ### Benchmarks
Below is the TFLOPS you get for sGEMM on a Pascal Titan X vs cuBLAS 8.0.
![alt tag](https://github.com/ptillet/isaac/raw/master/documentation/bench/GEMM.png)
The C++ and Python API does some kernel fusion, but is not entirely stable. It works well to compose element-wise operations, though. Below is the TFLOPS you get for FCONV on a Pascal Titan X vs cuDNN v6.
![alt tag](https://github.com/ptillet/isaac/raw/master/documentation/bench/CONV.png)
There's still room for improvement.
### Benchmark ### APIs
``` ISAAC implements both GEMM and FCONV for fp16x2, fp32, and fp64. Half-precision with 32-bits accumulation and complex data-types is not yet supported.
Usage : blas-bench [--op {axpy, dot, gemv, gemm}] [--dtype {float32, float64}] [--device DEVICE_IDX] [--help]
--op: operation to benchmark (default = gemm)
--dtype: data-type to benchmark (default = float32)
--device: index of isaac device in [0, ..., ndevices - 1] (default = 0)
--help: display this message
```
It detects clBLAS or cuBLAS and compares it against ISAAC for e.g., DeepBench, Covariance, LAPACK (packed rank1 updates), etc.
Below is the TFLOPS you get for GEMM on a Pascal Titan X (cuBLAS 8.0). Numbers in bold represent speed-ups greater than 5%. ### Future Plans
![alt tag](https://github.com/ptillet/isaac/raw/master/documentation/bench/bench-cuBLAS.png)
For AMD Fury (clBLAS-2.10-Fiji): Future plans include (but are not limited to):
![alt tag](https://github.com/ptillet/isaac/raw/master/documentation/bench/bench-clBLAS.png) * Transparent use over cuBLAS/cuDNN using LD_PRELOAD
* Backward Convolution
Same trend on Intel Broadwell iGPU * Complex data-types for GEMM
### BLAS routines supported
Currently supported functions are:
| BLAS1 | BLAS2 | BLAS3 |
| --------------| --------------| --------------|
| xAXPY | xGEMV | xGEMM |
| xCOPY | xGER | |
| xSCAL | | |
| xDOT | | |
| xASUM | | |
For x in {S, D}
### Contributing
You can contribute to further tuning isaac if you have one of the following architecture:
- NVidia: SM 2.x ; SM 3.5 ; SM 5.0
If you have one of the following architectures you can contribute by running:
```
git clone https://github.com/ptillet/isaac.git
cd isaac/python ;
python setup.py build;
cd ../tune
PYTHONPATH=../python/build/lib.linux-x86_64-2.7/ python main.py --float64 --float32 --elementwise_1d --elementwise_2d --reduce_1d --reduce_2d_rows --reduce_2d_cols --gemm_nn --gemm_nt --gemm_tn --gemm_tt
```
This will output a .json file that you can submit for integration.
Bug reports are more than welcome!

View File

@@ -1,44 +0,0 @@
set(CMAKE_BUILD_TYPE Release)
set(BLAS_DEF)
set(BLAS_LIBS)
#CUBLAS
find_package(CUDA QUIET)
if(CUDA_FOUND)
set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_CUBLAS")
include_directories(${CUDA_INCLUDE_DIRS})
set(BLAS_LIBS ${BLAS_LIBS} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
endif()
#CLBLAS
find_package(CLBLAS QUIET)
if(CLBLAS_FOUND)
set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_CLBLAS")
include_directories(${CLBLAS_INCLUDE_DIR})
set(BLAS_LIBS ${BLAS_LIBS} ${CLBLAS_LIBRARIES} OpenCL pthread)
endif()
##CBLAS
#find_package(MKL QUIET)
#if(MKL_FOUND)
# set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_MKL")
# include_directories(${MKL_INCLUDE_DIR})
# set(BLAS_LIBS ${BLAS_LIBS} ${MKL_LIBRARIES} )
#else()
# find_package(OpenBlas)
# if(OPENBLAS_FOUND)
# set(BLAS_DEF ${BLAS_DEF} "-DHAS_A_BLAS -DBENCH_CBLAS")
# include_directories(${OPENBLAS_INCLUDE_DIR})
# set(BLAS_LIBS ${BLAS_LIBS} ${OPENBLAS_LIBRARIES} )
# endif()
#endif()
string(REPLACE ";" " " BLAS_DEF_STR "${BLAS_DEF}")
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/tests/common)
foreach(PROG blas)
add_executable(bench-${PROG} ${PROG}.cpp)
set_target_properties(bench-${PROG} PROPERTIES COMPILE_FLAGS "${BLAS_DEF_STR}")
target_link_libraries(bench-${PROG} ${BLAS_LIBS} isaac)
endforeach(PROG)

View File

@@ -1,412 +0,0 @@
#include "isaac/array.h"
#include "isaac/runtime/execute.h"
#ifdef BENCH_CLBLAS
#include "clBLAS.h"
#endif
#ifdef BENCH_MKL
#include "mkl_cblas.h"
#elif defined(BENCH_CBLAS)
#include "cblas.h"
#endif
#ifdef BENCH_CUBLAS
#include <cublas.h>
#endif
#include <iomanip>
#include <stdlib.h>
#include <cmath>
#include <numeric>
#include <regex>
#include <string>
#include "common.hpp"
#include "half.hpp"
typedef sc::int_t int_t;
Timer tmr;
/* C++ wrapper for BLAS */
#ifdef BENCH_CLBLAS
template<typename... Args> void clblasAxpy(float, Args... args){ clblasSaxpy(args...); }
template<typename... Args> void clblasAxpy(double, Args... args){ clblasDaxpy(args...); }
template<typename... Args> void clblasDot(float, Args... args){ clblasSdot(args...); }
template<typename... Args> void clblasDot(double, Args... args){ clblasDdot(args...); }
template<typename... Args> void clblasGemv(float, Args... args){ clblasSgemv(args...); }
template<typename... Args> void clblasGemv(double, Args... args){ clblasDgemv(args...); }
template<typename... Args> void clblasGemm(float, Args... args){ clblasSgemm(args...); }
template<typename... Args> void clblasGemm(double, Args... args){ clblasDgemm(args...); }
#endif
#ifdef BENCH_CBLAS
template<typename... Args> void cblasAxpy(float, Args... args){ cblas_saxpy(args...); }
template<typename... Args> void cblasAxpy(double, Args... args){ cblas_daxpy(args...); }
template<typename... Args> void cblasDot(float, Args... args){ cblas_sdot(args...); }
template<typename... Args> void cblasDot(double, Args... args){ cblas_ddot(args...); }
template<typename... Args> void cblasGemv(float, Args... args){ cblas_sgemv(args...); }
template<typename... Args> void cblasGemv(double, Args... args){ cblas_dgemv(args...); }
template<typename... Args> void cblasGemm(float, Args... args){ cblas_sgemm(args...); }
template<typename... Args> void cblasGemm(double, Args... args){ cblas_dgemm(args...); }
#endif
//cuBLAS
#ifdef BENCH_CUBLAS
template<typename... Args> void cublasAxpy(float, Args... args){ cublasSaxpy(args...); }
template<typename... Args> void cublasAxpy(double, Args... args){ cublasDaxpy(args...); }
template<typename... Args> void cublasDot(float, Args... args){ cublasSdot(args...); }
template<typename... Args> void cublasDot(double, Args... args){ cublasDdot(args...); }
template<typename... Args> void cublasGemv(float, Args... args){ cublasSgemv(args...); }
template<typename... Args> void cublasGemv(double, Args... args){ cublasDgemv(args...); }
template<typename... Args> void cublasGemm(float, Args... args){ cublasSgemm(args...); }
template<typename... Args> void cublasGemm(double, Args... args){ cublasDgemm(args...); }
#endif
//
template<class OP, class SYNC>
double bench(OP const & op, SYNC const & sync)
{
std::vector<long> times;
double total_time = 0;
op();
sync();
while(total_time*1e-9 < 2e-1){
tmr.start();
op();
sync();
times.push_back(tmr.get().count());
total_time+=times.back();
}
return min(times);
}
void print_results_header(std::vector<std::string> sections, bool
#ifdef BENCH_CLBLAS
on_cl
#endif
, bool
#ifdef BENCH_CUBLAS
on_cu
#endif
){
std::cout << color_stream(ITALIC) << color_stream(BOLD) ;
std::copy(sections.begin(), sections.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
std::cout << "ISAAC";
#ifdef BENCH_CLBLAS
if(on_cl)
std::cout << "\tclBLAS";
#endif
#ifdef BENCH_CBLAS
std::cout << "\tBLAS";
#endif
#ifdef BENCH_CUBLAS
if(on_cu)
std::cout << "\tcuBLAS";
#endif
std::cout << color_stream(RESET) << std::endl;
}
void print_results(std::vector<double> const & times, std::vector<std::string> const & prefix, std::function<double(double)> fn){
std::copy(prefix.begin(), prefix.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
std::vector<double> perf;
std::transform(times.begin(), times.end(), std::back_inserter(perf), fn);
auto fastest = perf;
std::sort(fastest.begin(), fastest.end(), std::greater<double>());
for(auto x: perf){
if(x/fastest[1] >= 1.05)
std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
else
std::cout << x;
std::cout << "\t";
}
std::cout << std::endl;
}
template<class T>
std::string str(T const & x){ return std::to_string(x); }
template<class T>
void bench(sc::numeric_type dtype, std::string operation)
{
using std::get;
using std::make_tuple;
//unsigned int dtsize = sc::size_of(dtype);
sc::driver::CommandQueue & queue = sc::driver::backend::queues::get(sc::driver::backend::contexts::get_default(),0);
auto sync = [&](){ queue.synchronize(); };
#ifdef BENCH_CUBLAS
auto cusync = [&](){ cudaDeviceSynchronize(); };
#endif
bool on_cl = queue.backend()==sc::driver::OPENCL;
bool on_cu = queue.backend()==sc::driver::CUDA;
size_t dtsize = sc::size_of(dtype);
/*---------*/
/*--BLAS1--*/
/*---------*/
if(operation=="axpy")
{
float alpha = 1;
print_results_header({"N"}, on_cl, on_cu);
for(int_t MB: std::vector<int_t>{1, 10, 100, 1000})
{
int_t N = MB*1e6/dtsize/3;
std::vector<double> times;
sc::array x(N, dtype), y(N, dtype);
//Bench
times.push_back(bench([&](){y = x + alpha*y;}, sync));
#ifdef BENCH_CLBLAS
if(on_cl)
times.push_back(bench([&]() {clblasAxpy(T(), N, alpha, cl(x), 0, 1, cl(y), 0, 1, 1, &cl(queue), 0, nullptr, nullptr);}, sync));
#endif
#ifdef BENCH_CBLAS
std::vector<float> cx(N), cy(N);
sc::copy(x, cx);
sc::copy(y, cy);
times.push_back(bench([&](){cblasAxpy(T(), N, alpha, cx.data(), 1, cy.data(), 1);}, sync));
#endif
#ifdef BENCH_CUBLAS
if(on_cu)
times.push_back(bench([&](){cublasAxpy(T(), N, alpha, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
#endif
print_results(times, {str(MB)}, [&](double t){return MB*1e6/t;});
}
}
if(operation=="dot")
{
print_results_header({"MB"}, on_cl, on_cu);
for(int_t MB: std::vector<int_t>{1, 10, 100, 1000})
{
int_t N = MB*1e6/dtsize/2;
std::vector<double> times;
sc::array x(N, dtype), y(N, dtype);
sc::array scratch(N, dtype);
sc::scalar s(dtype);
//Bench
times.push_back(bench([&](){s = dot(x,y);}, sync));
#ifdef BENCH_CLBLAS
if(on_cl)
times.push_back(bench([&]() {clblasDot(T(), N, cl(s), 0, cl(x), 0, 1, cl(y), 0, 1, cl(scratch), 1, &cl(queue), 0, nullptr, nullptr);}, sync));
#endif
#ifdef BENCH_CBLAS
std::vector<float> cx(N), cy(N);
sc::copy(x, cx);
sc::copy(y, cy);
times.push_back(bench([&](){cblasDot(T(), N, cx.data(), 1, cy.data(), 1);}, sync));
#endif
#ifdef BENCH_CUBLAS
if(on_cu)
times.push_back(bench([&](){cublasDot(T(), N, (T*)cu(x), 1, (T*)cu(y), 1);}, cusync));
#endif
print_results(times, {str(MB)}, [&](double t){return MB*1e6/t;});
}
}
if(operation.substr(0, 4)=="gemv")
{
std::vector<std::tuple<std::string, std::string,int_t, int_t> > MNs;
//Linear System
MNs.push_back(make_tuple("Square", "N",153,153));
MNs.push_back(make_tuple("Square", "N",1024, 1024));
MNs.push_back(make_tuple("Square", "N",2867,2867));
MNs.push_back(make_tuple("Square", "T",153,153));
MNs.push_back(make_tuple("Square", "T",1024,1024));
MNs.push_back(make_tuple("Square", "T",2867,2867));
//Normalization
MNs.push_back(make_tuple("Short", "N", 64, 60000));
MNs.push_back(make_tuple("Short", "N", 256, 60000));
MNs.push_back(make_tuple("Short", "N", 1024, 60000));
MNs.push_back(make_tuple("Short", "T", 64, 60000));
MNs.push_back(make_tuple("Short", "T", 256, 60000));
MNs.push_back(make_tuple("Short", "T", 1024, 60000));
//Householder
MNs.push_back(make_tuple("Tall", "N", 10, 60000));
MNs.push_back(make_tuple("Tall", "N", 30, 60000));
MNs.push_back(make_tuple("Tall", "T", 10, 60000));
MNs.push_back(make_tuple("Tall", "T", 30, 60000));
/*---------*/
/*--BLAS2--*/
/*---------*/
print_results_header({"BENCH", "M", "N", "AT"}, on_cl, on_cu);
for(auto MN: MNs)
{
std::vector<double> times;
std::string name = get<0>(MN);
std::string cAT = get<1>(MN);
int_t M = get<2>(MN);
int_t N = get<3>(MN);
int_t As1 = M, As2 = N;
bool AT = (cAT == "T");
if(AT) std::swap(As1, As2);
sc::array A(As1, As2, dtype), y(M, dtype), x(N, dtype);
#ifdef HAS_A_BLAS
int_t lda = A.stride()[1];
#endif
//Bench
times.push_back(bench([&](){y = AT?dot(A.T,x):dot(A,x);}, sync));
#ifdef BENCH_CLBLAS
if(on_cl)
times.push_back(bench([&]() {clblasGemv(T(), clblasColumnMajor, AT?clblasTrans:clblasNoTrans, As1, As2, 1, cl(A), 0, lda, cl(x), 0, 1, 0, cl(y), 0, 1, 1, &cl(queue),0, nullptr, nullptr);}, sync));
#endif
#ifdef BENCH_CBLAS
std::vector<float> cA(M*N), cx(N), cy(M);
sc::copy(x, cx);
sc::copy(y, cy);
sc::copy(A, cA);
times.push_back(bench([&](){cblasGemv(T(), CblasColMajor, AT?CblasTrans:CblasNoTrans, As1, As2, 1, cA.data(), lda, cx.data(), 1, 0, cy.data(), 1);}, sync));
#endif
#ifdef BENCH_CUBLAS
if(on_cu)
times.push_back(bench([&](){cublasGemv(T(), AT?'t':'n', As1, As2, 1, (T*)cu(A), lda, (T*)cu(x), 1, 0, (T*)cu(y), 1);}, cusync));
#endif
print_results(times, {name, str(M), str(N), cAT}, [&](double t){ return (M*N + M + N)*dtsize/t;});
}
}
if(operation.substr(0,4)=="gemm")
{
std::vector<std::tuple<std::string, int_t, int_t, int_t, std::string, std::string> > MNKs;
//DeepBench
for(size_t MK: std::vector<size_t>{1760, 2048, 2560})
for(size_t N: std::vector<size_t>{16, 32, 64, 128, 7000})
MNKs.push_back(make_tuple("Deep", MK, N, MK, "N", "N"));
for(size_t MK: std::vector<size_t>{1760, 2048, 2560})
for(size_t N: std::vector<size_t>{16, 32, 64, 128, 7000})
MNKs.push_back(make_tuple("Deep", MK, N, MK, "T", "N"));
for(size_t MK: std::vector<size_t>{1760, 4096})
MNKs.push_back(make_tuple("Deep", MK, 7133, MK, "N", "T"));
//Covariance (e.g., ICA, 10minutes/100Hz)
MNKs.push_back(make_tuple("Cov",32,32,60000,"N","T"));
MNKs.push_back(make_tuple("Cov",256,256,60000,"N","T"));
//Bi-diagonalization
MNKs.push_back(make_tuple("Lapack",4096,4096,32,"N","T"));
MNKs.push_back(make_tuple("Lapack",3456,3456,32,"N","T"));
MNKs.push_back(make_tuple("Lapack",896,896,32,"N","T"));
print_results_header({"BENCH", "M", "N", "K", "AT", "BT"}, on_cl, on_cu);
/*---------*/
/*--BLAS3--*/
/*---------*/
for(auto MNK: MNKs)
{
std::vector<double> times;
std::vector<double> tflops;
std::string name = get<0>(MNK);
int_t M = get<1>(MNK);
int_t N = get<2>(MNK);
int_t K = get<3>(MNK);
std::string cAT = get<4>(MNK);
std::string cBT = get<5>(MNK);
bool AT = cAT=="T";
bool BT = cBT=="T";
int_t As1 = M, As2 = K;
if(AT) std::swap(As1, As2);
int_t Bs1 = K, Bs2 = N;
if(BT) std::swap(Bs1, Bs2);
sc::array C(M, N, dtype), A(As1, As2, dtype), B(Bs1, Bs2, dtype);
#ifdef HAS_A_BLAS
int_t lda = A.stride()[1], ldb = B.stride()[1], ldc = C.stride()[1];
#endif
//bench
times.push_back(bench([&](){C = AT?(BT?dot(A.T,B.T)
:dot(A.T,B))
:(BT?dot(A,B.T)
:dot(A,B));}, sync));
#ifdef BENCH_CLBLAS
if(on_cl)
times.push_back(bench([&]() {clblasGemm(T(), clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans,
M, N, K, 1, cl(A), 0, lda, cl(B), 0, ldb,
0, cl(C), 0, ldc, 1, &cl(queue),0, nullptr, nullptr);}, sync));
#endif
#ifdef BENCH_CBLAS
std::vector<float> cC(M*N), cA(M*K), cB(N*K);
sc::copy(C, cC);
sc::copy(A, cA);
sc::copy(B, cB);
times.push_back(bench([&](){cblasGemm(T(), CblasColMajor, AT?CblasTrans:CblasNoTrans, BT?CblasTrans:CblasNoTrans, M, N, K, 1, cA.data(), lda, cB.data(), ldb, 1, cC.data(), ldc);}, sync));
#endif
#ifdef BENCH_CUBLAS
if(on_cu)
times.push_back(bench([&](){cublasGemm(T(), AT?'t':'n', BT?'t':'n', M, N, K, 1, (T*)cu(A), lda, (T*)cu(B), ldb, 0, (T*)cu(C), ldc);}, cusync));
#endif
print_results(times, {name, str(M), str(N), str(K), cAT, cBT}, [&](double t){ return 2*M*N*K/t*1e-3;});
}
}
}
void handle_misusage(){
std::cerr << "Usage : blas-bench [--op {axpy, dot, gemv, gemm}] [--dtype {float32, float64}] [--device DEVICE_IDX] [--help]" << std::endl;
std::cerr << "--op: operation to benchmark (default = gemm)" << std::endl;
std::cerr << "--dtype: data-type to benchmark (default = float32)" << std::endl;
std::cerr << "--device: index of isaac device in [0, ..., ndevices - 1] (default = 0)" << std::endl;
std::cerr << "--help: display this message" << std::endl;
exit(EXIT_FAILURE);
}
std::string getopt(std::vector<std::string> const & args,
std::string const & key,
std::vector<std::string> const & set = {},
std::string dft = "")
{
auto it = std::find(args.begin(), args.end(), key);
if(it==args.end()){
if(dft.empty())
handle_misusage();
return dft;
}
auto next = it + 1;
if(next==args.end() || next->compare(0, 2, "--")==0)
handle_misusage();
if(set.size() && std::find(set.begin(), set.end(), *next)==set.end())
handle_misusage();
return *next;
}
int main(int argc, char* argv[])
{
std::vector<std::string> args(argv, argv + argc);
#ifdef BENCH_CLBLAS
clblasSetup();
#endif
sc::driver::backend::default_queue_properties = CL_QUEUE_PROFILING_ENABLE;
if(std::find(args.begin(), args.end(), "--help") != args.end())
handle_misusage();
std::string operation = getopt(args, "--op", {"axpy", "dot", "gemv", "gemm"}, "gemm");
std::string dtype = getopt(args, "--dtype", {"float16", "float32", "float64"}, "float32");
int device;
try{
device = std::stoi(getopt(args, "--device", {}, "0"));
}catch(...){ handle_misusage(); }
sc::driver::backend::default_device = device;
/* List devices */
std::cout << "Devices available:" << std::endl;
std::cout << "------------------" << std::endl;
size_t i = 0;
std::vector<sc::driver::Platform> platforms;
sc::driver::backend::platforms(platforms);
for(sc::driver::Platform const & pf: platforms){
std::vector<sc::driver::Device> devices;
pf.devices(devices);
for(sc::driver::Device const & device: devices)
std::cout << "[" << (i++==sc::driver::backend::default_device?"x":" ") << "]"
<< " - " << device.name()
<< " on " << pf.name() << std::endl;
}
std::cout << "------------------" << std::endl;
std::cout << std::fixed << std::setprecision(2);
//if(dtype=="float16")
// bench<half_float::half>(sc::HALF_TYPE, operation);
if(dtype=="float32")
bench<float>(sc::FLOAT_TYPE, operation);
if(dtype=="float64")
bench<double>(sc::DOUBLE_TYPE, operation);
#ifdef BENCH_CLBLAS
clblasTeardown();
#endif
}

View File

@@ -1,152 +0,0 @@
#ifndef ISAAC_BENCH_COMMON_HPP_
#define ISAAC_BENCH_COMMON_HPP_
#include <chrono>
#include <algorithm>
#include "isaac/array.h"
namespace sc = isaac;
template<std::size_t> struct int_{};
template <class Tuple, size_t Pos>
std::ostream& print_tuple(std::ostream& out, const Tuple& t, int_<Pos> )
{
out << std::get< std::tuple_size<Tuple>::value-Pos >(t) << ',';
return print_tuple(out, t, int_<Pos-1>());
}
template <class Tuple>
std::ostream& print_tuple(std::ostream& out, const Tuple& t, int_<1> )
{
return out << std::get<std::tuple_size<Tuple>::value-1>(t);
}
template <class... Args>
std::ostream& operator<<(std::ostream& out, const std::tuple<Args...>& t)
{
print_tuple(out, t, int_<sizeof...(Args)>());
return out;
}
int ceil(int N, int pad)
{
return (N%pad==0)?N:(N+pad-1)/pad*pad;
}
std::vector<int> create_log_range(int min, int max, int N, int pad)
{
std::vector<int> res(N);
for(int i = 0 ; i < N ; ++i)
{
res[i] = static_cast<int>(std::exp(std::log(min) + (float)(std::log(max) - std::log(min))*i/N));
res[i] = ceil(res[i], pad);
}
return res;
}
std::vector<int> create_full_range(int min, int max, int pad)
{
std::vector<int> N;
for(int i = ceil(min, pad) ; i < ceil(max, pad) ; i+=pad)
N.push_back(i);
return N;
}
template<class T>
T median(std::vector<T> x)
{
size_t size = x.size();
std::sort(x.begin(), x.end());
if (size % 2 == 0)
return (x[size / 2 - 1] + x[size / 2]) / 2;
else
return x[size / 2];
}
template<class T>
T min(std::vector<T> x)
{ return *std::min_element(x.begin(), x.end()); }
template<class T>
T max(std::vector<T> x)
{ return *std::max_element(x.begin(), x.end()); }
template<class T>
T mean(std::vector<T> x)
{
T res = 0;
int N = x.size();
for(int i = 0 ; i < N ; ++i)
res += x[i];
return res/N;
}
class Timer
{
typedef std::chrono::high_resolution_clock high_resolution_clock;
typedef std::chrono::nanoseconds nanoseconds;
public:
explicit Timer(bool run = false)
{ if (run) start(); }
void start()
{ _start = high_resolution_clock::now(); }
nanoseconds get() const
{ return std::chrono::duration_cast<nanoseconds>(high_resolution_clock::now() - _start); }
private:
high_resolution_clock::time_point _start;
};
cl_mem& cl(sc::array& x)
{ return x.data().handle().cl(); }
cl_mem& cl(sc::scalar& x)
{ return x.data().handle().cl(); }
cl_command_queue& cl(sc::driver::CommandQueue& x)
{ return x.handle().cl(); }
CUdeviceptr& cu(sc::array& x)
{ return x.data().handle().cu(); }
CUdeviceptr& cu(sc::scalar& x)
{ return x.data().handle().cu(); }
CUstream& cu(sc::driver::CommandQueue& x)
{ return x.handle().cu(); }
enum Code {
RESET = 0,
BOLD = 1,
ITALIC = 3,
FG_RED = 31,
FG_GREEN = 32,
FG_YELLOW = 33,
FG_BLUE = 34,
FG_MAGENTA = 35,
FG_CYAN = 36,
FG_LIGHT_GRAY = 37,
FG_DARK_GRAY = 90,
FG_LIGHT_RED = 91,
FG_LIGHT_GREEN = 92,
FG_LIGHT_YELLOW = 93,
FG_LIGHT_BLUE = 94,
FG_LIGHT_MAGENTA = 95,
FG_LIGHT_CYAN = 96,
FG_WHITE = 97
};
class color_stream {
Code code;
public:
color_stream(Code pCode) : code(pCode) {}
friend std::ostream&
operator<<(std::ostream& os, const color_stream& mod) {
return os << "\033[" << mod.code << "m";
}
};
#endif

View File

@@ -1,54 +0,0 @@
#include "isaac/array.h"
#include <vector>
namespace sc = isaac;
#ifdef BENCH_CUBLAS
__global__ void dummy(){}
#endif
int main()
{
for(sc::driver::backend::data_type::const_iterator it = sc::driver::queues.data().begin() ; it != sc::driver::queues.data().end() ; ++it)
{
cl::CommandQueue queue = it->second[0];
cl::Context context = it->first;
cl::Device device = queue.getInfo<CL_QUEUE_DEVICE>();
cl::Program program(context,"__kernel void dummy(){}");
program.build();
cl::Kernel kernel(program, "dummy");
cl::NDRange offset = cl::NullRange;
cl::NDRange global(1);
cl::NDRange local(1);
cl::Event event;
std::cout << "Device: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
std::cout << "-------------------------" << std::endl;
queue.enqueueNDRangeKernel(kernel, offset, global, local, NULL, &event);
queue.flush();
queue.finish();
{
long time = event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
std::cout << "Kernel launch overhead: " << time << std::endl;
}
#ifdef BENCH_CUBLAS
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
dummy<<<1, 1>>>();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
std::cout << "CUDA Kernel launch overhead: " << time << std::endl;
#endif
std::cout << "-------------------------" << std::endl;
}
}

View File

@@ -1,15 +0,0 @@
file(GLOB CLBLAS_ROOT /opt/clBLAS*)
set(CLBLAS_INCLUDE_HINTS "${CLBLAS_ROOT}/include")
set(CLBLAS_LIBRARIES_HINTS "${CLBLAS_ROOT}/lib64")
find_path(CLBLAS_INCLUDE_DIR clBLAS.h HINTS ${CLBLAS_INCLUDE_HINTS})
find_library(CLBLAS_LIBRARIES NAMES clBLAS HINTS ${CLBLAS_LIBRARIES_HINTS})
if(CLBLAS_LIBRARIES)
set(CLBLAS_LIBRARIES ${CLBLAS_LIBRARIES})
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(CLBLAS DEFAULT_MSG CLBLAS_LIBRARIES CLBLAS_INCLUDE_DIR)
mark_as_advanced(CLBLAS)

View File

@@ -1,19 +0,0 @@
file(GLOB SYSTEM_STUDIO_ROOT /opt/intel/ /opt/intel/composerxe* /opt/intel/system_studio_*)
find_path(MKL_INCLUDE_DIR mkl_blas.h HINTS ${SYSTEM_STUDIO_ROOT}/mkl/include/)
find_library(MKL_LIBRARIES NAMES mkl_core HINTS ${SYSTEM_STUDIO_ROOT}/mkl/lib/intel64/)
find_library(ICC_LIBRARIES NAMES iomp5 HINTS ${SYSTEM_STUDIO_ROOT}/compiler/lib/intel64/)
if(ICC_LIBRARIES)
set(OMP_LIBRARIES ${ICC_LIBRARIES})
else()
set(OMP_LIBRARIES gomp)
endif()
if(MKL_LIBRARIES AND OMP_LIBRARIES)
set(MKL_LIBRARIES -lmkl_mc3 -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core ${MKL_LIBRARIES} ${OMP_LIBRARIES} pthread)
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(MKL DEFAULT_MSG MKL_LIBRARIES MKL_INCLUDE_DIR)
mark_as_advanced(MKL)

View File

@@ -1,10 +0,0 @@
find_path(OPENBLAS_INCLUDE_DIR cblas.h)
find_library(OPENBLAS_LIBRARIES NAMES openblas PATHS /lib/ /lib64/ /usr/lib /usr/lib64 /usr/local/lib /usr/local/lib64 /opt/OpenBLAS/lib $ENV{OPENBLAS_HOME}/lib)
if(OPENBLAS_LIBRARIES)
set(OPENBLAS_LIBRARIES ${OPENBLAS_LIBRARIES})
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(OpenBlas DEFAULT_MSG OPENBLAS_LIBRARIES OPENBLAS_INCLUDE_DIR)
mark_as_advanced(OpenBlas)

View File

@@ -1,30 +0,0 @@
#Hints for finding libOpenCL
#OpenCL Hints
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
set(L_HINTS $ENV{INTELOCLSDKROOT}/lib/x64)
else()
set(L_HINTS $ENV{INTELOCLSDKROOT}/lib/x86)
endif()
set(ANDROID_CL_GLOB_HINTS /opt/adreno-driver*/lib)
set(X86_CL_GLOB_HINTS /opt/AMDAPPSDK*/lib/x86_64)
if(ANDROID)
foreach(PATH ${ANDROID_CL_GLOB_HINTS})
file(GLOB _TMP ${PATH})
set(L_HINTS ${L_HINTS} ${_TMP})
endforeach()
find_library(OPENCL_LIBRARIES NAMES OpenCL NO_CMAKE_FIND_ROOT_PATH HINTS ${L_HINTS} )
else()
foreach(PATH ${X86_CL_GLOB_HINTS})
file(GLOB _TMP ${PATH})
set(L_HINTS ${L_HINTS} ${_TMP})
endforeach()
set(L_HINTS ${L_HINTS} ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/)
find_library(OPENCL_LIBRARIES NAMES OpenCL HINTS ${L_HINTS} )
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(OpenCL DEFAULT_MSG OPENCL_LIBRARIES)
mark_as_advanced(OpenCL)

View File

@@ -1,138 +0,0 @@
#*********************************************************#
#* File: Apk.cmake *
#* Android apk tools
#*
#* Copyright (C) 2002-2013 The PixelLight Team (http://www.pixellight.org/)
#*
#* This file is part of PixelLight.
#*
#* Permission is hereby granted, free of charge, to any person obtaining a copy of this software
#* and associated documentation files (the "Software"), to deal in the Software without
#* restriction, including without limitation the rights to use, copy, modify, merge, publish,
#* distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
#* Software is furnished to do so, subject to the following conditions:
#*
#* The above copyright notice and this permission notice shall be included in all copies or
#* substantial portions of the Software.
#*
#* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
#* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
#* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
#* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
#* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#*********************************************************#
##################################################
## Options
##################################################
set(ANDROID_APK_API_LEVEL "10" CACHE STRING "Android APK API level")
set(ANDROID_APK_INSTALL "0" CACHE BOOL "Install created apk file on the device automatically?")
set(ANDROID_APK_RUN "0" CACHE BOOL "Run created apk file on the device automatically? (installs it automatically as well, \"ANDROID_APK_INSTALL\"-option is ignored)")
set(ANDROID_APK_SIGNER_KEYSTORE "~/my-release-key.keystore" CACHE STRING "Keystore for signing the apk file (only required for release apk)")
set(ANDROID_APK_SIGNER_ALIAS "myalias" CACHE STRING "Alias for signing the apk file (only required for release apk)")
##################################################
## Variables
##################################################
set(ANDROID_THIS_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}) # Directory this CMake file is in
##################################################
## MACRO: android_create_apk
##
## Create/copy Android apk related files
##
## @param name
## Name of the project (e.g. "MyProject"), this will also be the name of the created apk file
## @param apk_pacakge_name
## Pacakge name of the application
## @param apk_directory
## Directory were to construct the apk file in (e.g. "${CMAKE_BINARY_DIR}/apk")
## @param libs_directory
## Directory where the built android libraries will be POST_BUILD, e.g ${CMAKE_SOURCE_DIR}/libs
## @param assets_directory
## Directory where the assets for the application are locatated
##
## @remarks
## Requires the following tools to be found automatically
## - "android" (part of the Android SDK)
## - "adb" (part of the Android SDK)
## - "ant" (type e.g. "sudo apt-get install ant" on your Linux system to install Ant)
## - "jarsigner" (part of the JDK)
## - "zipalign" (part of the Android SDK)
##################################################
macro(android_create_apk name apk_package_name apk_directory libs_directory android_directory assets_directory)
set(ANDROID_NAME ${name})
set(ANDROID_APK_PACKAGE ${apk_package_name})
# Create the directory for the libraries
add_custom_command(TARGET ${ANDROID_NAME} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E remove_directory "${apk_directory}/libs")
add_custom_command(TARGET ${ANDROID_NAME} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory "${apk_directory}/libs/armeabi-v7a/")
get_property(MAINLIB TARGET ${name} PROPERTY LOCATION)
get_property(ISAAC TARGET isaac PROPERTY LOCATION)
add_custom_command(TARGET ${ANDROID_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy "${MAINLIB}" "${apk_directory}/libs/armeabi-v7a/")
# Create "build.xml", "default.properties", "local.properties" and "proguard.cfg" files
if(CMAKE_BUILD_TYPE MATCHES Release)
set(ANDROID_APK_DEBUGGABLE "false")
else()
set(ANDROID_APK_DEBUGGABLE "true")
endif()
add_custom_command(TARGET ${ANDROID_NAME} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory "${apk_directory}/res")
add_custom_command(TARGET ${ANDROID_NAME} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory "${android_directory}/res" "${apk_directory}/res/")
configure_file("${android_directory}/AndroidManifest.xml" "${apk_directory}/AndroidManifest.xml")
add_custom_command(TARGET ${ANDROID_NAME} COMMAND android update project -t android-${ANDROID_APK_API_LEVEL} --name ${ANDROID_NAME} --path "${apk_directory}")
# Copy assets
add_custom_command(TARGET ${ANDROID_NAME} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E remove_directory "${apk_directory}/assets")
add_custom_command(TARGET ${ANDROID_NAME} PRE_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory "${apk_directory}/assets/")
add_custom_command(TARGET ${ANDROID_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_SOURCE_DIR}/assets" "${apk_directory}/assets/")
# Build the apk file
if(CMAKE_BUILD_TYPE MATCHES Release)
# Let Ant create the unsigned apk file
add_custom_command(TARGET ${ANDROID_NAME}
COMMAND ant release
WORKING_DIRECTORY "${apk_directory}")
# Sign the apk file
add_custom_command(TARGET ${ANDROID_NAME}
COMMAND jarsigner -verbose -keystore ${ANDROID_APK_SIGNER_KEYSTORE} bin/${ANDROID_NAME}-unsigned.apk ${ANDROID_APK_SIGNER_ALIAS}
WORKING_DIRECTORY "${apk_directory}")
# Align the apk file
add_custom_command(TARGET ${ANDROID_NAME}
COMMAND zipalign -v -f 4 bin/${ANDROID_NAME}-unsigned.apk bin/${ANDROID_NAME}.apk
WORKING_DIRECTORY "${apk_directory}")
# Install current version on the device/emulator
if(ANDROID_APK_INSTALL OR ANDROID_APK_RUN)
add_custom_command(TARGET ${ANDROID_NAME}
COMMAND adb install -r bin/${ANDROID_NAME}.apk
WORKING_DIRECTORY "${apk_directory}")
endif()
else()
# Let Ant create the unsigned apk file
add_custom_command(TARGET ${ANDROID_NAME}
COMMAND ant debug
WORKING_DIRECTORY "${apk_directory}")
# Install current version on the device/emulator
if(ANDROID_APK_INSTALL OR ANDROID_APK_RUN)
add_custom_command(TARGET ${ANDROID_NAME}
COMMAND adb install -r bin/${ANDROID_NAME}-debug.apk
WORKING_DIRECTORY "${apk_directory}")
endif()
endif()
# Start the application
if(ANDROID_APK_RUN)
add_custom_command(TARGET ${ANDROID_NAME}
COMMAND adb shell am start -n ${ANDROID_APK_PACKAGE}/android.app.NativeActivity)
endif()
endmacro(android_create_apk name apk_directory libs_directory assets_directory)

View File

@@ -1 +0,0 @@
cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchain/android.cmake -DANDROID_NDK=/opt/android-ndk-r10d/ -DANDROID_ABI=armeabi-v7a with NEON -DANDROID_NATIVE_API_LEVEL=19 -DANDROID_APK_API_LEVEL=19 -DANDROID_APK_RUN=1 ../

View File

@@ -1,61 +0,0 @@
#Copyright (c) 2014, ArrayFire
#All rights reserved.
# Function to turn an OpenCL source file into a C string within a source file.
# xxd uses its input's filename to name the string and its length, so we
# need to move them to a name that depends only on the path output, not its
# input. Otherwise, builds in different relative locations would put the
# source into different variable names, and everything would fall over.
# The actual name will be filename (.s replaced with underscores), and length
# name_len.
#
# Usage example:
#
# set(KERNELS a.cl b/c.cl)
# resource_to_cxx_source(
# SOURCES ${KERNELS}
# VARNAME OUTPUTS
# )
# add_executable(foo ${OUTPUTS})
#
# The namespace they are placed in is taken from filename.namespace.
#
# For example, if the input file is kernel.cl, the two variables will be
# unsigned char ns::kernel_cl[];
# unsigned int ns::kernel_cl_len;
#
# where ns is the contents of kernel.cl.namespace.
include(CMakeParseArguments)
set(BIN2CPP_PROGRAM "bin2cpp")
function(CODE_TO_H)
cmake_parse_arguments(ARGS "" "VARNAME;EXTENSION;OUTPUT_DIR;TARGET;NAMESPACE;EOF" "SOURCES" ${ARGN})
set(_output_files "")
foreach(_input_file ${ARGS_SOURCES})
get_filename_component(_path "${_input_file}" PATH)
get_filename_component(_name "${_input_file}" NAME)
get_filename_component(_name_we "${_input_file}" NAME_WE)
set(var_name ${_name_we})
set(_namespace "${ARGS_NAMESPACE}")
string(REPLACE "." "_" var_name ${var_name})
set(_output_path "${ARGS_OUTPUT_DIR}")
set(_output_file "${_output_path}/${_name_we}.${ARGS_EXTENSION}")
add_custom_command(
OUTPUT ${_output_file}
DEPENDS ${_input_file} ${BIN2CPP_PROGRAM}
COMMAND ${CMAKE_COMMAND} -E make_directory "${_output_path}"
COMMAND ${CMAKE_COMMAND} -E echo "\\#include \\<${_path}/${_name_we}.hpp\\>" >>"${_output_file}"
COMMAND ${BIN2CPP_PROGRAM} --file ${_name} --namespace ${_namespace} --output ${_output_file} --name ${var_name} --eof ${ARGS_EOF} --extension ${ARGS_EXTENSION}
WORKING_DIRECTORY "${_path}"
COMMENT "Compiling ${_input_file} to C++ source"
)
list(APPEND _output_files ${_output_file})
endforeach()
add_custom_target(${ARGS_TARGET} ALL DEPENDS ${_output_files})
endfunction()

View File

@@ -1,194 +0,0 @@
// Copyright (c) 2014, ArrayFire
// All rights reserved.
// Umar Arshad
// Copyright 2014
#include <stdlib.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <string>
#include <vector>
#include <map>
#include <memory>
#include <algorithm>
using namespace std;
typedef map<string, string> opt_t;
static
void print_usage() {
cout << R"delimiter(BIN2CPP
Converts files from a binary file to C++ headers. It is similar to bin2c and
xxd but adds support for namespaces.
| --name | name of the variable (default: var) |
| --file | input file |
| --output | output file (If no output is specified then it prints to stdout |
| --type | Type of variable (default: char) |
| --namespace | A space seperated list of namespaces |
| --formatted | Tabs for formatting |
| --version | Prints my name |
| --help | Prints usage info |
Example
-------
Command:
./bin2cpp --file blah.txt --namespace blah detail --formatted --name blah_var
Will produce:
#pragma once
#include <cstddef>
namespace blah {
namespace detail {
static const char blah_var[] = {
0x2f, 0x2f, 0x20, 0x62, 0x6c, 0x61, 0x68, 0x2e, 0x74, 0x78,
0x74, 0xa, 0x62, 0x6c, 0x61, 0x68, 0x20, 0x62, 0x6c, 0x61,
0x68, 0x20, 0x62, 0x6c, 0x61, 0x68, 0xa, };
static const size_t blah_var_len = 27;
}
})delimiter";
exit(0);
}
static bool formatted;
static
void add_tabs(const int level ){
if(formatted) {
for(int i =0; i < level; i++) {
cout << "\t";
}
}
}
static
opt_t
parse_options(const vector<string>& args) {
opt_t options;
options["--name"] = "";
options["--type"] = "";
options["--file"] = "";
options["--output"] = "";
options["--extension"] = "";
options["--namespace"] = "";
options["--eof"] = "";
//Parse Arguments
string curr_opt;
bool verbose = false;
for(auto arg : args) {
if(arg == "--verbose") {
verbose = true;
}
else if(arg == "--formatted") {
formatted = true;
}
else if(arg == "--version") {
cout << args[0] << " By Umar Arshad" << endl;
}
else if(arg == "--help") {
print_usage();
}
else if(options.find(arg) != options.end()) {
curr_opt = arg;
}
else if(curr_opt.empty()) {
//cerr << "Invalid Argument: " << arg << endl;
}
else {
if(options[curr_opt] != "") {
options[curr_opt] += " " + arg;
}
else {
options[curr_opt] += arg;
}
}
}
if(verbose) {
for(auto opts : options) {
cout << get<0>(opts) << " " << get<1>(opts) << endl;
}
}
return options;
}
int main(int argc, const char * const * const argv)
{
vector<string> args(argv, argv+argc);
opt_t&& options = parse_options(args);
//Save default cout buffer. Need this to prevent crash.
auto bak = cout.rdbuf();
unique_ptr<ofstream> outfile;
// Set defaults
if(options["--name"] == "") { options["--name"] = "var"; }
if(options["--output"] != "") {
//redirect stream if output file is specified
outfile.reset(new ofstream(options["--output"]));
cout.rdbuf(outfile->rdbuf());
}
if(options["--extension"] != "cpp")
cout << "#pragma once\n";
cout << "\n";
cout << "#include <cstddef>\n"; // defines size_t
cout << "\n";
int ns_cnt = 0;
int level = 0;
if(options["--namespace"] != "") {
std::stringstream namespaces(options["--namespace"]);
string name;
namespaces >> name;
do {
add_tabs(level++);
cout << "namespace " << name << "\n";
cout << "{\n";
ns_cnt++;
namespaces >> name;
} while(!namespaces.fail());
}
if(options["--type"] == "") {
options["--type"] = "char";
}
add_tabs(level);
cout << "\n";
cout << "static const " << options["--type"] << " " << options["--name"] << "[] = {\n";
ifstream input(options["--file"]);
size_t char_cnt = 0;
add_tabs(++level);
for(char i; input.get(i);) {
cout << "0x" << std::hex << static_cast<int>(i) << ",\t";
char_cnt++;
if(!(char_cnt % 10)) {
cout << endl;
add_tabs(level);
}
}
if (options["--eof"].c_str()[0] == '1') {
// Add end of file character
cout << "0x0";
char_cnt++;
}
cout << "};\n";
add_tabs(--level);
cout << "\n";
cout << "static const std::size_t " << options["--name"] << "_len" << " = " << std::dec << char_cnt << ";\n";
cout << "\n";
while(ns_cnt--) {
add_tabs(--level);
cout << "}\n";
}
cout.rdbuf(bak);
}

View File

@@ -1,130 +0,0 @@
#Thanks to Andreas Knoeckler for providing stand-alone boost.python
#through PyOpenCL and PyCUDA
import os, sys
from distutils.ccompiler import show_compilers,new_compiler
from distutils.command.build_ext import build_ext
from distutils.command.build_py import build_py
from distutils.core import setup, Extension
from distutils.sysconfig import get_python_inc
from distutils import sysconfig
from imp import find_module
from glob import glob
from os.path import dirname
platform_cflags = {}
platform_ldflags = {}
platform_libs = {}
class build_ext_subclass(build_ext):
def build_extensions(self):
c = self.compiler.compiler_type
if c in platform_cflags.keys():
for e in self.extensions:
e.extra_compile_args = platform_cflags[c]
if c in platform_ldflags.keys():
for e in self.extensions:
e.extra_link_args = platform_ldflags[c]
if c in platform_libs.keys():
for e in self.extensions:
try:
e.libraries += platform_libs[c]
except:
e.libraries = platform_libs[c]
build_ext.build_extensions(self)
def main():
def recursive_glob(rootdir='.', suffix=''):
return [os.path.join(looproot, filename)
for looproot, _, filenames in os.walk(rootdir)
for filename in filenames if filename.endswith(suffix)]
def remove_prefixes(optlist, bad_prefixes):
for bad_prefix in bad_prefixes:
for i, flag in enumerate(optlist):
if flag.startswith(bad_prefix):
optlist.pop(i)
break
return optlist
#Tweaks warning, because boost-numpy and boost-python won't compile cleanly without these changes
cvars = sysconfig.get_config_vars()
cvars['OPT'] = str.join(' ', remove_prefixes(cvars['OPT'].split(), ['-g', '-Wstrict-prototypes']))
cvars["CFLAGS"] = cvars["BASECFLAGS"] + ' ' + cvars['OPT']
cvars["LDFLAGS"] = '-Wl,--no-as-needed ' + cvars["LDFLAGS"]
#Check Android
for_android = '-mandroid' in cvars['PY_CFLAGS']
#Dynamic load for backend switching
libraries = ['dl']
library_dirs = []
#Include directories
numpy_include = os.path.join(find_module("numpy")[1], "core", "include")
include ='${INCLUDE_DIRECTORIES_STR}'.split() + ['external/boost/', 'external/boost/boost/', numpy_include]
#Android
if for_android:
ANDROID_ROOT = os.environ['ANDROIDNDK'] + '/sources/cxx-stl/gnu-libstdc++/' + os.environ['TOOLCHAIN_VERSION']
library_dirs += [ANDROID_ROOT + '/libs/armeabi']
include += [ANDROID_ROOT + '/include/', ANDROID_ROOT + '/libs/armeabi/include/']
libraries += ['gnustl_shared']
#Source files
src = '${LIBISAAC_SRC_STR}'.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
boostsrc = 'external/boost/libs/'
for s in ['numpy','python','smart_ptr','system','thread']:
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]
extensions = []
#isaac
extensions += [Extension(
'_isaac',src,
extra_compile_args= ['-std=c++11', '-Wno-unused-function', '-Wno-unused-local-typedefs', '-Wno-sign-compare', '-Wno-attributes', '-DBOOST_PYTHON_SOURCE '],
extra_link_args=['-Wl,-soname=_isaac.so'],
undef_macros=[],
include_dirs=include,
library_dirs=library_dirs,
libraries=libraries)]
#External
extensions += [Extension('external.sklearn._tree',
['external/sklearn/_tree.c'],
include_dirs = [numpy_include])]
#Setup
setup(
name='isaac',
version='1.0',
description="Input-specific architecture-aware computations",
author='Philippe Tillet',
author_email='ptillet@g.harvard.edu',
license='MPL 2.0',
packages=['isaac', 'isaac.external', 'isaac.external.sklearn'],
ext_package="isaac",
ext_modules=extensions,
cmdclass={'build_py': build_py, 'build_ext': build_ext_subclass},
classifiers=[
'Environment :: Console',
'Development Status :: 1 - Experimental',
'Intended Audience :: Developers',
'Intended Audience :: Other Audience',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: MIT License',
'Natural Language :: English',
'Programming Language :: C++',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Topic :: Scientific/Engineering',
'Topic :: Scientific/Engineering :: Mathematics',
'Topic :: Scientific/Engineering :: Physics',
'Topic :: Scientific/Engineering :: Machine Learning',
]
)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -1,12 +0,0 @@
#System
SET(CMAKE_SYSTEM_NAME Windows)
#Compilers
SET(CMAKE_C_COMPILER /usr/bin/i686-w64-mingw32-gcc)
SET(CMAKE_CXX_COMPILER /usr/bin/i686-w64-mingw32-g++)
SET(CMAKE_RC_COMPILER /usr/bin/i686-w64-mingw32-windres)
# search headers and libraries in the target environment, search
# programs in the host environment
SET(CMAKE_FIND_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/crossdeps" /usr/i686-w64-mingw32)
SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)

View File

@@ -1,12 +0,0 @@
#System
SET(CMAKE_SYSTEM_NAME Windows)
#Compilers
SET(CMAKE_C_COMPILER /usr/bin/x86_64-w64-mingw32-gcc)
SET(CMAKE_CXX_COMPILER /usr/bin/x86_64-w64-mingw32-g++)
SET(CMAKE_RC_COMPILER /usr/bin/x86_64-w64-mingw32-windres)
# search headers and libraries in the target environment, search
# programs in the host environment
SET(CMAKE_FIND_ROOT_PATH "${CMAKE_CURRENT_LIST_DIR}/crossdeps" /usr/x86_64-w64-mingw32)
SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)

View File

@@ -2,6 +2,6 @@ for i in $(find ../lib/ ../include/isaac/ ../python/src/bind -name '*.cpp' -or -
do do
if ! grep -q Copyright $i if ! grep -q Copyright $i
then then
cat license-header.txt $i >$i.new && mv $i.new $i cat ../LICENSE $i >$i.new && mv $i.new $i
fi fi
done done

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.2 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 40 KiB

View File

@@ -1,69 +0,0 @@
import matplotlib.pyplot as plt
import numpy as np
def add_line(ax, xpos, ypos, height=.1):
line = plt.Line2D([xpos, xpos], [ypos + height, ypos],
transform=ax.transAxes, color='black')
line.set_clip_on(False)
ax.add_line(line)
bench = [('DeepBench-Forward\nM=K=1760', 'N'),
('DeepBench-Backward\nM=K=2560', 'N'),
('Covariance\nK=60000', 'M=N'),
('Blocked SVD\nK=32', 'M=N')]
labels = [[16, 32, 64, 128, 7000],
[16, 32, 64, 128, 7000],
[32, 256],
[896, 3456, 4096]]
configs = {
'Pascal Titan X': {'lib': 'cuBLAS',
'libperf': [1.65, 1.88, 2.58, 4.83, 11.5,
0.72, 1.72, 2.39, 2.86, 7.77,
0.80, 3.61,
1.37, 2.50, 2.57],
'libcol': 'green',
'scperf': [1.15, 2.43, 3.83, 5.53, 11.5,
1.78, 3.06, 4.37, 5.52, 8.67,
1.44, 6.43,
1.14, 4.53, 4.91]},
'R9 Fury': {'lib': 'clBLAS',
'libperf': [0.22, 0.65, 1.35, 1.92, 3.35,
0.28, 0.64, 1.36, 1.91, 3.32,
0.02, 0.87,
0.43, 0.98, 1.95],
'libcol': '#d30034',
'scperf': [0.67, 0.94, 1.18, 2.12, 4.66,
0.63, 1.15, 1.43, 1.82, 4.22,
0.19, 2.82,
0.35, 1.82, 1.80]}
}
for device, conf in configs.iteritems():
width = 0.5
sep = 1.3
xx = sep*np.arange(len(conf['scperf'])) + width
groups = [0] + [len(_) for _ in labels]
for i in np.cumsum(groups)[:-1]:
xx[i:] += sep
xmax = xx[-1] + width + sep
figure, ax = plt.subplots(figsize=(12,8))
sc = ax.bar(xx - width, conf['scperf'], width, color='purple')
cu = ax.bar(xx, conf['libperf'], width, color=conf['libcol'])
linex = [(xx[i] - sep) for i in np.cumsum(groups)[1:-1]]
linex = [0] + linex + [xmax]
for i in range(len(linex)-1):
group, sublabel = bench[i]
add_line(ax, linex[i]/xmax, 0, -10)
ax.text(.5*(linex[i] + linex[i+1])/xmax, -.12, group, ha='center', transform=ax.transAxes, fontsize = 10, color='darkblue')
ax.text(.5*(linex[i] + linex[i+1])/xmax, -.07, sublabel, ha='center', transform=ax.transAxes, fontsize = 10)
ax.set_xlim((0,xmax))
ax.set_xticks(xx)
ax.set_xticklabels([x for _ in labels for x in _ ], rotation=30, fontsize=10)
ax.set_ylabel('TFLOPS')
ax.legend((sc, cu), ('ISAAC', conf['lib']))
ax.set_title('sGEMM - {}'.format(device))
plt.savefig('bench-{}.png'.format(conf['lib']))
plt.show()

View File

@@ -1,21 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

View File

@@ -1,4 +1,6 @@
foreach(PROG indexing) foreach(PROG bench)
add_executable(example-${PROG} ${PROG}.cpp) add_executable(${PROG} ${PROG}.cpp)
target_link_libraries(example-${PROG} isaac) set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG})
include_directories(/usr/local/cuda/include/)
target_link_libraries(${PROG} PRIVATE isaac)
endforeach(PROG) endforeach(PROG)

181
examples/bench.cpp Normal file
View File

@@ -0,0 +1,181 @@
#include <tuple>
#include "isaac/driver/backend.h"
#include "isaac/driver/cublas.h"
#include "isaac/driver/context.h"
#include "isaac/driver/buffer.h"
#include "isaac/driver/stream.h"
#include "isaac/tools/bench.hpp"
#include "isaac/api.h"
namespace sc = isaac;
namespace drv = sc::driver;
using sc::param_t;
using std::make_tuple;
double geometric_mean(std::vector<double> const&data){
double logsum = std::accumulate(data.begin(), data.end(),
(double)0, [](double acc, double x){ return acc + std::log(x);});
return std::exp(logsum/data.size());
}
void print_results_header(std::vector<std::string> sections){
std::cout << color_stream(ITALIC) << color_stream(BOLD) ;
std::copy(sections.begin(), sections.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
std::cout << "ISAAC\tcuDNN";
std::cout << color_stream(RESET) << std::endl;
}
void print_results(std::vector<double> const & times, std::vector<std::string> const & prefix, std::function<double(double)> fn){
std::copy(prefix.begin(), prefix.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
std::vector<double> perf;
std::transform(times.begin(), times.end(), std::back_inserter(perf), fn);
auto fastest = perf;
std::sort(fastest.begin(), fastest.end(), std::greater<double>());
for(auto x: perf){
if(x/fastest[1] >= 1.05)
std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
else
std::cout << x;
std::cout << "\t";
}
std::cout << std::endl;
}
int main(){
std::cout << std::fixed << std::setprecision(2);
auto ctx = drv::backend::contexts::get_default();
drv::Stream stream(ctx);
sc::DType dtype = sc::FLOAT_TYPE;
int32_t dtsize = sc::size_of(dtype);
drv::Device const & device = drv::backend::contexts::get_default().device();
{
typedef std::tuple<param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> conv_tuple;
std::vector<conv_tuple> shapes;
//Cluster 1
for(size_t N: std::vector<size_t>{4, 8, 16, 32})
shapes.push_back(std::make_tuple(700, 161, 1, N, 32, 5, 20, 0, 0, 2, 2));
//Cluster 2
for(size_t N: std::vector<size_t>{4, 8, 16, 32})
shapes.push_back(std::make_tuple(341, 79, 32, N, 32, 5, 10, 0, 0, 2, 2));
//Cluster 3
shapes.push_back(std::make_tuple(480, 48, 1, 16, 16, 3, 3, 1, 1, 1, 1));
shapes.push_back(std::make_tuple(240, 24, 16, 16, 32, 3, 3, 1, 1, 1, 1));
shapes.push_back(std::make_tuple(120, 12, 32, 16, 64, 3, 3, 1, 1, 1, 1));
shapes.push_back(std::make_tuple(60, 6, 64, 16, 128, 3, 3, 1, 1, 1, 1));
//Cluster 4
shapes.push_back(std::make_tuple(108, 108, 3, 8, 64, 3, 3, 1, 1, 2, 2));
shapes.push_back(std::make_tuple(54, 54, 64, 8, 64, 3, 3, 1, 1, 1, 1));
shapes.push_back(std::make_tuple(27, 27, 128, 8, 128, 3, 3, 1, 1, 1, 1));
shapes.push_back(std::make_tuple(14, 14, 128, 8, 256, 3, 3, 1, 1, 1, 1));
shapes.push_back(std::make_tuple(7, 7, 256, 8, 512, 3, 3, 1, 1, 1, 1));
//Cluster 5-6
for(size_t N: std::vector<size_t>{8, 16}){
shapes.push_back(std::make_tuple(224, 224, 3, N, 64, 3, 3, 1, 1, 1, 1));
shapes.push_back(std::make_tuple(112, 112, 64, N, 128, 3, 3, 1, 1, 1, 1));
shapes.push_back(std::make_tuple(56, 56, 128, N, 256, 3, 3, 1, 1, 1, 1));
shapes.push_back(std::make_tuple(28, 28, 256, N, 512, 3, 3, 1, 1, 1, 1));
shapes.push_back(std::make_tuple(14, 14, 512, N, 512, 3, 3, 1, 1, 1, 1));
shapes.push_back(std::make_tuple(7, 7, 512, N, 512, 3, 3, 1, 1, 1, 1));
}
//Cluster 7
shapes.push_back(std::make_tuple(224, 224, 3, 16, 64, 7, 7, 3, 3, 2, 2));
shapes.push_back(std::make_tuple(28, 28, 192, 16, 32, 5, 5, 2, 2, 1, 1));
shapes.push_back(std::make_tuple(28, 28, 192, 16, 64, 1, 1, 0, 0, 1, 1));
shapes.push_back(std::make_tuple(14, 14, 512, 16, 48, 5, 5, 2, 2, 1, 1));
shapes.push_back(std::make_tuple(14, 14, 512, 16, 192, 1, 1, 0, 0, 1, 1));
shapes.push_back(std::make_tuple(7, 7, 832, 16, 256, 1, 1, 0, 0, 1, 1));
shapes.push_back(std::make_tuple(7, 7, 832, 16, 128, 5, 5, 2, 2, 1, 1));
param_t W, H, P, Q, C, N, K, R, S, pad_h, pad_w, stride_h, stride_w;
std::cout << "======================================================================" << std::endl;
std::cout << "FCONV" << std::endl;
std::cout << "======================================================================" << std::endl;
print_results_header({"N", "K", "P", "Q", "C", "R", "S"});
std::vector<double> speedup;
for(auto shape: shapes){
std::tie(W, H, C, N, K, R, S, pad_h, pad_w, stride_h, stride_w) = shape;
P = (H - R + 1 + 2*pad_h)/stride_h;
Q = (W - S + 1 + 2*pad_w)/stride_w;
sc::scalar alpha(1., dtype);
sc::scalar beta(0., dtype);
drv::Buffer O(ctx, N*K*P*Q*dtsize);
drv::Buffer I(ctx, C*H*W*N*dtsize);
drv::Buffer F(ctx, K*C*R*S*dtsize);
std::vector<double> times;
times.push_back(bench([&](){ sc::CONV(device, stream, dtype, N, K, P, Q, C, R, S, H, W, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device));
times.push_back(bench([&](){ sc::driver::cudnnConv(dtype, ctx, stream, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device));
speedup.push_back(times[1]/times[0]);
print_results(times, {str(N), str(K), str(P), str(Q), str(C), str(R), str(S)}, [&](double tsec){ return sc::templates::Conv::tflops(P,Q,K,N,C,R,S,tsec);});
}
std::cout << "======================================================================" << std::endl;
std::cout << "Speedup: " << geometric_mean(speedup) << std::endl;
std::cout << std::endl;
}
//GEMM
{
typedef std::tuple<sc::IsaacOperation_t, sc::IsaacOperation_t, param_t, param_t, param_t> gemm_tuple;
std::vector<gemm_tuple> shapes;
// LinPack
for(param_t N: std::vector<param_t>{512, 1024, 2048})
shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, N));
// DeepBench [Forward]
for(param_t M: std::vector<param_t>{1760})
for(param_t N: std::vector<param_t>{8, 16, 32, 64, 128})
shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_N, M, N, M));
// DeepBench [Backward]
for(param_t M: std::vector<param_t>{1760})
for(param_t N: std::vector<param_t>{8, 16, 32, 64, 128})
shapes.push_back(std::make_tuple(sc::ISAAC_OP_T, sc::ISAAC_OP_N, M, N, M));
// PCA/ICA
for(param_t N: std::vector<param_t>{16, 64, 256})
for(param_t K: std::vector<param_t>{64000})
shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
// LaPACK
for(param_t N: std::vector<param_t>{1024, 2048, 4096})
for(param_t K: std::vector<param_t>{32})
shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
sc::IsaacOperation_t AT, BT;
param_t M, N, K;
std::cout << "======================================================================" << std::endl;
std::cout << "GEMM:" << std::endl;
std::cout << "======================================================================" << std::endl;
print_results_header({"AT", "BT", "M", "N", "K"});
std::vector<double> speedup;
for(auto shape: shapes){
std::tie(AT, BT, M, N, K) = shape;
sc::scalar alpha(1., dtype);
sc::scalar beta(0., dtype);
size_t ldc = M;
size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
char cuAT = (AT==sc::ISAAC_OP_T)?'T':'N';
char cuBT = (BT==sc::ISAAC_OP_T)?'T':'N';
drv::Buffer C(ctx, M*N*dtsize);
drv::Buffer A(ctx, M*K*dtsize);
drv::Buffer B(ctx, K*N*dtsize);
std::vector<double> times;
times.push_back(bench([&](){ sc::GEMM(device, stream, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, alpha, A, B, beta, C); }, [&](){ stream.synchronize(); }, device));
times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, ctx, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }, [&](){ stream.synchronize(); }, device));
speedup.push_back(times[1]/times[0]);
print_results(times, {str(AT), str(BT), str(M), str(N), str(K)}, [&](double tsec){ return sc::templates::GEMM::tflops(M, N, K, tsec);});
}
std::cout << "======================================================================" << std::endl;
std::cout << "Speedup: " << geometric_mean(speedup) << std::endl;
}
}

View File

@@ -1,60 +0,0 @@
#include "isaac/array.h"
#include "isaac/symbolic/scheduler/dag.h"
namespace sc = isaac;
class carma_generator
{
void apply_impl(sc::array_base const & A, sc::array_base const & B, sc::view C, size_t depth)
{
if(depth>=split_.size()){
dag_.append(sc::assign(C, sc::dot(A, B)), "C = dot(A, B)");
}
else
{
sc::int_t M = C.shape()[0], N = C.shape()[1], K = A.shape()[1];
size_t new_depth = depth + 1;
//Split along M
if(M >= N && M >= K){
apply_impl(A({0, M/2}, {sc::all}), B, C({0, M/2}, sc::all), new_depth);
apply_impl(A({M/2, sc::end}, {sc::all}), B, C({M/2, sc::end}, sc::all), new_depth);
}
//Split along N
else if(N >= M && N >= K){
apply_impl(A, B(sc::all, {0, N/2}), C(sc::all, {0, N/2}), new_depth);
apply_impl(A, B(sc::all, {N/2, sc::end}), C(sc::all, {N/2, sc::end}), new_depth);
}
//Split along K
else{
sc::array_base & C1 = dag_.create_temporary(new sc::array(C.shape(), C.dtype(), C.context()));
sc::array_base & C2 = dag_.create_temporary(new sc::array(C.shape(), C.dtype(), C.context()));
apply_impl(A(sc::all, {0, K/2}), B({0, K/2}, sc::all), C1, new_depth);
apply_impl(A(sc::all, {K/2, sc::end}), B({K/2, sc::end}, sc::all), C2, new_depth);
dag_.append(sc::assign(C, C1 + C2), "C = C1 + C2");
}
}
}
public:
carma_generator(size_t depth): split_(depth)
{ }
void apply(sc::array_base const & A, sc::array_base const & B, sc::array_base & C)
{
apply_impl(A, B, sc::view(C), 0);
dag_.export_graphviz("test.dot");
}
private:
sc::symbolic::scheduler::dag dag_;
std::vector<sc::int_t> split_;
};
int main()
{
sc::int_t M = 131, N = 1402, K = 5023;
sc::array C(M, N), A(M, K), B(K, N);
carma_generator generator(3);
generator.apply(A, B, C);
}

View File

@@ -1,43 +0,0 @@
#include "isaac/array.h"
namespace sc = isaac;
int main()
{
// static const char * sline = "--------------------";
static const char * dline = "====================";
std::cout << dline << std::endl;
std::cout << "Tutorial: Indexing " << std::endl;
std::cout << dline << std::endl;
sc::int_t M = 5, N = 12;
std::vector<float> data(M*N);
for(unsigned int i = 0 ; i < data.size(); ++i)
data[i] = i;
sc::array A = sc::array(M, N, data);
sc::array s = sc::array({1,1}, std::vector<float>{5});
sc::array x = sc::array({1,3},std::vector<float>{1,2,3});
sc::array y = sc::array({3,3},std::vector<float>{1,2,3,4,5,6,7,8,9});
sc::array B({4,3},std::vector<float>{0,1,2,3,4,5,6,7,8,9,10,11});
// std::cout << sc::sum(y, 1)*sc::sum(x) << std::endl;
// std::cout << sc::dot(B.T, B + B) << std::endl;
std::cout << 1*s*x + x << std::endl;
// std::cout << sc::sum(B) << std::endl;
// std::cout << sc::reshape(x, {3,1}) + sc::sum(x)*sc::sum(sc::dot(B.T,B) + x + y, 1) + sc::sum(B)*sc::sum(B, 0)<< std::endl;
// std::cout << sline << std::endl;
// std::cout << "A[3, 2:end]:" << A(3, {2,sc::end}) << std::endl;
// std::cout << sline << std::endl;
// std::cout << "A[2:end, 4]:" << A({2,sc::end}, 4) << std::endl;
// std::cout << sline << std::endl;
// std::cout << "diag(A, 1): " << sc::diag(A, 1) << std::endl;
// std::cout << sline << std::endl;
// std::cout << "diag(A, -7): " << sc::diag(A, -7) << std::endl;
}

186
examples/ptx-conv.cpp Normal file
View File

@@ -0,0 +1,186 @@
#include <sstream>
#include <chrono>
#include <exception>
#include <fstream>
#include <iomanip>
#include "isaac/driver/backend.h"
#include "isaac/driver/module.h"
#include "isaac/driver/error.h"
#include "isaac/driver/kernel.h"
#include "isaac/driver/cublas.h"
#include "isaac/driver/stream.h"
#include "isaac/driver/buffer.h"
#include "isaac/templates/error.hpp"
#include <string>
#include <iostream>
#include <cassert>
#include <cstdlib>
#include "isaac/tools/bench.hpp"
#include "isaac/templates/conv.h"
namespace sc = isaac;
namespace drv = isaac::driver;
inline int32_t idx(int32_t x, int32_t y, int32_t z, int32_t w,
int32_t /*s0*/, int32_t s1, int32_t s2, int32_t s3)
{ return w + z*s3 + y*s3*s2 + x*s3*s2*s1; }
void cpp_conv_nchw(int32_t C, int32_t N, int32_t K,
int32_t H, int32_t W,
int32_t R, int32_t S,
int32_t pad_h, int32_t pad_w,
int32_t stride_h, int32_t stride_w,
int32_t P, int32_t Q,
float* O, float* I, float* F)
{
for(int32_t k = 0; k < K; ++k)
for(int32_t p = 0 ; p < P; ++p)
for(int32_t q = 0; q < Q; ++q)
for(int32_t n = 0; n < N; ++n)
{
int32_t pp = p*stride_h - pad_h;
int32_t qq = q*stride_w - pad_w;
float acc = 0;
for(int32_t c = 0; c < C; ++c)
for(int32_t r = 0; r < R; ++r)
for(int32_t s = 0; s < S; ++s)
{
int32_t h = pp + r;
int32_t w = qq + s;
if(h >= 0 && h < H && w >= 0 && w < W)
acc += F[idx(k, c, r, s, K, C, R, S)]*I[idx(n, c, h, w, N, C, H, W)];
}
O[idx(n, k, p, q, N, K, P, Q)] = acc;
}
}
void cpp_conv_chwn(int32_t C, int32_t N, int32_t K,
int32_t H, int32_t W,
int32_t R, int32_t S,
int32_t pad_h, int32_t pad_w,
int32_t stride_h, int32_t stride_w,
int32_t P, int32_t Q,
float* O, float* I, float* F)
{
for(int32_t k = 0; k < K ; ++k)
for(int32_t p = 0 ; p < P; ++p)
for(int32_t q = 0; q < Q; ++q)
for(int32_t n = 0; n < N; ++n)
{
int32_t pp = p*stride_h - pad_h;
int32_t qq = q*stride_w - pad_w;
float acc = 0;
for(int32_t c = 0; c < C; ++c)
for(int32_t r = 0; r < R; ++r)
for(int32_t s = 0; s < S; ++s)
{
int32_t h = pp + r;
int32_t w = qq + s;
if(h >= 0 && h < H && w >= 0 && w < W)
acc += F[idx(c, r, s, k, C, R, S, K)]*I[idx(c, h, w, n, C, H, W, N)];
}
O[idx(k, p, q, n, K, P, Q, N)] = acc;
}
}
double get_tflops(uint64_t P, uint64_t Q, uint64_t K, uint64_t N, uint64_t C, uint64_t R, uint64_t S, double time){
return 2*P*Q*K*N*C*R*S/(time*1e3);
}
bool test = false;
int main(){
auto ctx = drv::backend::contexts::get_default();
int32_t dtsize = 4;
//Arguments
int32_t C = 1, N = 4, K = 32;
int32_t H = 68, W = 260;
int32_t R = 5, S = 5;
int32_t pad_h = 0, pad_w = 0;
int32_t stride_h = 1, stride_w = 1;
int32_t P = (H - R + 1 + 2*pad_h)/stride_h, Q = (W - S + 1 + 2*pad_w)/stride_w;
std::vector<float> iO(K*P*Q*N);
std::vector<float> iI(C*H*W*N);
std::vector<float> iF(C*R*S*K);
drv::Buffer O(ctx, iO.size()*dtsize);
drv::Buffer I(ctx, iI.size()*dtsize);
for(size_t i = 0; i < iI.size(); ++i) iI[i] = (float)rand()/RAND_MAX;
drv::Buffer F(ctx, iF.size()*dtsize);
for(size_t i = 0; i < iF.size(); ++i) iF[i] = (float)rand()/RAND_MAX;
drv::Stream queue(ctx);
queue.write(O, true, 0, iO.size()*dtsize, iO.data());
queue.write(I, true, 0, iI.size()*dtsize, iI.data());
queue.write(F, true, 0, iF.size()*dtsize, iF.data());
sc::scalar alpha(1., sc::FLOAT_TYPE);
sc::scalar beta(1., sc::FLOAT_TYPE);
if(test)
cpp_conv_chwn(C, N, K, H, W, R, S, pad_h, pad_w, stride_h, stride_w, P, Q, iO.data(), iI.data(), iF.data());
std::vector<float> rO(iO.size());
std::vector<int> rv = {2,4};
std::vector<int> rl = {1,2,4};
std::vector<int> rs = {1,2,4,8};
float best = 0;
for(size_t vec: rv)
for(size_t bp: std::vector<int>{})
for(size_t bq: std::vector<int>{1,2,4})
for(size_t bn: rl)
for(size_t bk: rl)
for(size_t bf_n: rl)
for(size_t ps: std::vector<int>{1,2,4})
for(size_t qs: std::vector<int>{1,2,4})
for(size_t ns: rs)
for(size_t ks: rs)
for(size_t crs_l: rl)
for(size_t crs_s: std::vector<int>{1})
for(size_t cs: std::vector<int>{1})
for(size_t bc: std::vector<int>{1})
for(size_t gridc: std::vector<int>{1})
{
// Compile
isaac::templates::Conv conv(sc::FLOAT_TYPE, C, H, W, N, K, P, Q, R, S, pad_h, pad_w, stride_h, stride_w, vec, bp, bq, bn, bk, bf_n, ps, qs, ns, ks, crs_l, crs_s, cs, bc, gridc);
std::string src;
try{
src = conv.dump(ctx.device(), "fconv");
}catch(isaac::templates::invalid_parameters){
continue;
}
drv::Module program(ctx, src, true);
drv::Kernel kernel(program, "fconv");
//Launch
float time;
try{
time = bench([&](){ conv.enqueue(kernel, queue, alpha, I, F, beta, O); },
[&](){ queue.synchronize(); }, ctx.device());
}catch(drv::exception::cuda::launch_out_of_resources){
continue;
}
//Report
float tflops = get_tflops(P,Q,K,N,C,R,S,time);
best = std::max(tflops, best);
std::cout << "//" << vec << " " << bp << " " << bq << " " << bn << " " << bk << " " << bf_n << " " << ps << " " << qs << " " << ns << " " << ks << " " << crs_l << " " << crs_s << " " << cs << " " << bc << " " << gridc << ": " << std::setprecision(3) << tflops << " [ " << best << " ] " << std::endl;
//Test
if(test){
queue.read(O, true, 0, rO.size()*dtsize, rO.data());
for(size_t i = 0 ; i < rO.size(); ++i)
if(fabs((iO[i] - rO[i])/rO[i]) > 1e-4 || std::isnan(rO[i])) { std::cout << "// Failure at idx " << i << ": " << iO[i] << " != " << rO[i] << std::endl; exit(1); }
}
}
//cuDNN
float time = bench([&](){sc::driver::cudnnConv(sc::FLOAT_TYPE, ctx, queue, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); },
[&](){ queue.synchronize(); }, ctx.device());
float tflops = get_tflops(P,Q,K,N,C,R,S,time);
std::cout << "TFLOPs: " << tflops << std::endl;
}

84
examples/ptx-gemm.cpp Normal file
View File

@@ -0,0 +1,84 @@
#include <sstream>
#include <chrono>
#include <exception>
#include <iomanip>
#include <string>
#include <iostream>
#include <cassert>
#include "isaac/driver/backend.h"
#include "isaac/driver/error.h"
#include "isaac/driver/module.h"
#include "isaac/driver/kernel.h"
#include "isaac/driver/stream.h"
#include "isaac/driver/buffer.h"
#include "isaac/driver/cublas.h"
#include "isaac/half.hpp"
#include "isaac/tools/bench.hpp"
#include "isaac/tools/collections.hpp"
#include "isaac/templates/gemm.h"
#include "isaac/templates/error.hpp"
namespace sc = isaac;
namespace drv = isaac::driver;
void do_bench(int32_t M, int32_t N, int32_t K, sc::IsaacOperation_t AT, sc::IsaacOperation_t BT, sc::DType dtype){
auto ctx = drv::backend::contexts::get_default();
size_t dtsize = sc::size_of(dtype);
//Buffers
int32_t AS0 = M, AS1 = K;
int32_t BS0 = K, BS1 = N;
if(AT=='T') std::swap(AS0, AS1);
if(BT=='T') std::swap(BS0, BS1);
int32_t ldc = M, lda = AS0, ldb = BS0;
int32_t offc = 0, offa = 0, offb = 0;
drv::Buffer C(ctx, M*N*dtsize);
drv::Buffer A(ctx, M*K*dtsize);
drv::Buffer B(ctx, K*N*dtsize);
drv::Stream queue(ctx);
sc::scalar alpha(1., dtype), beta(0., dtype);
// cuBlas
double time = bench([&](){ sc::driver::cublasGemm(dtype, ctx, queue, AT, BT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);}
, [&](){ queue.synchronize(); }, ctx.device());
std::cout << 2*1e-3*M*N*K/time << std::endl;
//Exhaustive search
std::vector<int> r1 = {1};
std::vector<int> rv = {4};
std::vector<int> rr = {1, 2, 4};
std::vector<int> rl = {2, 4, 8, 16, 32};
std::vector<int> rs = {1, 2, 4, 8, 16};
double best = 0;
for(auto x: sc::cpp::cartesian({rv, rl, rl, rl, rs, r1, rs, rl, rl, rl, rl, r1, r1, r1}))
{
isaac::templates::GEMM gemm(dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]);
//Compile
std::string src;
try{
src = gemm.dump(ctx.device(), "gemm");
}catch(isaac::templates::invalid_parameters){
continue;
}
drv::Module program(ctx, src, true);
drv::Kernel kernel(program, "gemm");
//Launch
double time;
try{
time = bench([&](){ gemm.enqueue(kernel, queue, alpha, A, B, beta, C); }, [&](){ queue.synchronize(); }, ctx.device());
}catch(drv::exception::cuda::launch_out_of_resources){
continue;
}
//Report
double tflops = 2*1e-3*M*N*K/time;
best = std::max(tflops, best);
std::cout << "//" << x[0] << " " << x[1] << " " << x[2] << " " << x[3] << " " << x[4] << " " << x[5] << " " << x[6] << " " << x[7] << " " << x[8] << " " << x[9] << " " << x[10] << " " << x[11] << " " << x[12] << " " << x[13] << " " << std::setprecision(3) << tflops << " [ " << best << " ] " << std::endl;
}
}
int main(){
do_bench(2048, 2048, 2048, sc::ISAAC_OP_N, sc::ISAAC_OP_T, sc::FLOAT_TYPE);
}

View File

@@ -1,53 +0,0 @@
/* ************************************************************************
* Copyright 2013 Advanced Micro Devices, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ************************************************************************/
#ifndef CLBLAS_COMPLEX_H_
#define CLBLAS_COMPLEX H_
#ifdef __cplusplus
extern "C" {
#endif
typedef cl_float2 FloatComplex;
typedef cl_double2 DoubleComplex;
static __inline FloatComplex
floatComplex(float real, float imag)
{
FloatComplex z;
z.s[0] = real;
z.s[1] = imag;
return z;
}
static __inline DoubleComplex
doubleComplex(double real, double imag)
{
DoubleComplex z;
z.s[0] = real;
z.s[1] = imag;
return z;
}
#define CREAL(v) ((v).s[0])
#define CIMAG(v) ((v).s[1])
#ifdef __cplusplus
} /* extern "C" { */
#endif
#endif /* CLBLAS_COMPLEX_H_ */

10096
include/external/clBLAS.h vendored

File diff suppressed because it is too large Load Diff

View File

@@ -1,22 +0,0 @@
/* ************************************************************************
* Copyright 2013 Advanced Micro Devices, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ************************************************************************/
/* the configured version and settings for clblas
*/
#define clblasVersionMajor 2
#define clblasVersionMinor 6
#define clblasVersionPatch 0

View File

@@ -1,64 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "device_types.h"
#if !defined(__CUDACC_RTC__)
#define EXCLUDE_FROM_RTC
#include "driver_types.h"
#undef EXCLUDE_FROM_RTC
#endif /* !__CUDACC_RTC__ */
#include "surface_types.h"
#include "texture_types.h"
#include "vector_types.h"

View File

@@ -1,412 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CHANNEL_DESCRIPTOR_H__)
#define __CHANNEL_DESCRIPTOR_H__
#if defined(__cplusplus)
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "driver_types.h"
#include "cuda_runtime_api.h"
#include "host_defines.h"
#include "vector_types.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
/**
* \addtogroup CUDART_HIGHLEVEL
*
* @{
*/
/**
* \brief \hl Returns a channel descriptor using the specified format
*
* Returns a channel descriptor with format \p f and number of bits of each
* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
* defined as:
* \code
struct cudaChannelFormatDesc {
int x, y, z, w;
enum cudaChannelFormatKind f;
};
* \endcode
*
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
* ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
*
* \return
* Channel descriptor with format \p f
*
* \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
*/
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
{
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
{
int e = (int)sizeof(char) * 8;
#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
#if !defined(__LP64__)
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
#endif /* !__LP64__ */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
}
#endif /* __cplusplus */
/** @} */
/** @} */ /* END CUDART_TEXTURE_HL */
#endif /* !__CHANNEL_DESCRIPTOR_H__ */

View File

@@ -1,338 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CU_COMPLEX_H_)
#define CU_COMPLEX_H_
/* When trying to include C header file in C++ Code extern "C" is required
* But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
* extern "C" cannot be nested
* Hence keep the header out of extern "C" block
*/
#include <math.h> /* import fabsf, sqrt */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
#include "vector_types.h"
typedef float2 cuFloatComplex;
__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
{
return x.x;
}
__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
{
return x.y;
}
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
(float r, float i)
{
cuFloatComplex res;
res.x = r;
res.y = i;
return res;
}
__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
{
return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
}
__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
cuFloatComplex y)
{
return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),
cuCimagf(x) + cuCimagf(y));
}
__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
cuFloatComplex y)
{
return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),
cuCimagf(x) - cuCimagf(y));
}
/* This implementation could suffer from intermediate overflow even though
* the final result would be in range. However, various implementations do
* not guard against this (presumably to avoid losing performance), so we
* don't do it either to stay competitive.
*/
__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
cuFloatComplex y)
{
cuFloatComplex prod;
prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) -
(cuCimagf(x) * cuCimagf(y)),
(cuCrealf(x) * cuCimagf(y)) +
(cuCimagf(x) * cuCrealf(y)));
return prod;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Such guarded implementations are usually the default for
* complex library implementations, with some also offering an unguarded,
* faster version.
*/
__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
cuFloatComplex y)
{
cuFloatComplex quot;
float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
float oos = 1.0f / s;
float ars = cuCrealf(x) * oos;
float ais = cuCimagf(x) * oos;
float brs = cuCrealf(y) * oos;
float bis = cuCimagf(y) * oos;
s = (brs * brs) + (bis * bis);
oos = 1.0f / s;
quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
((ais * brs) - (ars * bis)) * oos);
return quot;
}
/*
* We would like to call hypotf(), but it's not available on all platforms.
* This discrete implementation guards against intermediate underflow and
* overflow by scaling. Otherwise we would lose half the exponent range.
* There are various ways of doing guarded computation. For now chose the
* simplest and fastest solution, however this may suffer from inaccuracies
* if sqrt and division are not IEEE compliant.
*/
__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
{
float a = cuCrealf(x);
float b = cuCimagf(x);
float v, w, t;
a = fabsf(a);
b = fabsf(b);
if (a > b) {
v = a;
w = b;
} else {
v = b;
w = a;
}
t = w / v;
t = 1.0f + t * t;
t = v * sqrtf(t);
if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
t = v + w;
}
return t;
}
/* Double precision */
typedef double2 cuDoubleComplex;
__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
{
return x.x;
}
__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)
{
return x.y;
}
__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex
(double r, double i)
{
cuDoubleComplex res;
res.x = r;
res.y = i;
return res;
}
__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
{
return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
cuDoubleComplex y)
{
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),
cuCimag(x) + cuCimag(y));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
cuDoubleComplex y)
{
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),
cuCimag(x) - cuCimag(y));
}
/* This implementation could suffer from intermediate overflow even though
* the final result would be in range. However, various implementations do
* not guard against this (presumably to avoid losing performance), so we
* don't do it either to stay competitive.
*/
__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
cuDoubleComplex y)
{
cuDoubleComplex prod;
prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -
(cuCimag(x) * cuCimag(y)),
(cuCreal(x) * cuCimag(y)) +
(cuCimag(x) * cuCreal(y)));
return prod;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Such guarded implementations are usually the default for
* complex library implementations, with some also offering an unguarded,
* faster version.
*/
__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
cuDoubleComplex y)
{
cuDoubleComplex quot;
double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
double oos = 1.0 / s;
double ars = cuCreal(x) * oos;
double ais = cuCimag(x) * oos;
double brs = cuCreal(y) * oos;
double bis = cuCimag(y) * oos;
s = (brs * brs) + (bis * bis);
oos = 1.0 / s;
quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
((ais * brs) - (ars * bis)) * oos);
return quot;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Otherwise we would lose half the exponent range. There are
* various ways of doing guarded computation. For now chose the simplest
* and fastest solution, however this may suffer from inaccuracies if sqrt
* and division are not IEEE compliant.
*/
__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
{
double a = cuCreal(x);
double b = cuCimag(x);
double v, w, t;
a = fabs(a);
b = fabs(b);
if (a > b) {
v = a;
w = b;
} else {
v = b;
w = a;
}
t = w / v;
t = 1.0 + t * t;
t = v * sqrt(t);
if ((v == 0.0) ||
(v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
t = v + w;
}
return t;
}
#if defined(__cplusplus)
}
#endif /* __cplusplus */
/* aliases */
typedef cuFloatComplex cuComplex;
__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
float y)
{
return make_cuFloatComplex (x, y);
}
/* float-to-double promotion */
__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
(cuFloatComplex c)
{
return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
}
__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
(cuDoubleComplex c)
{
return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
}
__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
{
float real_res;
float imag_res;
real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);
real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;
return make_cuComplex(real_res, imag_res);
}
__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
{
double real_res;
double imag_res;
real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
return make_cuDoubleComplex(real_res, imag_res);
}
#endif /* !defined(CU_COMPLEX_H_) */

View File

@@ -1,565 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
* This is the public header file for the CUBLAS library, defining the API
*
* CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
* on top of the CUDA runtime.
*/
#if !defined(CUBLAS_H_)
#define CUBLAS_H_
#include <cuda_runtime.h>
#ifndef CUBLASWINAPI
#ifdef _WIN32
#define CUBLASWINAPI __stdcall
#else
#define CUBLASWINAPI
#endif
#endif
#undef CUBLASAPI
#ifdef __CUDACC__
#define CUBLASAPI __host__
#else
#define CUBLASAPI
#endif
#include "cublas_api.h"
#if defined(__cplusplus)
extern "C" {
#endif
/* CUBLAS data types */
#define cublasStatus cublasStatus_t
cublasStatus CUBLASWINAPI cublasInit (void);
cublasStatus CUBLASWINAPI cublasShutdown (void);
cublasStatus CUBLASWINAPI cublasGetError (void);
cublasStatus CUBLASWINAPI cublasGetVersion(int *version);
cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr);
cublasStatus CUBLASWINAPI cublasFree (void *devicePtr);
cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream);
/* ---------------- CUBLAS BLAS1 functions ---------------- */
/* NRM2 */
float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx);
double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx);
float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx);
double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* DOT */
float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y,
int incy);
double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y,
int incy);
cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y,
int incy);
cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y,
int incy);
cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
int incy);
cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
int incy);
/*------------------------------------------------------------------------*/
/* SCAL */
void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx);
void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx);
void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx);
void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx);
void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx);
void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* AXPY */
void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx,
float *y, int incy);
void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x,
int incx, double *y, int incy);
void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x,
int incx, cuComplex *y, int incy);
void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* COPY */
void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y,
int incy);
void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y,
int incy);
void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
int incy);
void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
int incy);
/*------------------------------------------------------------------------*/
/* SWAP */
void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy);
void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy);
void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy);
void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* AMAX */
int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx);
int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx);
int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx);
int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* AMIN */
int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx);
int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx);
int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx);
int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* ASUM */
float CUBLASWINAPI cublasSasum (int n, const float *x, int incx);
double CUBLASWINAPI cublasDasum (int n, const double *x, int incx);
float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx);
double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* ROT */
void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy,
float sc, float ss);
void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy,
double sc, double ss);
void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y,
int incy, float c, cuComplex s);
void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx,
cuDoubleComplex *y, int incy, double sc,
cuDoubleComplex cs);
void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
int incy, float c, float s);
void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx,
cuDoubleComplex *y, int incy, double c, double s);
/*------------------------------------------------------------------------*/
/* ROTG */
void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss);
void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss);
void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
cuComplex *cs);
void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
cuDoubleComplex *cs);
/*------------------------------------------------------------------------*/
/* ROTM */
void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
const float* sparam);
void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
const double* sparam);
/*------------------------------------------------------------------------*/
/* ROTMG */
void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1,
const float *sy1, float* sparam);
void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1,
const double *sy1, double* sparam);
/* --------------- CUBLAS BLAS2 functions ---------------- */
/* GEMV */
void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
const float *A, int lda, const float *x, int incx,
float beta, float *y, int incy);
void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
const double *A, int lda, const double *x, int incx,
double beta, double *y, int incy);
void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x, int incx,
cuComplex beta, cuComplex *y, int incy);
void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
cuDoubleComplex beta, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* GBMV */
void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku,
float alpha, const float *A, int lda,
const float *x, int incx, float beta, float *y,
int incy);
void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku,
double alpha, const double *A, int lda,
const double *x, int incx, double beta, double *y,
int incy);
void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *x, int incx, cuComplex beta, cuComplex *y,
int incy);
void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y,
int incy);
/*------------------------------------------------------------------------*/
/* TRMV */
void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n,
const float *A, int lda, float *x, int incx);
void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n,
const double *A, int lda, double *x, int incx);
void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n,
const cuComplex *A, int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n,
const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TBMV */
void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k,
const float *A, int lda, float *x, int incx);
void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k,
const double *A, int lda, double *x, int incx);
void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k,
const cuComplex *A, int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k,
const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TPMV */
void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx);
void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TRSV */
void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx);
void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx);
void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda,
cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TPSV */
void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP,
float *x, int incx);
void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP,
cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TBSV */
void CUBLASWINAPI cublasStbsv(char uplo, char trans,
char diag, int n, int k, const float *A,
int lda, float *x, int incx);
void CUBLASWINAPI cublasDtbsv(char uplo, char trans,
char diag, int n, int k, const double *A,
int lda, double *x, int incx);
void CUBLASWINAPI cublasCtbsv(char uplo, char trans,
char diag, int n, int k, const cuComplex *A,
int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtbsv(char uplo, char trans,
char diag, int n, int k, const cuDoubleComplex *A,
int lda, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* SYMV/HEMV */
void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
int lda, const float *x, int incx, float beta,
float *y, int incy);
void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
int lda, const double *x, int incx, double beta,
double *y, int incy);
void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
int lda, const cuComplex *x, int incx, cuComplex beta,
cuComplex *y, int incy);
void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta,
cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* SBMV/HBMV */
void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha,
const float *A, int lda, const float *x, int incx,
float beta, float *y, int incy);
void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha,
const double *A, int lda, const double *x, int incx,
double beta, double *y, int incy);
void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x, int incx,
cuComplex beta, cuComplex *y, int incy);
void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
cuDoubleComplex beta, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* SPMV/HPMV */
void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
const float *AP, const float *x,
int incx, float beta, float *y, int incy);
void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
const double *AP, const double *x,
int incx, double beta, double *y, int incy);
void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
const cuComplex *AP, const cuComplex *x,
int incx, cuComplex beta, cuComplex *y, int incy);
void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
const cuDoubleComplex *AP, const cuDoubleComplex *x,
int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* GER */
void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
const float *y, int incy, float *A, int lda);
void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
const double *y, int incy, double *A, int lda);
void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy,
cuComplex *A, int lda);
void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy,
cuComplex *A, int lda);
void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, const cuDoubleComplex *y, int incy,
cuDoubleComplex *A, int lda);
void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, const cuDoubleComplex *y, int incy,
cuDoubleComplex *A, int lda);
/*------------------------------------------------------------------------*/
/* SYR/HER */
void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
int incx, float *A, int lda);
void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
int incx, double *A, int lda);
void CUBLASWINAPI cublasCher (char uplo, int n, float alpha,
const cuComplex *x, int incx, cuComplex *A, int lda);
void CUBLASWINAPI cublasZher (char uplo, int n, double alpha,
const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda);
/*------------------------------------------------------------------------*/
/* SPR/HPR */
void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
int incx, float *AP);
void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
int incx, double *AP);
void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
int incx, cuComplex *AP);
void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
int incx, cuDoubleComplex *AP);
/*------------------------------------------------------------------------*/
/* SYR2/HER2 */
void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x,
int incx, const float *y, int incy, float *A,
int lda);
void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x,
int incx, const double *y, int incy, double *A,
int lda);
void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy, cuComplex *A,
int lda);
void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A,
int lda);
/*------------------------------------------------------------------------*/
/* SPR2/HPR2 */
void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x,
int incx, const float *y, int incy, float *AP);
void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
const double *x, int incx, const double *y,
int incy, double *AP);
void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
const cuComplex *x, int incx, const cuComplex *y,
int incy, cuComplex *AP);
void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
int incy, cuDoubleComplex *AP);
/* ------------------------BLAS3 Functions ------------------------------- */
/* GEMM */
void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k,
float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C,
int ldc);
void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
double alpha, const double *A, int lda,
const double *B, int ldb, double beta, double *C,
int ldc);
void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
int k, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb,
cuDoubleComplex beta, cuDoubleComplex *C,
int ldc);
/* -------------------------------------------------------*/
/* SYRK */
void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha,
const float *A, int lda, float beta, float *C,
int ldc);
void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
double alpha, const double *A, int lda,
double beta, double *C, int ldc);
void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
cuComplex beta, cuComplex *C, int ldc);
void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/* ------------------------------------------------------- */
/* HERK */
void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
float alpha, const cuComplex *A, int lda,
float beta, cuComplex *C, int ldc);
void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
double alpha,
const cuDoubleComplex *A, int lda,
double beta,
cuDoubleComplex *C, int ldc);
/* ------------------------------------------------------- */
/* SYR2K */
void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha,
const float *A, int lda, const float *B, int ldb,
float beta, float *C, int ldc);
void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
double alpha, const double *A, int lda,
const double *B, int ldb, double beta,
double *C, int ldc);
void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/* ------------------------------------------------------- */
/* HER2K */
void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, float beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, double beta,
cuDoubleComplex *C, int ldc);
/*------------------------------------------------------------------------*/
/* SYMM*/
void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha,
const float *A, int lda, const float *B, int ldb,
float beta, float *C, int ldc);
void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha,
const double *A, int lda, const double *B, int ldb,
double beta, double *C, int ldc);
void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *B, int ldb,
cuComplex beta, cuComplex *C, int ldc);
void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
/*------------------------------------------------------------------------*/
/* HEMM*/
void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/*------------------------------------------------------------------------*/
/* TRSM*/
void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
int m, int n, float alpha, const float *A, int lda,
float *B, int ldb);
void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
char diag, int m, int n, double alpha,
const double *A, int lda, double *B,
int ldb);
void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
int m, int n, cuComplex alpha, const cuComplex *A,
int lda, cuComplex *B, int ldb);
void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
char diag, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
cuDoubleComplex *B, int ldb);
/*------------------------------------------------------------------------*/
/* TRMM*/
void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
int m, int n, float alpha, const float *A, int lda,
float *B, int ldb);
void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
char diag, int m, int n, double alpha,
const double *A, int lda, double *B,
int ldb);
void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
int m, int n, cuComplex alpha, const cuComplex *A,
int lda, cuComplex *B, int ldb);
void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
char diag, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
int ldb);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif /* !defined(CUBLAS_H_) */

File diff suppressed because it is too large Load Diff

View File

@@ -1,228 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
#define __CUDA_DEVICE_RUNTIME_API_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__CUDABE__)
#if (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
struct cudaFuncAttributes;
__device__ __attribute__((nv_weak)) cudaError_t cudaMalloc(void **p, size_t s)
{
return cudaErrorUnknown;
}
__device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
{
return cudaErrorUnknown;
}
__device__ __attribute__((nv_weak)) cudaError_t cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
{
return cudaErrorUnknown;
}
__device__ __attribute__((nv_weak)) cudaError_t cudaGetDevice(int *device)
{
return cudaErrorUnknown;
}
__device__ __attribute__((nv_weak)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
{
return cudaErrorUnknown;
}
__device__ __attribute__((nv_weak)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
{
return cudaErrorUnknown;
}
#endif /* (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
#else /* defined(__CUDABE__) */
#if defined(__cplusplus) && defined(__CUDACC__) // Visible to nvcc front-end only
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) // Visible to SM>=3.5 and "__host__ __device__" only
#include "driver_types.h"
#include "host_defines.h"
extern "C"
{
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
/**
* \ingroup CUDART_EXECUTION
* \brief Obtains a parameter buffer
*
* Obtains a parameter buffer which can be filled with parameters for a kernel launch.
* Parameters passed to ::cudaLaunchDevice must be allocated via this function.
*
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
* CUDA user code should use <<< >>> to launch kernels.
*
* \param alignment - Specifies alignment requirement of the parameter buffer
* \param size - Specifies size requirement in bytes
*
* \return
* Returns pointer to the allocated parameterBuffer
* \notefnerr
*
* \sa cudaLaunchDevice
*/
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
/**
* \ingroup CUDART_EXECUTION
* \brief Launches a specified kernel
*
* Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
* by calling ::cudaGetParameterBuffer().
*
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
* CUDA user code should use <<< >>> to launch the kernels.
*
* \param func - Pointer to the kernel to be launched
* \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
* \param gridDimension - Specifies grid dimensions
* \param blockDimension - Specifies block dimensions
* \param sharedMemSize - Specifies size of shared memory
* \param stream - Specifies the stream to be used
*
* \return
* ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
* ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
* \notefnerr
* \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
* Guide for the detailed descriptions of launch configuration and parameter layout respectively.
*
* \sa cudaGetParameterBuffer
*/
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
// When compiling for the device and per thread default stream is enabled, add
// a static inline redirect to the per thread stream entry points.
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
{
return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
}
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
{
return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
}
#else
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
#endif
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
}
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
#endif // defined(__cplusplus) && defined(__CUDACC__)
#endif /* defined(__CUDABE__) */
#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,69 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__DEVICE_TYPES_H__)
#define __DEVICE_TYPES_H__
#include "host_defines.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
enum __device_builtin__ cudaRoundMode
{
cudaRoundNearest,
cudaRoundZero,
cudaRoundPosInf,
cudaRoundMinInf
};
#endif /* !__DEVICE_TYPES_H__ */

View File

@@ -1,145 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__DRIVER_FUNCTIONS_H__)
#define __DRIVER_FUNCTIONS_H__
#include "builtin_types.h"
#include "host_defines.h"
#include "driver_types.h"
/**
* \addtogroup CUDART_MEMORY
*
* @{
*/
/**
* \brief Returns a cudaPitchedPtr based on input parameters
*
* Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
* \p p, \p xsz, and \p ysz.
*
* \param d - Pointer to allocated memory
* \param p - Pitch of allocated memory in bytes
* \param xsz - Logical width of allocation in elements
* \param ysz - Logical height of allocation in elements
*
* \return
* ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
*
* \sa make_cudaExtent, make_cudaPos
*/
static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
{
struct cudaPitchedPtr s;
s.ptr = d;
s.pitch = p;
s.xsize = xsz;
s.ysize = ysz;
return s;
}
/**
* \brief Returns a cudaPos based on input parameters
*
* Returns a ::cudaPos based on the specified input parameters \p x,
* \p y, and \p z.
*
* \param x - X position
* \param y - Y position
* \param z - Z position
*
* \return
* ::cudaPos specified by \p x, \p y, and \p z
*
* \sa make_cudaExtent, make_cudaPitchedPtr
*/
static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z)
{
struct cudaPos p;
p.x = x;
p.y = y;
p.z = z;
return p;
}
/**
* \brief Returns a cudaExtent based on input parameters
*
* Returns a ::cudaExtent based on the specified input parameters \p w,
* \p h, and \p d.
*
* \param w - Width in bytes
* \param h - Height in elements
* \param d - Depth in elements
*
* \return
* ::cudaExtent specified by \p w, \p h, and \p d
*
* \sa make_cudaPitchedPtr, make_cudaPos
*/
static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d)
{
struct cudaExtent e;
e.width = w;
e.height = h;
e.depth = d;
return e;
}
/** @} */ /* END CUDART_MEMORY */
#endif /* !__DRIVER_FUNCTIONS_H__ */

File diff suppressed because it is too large Load Diff

View File

@@ -1,201 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__HOST_CONFIG_H__)
#define __HOST_CONFIG_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__CUDACC__)
#if defined(__CUDACC_RTC__)
#define _CRTIMP
#define __THROW
#else /* __CUDACC_RTC__ */
/* check for host compilers that are compatible with nvcc */
#if !defined(__GNUC__) && !defined(_WIN32)
#error --- !!! UNSUPPORTED COMPILER !!! ---
#endif /* !__GNUC__ && !_WIN32 */
#if defined(__ICC)
#if __ICC != 1500 || !defined(__GNUC__) || !defined(__LP64__)
#error -- unsupported ICC configuration! Only ICC 15.0 on Linux x86_64 is supported!
#endif /* __ICC != 1500 || !__GNUC__ || !__LP64__ */
#endif /* __ICC */
#if defined(__PGIC__)
#if __PGIC__ != 15 || __PGIC_MINOR__ != 4 || !defined(__GNUC__) || !defined(__LP64__)
#error -- unsupported pgc++ configuration! Only pgc++ 15.4 on Linux x86_64 is supported!
#endif /* __PGIC__ != 15 || __PGIC_MINOR != 4 || !__GNUC__ || !__LP64__ */
#endif /* __PGIC__ */
#if defined(__powerpc__)
#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
#if defined(__ibmxl_vrm__) && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000)
#error -- unsupported xlC version! only xlC 13.1 is supported
#endif /* __ibmxl_vrm__ && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) */
#endif /* __powerpc__ */
#if defined(__GNUC__)
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 9)
#error -- unsupported GNU version! gcc versions later than 4.9 are not supported!
#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 9) */
#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
#error -- clang and clang++ are the only supported host compilers on Mac OS X!
#endif /* __APPLE__ && __MACH__ && !__clang__ */
#endif /* __GNUC__ */
#if defined(_WIN32)
#if _MSC_VER < 1600 || _MSC_VER > 1800
#error -- unsupported Microsoft Visual Studio version! Only the versions 2010, 2012, and 2013 are supported!
#endif /* _MSC_VER < 1600 || _MSC_VER > 1800 */
#endif /* _WIN32 */
/* configure host compiler */
#if defined(__APPLE__)
#define _CRTIMP
#define __THROW
#if defined(__BLOCKS__) /* nvcc does not support closures */
#undef __BLOCKS__
#endif /* __BLOCKS__ */
#elif defined(__ANDROID__)
#define _CRTIMP
#define __THROW
#elif defined(__QNX__)
#define _CRTIMP
#define __THROW
#elif defined(__GNUC__)
#define _CRTIMP
#include <features.h> /* for __THROW */
#elif defined(_WIN32)
#if _MSC_VER >= 1500
#undef _USE_DECLSPECS_FOR_SAL
#define _USE_DECLSPECS_FOR_SAL \
1
#endif /* _MSC_VER >= 1500 */
#if !defined(_CRT_NONSTDC_NO_WARNINGS)
#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
#endif /* !_CRT_NONSTDC_NO_WARNINGS */
#if !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
#endif /* !_CRT_SECURE_NO_WARNINGS */
#if !defined(NOMINMAX)
#define NOMINMAX /* min and max are part of cuda runtime */
#endif /* !NOMINMAX */
#include <crtdefs.h> /* for _CRTIMP */
#define __THROW
#endif /* __APPLE__ */
#endif /* __CUDACC_RTC__ */
#endif /* __CUDACC__ */
#endif /* !__HOST_CONFIG_H__ */

View File

@@ -1,241 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__HOST_DEFINES_H__)
#define __HOST_DEFINES_H__
/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
#if defined(__GNUC__) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
#if defined(__CUDACC_RTC__)
#define __volatile__ volatile
#endif /* __CUDACC_RTC__ */
#define __no_return__ \
__attribute__((noreturn))
#if defined(__CUDACC__) || defined(__CUDA_ARCH__)
/* gcc allows users to define attributes with underscores,
e.g., __attribute__((__noinline__)).
Consider a non-CUDA source file (e.g. .cpp) that has the
above attribute specification, and includes this header file. In that case,
defining __noinline__ as below would cause a gcc compilation error.
Hence, only define __noinline__ when the code is being processed
by a CUDA compiler component.
*/
#define __noinline__ \
__attribute__((noinline))
#endif /* __CUDACC__ || __CUDA_ARCH__ */
#define __forceinline__ \
__inline__ __attribute__((always_inline))
#define __align__(n) \
__attribute__((aligned(n)))
#define __thread__ \
__thread
#define __import__
#define __export__
#define __cdecl
#define __annotate__(a) \
__attribute__((a))
#define __location__(a) \
__annotate__(a)
#define CUDARTAPI
#elif defined(_MSC_VER)
#if _MSC_VER >= 1400
#define __restrict__ \
__restrict
#else /* _MSC_VER >= 1400 */
#define __restrict__
#endif /* _MSC_VER >= 1400 */
#define __inline__ \
__inline
#define __no_return__ \
__declspec(noreturn)
#define __noinline__ \
__declspec(noinline)
#define __forceinline__ \
__forceinline
#define __align__(n) \
__declspec(align(n))
#define __thread__ \
__declspec(thread)
#define __import__ \
__declspec(dllimport)
#define __export__ \
__declspec(dllexport)
#define __annotate__(a) \
__declspec(a)
#define __location__(a) \
__annotate__(__##a##__)
#define CUDARTAPI \
__stdcall
#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
#define __inline__
#if !defined(__align__)
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
#endif /* !__align__ */
#if !defined(CUDARTAPI)
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
#endif /* !CUDARTAPI */
#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
(defined(_MSC_VER) && _MSC_VER < 1900) || \
(!defined(__GNUC__) && !defined(_MSC_VER))
#define __specialization_static \
static
#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
(_MSC_VER && _MSC_VER < 1900) ||
(!__GNUC__ && !_MSC_VER) */
#define __specialization_static
#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
(_MSC_VER && _MSC_VER < 1900) ||
(!__GNUC__ && !_MSC_VER) */
#if !defined(__CUDACC__) && !defined(__CUDABE__)
#undef __annotate__
#define __annotate__(a)
#else /* !__CUDACC__ && !__CUDABE__ */
#define __launch_bounds__(...) \
__annotate__(launch_bounds(__VA_ARGS__))
#endif /* !__CUDACC__ && !__CUDABE__ */
#if defined(__CUDACC__) || defined(__CUDABE__) || \
defined(__GNUC__) || defined(_WIN64)
#define __builtin_align__(a) \
__align__(a)
#else /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */
#define __builtin_align__(a)
#endif /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */
#define __host__ \
__location__(host)
#define __device__ \
__location__(device)
#define __global__ \
__location__(global)
#define __shared__ \
__location__(shared)
#define __constant__ \
__location__(constant)
#define __managed__ \
__location__(managed)
#if (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__)) || !defined(__CUDACC__)
#define __device_builtin__
#define __device_builtin_texture_type__
#define __device_builtin_surface_type__
#define __cudart_builtin__
#else /* (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__)) || !__CUDACC__ */
#define __device_builtin__ \
__location__(device_builtin)
#define __device_builtin_texture_type__ \
__location__(device_builtin_texture_type)
#define __device_builtin_surface_type__ \
__location__(device_builtin_surface_type)
#define __cudart_builtin__ \
__location__(cudart_builtin)
#endif /* (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__)) || !__CUDACC__ */
#if defined(__CUDACC__) && defined(__clang__)
#if !defined(__has_feature)
#error --- !!! The Clang version does not support __has_feature !!! ---
#endif /* !__has_feature */
#if defined(__cplusplus) && defined(__CUDACC__)
#if (__has_feature(cxx_noexcept))
#define NV_CLANG_ATOMIC_NOEXCEPT noexcept
#define NV_CLANG_ATOMIC_NOEXCEPT_(x) noexcept(x)
#else /* !__has_feature(cxx_noexcept) */
#define NV_CLANG_ATOMIC_NOEXCEPT throw()
#define NV_CLANG_ATOMIC_NOEXCEPT_(x)
#endif /* __has_feature(cxx_noexcept) */
template <typename T> struct __nv_clang_atomic_t {
__nv_clang_atomic_t() NV_CLANG_ATOMIC_NOEXCEPT;
__nv_clang_atomic_t(const T &x) NV_CLANG_ATOMIC_NOEXCEPT;
operator T() volatile NV_CLANG_ATOMIC_NOEXCEPT;
operator T() NV_CLANG_ATOMIC_NOEXCEPT;
};
#define _Atomic(X) __nv_clang_atomic_t<X>
#endif /* defined(__cplusplus) && defined(__CUDACC__) */
#endif /* __CUDACC__ && __clang__ */
#endif /* !__HOST_DEFINES_H__ */

View File

@@ -1,119 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__SURFACE_TYPES_H__)
#define __SURFACE_TYPES_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "driver_types.h"
/**
* \addtogroup CUDART_TYPES
*
* @{
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#define cudaSurfaceType1D 0x01
#define cudaSurfaceType2D 0x02
#define cudaSurfaceType3D 0x03
#define cudaSurfaceTypeCubemap 0x0C
#define cudaSurfaceType1DLayered 0xF1
#define cudaSurfaceType2DLayered 0xF2
#define cudaSurfaceTypeCubemapLayered 0xFC
/**
* CUDA Surface boundary modes
*/
enum __device_builtin__ cudaSurfaceBoundaryMode
{
cudaBoundaryModeZero = 0, /**< Zero boundary mode */
cudaBoundaryModeClamp = 1, /**< Clamp boundary mode */
cudaBoundaryModeTrap = 2 /**< Trap boundary mode */
};
/**
* CUDA Surface format modes
*/
enum __device_builtin__ cudaSurfaceFormatMode
{
cudaFormatModeForced = 0, /**< Forced format mode */
cudaFormatModeAuto = 1 /**< Auto format mode */
};
/**
* CUDA Surface reference
*/
struct __device_builtin__ surfaceReference
{
/**
* Channel descriptor for surface reference
*/
struct cudaChannelFormatDesc channelDesc;
};
/**
* An opaque value that represents a CUDA Surface object
*/
typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
/** @} */
/** @} */ /* END CUDART_TYPES */
#endif /* !__SURFACE_TYPES_H__ */

View File

@@ -1,213 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__TEXTURE_TYPES_H__)
#define __TEXTURE_TYPES_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "driver_types.h"
/**
* \addtogroup CUDART_TYPES
*
* @{
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#define cudaTextureType1D 0x01
#define cudaTextureType2D 0x02
#define cudaTextureType3D 0x03
#define cudaTextureTypeCubemap 0x0C
#define cudaTextureType1DLayered 0xF1
#define cudaTextureType2DLayered 0xF2
#define cudaTextureTypeCubemapLayered 0xFC
/**
* CUDA texture address modes
*/
enum __device_builtin__ cudaTextureAddressMode
{
cudaAddressModeWrap = 0, /**< Wrapping address mode */
cudaAddressModeClamp = 1, /**< Clamp to edge address mode */
cudaAddressModeMirror = 2, /**< Mirror address mode */
cudaAddressModeBorder = 3 /**< Border address mode */
};
/**
* CUDA texture filter modes
*/
enum __device_builtin__ cudaTextureFilterMode
{
cudaFilterModePoint = 0, /**< Point filter mode */
cudaFilterModeLinear = 1 /**< Linear filter mode */
};
/**
* CUDA texture read modes
*/
enum __device_builtin__ cudaTextureReadMode
{
cudaReadModeElementType = 0, /**< Read texture as specified element type */
cudaReadModeNormalizedFloat = 1 /**< Read texture as normalized float */
};
/**
* CUDA texture reference
*/
struct __device_builtin__ textureReference
{
/**
* Indicates whether texture reads are normalized or not
*/
int normalized;
/**
* Texture filter mode
*/
enum cudaTextureFilterMode filterMode;
/**
* Texture address mode for up to 3 dimensions
*/
enum cudaTextureAddressMode addressMode[3];
/**
* Channel descriptor for the texture reference
*/
struct cudaChannelFormatDesc channelDesc;
/**
* Perform sRGB->linear conversion during texture read
*/
int sRGB;
/**
* Limit to the anisotropy ratio
*/
unsigned int maxAnisotropy;
/**
* Mipmap filter mode
*/
enum cudaTextureFilterMode mipmapFilterMode;
/**
* Offset applied to the supplied mipmap level
*/
float mipmapLevelBias;
/**
* Lower end of the mipmap level range to clamp access to
*/
float minMipmapLevelClamp;
/**
* Upper end of the mipmap level range to clamp access to
*/
float maxMipmapLevelClamp;
int __cudaReserved[15];
};
/**
* CUDA texture descriptor
*/
struct __device_builtin__ cudaTextureDesc
{
/**
* Texture address mode for up to 3 dimensions
*/
enum cudaTextureAddressMode addressMode[3];
/**
* Texture filter mode
*/
enum cudaTextureFilterMode filterMode;
/**
* Texture read mode
*/
enum cudaTextureReadMode readMode;
/**
* Perform sRGB->linear conversion during texture read
*/
int sRGB;
/**
* Indicates whether texture reads are normalized or not
*/
int normalizedCoords;
/**
* Limit to the anisotropy ratio
*/
unsigned int maxAnisotropy;
/**
* Mipmap filter mode
*/
enum cudaTextureFilterMode mipmapFilterMode;
/**
* Offset applied to the supplied mipmap level
*/
float mipmapLevelBias;
/**
* Lower end of the mipmap level range to clamp access to
*/
float minMipmapLevelClamp;
/**
* Upper end of the mipmap level range to clamp access to
*/
float maxMipmapLevelClamp;
};
/**
* An opaque value that represents a CUDA texture object
*/
typedef __device_builtin__ unsigned long long cudaTextureObject_t;
/** @} */
/** @} */ /* END CUDART_TYPES */
#endif /* !__TEXTURE_TYPES_H__ */

View File

@@ -1,177 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__VECTOR_FUNCTIONS_H__)
#define __VECTOR_FUNCTIONS_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "builtin_types.h"
#include "host_defines.h"
#include "vector_types.h"
#if defined(__CUDACC_RTC__)
#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
#else /* !__CUDACC_RTC__ */
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
#endif /* __CUDACC_RTC__ */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
#undef __VECTOR_FUNCTIONS_DECL__
#if !defined(__CUDACC_RTC__)
#include "vector_functions.hpp"
#endif /* !__CUDACC_RTC__ */
#endif /* !__VECTOR_FUNCTIONS_H__ */

View File

@@ -1,318 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__VECTOR_FUNCTIONS_HPP__)
#define __VECTOR_FUNCTIONS_HPP__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "builtin_types.h"
#include "host_defines.h"
#include "vector_types.h"
#if defined(__CUDACC_RTC__)
#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
#else /* !__CUDACC_RTC__ */
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
#endif /* __CUDACC_RTC__ */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
{
char1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
{
uchar1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
{
char2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
{
uchar2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
{
char3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
{
uchar3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
{
char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
{
uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
{
short1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
{
ushort1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
{
short2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
{
ushort2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
{
short3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
{
ushort3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
{
short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
{
ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
{
int1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
{
uint1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
{
int2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
{
uint2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
{
int3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
{
uint3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
{
int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
{
uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
{
long1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
{
ulong1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
{
long2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
{
ulong2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
{
long3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
{
ulong3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
{
long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
{
ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
{
float1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
{
float2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
{
float3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
{
float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
{
longlong1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
{
ulonglong1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
{
longlong2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
{
ulonglong2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
{
longlong3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
{
ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
{
longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
{
ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
{
double1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
{
double2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
{
double3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
{
double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
#undef __VECTOR_FUNCTIONS_DECL__
#endif /* !__VECTOR_FUNCTIONS_HPP__ */

View File

@@ -1,431 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__VECTOR_TYPES_H__)
#define __VECTOR_TYPES_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if !defined(__CUDA_LIBDEVICE__) && !defined(__CUDACC_RTC__)
#define EXCLUDE_FROM_RTC
#include "builtin_types.h"
#undef EXCLUDE_FROM_RTC
#endif /* !__CUDA_LIBDEVICE__ && !__CUDACC_RTC__ */
#include "host_defines.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && !defined(__CUDABE__) && \
defined(_WIN32) && !defined(_WIN64)
#pragma warning(push)
#pragma warning(disable: 4201 4408)
#define __cuda_builtin_vector_align8(tag, members) \
struct __device_builtin__ tag \
{ \
union \
{ \
struct { members }; \
struct { long long int :1,:0; }; \
}; \
}
#else /* !__CUDACC__ && !__CUDACC_RTC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
#define __cuda_builtin_vector_align8(tag, members) \
struct __device_builtin__ __align__(8) tag \
{ \
members \
}
#endif /* !__CUDACC__ && !__CUDACC_RTC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
struct __device_builtin__ char1
{
signed char x;
};
struct __device_builtin__ uchar1
{
unsigned char x;
};
struct __device_builtin__ __align__(2) char2
{
signed char x, y;
};
struct __device_builtin__ __align__(2) uchar2
{
unsigned char x, y;
};
struct __device_builtin__ char3
{
signed char x, y, z;
};
struct __device_builtin__ uchar3
{
unsigned char x, y, z;
};
struct __device_builtin__ __align__(4) char4
{
signed char x, y, z, w;
};
struct __device_builtin__ __align__(4) uchar4
{
unsigned char x, y, z, w;
};
struct __device_builtin__ short1
{
short x;
};
struct __device_builtin__ ushort1
{
unsigned short x;
};
struct __device_builtin__ __align__(4) short2
{
short x, y;
};
struct __device_builtin__ __align__(4) ushort2
{
unsigned short x, y;
};
struct __device_builtin__ short3
{
short x, y, z;
};
struct __device_builtin__ ushort3
{
unsigned short x, y, z;
};
__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
struct __device_builtin__ int1
{
int x;
};
struct __device_builtin__ uint1
{
unsigned int x;
};
__cuda_builtin_vector_align8(int2, int x; int y;);
__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
struct __device_builtin__ int3
{
int x, y, z;
};
struct __device_builtin__ uint3
{
unsigned int x, y, z;
};
struct __device_builtin__ __builtin_align__(16) int4
{
int x, y, z, w;
};
struct __device_builtin__ __builtin_align__(16) uint4
{
unsigned int x, y, z, w;
};
struct __device_builtin__ long1
{
long int x;
};
struct __device_builtin__ ulong1
{
unsigned long x;
};
#if defined(__CUDACC_RTC__) || defined(_WIN32)
__cuda_builtin_vector_align8(long2, long int x; long int y;);
__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
#else /* __CUDACC_RTC__ || _WIN32 */
struct __device_builtin__ __align__(2*sizeof(long int)) long2
{
long int x, y;
};
struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
{
unsigned long int x, y;
};
#endif /* __CUDACC_RTC__ || _WIN32 */
struct __device_builtin__ long3
{
long int x, y, z;
};
struct __device_builtin__ ulong3
{
unsigned long int x, y, z;
};
struct __device_builtin__ __builtin_align__(16) long4
{
long int x, y, z, w;
};
struct __device_builtin__ __builtin_align__(16) ulong4
{
unsigned long int x, y, z, w;
};
struct __device_builtin__ float1
{
float x;
};
#if !defined(__CUDACC__) && !defined(__CUDABE__) && defined(__arm__) && \
defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-pedantic"
struct __device_builtin__ __attribute__((aligned(8))) float2
{
float x; float y; float __cuda_gnu_arm_ice_workaround[0];
};
#pragma GCC poison __cuda_gnu_arm_ice_workaround
#pragma GCC diagnostic pop
#else /* !__CUDACC__ && !__CUDABE__ && __arm__ && __ARM_PCS_VFP &&
__GNUC__ == 4&& __GNUC_MINOR__ == 6 */
__cuda_builtin_vector_align8(float2, float x; float y;);
#endif /* !__CUDACC__ && !__CUDABE__ && __arm__ && __ARM_PCS_VFP &&
__GNUC__ == 4&& __GNUC_MINOR__ == 6 */
struct __device_builtin__ float3
{
float x, y, z;
};
struct __device_builtin__ __builtin_align__(16) float4
{
float x, y, z, w;
};
struct __device_builtin__ longlong1
{
long long int x;
};
struct __device_builtin__ ulonglong1
{
unsigned long long int x;
};
struct __device_builtin__ __builtin_align__(16) longlong2
{
long long int x, y;
};
struct __device_builtin__ __builtin_align__(16) ulonglong2
{
unsigned long long int x, y;
};
struct __device_builtin__ longlong3
{
long long int x, y, z;
};
struct __device_builtin__ ulonglong3
{
unsigned long long int x, y, z;
};
struct __device_builtin__ __builtin_align__(16) longlong4
{
long long int x, y, z ,w;
};
struct __device_builtin__ __builtin_align__(16) ulonglong4
{
unsigned long long int x, y, z, w;
};
struct __device_builtin__ double1
{
double x;
};
struct __device_builtin__ __builtin_align__(16) double2
{
double x, y;
};
struct __device_builtin__ double3
{
double x, y, z;
};
struct __device_builtin__ __builtin_align__(16) double4
{
double x, y, z, w;
};
#if !defined(__CUDACC__) && !defined(__CUDABE__) && \
defined(_WIN32) && !defined(_WIN64)
#pragma warning(pop)
#endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
typedef __device_builtin__ struct char1 char1;
typedef __device_builtin__ struct uchar1 uchar1;
typedef __device_builtin__ struct char2 char2;
typedef __device_builtin__ struct uchar2 uchar2;
typedef __device_builtin__ struct char3 char3;
typedef __device_builtin__ struct uchar3 uchar3;
typedef __device_builtin__ struct char4 char4;
typedef __device_builtin__ struct uchar4 uchar4;
typedef __device_builtin__ struct short1 short1;
typedef __device_builtin__ struct ushort1 ushort1;
typedef __device_builtin__ struct short2 short2;
typedef __device_builtin__ struct ushort2 ushort2;
typedef __device_builtin__ struct short3 short3;
typedef __device_builtin__ struct ushort3 ushort3;
typedef __device_builtin__ struct short4 short4;
typedef __device_builtin__ struct ushort4 ushort4;
typedef __device_builtin__ struct int1 int1;
typedef __device_builtin__ struct uint1 uint1;
typedef __device_builtin__ struct int2 int2;
typedef __device_builtin__ struct uint2 uint2;
typedef __device_builtin__ struct int3 int3;
typedef __device_builtin__ struct uint3 uint3;
typedef __device_builtin__ struct int4 int4;
typedef __device_builtin__ struct uint4 uint4;
typedef __device_builtin__ struct long1 long1;
typedef __device_builtin__ struct ulong1 ulong1;
typedef __device_builtin__ struct long2 long2;
typedef __device_builtin__ struct ulong2 ulong2;
typedef __device_builtin__ struct long3 long3;
typedef __device_builtin__ struct ulong3 ulong3;
typedef __device_builtin__ struct long4 long4;
typedef __device_builtin__ struct ulong4 ulong4;
typedef __device_builtin__ struct float1 float1;
typedef __device_builtin__ struct float2 float2;
typedef __device_builtin__ struct float3 float3;
typedef __device_builtin__ struct float4 float4;
typedef __device_builtin__ struct longlong1 longlong1;
typedef __device_builtin__ struct ulonglong1 ulonglong1;
typedef __device_builtin__ struct longlong2 longlong2;
typedef __device_builtin__ struct ulonglong2 ulonglong2;
typedef __device_builtin__ struct longlong3 longlong3;
typedef __device_builtin__ struct ulonglong3 ulonglong3;
typedef __device_builtin__ struct longlong4 longlong4;
typedef __device_builtin__ struct ulonglong4 ulonglong4;
typedef __device_builtin__ struct double1 double1;
typedef __device_builtin__ struct double2 double2;
typedef __device_builtin__ struct double3 double3;
typedef __device_builtin__ struct double4 double4;
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
struct __device_builtin__ dim3
{
unsigned int x, y, z;
#if defined(__cplusplus)
__host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
__host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
__host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
#endif /* __cplusplus */
};
typedef __device_builtin__ struct dim3 dim3;
#undef __cuda_builtin_vector_align8
#endif /* !__VECTOR_TYPES_H__ */

87
include/isaac/api.h Normal file
View File

@@ -0,0 +1,87 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "isaac/runtime/predict.h"
#include "isaac/driver/backend.h"
#include "isaac/driver/cublas.h"
#include "isaac/driver/context.h"
#include "isaac/driver/kernel.h"
#include "isaac/driver/buffer.h"
#include "isaac/driver/stream.h"
#include "isaac/tools/collections.hpp"
#include "isaac/templates/conv.h"
#include "isaac/templates/gemm.h"
namespace isaac{
void GEMM(driver::Device const & device, driver::Stream & stream,
DType dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K,
size_t offa, size_t lda, size_t offb, size_t ldb, size_t offc, size_t ldc,
scalar const & alpha, driver::Buffer const & A, driver::Buffer const & B, scalar const & beta, driver::Buffer& C)
{
typedef std::tuple<driver::Stream, DType, IsaacOperation_t, IsaacOperation_t,
param_t, param_t, param_t, size_t, size_t, size_t, size_t, size_t, size_t> key_type;
typedef std::pair<std::shared_ptr<templates::GEMM>, std::shared_ptr<driver::Kernel>> value_type;
static std::function<value_type()> compile = [&](){
//Fetch profile
runtime::GEMMProfile* profile = (runtime::GEMMProfile*)runtime::database.at({device.architecture(), runtime::GEMM}).get();
templates::GEMM generator = profile->predict(device, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc);
//Execute
std::string src = generator.dump(device, "gemm");
driver::Module module(stream.context(), src);
return value_type(std::make_shared<templates::GEMM>(generator), std::make_shared<driver::Kernel>(module, "gemm"));
};
static cpp::CachedMap<key_type, value_type> cache(compile);
//Retrieve profile/kernel and execute
value_type const & value = cache.get(key_type(stream, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc));
value.first->enqueue(*value.second, stream, alpha, A, B, beta, C);
}
void CONV(driver::Device const & device, driver::Stream & stream,
DType dtype, param_t N, param_t K, param_t P, param_t Q, param_t C, param_t R, param_t S,
param_t H, param_t W, param_t pad_h, param_t pad_w, param_t stride_h, param_t stride_w,
scalar const & alpha, driver::Buffer const & I, driver::Buffer const & F, scalar const & beta, driver::Buffer& O)
{
typedef std::tuple<driver::Stream, DType, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> key_type;
typedef std::pair<std::shared_ptr<templates::Conv>, std::shared_ptr<driver::Kernel>> value_type;
static std::function<value_type()> compile = [&](){
//Fetch profile
runtime::ConvProfile* profile = (runtime::ConvProfile*)runtime::database.at({device.architecture(), runtime::CONV}).get();
templates::Conv generator = profile->predict(device, dtype, C, H, W, N, K, P, Q, R, S, pad_h, pad_w, stride_h, stride_w);
//Execute
std::string src = generator.dump(device, "fconv");
driver::Module module(stream.context(), src);
return value_type(std::make_shared<templates::Conv>(generator), std::make_shared<driver::Kernel>(module, "fconv"));
};
static cpp::CachedMap<key_type, value_type> cache(compile);
//Retrieve profile/kernel and execute
value_type const & value = cache.get(key_type(stream, dtype, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w));
value.first->enqueue(*value.second, stream, alpha, I, F, beta, O);
}
}

View File

@@ -1,337 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_ARRAY_H_
#define ISAAC_ARRAY_H_
#include <iostream>
#include "isaac/defines.h"
#include "isaac/driver/backend.h"
#include "isaac/jit/syntax/expression/expression.h"
#include "isaac/runtime/handler.h"
#include "isaac/types.h"
#include "isaac/tools/cpp/tuple.hpp"
namespace isaac
{
class scalar;
class view;
class ISAACAPI array_base
{
int_t dsize();
public:
//1D Constructors
explicit array_base(int_t size1, numeric_type dtype = FLOAT_TYPE, driver::Context const & context = driver::backend::contexts::get_default());
array_base(int_t size1, numeric_type dtype, driver::Buffer data, int_t start, int_t inc);
template<typename DT>
array_base(std::vector<DT> const & data, driver::Context const & context = driver::backend::contexts::get_default());
array_base(array_base & v, slice const & s1);
//2D Constructors
array_base(int_t size1, int_t size2, numeric_type dtype = FLOAT_TYPE, driver::Context const & context = driver::backend::contexts::get_default());
array_base(int_t size1, int_t size2, numeric_type dtype, driver::Buffer data, int_t start, int_t ld);
template<typename DT>
array_base(int_t size1, int_t size2, std::vector<DT> const & data, driver::Context const & context = driver::backend::contexts::get_default());
array_base(array_base & M, slice const & s1, slice const & s2);
//3D Constructors
array_base(int_t size1, int_t size2, int_t size3, numeric_type dtype = FLOAT_TYPE, driver::Context const & context = driver::backend::contexts::get_default());
//General constructor
template<typename DT>
array_base(tuple const & shape, std::vector<DT> const & data, driver::Context const & context = driver::backend::contexts::get_default());
array_base(tuple const & shape, numeric_type dtype, driver::Context const & context = driver::backend::contexts::get_default());
array_base(tuple const & shape, numeric_type dtype, int_t start, tuple const & stride, driver::Context const & context = driver::backend::contexts::get_default());
array_base(tuple const & shape, numeric_type dtype, int_t start, tuple const & stride, driver::Buffer const & data);
explicit array_base(runtime::execution_handler const &);
//Make the class virtual
virtual ~array_base() = 0;
//Getters
numeric_type dtype() const;
tuple const & shape() const;
size_t dim() const;
int_t start() const;
tuple const & stride() const;
driver::Context const & context() const;
driver::Buffer const & data() const;
driver::Buffer & data();
//Setters
array_base& resize(int_t size1, int_t size2=1);
//Numeric operators
array_base& operator=(array_base const &);
array_base& operator=(expression_tree const &);
array_base& operator=(runtime::execution_handler const &);
template<class T>
array_base & operator=(std::vector<T> const & rhs);
array_base & operator=(value_scalar const & rhs);
expression_tree operator-();
expression_tree operator!();
array_base& operator+=(value_scalar const &);
array_base& operator+=(array_base const &);
array_base& operator+=(expression_tree const &);
array_base& operator-=(value_scalar const &);
array_base& operator-=(array_base const &);
array_base& operator-=(expression_tree const &);
array_base& operator*=(value_scalar const &);
array_base& operator*=(array_base const &);
array_base& operator*=(expression_tree const &);
array_base& operator/=(value_scalar const &);
array_base& operator/=(array_base const &);
array_base& operator/=(expression_tree const &);
//Indexing (1D)
const scalar operator[](int_t) const;
scalar operator[](int_t);
view operator[](slice const &);
//Indexing (2D)
view operator()(int_t, int_t);
view operator()(slice const &, int_t);
view operator()(int_t, slice const &);
view operator()(slice const &, slice const &);
const view operator()(int_t, int_t) const;
const view operator()(slice const &, int_t) const;
const view operator()(int_t, slice const &) const;
const view operator()(slice const &, slice const &) const;
protected:
numeric_type dtype_;
tuple shape_;
int_t start_;
tuple stride_;
driver::Context context_;
driver::Buffer data_;
public:
const expression_tree T;
};
class ISAACAPI array : public array_base
{
public:
using array_base::array_base;
//Copy Constructor
array(array_base const &);
array(array const &);
array(expression_tree const & proxy);
using array_base::operator=;
};
class ISAACAPI view : public array_base
{
public:
view(array_base & data);
view(array_base& data, slice const & s1);
view(array_base& data, slice const & s1, slice const & s2);
view(int_t size1, numeric_type dtype, driver::Buffer data, int_t start, int_t inc);
using array_base::operator=;
};
class ISAACAPI scalar : public array_base
{
friend value_scalar::value_scalar(const scalar &);
friend value_scalar::value_scalar(const expression_tree &);
private:
void inject(values_holder&) const;
template<class T> T cast() const;
public:
explicit scalar(numeric_type dtype, const driver::Buffer &data, int_t offset);
explicit scalar(value_scalar value, driver::Context const & context = driver::backend::contexts::get_default());
explicit scalar(numeric_type dtype, driver::Context const & context = driver::backend::contexts::get_default());
scalar(expression_tree const & proxy);
scalar& operator=(value_scalar const &);
// scalar& operator=(scalar const & s);
using array_base::operator =;
#define INSTANTIATE(type) operator type() const;
INSTANTIATE(char)
INSTANTIATE(unsigned char)
INSTANTIATE(short)
INSTANTIATE(unsigned short)
INSTANTIATE(int)
INSTANTIATE(unsigned int)
INSTANTIATE(long)
INSTANTIATE(unsigned long)
INSTANTIATE(long long)
INSTANTIATE(unsigned long long)
INSTANTIATE(float)
INSTANTIATE(double)
#undef INSTANTIATE
};
//copy
ISAACAPI void copy(void const * data, array_base & gx, driver::CommandQueue & queue, bool blocking = true);
ISAACAPI void copy(array_base const & gx, void* data, driver::CommandQueue & queue, bool blocking = true);
ISAACAPI void copy(void const *data, array_base &gx, bool blocking = true);
ISAACAPI void copy(array_base const & gx, void* data, bool blocking = true);
template<class T> ISAACAPI void copy(std::vector<T> const & cA, array_base& gA, driver::CommandQueue & queue, bool blocking = true);
template<class T> ISAACAPI void copy(array_base const & gA, std::vector<T> & cA, driver::CommandQueue & queue, bool blocking = true);
template<class T> ISAACAPI void copy(std::vector<T> const & cA, array_base & gA, bool blocking = true);
template<class T> ISAACAPI void copy(array_base const & gA, std::vector<T> & cA, bool blocking = true);
//Operators
//Binary operators
#define ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(OPNAME) \
ISAACAPI expression_tree OPNAME (array_base const & x, expression_tree const & y);\
ISAACAPI expression_tree OPNAME (array_base const & x, value_scalar const & y);\
ISAACAPI expression_tree OPNAME (array_base const & x, array_base const & y);\
\
ISAACAPI expression_tree OPNAME (expression_tree const & x, expression_tree const & y);\
ISAACAPI expression_tree OPNAME (expression_tree const & x, value_scalar const & y);\
ISAACAPI expression_tree OPNAME (expression_tree const & x, array_base const & y);\
\
ISAACAPI expression_tree OPNAME (value_scalar const & y, expression_tree const & x);\
ISAACAPI expression_tree OPNAME (value_scalar const & y, array_base const & x);\
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator +)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator -)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator *)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator /)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator >)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator >=)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator <)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator <=)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator ==)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(operator !=)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(maximum)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(minimum)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(pow)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(dot)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(outer)
ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR(assign)
#undef ISAAC_DECLARE_ELEMENT_BINARY_OPERATOR
#define ISAAC_DECLARE_ROT(LTYPE, RTYPE, CTYPE, STYPE) \
expression_tree rot(LTYPE const & x, RTYPE const & y, CTYPE const & c, STYPE const & s);
ISAAC_DECLARE_ROT(array_base, array_base, scalar, scalar)
ISAAC_DECLARE_ROT(expression_tree, array_base, scalar, scalar)
ISAAC_DECLARE_ROT(array_base, expression_tree, scalar, scalar)
ISAAC_DECLARE_ROT(expression_tree, expression_tree, scalar, scalar)
ISAAC_DECLARE_ROT(array_base, array_base, value_scalar, value_scalar)
ISAAC_DECLARE_ROT(expression_tree, array_base, value_scalar, value_scalar)
ISAAC_DECLARE_ROT(array_base, expression_tree, value_scalar, value_scalar)
ISAAC_DECLARE_ROT(expression_tree, expression_tree, value_scalar, value_scalar)
ISAAC_DECLARE_ROT(array_base, array_base, expression_tree, expression_tree)
ISAAC_DECLARE_ROT(expression_tree, array_base, expression_tree, expression_tree)
ISAAC_DECLARE_ROT(array_base, expression_tree, expression_tree, expression_tree)
ISAAC_DECLARE_ROT(expression_tree, expression_tree, expression_tree, expression_tree)
//--------------------------------
//Unary operators
#define ISAAC_DECLARE_UNARY_OPERATOR(OPNAME) \
ISAACAPI expression_tree OPNAME (array_base const & x);\
ISAACAPI expression_tree OPNAME (expression_tree const & x);
ISAAC_DECLARE_UNARY_OPERATOR(abs)
ISAAC_DECLARE_UNARY_OPERATOR(acos)
ISAAC_DECLARE_UNARY_OPERATOR(asin)
ISAAC_DECLARE_UNARY_OPERATOR(atan)
ISAAC_DECLARE_UNARY_OPERATOR(ceil)
ISAAC_DECLARE_UNARY_OPERATOR(cos)
ISAAC_DECLARE_UNARY_OPERATOR(cosh)
ISAAC_DECLARE_UNARY_OPERATOR(exp)
ISAAC_DECLARE_UNARY_OPERATOR(floor)
ISAAC_DECLARE_UNARY_OPERATOR(log)
ISAAC_DECLARE_UNARY_OPERATOR(log10)
ISAAC_DECLARE_UNARY_OPERATOR(sin)
ISAAC_DECLARE_UNARY_OPERATOR(sinh)
ISAAC_DECLARE_UNARY_OPERATOR(sqrt)
ISAAC_DECLARE_UNARY_OPERATOR(tan)
ISAAC_DECLARE_UNARY_OPERATOR(tanh)
ISAAC_DECLARE_UNARY_OPERATOR(trans)
#undef ISAAC_DECLARE_UNARY_OPERATOR
ISAACAPI expression_tree cast(array_base const &, numeric_type dtype);
ISAACAPI expression_tree cast(expression_tree const &, numeric_type dtype);
//Matrix reduction
#define ISAAC_DECLARE_REDUCTION(OPNAME) \
ISAACAPI expression_tree OPNAME(array_base const & M, int_t axis = -1);\
ISAACAPI expression_tree OPNAME(expression_tree const & M, int_t axis = -1);
ISAAC_DECLARE_REDUCTION(sum)
ISAAC_DECLARE_REDUCTION(argmax)
ISAAC_DECLARE_REDUCTION((max))
ISAAC_DECLARE_REDUCTION((min))
ISAAC_DECLARE_REDUCTION(argmin)
//Shortcuts
ISAACAPI expression_tree norm(array_base const &, unsigned int order = 2, int_t axis = -1);
ISAACAPI expression_tree norm(expression_tree const &, unsigned int order = 2, int_t axis = -1);
ISAACAPI expression_tree mean(array_base const &, int_t axis = -1);
ISAACAPI expression_tree mean(expression_tree const &, int_t axis = -1);
//ISAACAPI expression_tree var(array_base const &, int_t axis = -1);
//ISAACAPI expression_tree var(expression_tree const &, int_t axis = -1);
//Fusion
ISAACAPI expression_tree fuse(expression_tree const & x, expression_tree const & y);
//Initializers
ISAACAPI expression_tree eye(int_t, int_t, isaac::numeric_type, driver::Context const & context = driver::backend::contexts::get_default());
ISAACAPI expression_tree zeros(tuple const & shape, numeric_type dtype, driver::Context const & context = driver::backend::contexts::get_default());
//Swap
ISAACAPI void swap(view x, view y);
//Reshape
ISAACAPI expression_tree reshape(array_base const &, tuple const &);
ISAACAPI expression_tree reshape(expression_tree const &, tuple const &);
ISAACAPI expression_tree ravel(array_base const &);
ISAACAPI expression_tree ravel(expression_tree const & x);
//Diag
array diag(array_base & x, int offset = 0);
//
ISAACAPI std::ostream& operator<<(std::ostream &, array_base const &);
ISAACAPI std::ostream& operator<<(std::ostream &, expression_tree const &);
}
#endif

View File

@@ -1,63 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_COMMON_EXPRESSION_TYPE_H
#define ISAAC_COMMON_EXPRESSION_TYPE_H
#include <string>
#include <stdexcept>
namespace isaac
{
enum expression_type
{
INVALID_EXPRESSION_TYPE,
ELEMENTWISE_1D,
ELEMENTWISE_2D,
REDUCE_1D,
REDUCE_2D_ROWS,
REDUCE_2D_COLS,
GEMM_NN,
GEMM_TN,
GEMM_NT,
GEMM_TT
};
inline expression_type expression_type_from_string(std::string const & name)
{
if(name=="elementwise_1d") return ELEMENTWISE_1D;
if(name=="reduce_1d") return REDUCE_1D;
if(name=="elementwise_2d") return ELEMENTWISE_2D;
if(name=="reduce_2d_rows") return REDUCE_2D_ROWS;
if(name=="reduce_2d_cols") return REDUCE_2D_COLS;
if(name=="gemm_nn") return GEMM_NN;
if(name=="gemm_nt") return GEMM_NT;
if(name=="gemm_tn") return GEMM_TN;
if(name=="gemm_tt") return GEMM_TT;
throw std::invalid_argument("Unrecognized expression: " + name);
}
}
#endif

View File

@@ -1,144 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_COMMON_NUMERIC_TYPE_H
#define ISAAC_COMMON_NUMERIC_TYPE_H
#include <stdexcept>
#include "isaac/exception/api.h"
namespace isaac
{
class half{
/* It is a incompleted class for compiling using*/
public:
half() {};
};
enum numeric_type
{
INVALID_NUMERIC_TYPE = 0,
// BOOL_TYPE,
CHAR_TYPE,
UCHAR_TYPE,
SHORT_TYPE,
USHORT_TYPE,
INT_TYPE,
UINT_TYPE,
LONG_TYPE,
ULONG_TYPE,
HALF_TYPE,
FLOAT_TYPE,
DOUBLE_TYPE
};
inline std::string to_string(numeric_type const & type)
{
switch (type)
{
// case BOOL_TYPE: return "bool";
case CHAR_TYPE: return "char";
case UCHAR_TYPE: return "uchar";
case SHORT_TYPE: return "short";
case USHORT_TYPE: return "ushort";
case INT_TYPE: return "int";
case UINT_TYPE: return "uint";
case LONG_TYPE: return "long";
case ULONG_TYPE: return "ulong";
case HALF_TYPE : return "half";
case FLOAT_TYPE : return "float";
case DOUBLE_TYPE : return "double";
default : throw unknown_datatype(type);
}
}
inline numeric_type numeric_type_from_string(std::string const & name)
{
if(name=="float16") return HALF_TYPE;
if(name=="float32") return FLOAT_TYPE;
if(name=="float64") return DOUBLE_TYPE;
throw std::invalid_argument("Invalid datatype: " + name);
}
inline unsigned int size_of(numeric_type type)
{
switch (type)
{
// case BOOL_TYPE:
case UCHAR_TYPE:
case CHAR_TYPE: return 1;
case HALF_TYPE:
case USHORT_TYPE:
case SHORT_TYPE: return 2;
case UINT_TYPE:
case INT_TYPE:
case FLOAT_TYPE: return 4;
case ULONG_TYPE:
case LONG_TYPE:
case DOUBLE_TYPE: return 8;
default: throw unknown_datatype(type);
}
}
template<size_t size, bool is_unsigned>
struct to_int_numeric_type_impl;
#define ISAAC_INSTANTIATE_INT_TYPE_IMPL(SIZE, IS_UNSIGNED, TYPE) \
template<> struct to_int_numeric_type_impl<SIZE, IS_UNSIGNED> { static const numeric_type value = TYPE; }
ISAAC_INSTANTIATE_INT_TYPE_IMPL(1, false, CHAR_TYPE);
ISAAC_INSTANTIATE_INT_TYPE_IMPL(2, false, SHORT_TYPE);
ISAAC_INSTANTIATE_INT_TYPE_IMPL(4, false, INT_TYPE);
ISAAC_INSTANTIATE_INT_TYPE_IMPL(8, false, LONG_TYPE);
ISAAC_INSTANTIATE_INT_TYPE_IMPL(1, true, UCHAR_TYPE);
ISAAC_INSTANTIATE_INT_TYPE_IMPL(2, true, USHORT_TYPE);
ISAAC_INSTANTIATE_INT_TYPE_IMPL(4, true, UINT_TYPE);
ISAAC_INSTANTIATE_INT_TYPE_IMPL(8, true, ULONG_TYPE);
#undef ISAAC_INSTANTIATE_INT_TYPE_IMPL
template<class T>
struct to_int_numeric_type
{
static const numeric_type value = to_int_numeric_type_impl<sizeof(T), std::is_unsigned<T>::value>::value;
};
template<class T> struct to_numeric_type { static const numeric_type value = to_int_numeric_type<T>::value; };
template<> struct to_numeric_type<char> { static const numeric_type value = CHAR_TYPE; };
template<> struct to_numeric_type<unsigned char> { static const numeric_type value = UCHAR_TYPE ; };
template<> struct to_numeric_type<short> { static const numeric_type value = SHORT_TYPE ; };
template<> struct to_numeric_type<unsigned short> { static const numeric_type value = USHORT_TYPE ; };
template<> struct to_numeric_type<int> { static const numeric_type value = INT_TYPE ; };
template<> struct to_numeric_type<unsigned int> { static const numeric_type value = UINT_TYPE ; };
template<> struct to_numeric_type<long> { static const numeric_type value = LONG_TYPE ; };
template<> struct to_numeric_type<unsigned long> { static const numeric_type value = ULONG_TYPE ; };
template<> struct to_numeric_type<half> { static const numeric_type value = HALF_TYPE; };
template<> struct to_numeric_type<float> { static const numeric_type value = FLOAT_TYPE; };
template<> struct to_numeric_type<double> { static const numeric_type value = DOUBLE_TYPE; };
template<class T> typename std::enable_if<std::is_arithmetic<T>::value, numeric_type>::type numeric_type_of(T) { return to_numeric_type<T>::value; }
template<class T> typename std::enable_if<!std::is_arithmetic<T>::value, numeric_type>::type numeric_type_of(T const & x) { return x.dtype(); }
}
#endif

View File

@@ -1,49 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DEFINES_H
#define ISAAC_DEFINES_H
#if defined(_WIN32) || defined(_MSC_VER)
#ifdef ISAAC_DLL
#define ISAACAPI __declspec(dllexport)
#else
#define ISAACAPI __declspec(dllimport)
#endif
#else
#define ISAACAPI __attribute__((visibility("default")))
#endif
#if defined(_WIN32) || defined(_MSC_VER)
#define DISABLE_MSVC_WARNING_C4251 __pragma(warning(disable: 4251))
#define RESTORE_MSVC_WARNING_C4251 __pragma(warning(default: 4251))
#define DISABLE_MSVC_WARNING_C4275 __pragma(warning(disable: 4275))
#define RESTORE_MSVC_WARNING_C4275 __pragma(warning(disable: 4275))
#else
#define DISABLE_MSVC_WARNING_C4251
#define RESTORE_MSVC_WARNING_C4251
#define DISABLE_MSVC_WARNING_C4275
#define RESTORE_MSVC_WARNING_C4275
#endif
#endif

View File

@@ -27,12 +27,6 @@
#include <list> #include <list>
#include <vector> #include <vector>
#include "isaac/common/expression_type.h"
#include "isaac/common/numeric_type.h"
#include "isaac/driver/dispatch.h"
#include "isaac/defines.h"
#include "isaac/types.h"
namespace isaac namespace isaac
{ {
@@ -40,93 +34,78 @@ namespace driver
{ {
class Buffer; class Buffer;
class CommandQueue; class Stream;
class Context; class Context;
class Platform; class Platform;
class Program; class Module;
class Kernel; class Kernel;
class ProgramCache;
class ISAACAPI backend struct backend
{ {
public:
class ISAACAPI workspaces class modules
{ {
friend class backend;
public: public:
static const size_t SIZE = 8000000; //8MB of temporary workspace per queue static void release();
static void release(); static Module& get(Stream const & stream, std::string const & name, std::string const &src);
static driver::Buffer & get(CommandQueue const & key);
private: private:
DISABLE_MSVC_WARNING_C4251 static std::map<std::tuple<Stream, std::string>, Module * > cache_;
static std::map<CommandQueue, Buffer * > cache_;
RESTORE_MSVC_WARNING_C4251
}; };
class ISAACAPI programs class kernels
{ {
friend class backend; friend class backend;
public: public:
static void release(); static void release();
static ProgramCache & get(CommandQueue const & queue, expression_type expression, numeric_type dtype); static Kernel & get(Module const & program, std::string const & name);
private: private:
DISABLE_MSVC_WARNING_C4251 static std::map<std::tuple<Module, std::string>, Kernel * > cache_;
static std::map<std::tuple<CommandQueue, expression_type, numeric_type>, ProgramCache * > cache_;
RESTORE_MSVC_WARNING_C4251
}; };
class ISAACAPI kernels class contexts
{ {
friend class backend; friend class backend;
public:
static void release();
static Kernel & get(Program const & program, std::string const & name);
private: private:
DISABLE_MSVC_WARNING_C4251 static void init(std::vector<Platform> const &);
static std::map<std::tuple<Program, std::string>, Kernel * > cache_; static void release();
RESTORE_MSVC_WARNING_C4251 public:
static Context const & get_default();
template<class T>
static Context const & import(T context)
{
for(driver::Context const * x: cache_)
if((T)*x==context)
return *x;
cache_.emplace_back(new Context(context, false));
return *cache_.back();
}
static void get(std::list<Context const *> &);
private:
static std::list<Context const *> cache_;
}; };
class ISAACAPI contexts class streams
{ {
friend class backend; friend class backend;
private: private:
static void init(std::vector<Platform> const &); static void init(std::list<Context const *> const &);
static void release(); static void release();
public: public:
static Context const & get_default(); static void get(Context const &, std::vector<Stream *> &streams);
static Context const & import(CUcontext context); static Stream & get(Context const &, unsigned int id = 0);
static Context const & import(cl_context context); static Stream & get_default();
static void get(std::list<Context const *> &);
private: private:
DISABLE_MSVC_WARNING_C4251 static std::map< Context, std::vector<Stream*> > cache_;
static std::list<Context const *> cache_;
RESTORE_MSVC_WARNING_C4251
};
class ISAACAPI queues
{
friend class backend;
private:
static void init(std::list<Context const *> const &);
static void release();
public:
static void get(Context const &, std::vector<CommandQueue *> &queues);
static CommandQueue & get(Context const &, unsigned int id = 0);
private:
DISABLE_MSVC_WARNING_C4251
static std::map< Context, std::vector<CommandQueue*> > cache_;
RESTORE_MSVC_WARNING_C4251
}; };
static void init(); static void init();
static void release(); static void release();
static void platforms(std::vector<Platform> &); static std::vector<Platform> platforms();
static void synchronize(Context const &); static void synchronize(Context const &);
public:
static unsigned int default_device; static unsigned int default_device;
static cl_command_queue_properties default_queue_properties;
}; };
} }

View File

@@ -23,61 +23,30 @@
#ifndef ISAAC_DRIVER_BUFFER_H #ifndef ISAAC_DRIVER_BUFFER_H
#define ISAAC_DRIVER_BUFFER_H #define ISAAC_DRIVER_BUFFER_H
#include "isaac/types.h"
#include "isaac/defines.h"
#include "isaac/driver/common.h"
#include "isaac/driver/context.h"
#include "isaac/driver/handle.h" #include "isaac/driver/handle.h"
#include "isaac/driver/dispatch.h"
namespace isaac namespace isaac
{ {
namespace driver namespace driver
{ {
class Stream;
// Buffer // Buffer
class ISAACAPI Buffer: public has_handle_comparators<Buffer> class Buffer: public Handle<CUdeviceptr>
{ {
public: typedef Handle<CUdeviceptr> base_type;
typedef Handle<cl_mem, CUdeviceptr> handle_type;
private:
friend class CommandQueue;
friend class Kernel;
//Wrapper to get CUDA context from Memory
static CUcontext context(CUdeviceptr h)
{
CUcontext res;
check(dispatch::cuPointerGetAttribute((void*)&res, CU_POINTER_ATTRIBUTE_CONTEXT, h));
return res;
}
public: public:
//Constructors using base_type::base_type;
Buffer(CUdeviceptr h = 0, bool take_ownership = true);
Buffer(cl_mem Buffer = 0, bool take_ownership = true);
Buffer(Context const & context, size_t size); Buffer(Context const & context, size_t size);
//Accessors void set_zero(Stream const & queue);
handle_type& handle();
handle_type const & handle() const;
Context const & context() const;
private: private:
backend_type backend_; size_t size_;
Context context_;
handle_type h_;
}; };
inline Buffer make_buffer(backend_type backend, cl_mem clh = 0, CUdeviceptr cuh = 0, bool take_ownership = true)
{
if(backend==OPENCL)
return Buffer(clh, take_ownership);
else
return Buffer(cuh, take_ownership);
} }
}
} }
#endif #endif

View File

@@ -23,10 +23,6 @@
#ifndef ISAAC_DRIVER_CONTEXT_H #ifndef ISAAC_DRIVER_CONTEXT_H
#define ISAAC_DRIVER_CONTEXT_H #define ISAAC_DRIVER_CONTEXT_H
#include <map>
#include <memory>
#include "isaac/defines.h"
#include "isaac/driver/common.h"
#include "isaac/driver/device.h" #include "isaac/driver/device.h"
#include "isaac/driver/handle.h" #include "isaac/driver/handle.h"
@@ -36,42 +32,25 @@ namespace isaac
namespace driver namespace driver
{ {
class ISAACAPI Context: public has_handle_comparators<Context> class Context: public Handle<CUcontext>
{ {
friend class Program; typedef Handle<CUcontext> base_type;
friend class CommandQueue;
friend class Buffer;
public:
typedef Handle<cl_context, CUcontext> handle_type;
private: private:
static std::string cache_path(); static std::string get_cache_path();
static CUdevice device(CUcontext);
static CUdevice device(CUcontext)
{
CUdevice res;
dispatch::cuCtxGetDevice(&res);
return res;
}
public: public:
//Constructors //Constructors
explicit Context(CUcontext const & context, bool take_ownership = true); explicit Context(CUcontext const & context, bool take_ownership = true);
explicit Context(cl_context const & context, bool take_ownership = true);
explicit Context(Device const & device); explicit Context(Device const & device);
//Accessors //Accessors
backend_type backend() const;
Device const & device() const; Device const & device() const;
handle_type const & handle() const; std::string const & cache_path() const;
private: private:
DISABLE_MSVC_WARNING_C4251
backend_type backend_;
Device device_; Device device_;
std::string cache_path_; std::string cache_path_;
handle_type h_;
RESTORE_MSVC_WARNING_C4251
}; };
} }

View File

@@ -0,0 +1,114 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_CUBLAS_H
#define ISAAC_DRIVER_CUBLAS_H
#include "isaac/templates/common.hpp"
#include "isaac/driver/dispatch.h"
#include "isaac/driver/buffer.h"
#include "isaac/driver/stream.h"
#include "isaac/driver/backend.h"
namespace isaac
{
namespace driver
{
template<typename... Args> void cublasGemm_impl(half, Args... args){ driver::dispatch::cublasHgemm(args...); }
template<typename... Args> void cublasGemm_impl(float, Args... args){ driver::dispatch::cublasSgemm_v2(args...); }
template<typename... Args> void cublasGemm_impl(double, Args... args){ driver::dispatch::cublasDgemm_v2(args...); }
template<class cuType>
inline void cublasGemm_dispatch(Context const & ctx, Stream& queue, char AT, char BT, int32_t M, int32_t N, int32_t K, void* alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, void* beta, Buffer& C, int32_t ldc){
auto cu_trans = [](char xt) { return (xt=='N')?CUBLAS_OP_N:CUBLAS_OP_T; };
cublasHandle_t handle = dispatch::cublasHandle(ctx);
dispatch::cublasSetStream_v2(handle, (CUstream)queue);
CUdeviceptr cuA = A, cuB = B, cuC = C;
cublasGemm_impl(cuType(), handle, cu_trans(AT), cu_trans(BT), M, N, K, (cuType*)alpha, (const cuType*)cuA, lda, (const cuType*)cuB, ldb, (cuType*)beta, (cuType*)cuC, ldc);
}
inline void cublasGemm(DType dtype, Context const & ctx, Stream& queue, char AT, char BT, int32_t M, int32_t N, int32_t K, scalar alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, scalar beta, Buffer& C, int32_t ldc){
switch(dtype){
case HALF_TYPE: return cublasGemm_dispatch<half>(ctx, queue, AT, BT, M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc);
case FLOAT_TYPE: return cublasGemm_dispatch<float>(ctx, queue, AT, BT, M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc);
case DOUBLE_TYPE: return cublasGemm_dispatch<double>(ctx, queue, AT, BT, M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc);
default: throw;
}
}
inline cudnnDataType_t cudnnDtype(DType dtype){
switch(dtype){
case HALF_TYPE: return CUDNN_DATA_HALF;
case FLOAT_TYPE: return CUDNN_DATA_FLOAT;
case DOUBLE_TYPE: return CUDNN_DATA_DOUBLE;
}
throw;
}
inline void cudnnConv(DType dtype, Context const & ctx, Stream& queue, int32_t H, int32_t W, int32_t N, int32_t K, int32_t P, int32_t Q, int32_t C, int32_t R, int32_t S,
int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, Buffer const & F, scalar beta, Buffer const & O){
cudnnHandle_t handle = dispatch::cudnnHandle(ctx);
cudnnDataType_t cutype = cudnnDtype(dtype);
dispatch::cudnnSetStream(handle, (CUstream)queue);
cudnnTensorDescriptor_t tO, tI;
cudnnFilterDescriptor_t tF;
cudnnConvolutionDescriptor_t conv;
cudnnConvolutionFwdAlgo_t algo;
dispatch::cudnnCreateTensorDescriptor(&tO);
dispatch::cudnnCreateTensorDescriptor(&tI);
dispatch::cudnnCreateFilterDescriptor(&tF);
dispatch::cudnnSetTensor4dDescriptor(tO, CUDNN_TENSOR_NCHW, cutype, N, K, P, Q);
dispatch::cudnnSetFilter4dDescriptor(tF, cutype, CUDNN_TENSOR_NCHW, K, C, R, S);
dispatch::cudnnSetTensor4dDescriptor(tI, CUDNN_TENSOR_NCHW, cutype, N, C, H, W);
dispatch::cudnnCreateConvolutionDescriptor(&conv);
int pad[] = {pad_h, pad_w};
int stride[] = {stride_h, stride_w};
int upscale[] = {1, 1};
dispatch::cudnnSetConvolutionNdDescriptor(conv, 2, pad, stride, upscale, CUDNN_CROSS_CORRELATION, cutype);
// dispatch::cudnnSetConvolution2dDescriptor(conv, pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION);
// dispatch::cudnnGetConvolutionForwardAlgorithm(handle, tI, tF, conv, tO, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 1024*1024, &algo);
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
size_t workspace_size;
dispatch::cudnnGetConvolutionForwardWorkspaceSize(handle, tI, tF, conv, tO, algo, &workspace_size);
Buffer work(ctx, std::max((size_t)1,workspace_size));
CUdeviceptr twork = work;
CUdeviceptr pI = I, pF = F, pO = O;
dispatch::cudnnConvolutionForward(handle, alpha.data(), tI, (void*)pI, tF, (void*)pF, conv, algo, (void*)twork, workspace_size, beta.data(), tO, (void*)pO);
}
}
}
#endif

View File

@@ -23,8 +23,6 @@
#ifndef ISAAC_DRIVER_DEVICE_H #ifndef ISAAC_DRIVER_DEVICE_H
#define ISAAC_DRIVER_DEVICE_H #define ISAAC_DRIVER_DEVICE_H
#include "isaac/defines.h"
#include "isaac/driver/common.h"
#include "isaac/driver/platform.h" #include "isaac/driver/platform.h"
#include "isaac/driver/handle.h" #include "isaac/driver/handle.h"
@@ -35,60 +33,26 @@ namespace driver
{ {
// Device // Device
class ISAACAPI Device: public has_handle_comparators<Device> class Device: public Handle<CUdevice>
{ {
private:
friend class Context;
friend class CommandQueue;
public: public:
typedef Handle<cl_device_id, CUdevice> handle_type; typedef Handle<CUdevice> base_type;
//Supported types
enum Type
{
GPU = CL_DEVICE_TYPE_GPU,
CPU = CL_DEVICE_TYPE_CPU,
ACCELERATOR = CL_DEVICE_TYPE_ACCELERATOR,
UNKNOWN
};
//Supported vendors
enum class Vendor
{
AMD,
INTEL,
NVIDIA,
UNKNOWN
};
//Supported architectures //Supported architectures
enum class Architecture enum class Architecture
{ {
//Intel //NVidia
HASWELL, SM_2_0,
BROADWELL, SM_2_1,
SKYLAKE, SM_3_0,
KABYLAKE, SM_3_5,
SM_3_7,
SM_5_0,
SM_5_2,
SM_6_0,
SM_6_1,
//NVidia UNKNOWN
SM_2_0,
SM_2_1,
SM_3_0,
SM_3_5,
SM_3_7,
SM_5_0,
SM_5_2,
SM_6_0,
SM_6_1,
//AMD
TERASCALE_2,
TERASCALE_3,
GCN_1,
GCN_2,
GCN_3,
GCN_4,
UNKNOWN
}; };
private: private:
@@ -96,34 +60,32 @@ private:
template<CUdevice_attribute attr> template<CUdevice_attribute attr>
int cuGetInfo() const; int cuGetInfo() const;
inline Architecture nv_arch(std::pair<unsigned int, unsigned int> sm) const;
inline nvmlDevice_t nvml_device() const;
public: public:
//Constructors using base_type::base_type;
explicit Device(CUdevice const & device, bool take_ownership = true);
explicit Device(cl_device_id const & device, bool take_ownership = true);
//Accessors //Accessors
handle_type const & handle() const;
Vendor vendor() const;
Architecture architecture() const; Architecture architecture() const;
backend_type backend() const;
//Informations //Informations
std::string infos() const; std::string infos() const;
size_t clock_rate() const; size_t address_bits() const;
unsigned int address_bits() const;
driver::Platform platform() const; driver::Platform platform() const;
std::vector<size_t> max_block_dim() const;
size_t max_threads_per_block() const;
size_t max_shared_memory() const;
size_t warp_size() const;
std::pair<size_t, size_t> compute_capability() const;
//Identifier
std::string name() const; std::string name() const;
std::string vendor_str() const; std::string pci_bus_id() const;
std::vector<size_t> max_work_item_sizes() const; //Clocks
Type type() const; size_t current_sm_clock() const;
std::string extensions() const; size_t current_mem_clock() const;
size_t max_work_group_size() const;
size_t local_mem_size() const; size_t max_sm_clock() const;
size_t warp_wavefront_size() const; size_t max_mem_clock() const;
bool fp64_support() const;
std::pair<unsigned int, unsigned int> nv_compute_capability() const;
private:
backend_type backend_;
handle_type h_;
}; };
} }

View File

@@ -26,15 +26,14 @@
#include <type_traits> #include <type_traits>
#include <dlfcn.h> #include <dlfcn.h>
//OpenCL Backend
#include "isaac/driver/external/CL/cl.h"
#include "isaac/driver/external/CL/cl_ext.h"
//CUDA Backend //CUDA Backend
#include "isaac/driver/external/CUDA/cuda.h" #include "isaac/driver/external/CUDA/cuda.h"
#include "isaac/driver/external/CUDA/nvrtc.h" #include "isaac/driver/external/CUDA/nvrtc.h"
#include "isaac/driver/external/CUDA/cublas.h" #include "isaac/driver/external/CUDA/cublas.h"
#include "isaac/driver/external/CUDA/cudnn.h"
#include "isaac/driver/external/CUDA/nvml.h"
//Exceptions //Exceptions
#include "isaac/driver/common.h"
#include <iostream> #include <iostream>
namespace isaac namespace isaac
@@ -48,211 +47,189 @@ template<class T> void check(T){}
void check(nvrtcResult err); void check(nvrtcResult err);
void check(CUresult err); void check(CUresult err);
void check(cublasStatus_t err); void check(cublasStatus_t err);
void check(cl_int err); void check(cudnnStatus_t err);
void check_destruction(CUresult); void check_destruction(CUresult);
class dispatch class dispatch
{ {
private: private:
template <class F> template <class F>
struct return_type; struct return_type;
template <class R, class... A> template <class R, class... A>
struct return_type<R (*)(A...)> struct return_type<R (*)(A...)>
{ typedef R type; }; { typedef R type; };
typedef bool (*f_init_t)(); typedef bool (*f_init_t)();
template<f_init_t initializer, typename FunPtrT, typename... Args> template<f_init_t initializer, typename FunPtrT, typename... Args>
static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args) static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args)
{ {
initializer(); initializer();
if(cache == nullptr) if(cache == nullptr)
cache = dlsym(lib_h, name); cache = dlsym(lib_h, name);
FunPtrT fptr; FunPtrT fptr;
*reinterpret_cast<void **>(&fptr) = cache; *reinterpret_cast<void **>(&fptr) = cache;
typename return_type<FunPtrT>::type res = (*fptr)(args...); typename return_type<FunPtrT>::type res = (*fptr)(args...);
check(res); check(res);
return res; return res;
} }
public: public:
static bool clinit(); static bool nvrtcinit();
static bool cublasinit(); static bool nvmlinit();
static bool nvrtcinit(); static bool cuinit();
static bool cuinit(); static bool cublasinit();
static bool cudnninit();
static void release(); static void release();
//OpenCL //CUDA
static cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *); static CUresult cuCtxGetCurrent(CUcontext *pctx);
static cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *); static CUresult cuCtxDestroy_v2(CUcontext ctx);
static cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *); static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
static cl_int clReleaseMemObject(cl_mem); static CUresult cuDeviceGet(CUdevice *device, int ordinal);
static cl_int clFinish(cl_command_queue); static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
static cl_int clGetMemObjectInfo(cl_mem, cl_mem_info, size_t, void *, size_t *); static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
static cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *); static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
static cl_int clReleaseContext(cl_context); static CUresult cuMemFree_v2(CUdeviceptr dptr);
static cl_int clReleaseEvent(cl_event); static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
static cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); static CUresult cuDriverGetVersion(int *driverVersion);
static cl_int clEnqueueReadBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *); static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
static cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *); static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev);
static cl_int clReleaseDevice(cl_device_id);
static cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *);
static cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
static cl_int clGetContextInfo(cl_context, cl_context_info, size_t, void *, size_t *);
static cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *);
static cl_int clReleaseCommandQueue(cl_command_queue);
static cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *);
static cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *);
static cl_int clGetEventProfilingInfo(cl_event, cl_profiling_info, size_t, void *, size_t *);
static cl_program clCreateProgramWithBinary(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
static cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *);
static cl_int clRetainEvent(cl_event);
static cl_int clReleaseProgram(cl_program);
static cl_int clFlush(cl_command_queue);
static cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *);
static cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *);
static cl_int clGetKernelWorkGroupInfo(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *);
static cl_kernel clCreateKernel(cl_program, const char *, cl_int *);
static cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *);
static cl_mem clCreateImage(cl_context, cl_mem_flags, const cl_image_format *, const cl_image_desc *, void *, cl_int *);
static cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *);
static cl_int clReleaseKernel(cl_kernel);
static cl_int clEnqueueCopyBufferToImage(cl_command_queue, cl_mem, cl_mem, size_t, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
static cl_int clSetEventCallback(cl_event, cl_int, void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), void *);
//CUDA static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
static CUresult cuCtxDestroy_v2(CUcontext ctx); static CUresult cuModuleLoad(CUmodule *module, const char *fname);
static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags); static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
static CUresult cuDeviceGet(CUdevice *device, int ordinal); static CUresult cuModuleUnload(CUmodule hmod);
static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags); static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); static CUresult cuDeviceGetCount(int *count);
static CUresult cuMemFree_v2(CUdeviceptr dptr); static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); static CUresult cuInit(unsigned int Flags);
static CUresult cuDriverGetVersion(int *driverVersion); static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
static CUresult cuDeviceGetName(char *name, int len, CUdevice dev); static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
static CUresult cuModuleLoad(CUmodule *module, const char *fname); static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
static CUresult cuModuleUnload(CUmodule hmod); static CUresult cuStreamSynchronize(CUstream hStream);
static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); static CUresult cuStreamDestroy_v2(CUstream hStream);
static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); static CUresult cuEventDestroy_v2(CUevent hEvent);
static CUresult cuDeviceGetCount(int *count); static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr);
static CUresult cuInit(unsigned int Flags); static CUresult cuCtxGetDevice(CUdevice* result);
static CUresult cuEventRecord(CUevent hEvent, CUstream hStream); static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream);
static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
static CUresult cuStreamSynchronize(CUstream hStream);
static CUresult cuStreamDestroy_v2(CUstream hStream);
static CUresult cuEventDestroy_v2(CUevent hEvent);
static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr);
static CUresult cuCtxGetDevice(CUdevice* result);
static nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char **options); static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device);
static nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet); static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
static nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx); static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
static nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
static cublasHandle_t cublasHandle(Context const & ctx); static nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char **options);
static cublasStatus_t cublasCreate_v2(cublasHandle_t* h); static nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId); static nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId); static nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
static cublasStatus_t cublasSgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc); static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
static cublasStatus_t cublasDgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc); static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
static cublasHandle_t cublasHandle(Context const & ctx);
static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId);
static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId);
static cublasStatus_t cublasSgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
static cublasStatus_t cublasDgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
static cublasStatus_t cublasHgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc);
static cudnnHandle_t cudnnHandle(Context const & ctx);
static cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc);
static cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
static cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
static cudnnStatus_t cudnnCreate(cudnnHandle_t *handle);
static cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w);
static cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w);
static cudnnStatus_t cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, int pad_h, int pad_w, int u, int v, int upscalex, int upscaley, cudnnConvolutionMode_t mode);
static cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, int arrayLength, const int padA[], const int filterStrideA[], const int upscaleA[], cudnnConvolutionMode_t mode, cudnnDataType_t dataType);
static cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionFwdAlgo_t *algo);
static cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo, size_t *sizeInBytes);
static cudnnStatus_t cudnnConvolutionForward(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const cudnnFilterDescriptor_t wDesc, const void *w, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo, void *workSpace, size_t workSpaceSizeInBytes, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
static cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
private: private:
static void* opencl_; static void* cuda_;
static void* cuda_; static void* nvrtc_;
static void* nvrtc_; static void* nvml_;
static void* cublas_; static void* cublas_;
static void* cudnn_;
//CUDA
static void* cuCtxGetCurrent_;
static void* cuCtxDestroy_v2_;
static void* cuEventCreate_;
static void* cuDeviceGet_;
static void* cuMemcpyDtoH_v2_;
static void* cuStreamCreate_;
static void* cuEventElapsedTime_;
static void* cuMemFree_v2_;
static void* cuMemcpyDtoHAsync_v2_;
static void* cuDriverGetVersion_;
static void* cuDeviceGetName_;
static void* cuDeviceGetPCIBusId_;
//OpenCL static void* cuMemcpyHtoDAsync_v2_;
static void* clBuildProgram_; static void* cuModuleLoad_;
static void* clEnqueueNDRangeKernel_; static void* cuLaunchKernel_;
static void* clSetKernelArg_; static void* cuModuleUnload_;
static void* clReleaseMemObject_; static void* cuModuleLoadDataEx_;
static void* clFinish_; static void* cuDeviceGetAttribute_;
static void* clGetMemObjectInfo_; static void* cuDeviceGetCount_;
static void* clGetCommandQueueInfo_; static void* cuMemcpyHtoD_v2_;
static void* clReleaseContext_; static void* cuInit_;
static void* clReleaseEvent_; static void* cuEventRecord_;
static void* clEnqueueWriteBuffer_; static void* cuCtxCreate_v2_;
static void* clEnqueueReadBuffer_; static void* cuModuleGetFunction_;
static void* clGetProgramBuildInfo_; static void* cuStreamSynchronize_;
static void* clReleaseDevice_; static void* cuStreamDestroy_v2_;
static void* clCreateContext_; static void* cuEventDestroy_v2_;
static void* clGetDeviceIDs_; static void* cuMemAlloc_v2_;
static void* clGetContextInfo_; static void* cuPointerGetAttribute_;
static void* clGetDeviceInfo_; static void* cuCtxGetDevice_;
static void* clReleaseCommandQueue_; static void* cuMemsetD8Async_;
static void* clGetPlatformIDs_; static void* cuCtxPushCurrent_v2_;
static void* clGetPlatformInfo_; static void* cuCtxPopCurrent_v2_;
static void* clGetEventProfilingInfo_;
static void* clCreateProgramWithBinary_;
static void* clCreateCommandQueue_;
static void* clRetainEvent_;
static void* clReleaseProgram_;
static void* clFlush_;
static void* clGetProgramInfo_;
static void* clGetKernelInfo_;
static void* clGetKernelWorkGroupInfo_;
static void* clCreateKernel_;
static void* clCreateBuffer_;
static void* clCreateImage_;
static void* clCreateProgramWithSource_;
static void* clReleaseKernel_;
static void* clEnqueueCopyBufferToImage_;
static void* clSetEventCallback_;
//CUDA static void* nvmlInit_v2_;
static void* cuCtxDestroy_v2_; static void* nvmlDeviceGetHandleByPciBusId_v2_;
static void* cuEventCreate_; static void* nvmlDeviceGetClockInfo_;
static void* cuDeviceGet_; static void* nvmlDeviceGetMaxClockInfo_;
static void* cuMemcpyDtoH_v2_;
static void* cuStreamCreate_;
static void* cuEventElapsedTime_;
static void* cuMemFree_v2_;
static void* cuMemcpyDtoHAsync_v2_;
static void* cuDriverGetVersion_;
static void* cuDeviceGetName_;
static void* cuMemcpyHtoDAsync_v2_;
static void* cuModuleLoad_;
static void* cuLaunchKernel_;
static void* cuModuleUnload_;
static void* cuModuleLoadDataEx_;
static void* cuDeviceGetAttribute_;
static void* cuDeviceGetCount_;
static void* cuMemcpyHtoD_v2_;
static void* cuInit_;
static void* cuEventRecord_;
static void* cuCtxCreate_v2_;
static void* cuModuleGetFunction_;
static void* cuStreamSynchronize_;
static void* cuStreamDestroy_v2_;
static void* cuEventDestroy_v2_;
static void* cuMemAlloc_v2_;
static void* cuPointerGetAttribute_;
static void* cuCtxGetDevice_;
static void* nvrtcCompileProgram_; static void* nvrtcCompileProgram_;
static void* nvrtcGetProgramLogSize_; static void* nvrtcGetProgramLogSize_;
static void* nvrtcGetPTX_; static void* nvrtcGetPTX_;
static void* nvrtcGetPTXSize_; static void* nvrtcGetPTXSize_;
static void* nvrtcCreateProgram_; static void* nvrtcCreateProgram_;
static void* nvrtcGetProgramLog_; static void* nvrtcGetProgramLog_;
static void* cublasCreate_v2_;
static void* cublasGetStream_v2_;
static void* cublasSetStream_v2_;
static void* cublasHgemm_;
static void* cublasSgemm_v2_;
static void* cublasDgemm_v2_;
static void* cudnnCreateConvolutionDescriptor_;
static void* cudnnCreateTensorDescriptor_;
static void* cudnnCreateFilterDescriptor_;
static void* cudnnCreate_;
static void* cudnnSetTensor4dDescriptor_;
static void* cudnnSetFilter4dDescriptor_;
static void* cudnnSetConvolution2dDescriptor_;
static void* cudnnSetConvolutionNdDescriptor_;
static void* cudnnGetConvolutionForwardAlgorithm_;
static void* cudnnGetConvolutionForwardWorkspaceSize_;
static void* cudnnConvolutionForward_;
static void* cudnnSetStream_;
static void* cublasCreate_v2_;
static void* cublasGetStream_v2_;
static void* cublasSetStream_v2_;
static void* cublasSgemm_v2_;
static void* cublasDgemm_v2_;
}; };
} }

View File

@@ -0,0 +1,224 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_EXCEPTION_DRIVER_H
#define ISAAC_EXCEPTION_DRIVER_H
#include <exception>
#include "isaac/driver/dispatch.h"
namespace isaac
{
namespace driver
{
namespace exception
{
namespace nvrtc
{
#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory ,"out of memory");
ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure ,"program creation failure");
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input ,"invalid input");
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program ,"invalid program");
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_option ,"invalid option");
ISAAC_CREATE_NVRTC_EXCEPTION(compilation ,"compilation");
ISAAC_CREATE_NVRTC_EXCEPTION(builtin_operation_failure ,"builtin operation failure");
ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error ,"unknown error");
#undef ISAAC_CREATE_NVRTC_EXCEPTION
}
namespace cuda
{
class base: public std::exception{};
#define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "CUDA: Error- " msg; } }
ISAAC_CREATE_CUDA_EXCEPTION(invalid_value ,"invalid value");
ISAAC_CREATE_CUDA_EXCEPTION(out_of_memory ,"out of memory");
ISAAC_CREATE_CUDA_EXCEPTION(not_initialized ,"not initialized");
ISAAC_CREATE_CUDA_EXCEPTION(deinitialized ,"deinitialized");
ISAAC_CREATE_CUDA_EXCEPTION(profiler_disabled ,"profiler disabled");
ISAAC_CREATE_CUDA_EXCEPTION(profiler_not_initialized ,"profiler not initialized");
ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_started ,"profiler already started");
ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_stopped ,"profiler already stopped");
ISAAC_CREATE_CUDA_EXCEPTION(no_device ,"no device");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_device ,"invalid device");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_image ,"invalid image");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_context ,"invalid context");
ISAAC_CREATE_CUDA_EXCEPTION(context_already_current ,"context already current");
ISAAC_CREATE_CUDA_EXCEPTION(map_failed ,"map failed");
ISAAC_CREATE_CUDA_EXCEPTION(unmap_failed ,"unmap failed");
ISAAC_CREATE_CUDA_EXCEPTION(array_is_mapped ,"array is mapped");
ISAAC_CREATE_CUDA_EXCEPTION(already_mapped ,"already mapped");
ISAAC_CREATE_CUDA_EXCEPTION(no_binary_for_gpu ,"no binary for gpu");
ISAAC_CREATE_CUDA_EXCEPTION(already_acquired ,"already acquired");
ISAAC_CREATE_CUDA_EXCEPTION(not_mapped ,"not mapped");
ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_array ,"not mapped as array");
ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer ,"not mapped as pointer");
ISAAC_CREATE_CUDA_EXCEPTION(ecc_uncorrectable ,"ecc uncorrectable");
ISAAC_CREATE_CUDA_EXCEPTION(unsupported_limit ,"unsupported limit");
ISAAC_CREATE_CUDA_EXCEPTION(context_already_in_use ,"context already in use");
ISAAC_CREATE_CUDA_EXCEPTION(peer_access_unsupported ,"peer access unsupported");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_ptx ,"invalid ptx");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_graphics_context ,"invalid graphics context");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_source ,"invalid source");
ISAAC_CREATE_CUDA_EXCEPTION(file_not_found ,"file not found");
ISAAC_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found ,"shared object symbol not found");
ISAAC_CREATE_CUDA_EXCEPTION(shared_object_init_failed ,"shared object init failed");
ISAAC_CREATE_CUDA_EXCEPTION(operating_system ,"operating system");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_handle ,"invalid handle");
ISAAC_CREATE_CUDA_EXCEPTION(not_found ,"not found");
ISAAC_CREATE_CUDA_EXCEPTION(not_ready ,"not ready");
ISAAC_CREATE_CUDA_EXCEPTION(illegal_address ,"illegal address");
ISAAC_CREATE_CUDA_EXCEPTION(launch_out_of_resources ,"launch out of resources");
ISAAC_CREATE_CUDA_EXCEPTION(launch_timeout ,"launch timeout");
ISAAC_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing ,"launch incompatible texturing");
ISAAC_CREATE_CUDA_EXCEPTION(peer_access_already_enabled ,"peer access already enabled");
ISAAC_CREATE_CUDA_EXCEPTION(peer_access_not_enabled ,"peer access not enabled");
ISAAC_CREATE_CUDA_EXCEPTION(primary_context_active ,"primary context active");
ISAAC_CREATE_CUDA_EXCEPTION(context_is_destroyed ,"context is destroyed");
ISAAC_CREATE_CUDA_EXCEPTION(assert_error ,"assert");
ISAAC_CREATE_CUDA_EXCEPTION(too_many_peers ,"too many peers");
ISAAC_CREATE_CUDA_EXCEPTION(host_memory_already_registered ,"host memory already registered");
ISAAC_CREATE_CUDA_EXCEPTION(host_memory_not_registered ,"hot memory not registered");
ISAAC_CREATE_CUDA_EXCEPTION(hardware_stack_error ,"hardware stack error");
ISAAC_CREATE_CUDA_EXCEPTION(illegal_instruction ,"illegal instruction");
ISAAC_CREATE_CUDA_EXCEPTION(misaligned_address ,"misaligned address");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_address_space ,"invalid address space");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_pc ,"invalid pc");
ISAAC_CREATE_CUDA_EXCEPTION(launch_failed ,"launch failed");
ISAAC_CREATE_CUDA_EXCEPTION(not_permitted ,"not permitted");
ISAAC_CREATE_CUDA_EXCEPTION(not_supported ,"not supported");
ISAAC_CREATE_CUDA_EXCEPTION(unknown ,"unknown");
#undef ISAAC_CREATE_CUDA_EXCEPTION
}
namespace cublas
{
#define ISAAC_CREATE_CUBLAS_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } }
ISAAC_CREATE_CUBLAS_EXCEPTION(not_initialized ,"not initialized");
ISAAC_CREATE_CUBLAS_EXCEPTION(alloc_failed ,"alloc failed");
ISAAC_CREATE_CUBLAS_EXCEPTION(invalid_value ,"invalid value");
ISAAC_CREATE_CUBLAS_EXCEPTION(arch_mismatch ,"arch mismatch");
ISAAC_CREATE_CUBLAS_EXCEPTION(mapping_error ,"mapping error");
ISAAC_CREATE_CUBLAS_EXCEPTION(execution_failed ,"execution failed");
ISAAC_CREATE_CUBLAS_EXCEPTION(internal_error ,"internal error");
ISAAC_CREATE_CUBLAS_EXCEPTION(not_supported ,"not supported");
ISAAC_CREATE_CUBLAS_EXCEPTION(license_error ,"license error");
ISAAC_CREATE_CUBLAS_EXCEPTION(unknown ,"unknown");
#undef ISAAC_CREATE_CUBLAS_EXCEPTION
}
namespace cudnn
{
#define ISAAC_CREATE_CUDNN_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "CUDNN: Error- " msg; } }
ISAAC_CREATE_CUDNN_EXCEPTION(not_initialized ,"not initialized");
ISAAC_CREATE_CUDNN_EXCEPTION(alloc_failed ,"allocation failed");
ISAAC_CREATE_CUDNN_EXCEPTION(bad_param ,"bad param");
ISAAC_CREATE_CUDNN_EXCEPTION(internal_error ,"internal error");
ISAAC_CREATE_CUDNN_EXCEPTION(invalid_value ,"invalid value");
ISAAC_CREATE_CUDNN_EXCEPTION(arch_mismatch ,"arch mismatch");
ISAAC_CREATE_CUDNN_EXCEPTION(mapping_error ,"mapping error");
ISAAC_CREATE_CUDNN_EXCEPTION(execution_failed ,"execution failed");
ISAAC_CREATE_CUDNN_EXCEPTION(not_supported ,"not supported");
ISAAC_CREATE_CUDNN_EXCEPTION(license_error ,"license error");
}
namespace ocl
{
class base: public std::exception{};
#define ISAAC_CREATE_CL_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "OpenCL: Error- " msg; } }
ISAAC_CREATE_CL_EXCEPTION(device_not_found, "device not found");
ISAAC_CREATE_CL_EXCEPTION(device_not_available, "device not available");
ISAAC_CREATE_CL_EXCEPTION(compiler_not_available, "compiler not available");
ISAAC_CREATE_CL_EXCEPTION(mem_object_allocation_failure, "object allocation failure");
ISAAC_CREATE_CL_EXCEPTION(out_of_resources, "launch out of resources");
ISAAC_CREATE_CL_EXCEPTION(out_of_host_memory, "out of host memory");
ISAAC_CREATE_CL_EXCEPTION(profiling_info_not_available, "profiling info not available");
ISAAC_CREATE_CL_EXCEPTION(mem_copy_overlap, "mem copy overlap");
ISAAC_CREATE_CL_EXCEPTION(image_format_mismatch, "image format mismatch");
ISAAC_CREATE_CL_EXCEPTION(image_format_not_supported, "image format not supported");
ISAAC_CREATE_CL_EXCEPTION(build_program_failure, "build program failure");
ISAAC_CREATE_CL_EXCEPTION(map_failure, "map failure");
ISAAC_CREATE_CL_EXCEPTION(invalid_value, "invalid value");
ISAAC_CREATE_CL_EXCEPTION(invalid_device_type, "invalid device type");
ISAAC_CREATE_CL_EXCEPTION(invalid_platform, "invalid platform");
ISAAC_CREATE_CL_EXCEPTION(invalid_device, "invalid device");
ISAAC_CREATE_CL_EXCEPTION(invalid_context, "invalid context");
ISAAC_CREATE_CL_EXCEPTION(invalid_queue_properties, "invalid queue properties");
ISAAC_CREATE_CL_EXCEPTION(invalid_command_queue, "invalid command queue");
ISAAC_CREATE_CL_EXCEPTION(invalid_host_ptr, "invalid host pointer");
ISAAC_CREATE_CL_EXCEPTION(invalid_mem_object, "invalid mem object");
ISAAC_CREATE_CL_EXCEPTION(invalid_image_format_descriptor, "invalid image format descriptor");
ISAAC_CREATE_CL_EXCEPTION(invalid_image_size, "invalid image size");
ISAAC_CREATE_CL_EXCEPTION(invalid_sampler, "invalid sampler");
ISAAC_CREATE_CL_EXCEPTION(invalid_binary, "invalid binary");
ISAAC_CREATE_CL_EXCEPTION(invalid_build_options, "invalid build options");
ISAAC_CREATE_CL_EXCEPTION(invalid_program, "invalid program");
ISAAC_CREATE_CL_EXCEPTION(invalid_program_executable, "invalid program executable");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_name, "invalid kernel name");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_definition, "invalid kernel definition");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel, "invalid kernel");
ISAAC_CREATE_CL_EXCEPTION(invalid_arg_index, "invalid arg index");
ISAAC_CREATE_CL_EXCEPTION(invalid_arg_value, "invalid arg value");
ISAAC_CREATE_CL_EXCEPTION(invalid_arg_size, "invalid arg size");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_args, "invalid kernel args");
ISAAC_CREATE_CL_EXCEPTION(invalid_work_dimension, "invalid work dimension");
ISAAC_CREATE_CL_EXCEPTION(invalid_work_group_size, "invalid work group size");
ISAAC_CREATE_CL_EXCEPTION(invalid_work_item_size, "invalid work item size");
ISAAC_CREATE_CL_EXCEPTION(invalid_global_offset, "invalid global offset");
ISAAC_CREATE_CL_EXCEPTION(invalid_event_wait_list, "invalid event wait list");
ISAAC_CREATE_CL_EXCEPTION(invalid_event, "invalid event");
ISAAC_CREATE_CL_EXCEPTION(invalid_operation, "invalid operation");
ISAAC_CREATE_CL_EXCEPTION(invalid_gl_object, "invalid GL object");
ISAAC_CREATE_CL_EXCEPTION(invalid_buffer_size, "invalid buffer size");
ISAAC_CREATE_CL_EXCEPTION(invalid_mip_level, "invalid MIP level");
ISAAC_CREATE_CL_EXCEPTION(invalid_global_work_size, "invalid global work size");
#ifdef CL_INVALID_PROPERTY
ISAAC_CREATE_CL_EXCEPTION(invalid_property, "invalid property");
#endif
}
}
}
}
#endif

View File

@@ -23,8 +23,6 @@
#ifndef ISAAC_DRIVER_EVENT_H #ifndef ISAAC_DRIVER_EVENT_H
#define ISAAC_DRIVER_EVENT_H #define ISAAC_DRIVER_EVENT_H
#include "isaac/defines.h"
#include "isaac/driver/common.h"
#include "isaac/driver/handle.h" #include "isaac/driver/handle.h"
namespace isaac namespace isaac
@@ -34,26 +32,14 @@ namespace driver
{ {
// Event // Event
class ISAACAPI Event: public has_handle_comparators<Event> class Event: public Handle<cu_event_t>
{ {
private: private:
friend class CommandQueue; typedef Handle<cu_event_t> base_type;
public: public:
typedef Handle<cl_event, cu_event_t> handle_type; using base_type::base_type;
float elapsed_time() const;
public:
//Constructors
Event(cl_event const & event, bool take_ownership = true);
Event(backend_type backend);
//Accessors
handle_type const & handle() const;
//Profiling
long elapsed_time() const;
private:
backend_type backend_;
handle_type h_;
}; };
} }

File diff suppressed because it is too large Load Diff

View File

@@ -1,346 +0,0 @@
/*******************************************************************************
* Copyright (c) 2008-2013 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
******************************************************************************/
/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
/* cl_ext.h contains OpenCL extensions which don't have external */
/* (OpenGL, D3D) dependencies. */
#ifndef __CL_EXT_H
#define __CL_EXT_H
#ifdef __cplusplus
extern "C" {
#endif
#include "isaac/driver/external/CL/cl_ext.h"
/* cl_khr_fp64 extension - no extension #define since it has no functions */
#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
/* cl_khr_fp16 extension - no extension #define since it has no functions */
#define CL_DEVICE_HALF_FP_CONFIG 0x1033
/* Memory object destruction
*
* Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
*
* Registers a user callback function that will be called when the memory object is deleted and its resources
* freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
* stack associated with memobj. The registered user callback functions are called in the reverse order in
* which they were registered. The user callback functions are called and then the memory object is deleted
* and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
* notified when the memory referenced by host_ptr, specified when the memory object is created and used as
* the storage bits for the memory object, can be reused or freed.
*
* The application may not call CL api's with the cl_mem object passed to the pfn_notify.
*
* Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
* before using.
*/
#define cl_APPLE_SetMemObjectDestructor 1
cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem /* memobj */,
void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
void * /*user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
/* Context Logging Functions
*
* The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
* Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
* before using.
*
* clLogMessagesToSystemLog fowards on all log messages to the Apple System Logelementwise_2d
*/
#define cl_APPLE_ContextLoggingFunctions 1
extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * /* errstr */,
const void * /* private_info */,
size_t /* cb */,
void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * /* errstr */,
const void * /* private_info */,
size_t /* cb */,
void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * /* errstr */,
const void * /* private_info */,
size_t /* cb */,
void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
/************************
* cl_khr_icd extension *
************************/
#define cl_khr_icd 1
/* cl_platform_info */
#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920
/* Additional Error Codes */
#define CL_PLATFORM_NOT_FOUND_KHR -1001
extern CL_API_ENTRY cl_int CL_API_CALL
clIcdGetPlatformIDsKHR(cl_uint /* num_entries */,
cl_platform_id * /* platforms */,
cl_uint * /* num_platforms */);
typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
cl_uint /* num_entries */,
cl_platform_id * /* platforms */,
cl_uint * /* num_platforms */);
/* Extension: cl_khr_image2D_buffer
*
* This extension allows a 2D image to be created from a cl_mem buffer without a copy.
* The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
* Both the sampler and sampler-less read_image built-in functions are supported for 2D images
* and 2D images created from a buffer. Similarly, the write_image built-ins are also supported
* for 2D images created from a buffer.
*
* When the 2D image from buffer is created, the client must specify the width,
* height, image format (i.e. channel order and channel data type) and optionally the row pitch
*
* The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
* The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
*/
/*************************************
* cl_khr_initalize_memory extension *
*************************************/
#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x200E
/**************************************
* cl_khr_terminate_context extension *
**************************************/
#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x200F
#define CL_CONTEXT_TERMINATE_KHR 0x2010
#define cl_khr_terminate_context 1
extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
/*
* Extension: cl_khr_spir
*
* This extension adds support to create an OpenCL program object from a
* Standard Portable Intermediate Representation (SPIR) instance
*/
/******************************************
* cl_nv_device_attribute_query extension *
******************************************/
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
/*********************************
* cl_amd_device_memory_flags *
*********************************/
#define cl_amd_device_memory_flags 1
#define CL_MEM_USE_PERSISTENT_MEM_AMD (1 << 6) // Alloc from GPU's CPU visible heap
/* cl_device_info */
#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT 0x4032
/*********************************
* cl_amd_device_attribute_query *
*********************************/
#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036
#define CL_DEVICE_TOPOLOGY_AMD 0x4037
#define CL_DEVICE_BOARD_NAME_AMD 0x4038
#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039
#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040
#define CL_DEVICE_SIMD_WIDTH_AMD 0x4041
#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043
#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047
#define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048
typedef union
{
struct { cl_uint type; cl_uint data[5]; } raw;
struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
} cl_device_topology_amd;
#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD 1
/**************************
* cl_amd_offline_devices *
**************************/
#define CL_CONTEXT_OFFLINE_DEVICES_AMD 0x403F
#ifdef CL_VERSION_1_1
/***********************************
* cl_ext_device_fission extension *
***********************************/
#define cl_ext_device_fission 1
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
typedef cl_ulong cl_device_partition_property_ext;
extern CL_API_ENTRY cl_int CL_API_CALL
clCreateSubDevicesEXT( cl_device_id /*in_device*/,
const cl_device_partition_property_ext * /* properties */,
cl_uint /*num_entries*/,
cl_device_id * /*out_devices*/,
cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
( CL_API_CALL * clCreateSubDevicesEXT_fn)( cl_device_id /*in_device*/,
const cl_device_partition_property_ext * /* properties */,
cl_uint /*num_entries*/,
cl_device_id * /*out_devices*/,
cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
/* cl_device_partition_property_ext */
#define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050
#define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051
#define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053
/* clDeviceGetInfo selectors */
#define CL_DEVICE_PARENT_DEVICE_EXT 0x4054
#define CL_DEVICE_PARTITION_TYPES_EXT 0x4055
#define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056
#define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057
#define CL_DEVICE_PARTITION_STYLE_EXT 0x4058
/* error codes */
#define CL_DEVICE_PARTITION_FAILED_EXT -1057
#define CL_INVALID_PARTITION_COUNT_EXT -1058
#define CL_INVALID_PARTITION_NAME_EXT -1059
/* CL_AFFINITY_DOMAINs */
#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1
#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2
#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3
#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4
#define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10
#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100
/* cl_device_partition_property_ext list terminators */
#define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0)
#define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0)
#define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1)
/* cl_ext_atomic_counters_32 and cl_ext_atomic_counters_64 extensions
* no extension #define since they have no functions
*/
#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT 0x4032
/*********************************
* cl_qcom_ext_host_ptr extension
*********************************/
#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29)
#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0
#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1
#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2
#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3
#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4
#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5
#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6
#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7
typedef cl_uint cl_image_pitch_info_qcom;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceImageInfoQCOM(cl_device_id device,
size_t image_width,
size_t image_height,
const cl_image_format *image_format,
cl_image_pitch_info_qcom param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret);
typedef struct _cl_mem_ext_host_ptr
{
// Type of external memory allocation.
// Legal values will be defined in layered extensions.
cl_uint allocation_type;
// Host cache policy for this external memory allocation.
cl_uint host_cache_policy;
} cl_mem_ext_host_ptr;
/*********************************
* cl_qcom_ion_host_ptr extension
*********************************/
#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8
typedef struct _cl_mem_ion_host_ptr
{
// Type of external memory allocation.
// Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations.
cl_mem_ext_host_ptr ext_host_ptr;
// ION file descriptor
int ion_filedesc;
// Host pointer to the ION allocated memory
void* ion_hostptr;
} cl_mem_ion_host_ptr;
#endif /* CL_VERSION_1_1 */
#ifdef __cplusplus
}
#endif
#endif /* __CL_EXT_H */

File diff suppressed because it is too large Load Diff

View File

@@ -57,7 +57,7 @@
#if !defined(CUBLAS_H_) #if !defined(CUBLAS_H_)
#define CUBLAS_H_ #define CUBLAS_H_
#include <cuda_runtime.h> #include "cuda_runtime.h"
#ifndef CUBLASWINAPI #ifndef CUBLASWINAPI
#ifdef _WIN32 #ifdef _WIN32

File diff suppressed because it is too large Load Diff

1651
include/isaac/driver/external/CUDA/cudnn.h vendored Normal file

File diff suppressed because it is too large Load Diff

4406
include/isaac/driver/external/CUDA/nvml.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -24,10 +24,11 @@
#define ISAAC_DRIVER_HANDLE_H #define ISAAC_DRIVER_HANDLE_H
#include <memory> #include <memory>
#include "isaac/defines.h"
#include "isaac/driver/common.h"
#include <iostream> #include <iostream>
#include <functional>
#include <type_traits>
#include "isaac/driver/dispatch.h"
namespace isaac namespace isaac
{ {
@@ -35,68 +36,59 @@ namespace driver
{ {
struct cu_event_t{ struct cu_event_t{
operator bool() const { return first && second; } operator bool() const { return first && second; }
CUevent first; CUevent first;
CUevent second; CUevent second;
}; };
template<class CLType, class CUType> struct cu_platform{
class ISAACAPI Handle cu_platform() : status_(dispatch::cuInit(0)) {}
{ operator bool() const { return status_; }
private: private:
static void _delete(CUcontext x); CUresult status_;
static void _delete(CUdeviceptr x); };
static void _delete(CUstream x);
static void _delete(CUdevice);
static void _delete(CUevent x);
static void _delete(CUfunction);
static void _delete(CUmodule x);
static void _delete(cu_event_t x);
static void release(cl_context x); template<typename T> struct remove_class { };
static void release(cl_mem x); template<typename C, typename R, typename... A>
static void release(cl_command_queue x); struct remove_class<R(C::*)(A...)> { using type = R(A...); };
static void release(cl_device_id x); template<typename C, typename R, typename... A>
static void release(cl_event x); struct remove_class<R(C::*)(A...) const> { using type = R(A...); };
static void release(cl_kernel x); template<typename C, typename R, typename... A>
static void release(cl_program x); struct remove_class<R(C::*)(A...) volatile> { using type = R(A...); };
template<typename C, typename R, typename... A>
struct remove_class<R(C::*)(A...) const volatile> { using type = R(A...); };
template<typename T>
struct get_signature_impl { using type = typename remove_class<
decltype(&std::remove_reference<T>::type::operator())>::type; };
template<typename R, typename... A>
struct get_signature_impl<R(A...)> { using type = R(A...); };
template<typename R, typename... A>
struct get_signature_impl<R(&)(A...)> { using type = R(A...); };
template<typename R, typename... A>
struct get_signature_impl<R(*)(A...)> { using type = R(A...); };
template<typename T> using get_signature = typename get_signature_impl<T>::type;
template<class CUType>
class Handle
{
public: public:
//Constructors //Constructors
Handle(backend_type backend, bool take_ownership = true); Handle(CUType cu, bool take_ownership = true);
Handle(bool take_ownership = true);
~Handle();
//Comparison //Comparison
bool operator==(Handle const & other) const; bool operator==(Handle const & other) const;
bool operator!=(Handle const & other) const; bool operator!=(Handle const & other) const;
bool operator<(Handle const & other) const; bool operator<(Handle const & other) const;
//Accessors //Accessors
backend_type backend() const; operator CUType() const;
CLType & cl();
CLType const & cl() const;
CUType & cu();
CUType const & cu() const;
~Handle();
private: protected:
DISABLE_MSVC_WARNING_C4251
std::shared_ptr<CLType> cl_;
std::shared_ptr<CUType> cu_; std::shared_ptr<CUType> cu_;
RESTORE_MSVC_WARNING_C4251
private:
backend_type backend_;
bool has_ownership_; bool has_ownership_;
}; };
//Helper for automatic implementation of comparison operators
template<class T>
class has_handle_comparators
{
public:
friend bool operator==(T const & x, T const & y) { return x.handle() == y.handle(); }
friend bool operator!=(T const & x, T const & y) { return x.handle() != y.handle(); }
friend bool operator<(T const & x, T const & y) { return x.handle() < y.handle(); }
};
} }
} }

View File

@@ -23,11 +23,8 @@
#ifndef ISAAC_DRIVER_KERNEL_H #ifndef ISAAC_DRIVER_KERNEL_H
#define ISAAC_DRIVER_KERNEL_H #define ISAAC_DRIVER_KERNEL_H
#include "isaac/defines.h" #include "isaac/driver/module.h"
#include "isaac/driver/common.h"
#include "isaac/driver/program.h"
#include "isaac/driver/handle.h" #include "isaac/driver/handle.h"
#include "isaac/value_scalar.h"
#include <memory> #include <memory>
@@ -40,30 +37,25 @@ namespace driver
class Buffer; class Buffer;
// Kernel // Kernel
class ISAACAPI Kernel: public has_handle_comparators<Kernel> class Kernel: public Handle<CUfunction>
{ {
friend class CommandQueue;
public: public:
typedef Handle<cl_kernel, CUfunction> handle_type; typedef Handle<CUfunction> base_type;
public: public:
//Constructors //Constructors
Kernel(Program const & program, const char * name); Kernel(Module const & program, const char * name);
//Accessors
handle_type const & handle() const;
//Arguments setters //Arguments setters
void setArg(unsigned int index, value_scalar const & scal);
void setArg(unsigned int index, std::size_t size, void* ptr); void setArg(unsigned int index, std::size_t size, void* ptr);
void setArg(unsigned int index, Buffer const &); void setArg(unsigned int index, Buffer const &);
void setSizeArg(unsigned int index, std::size_t N);
template<class T> void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); } template<class T> void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); }
//Arguments getters
void* const* cu_params() const;
private: private:
backend_type backend_; Module program_;
unsigned int address_bits_; unsigned int address_bits_;
std::vector<std::shared_ptr<void> > cu_params_store_; std::vector<std::shared_ptr<void> > cu_params_store_;
std::vector<void*> cu_params_; std::vector<void*> cu_params_;
handle_type h_;
}; };
} }

View File

@@ -20,28 +20,38 @@
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ */
#include <string> #ifndef ISAAC_DRIVER_MODULE_H
#define ISAAC_DRIVER_MODULE_H
#include <map>
#include "isaac/driver/handle.h"
#include "isaac/driver/context.h"
namespace isaac namespace isaac
{ {
namespace tools
namespace driver
{ {
inline void cpuid(int code, int *a, int *b, int *c, int *d) { class Context;
__asm__ __volatile__("cpuid":"=a"(*a),"=b"(*b), class Device;
"=c"(*c),"=d"(*d):"a"(code));
}
inline std::string cpu_brand(){ class Module: public Handle<CUmodule>
char name[48]; {
int* ptr = (int*)name; typedef Handle<CUmodule> base_type;
cpuid(0x80000002, ptr, ptr+1, ptr+2, ptr+3);
cpuid(0x80000003, ptr+4, ptr+5, ptr+6, ptr+7);
cpuid(0x80000004, ptr+8, ptr+9, ptr+10, ptr+11);
return std::string(name, name+48);
}
public:
Module(Context const & context, std::string const & source, bool is_ir = true);
Context const & context() const;
private:
Context context_;
std::string source_;
};
} }
} }
#endif

View File

@@ -26,8 +26,7 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include "isaac/defines.h" #include "isaac/driver/handle.h"
#include "isaac/driver/common.h"
namespace isaac namespace isaac
{ {
@@ -37,20 +36,15 @@ namespace driver
class Device; class Device;
class ISAACAPI Platform class Platform: public Handle<cu_platform>
{ {
typedef Handle<cu_platform> base_type;
public: public:
//Constructors using base_type::base_type;
Platform(backend_type);
Platform(cl_platform_id const &);
//Accessors //Accessors
std::string name() const; std::string name() const;
std::string version() const; std::string version() const;
void devices(std::vector<Device> &) const; std::vector<Device> devices() const;
cl_platform_id cl_id() const;
private:
backend_type backend_;
cl_platform_id cl_platform_;
}; };
} }

View File

@@ -1,70 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_PROGRAM_H
#define ISAAC_DRIVER_PROGRAM_H
#include <map>
#include "isaac/defines.h"
#include "isaac/driver/common.h"
#include "isaac/driver/handle.h"
#include "isaac/driver/context.h"
namespace isaac
{
namespace driver
{
class Context;
class Device;
class ISAACAPI Program: public has_handle_comparators<Program>
{
public:
typedef Handle<cl_program, CUmodule> handle_type;
private:
friend class Kernel;
public:
//Constructors
Program(Context const & context, std::string const & source);
//Accessors
handle_type const & handle() const;
Context const & context() const;
private:
DISABLE_MSVC_WARNING_C4251
backend_type backend_;
Context context_;
std::string source_;
handle_type h_;
RESTORE_MSVC_WARNING_C4251
};
}
}
#endif

View File

@@ -1,59 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_PROGRAM_CACHE_H
#define ISAAC_DRIVER_PROGRAM_CACHE_H
#include <map>
#include "isaac/defines.h"
#include "isaac/driver/program.h"
namespace isaac
{
namespace driver
{
class ISAACAPI ProgramCache
{
friend class backend;
public:
//Clearing the cache
void clear();
//Adding a program to the cache
Program & add(Context const & context, std::string const & name, std::string const & src);
//Finding a program in the cache
Program const *find(std::string const & name);
private:
DISABLE_MSVC_WARNING_C4251
std::map<std::string, Program> cache_;
RESTORE_MSVC_WARNING_C4251
};
}
}
#endif

View File

@@ -20,12 +20,10 @@
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ */
#ifndef ISAAC_DRIVER_COMMAND_QUEUE_H #ifndef ISAAC_DRIVER_STREAM_H
#define ISAAC_DRIVER_COMMAND_QUEUE_H #define ISAAC_DRIVER_STREAM_H
#include <map> #include <map>
#include "isaac/defines.h"
#include "isaac/driver/common.h"
#include "isaac/driver/context.h" #include "isaac/driver/context.h"
#include "isaac/driver/device.h" #include "isaac/driver/device.h"
#include "isaac/driver/handle.h" #include "isaac/driver/handle.h"
@@ -38,40 +36,29 @@ namespace driver
class Kernel; class Kernel;
class Event; class Event;
class NDRange; class Range;
class Buffer; class Buffer;
// Command Queue // Command Queue
class ISAACAPI CommandQueue: public has_handle_comparators<CommandQueue> class Stream: public Handle<CUstream>
{ {
public: typedef Handle<CUstream> base_type;
typedef Handle<cl_command_queue, CUstream> handle_type;
public: public:
//Constructors //Constructors
CommandQueue(cl_command_queue const & queue, bool take_ownership = true); using base_type::base_type;
CommandQueue(Context const & context, Device const & device, cl_command_queue_properties properties = 0); Stream(Context const & context);
//Accessors //Accessors
handle_type & handle();
handle_type const & handle() const;
backend_type backend() const;
Context const & context() const; Context const & context() const;
Device const & device() const;
//Synchronize //Synchronize
void synchronize(); void synchronize();
//Profiling
void enable_profiling();
void disable_profiling();
//Enqueue calls //Enqueue calls
void enqueue(Kernel const & kernel, NDRange global, driver::NDRange local, std::vector<Event> const *, Event *event); void enqueue(Kernel const & kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const * = NULL, Event *event = NULL);
void write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr); void write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
void read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr); void read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr);
private: private:
backend_type backend_;
Context context_; Context context_;
Device device_;
handle_type h_;
}; };

View File

@@ -1,82 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_EXCEPTION_API_H
#define ISAAC_EXCEPTION_API_H
#include <string>
#include <exception>
#include "isaac/defines.h"
namespace isaac
{
/** @brief Exception for the case the generator is unable to deal with the operation */
DISABLE_MSVC_WARNING_C4275
class operation_not_supported_exception : public std::exception
{
public:
operation_not_supported_exception();
operation_not_supported_exception(std::string message);
virtual const char* what() const throw();
private:
DISABLE_MSVC_WARNING_C4251
std::string message_;
RESTORE_MSVC_WARNING_C4251
};
RESTORE_MSVC_WARNING_C4275
/** @brief Exception for the case the generator is unable to deal with the operation */
DISABLE_MSVC_WARNING_C4275
class ISAACAPI unknown_datatype : public std::exception
{
public:
unknown_datatype(int);
virtual const char* what() const throw();
private:
DISABLE_MSVC_WARNING_C4251
std::string message_;
RESTORE_MSVC_WARNING_C4251
};
RESTORE_MSVC_WARNING_C4275
/** @brief Exception for the case the generator is unable to deal with the operation */
DISABLE_MSVC_WARNING_C4275
class ISAACAPI semantic_error : public std::exception
{
public:
semantic_error(std::string const & message);
virtual const char* what() const throw();
private:
DISABLE_MSVC_WARNING_C4251
std::string message_;
RESTORE_MSVC_WARNING_C4251
};
RESTORE_MSVC_WARNING_C4275
}
#endif

View File

@@ -1,216 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_EXCEPTION_DRIVER_H
#define ISAAC_EXCEPTION_DRIVER_H
#include <exception>
#include "isaac/driver/dispatch.h"
#include "isaac/defines.h"
DISABLE_MSVC_WARNING_C4275
namespace isaac
{
namespace exception
{
class ISAACAPI unknown_architecture: public std::exception{
public:
unknown_architecture(std::string const & msg): msg_("Unrecognized architecture: " + msg){}
const char * what() const throw(){ return msg_.c_str(); }
private:
std::string msg_;
};
namespace nvrtc
{
#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class ISAACAPI name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } }
ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory ,"out of memory");
ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure ,"program creation failure");
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input ,"invalid input");
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program ,"invalid program");
ISAAC_CREATE_NVRTC_EXCEPTION(invalid_option ,"invalid option");
ISAAC_CREATE_NVRTC_EXCEPTION(compilation ,"compilation");
ISAAC_CREATE_NVRTC_EXCEPTION(builtin_operation_failure ,"builtin operation failure");
ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error ,"unknown error");
#undef ISAAC_CREATE_NVRTC_EXCEPTION
}
namespace cuda
{
class base: public std::exception{};
#define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class ISAACAPI name: public base { public:const char * what() const throw(){ return "CUDA: Error- " msg; } }
ISAAC_CREATE_CUDA_EXCEPTION(invalid_value ,"invalid value");
ISAAC_CREATE_CUDA_EXCEPTION(out_of_memory ,"out of memory");
ISAAC_CREATE_CUDA_EXCEPTION(not_initialized ,"not initialized");
ISAAC_CREATE_CUDA_EXCEPTION(deinitialized ,"deinitialized");
ISAAC_CREATE_CUDA_EXCEPTION(profiler_disabled ,"profiler disabled");
ISAAC_CREATE_CUDA_EXCEPTION(profiler_not_initialized ,"profiler not initialized");
ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_started ,"profiler already started");
ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_stopped ,"profiler already stopped");
ISAAC_CREATE_CUDA_EXCEPTION(no_device ,"no device");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_device ,"invalid device");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_image ,"invalid image");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_context ,"invalid context");
ISAAC_CREATE_CUDA_EXCEPTION(context_already_current ,"context already current");
ISAAC_CREATE_CUDA_EXCEPTION(map_failed ,"map failed");
ISAAC_CREATE_CUDA_EXCEPTION(unmap_failed ,"unmap failed");
ISAAC_CREATE_CUDA_EXCEPTION(array_is_mapped ,"array is mapped");
ISAAC_CREATE_CUDA_EXCEPTION(already_mapped ,"already mapped");
ISAAC_CREATE_CUDA_EXCEPTION(no_binary_for_gpu ,"no binary for gpu");
ISAAC_CREATE_CUDA_EXCEPTION(already_acquired ,"already acquired");
ISAAC_CREATE_CUDA_EXCEPTION(not_mapped ,"not mapped");
ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_array ,"not mapped as array");
ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer ,"not mapped as pointer");
ISAAC_CREATE_CUDA_EXCEPTION(ecc_uncorrectable ,"ecc uncorrectable");
ISAAC_CREATE_CUDA_EXCEPTION(unsupported_limit ,"unsupported limit");
ISAAC_CREATE_CUDA_EXCEPTION(context_already_in_use ,"context already in use");
ISAAC_CREATE_CUDA_EXCEPTION(peer_access_unsupported ,"peer access unsupported");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_ptx ,"invalid ptx");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_graphics_context ,"invalid graphics context");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_source ,"invalid source");
ISAAC_CREATE_CUDA_EXCEPTION(file_not_found ,"file not found");
ISAAC_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found ,"shared object symbol not found");
ISAAC_CREATE_CUDA_EXCEPTION(shared_object_init_failed ,"shared object init failed");
ISAAC_CREATE_CUDA_EXCEPTION(operating_system ,"operating system");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_handle ,"invalid handle");
ISAAC_CREATE_CUDA_EXCEPTION(not_found ,"not found");
ISAAC_CREATE_CUDA_EXCEPTION(not_ready ,"not ready");
ISAAC_CREATE_CUDA_EXCEPTION(illegal_address ,"illegal address");
ISAAC_CREATE_CUDA_EXCEPTION(launch_out_of_resources ,"launch out of resources");
ISAAC_CREATE_CUDA_EXCEPTION(launch_timeout ,"launch timeout");
ISAAC_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing ,"launch incompatible texturing");
ISAAC_CREATE_CUDA_EXCEPTION(peer_access_already_enabled ,"peer access already enabled");
ISAAC_CREATE_CUDA_EXCEPTION(peer_access_not_enabled ,"peer access not enabled");
ISAAC_CREATE_CUDA_EXCEPTION(primary_context_active ,"primary context active");
ISAAC_CREATE_CUDA_EXCEPTION(context_is_destroyed ,"context is destroyed");
ISAAC_CREATE_CUDA_EXCEPTION(assert_error ,"assert");
ISAAC_CREATE_CUDA_EXCEPTION(too_many_peers ,"too many peers");
ISAAC_CREATE_CUDA_EXCEPTION(host_memory_already_registered ,"host memory already registered");
ISAAC_CREATE_CUDA_EXCEPTION(host_memory_not_registered ,"hot memory not registered");
ISAAC_CREATE_CUDA_EXCEPTION(hardware_stack_error ,"hardware stack error");
ISAAC_CREATE_CUDA_EXCEPTION(illegal_instruction ,"illegal instruction");
ISAAC_CREATE_CUDA_EXCEPTION(misaligned_address ,"misaligned address");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_address_space ,"invalid address space");
ISAAC_CREATE_CUDA_EXCEPTION(invalid_pc ,"invalid pc");
ISAAC_CREATE_CUDA_EXCEPTION(launch_failed ,"launch failed");
ISAAC_CREATE_CUDA_EXCEPTION(not_permitted ,"not permitted");
ISAAC_CREATE_CUDA_EXCEPTION(not_supported ,"not supported");
ISAAC_CREATE_CUDA_EXCEPTION(unknown ,"unknown");
#undef ISAAC_CREATE_CUDA_EXCEPTION
}
namespace cublas
{
#define ISAAC_CREATE_CUBLAS_EXCEPTION(name, msg) class ISAACAPI name: public std::exception { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } }
ISAAC_CREATE_CUBLAS_EXCEPTION(not_initialized ,"not initialized");
ISAAC_CREATE_CUBLAS_EXCEPTION(alloc_failed ,"alloc failed");
ISAAC_CREATE_CUBLAS_EXCEPTION(invalid_value ,"invalid value");
ISAAC_CREATE_CUBLAS_EXCEPTION(arch_mismatch ,"arch mismatch");
ISAAC_CREATE_CUBLAS_EXCEPTION(mapping_error ,"mapping error");
ISAAC_CREATE_CUBLAS_EXCEPTION(execution_failed ,"execution failed");
ISAAC_CREATE_CUBLAS_EXCEPTION(internal_error ,"internal error");
ISAAC_CREATE_CUBLAS_EXCEPTION(not_supported ,"not supported");
ISAAC_CREATE_CUBLAS_EXCEPTION(license_error ,"license error");
ISAAC_CREATE_CUBLAS_EXCEPTION(unknown ,"unknown");
#undef ISAAC_CREATE_CUBLAS_EXCEPTION
}
namespace ocl
{
class ISAACAPI base: public std::exception{};
#define ISAAC_CREATE_CL_EXCEPTION(name, msg) class ISAACAPI name: public base { public: const char * what() const throw(){ return "OpenCL: Error- " msg; } }
ISAAC_CREATE_CL_EXCEPTION(device_not_found, "device not found");
ISAAC_CREATE_CL_EXCEPTION(device_not_available, "device not available");
ISAAC_CREATE_CL_EXCEPTION(compiler_not_available, "compiler not available");
ISAAC_CREATE_CL_EXCEPTION(mem_object_allocation_failure, "object allocation failure");
ISAAC_CREATE_CL_EXCEPTION(out_of_resources, "launch out of resources");
ISAAC_CREATE_CL_EXCEPTION(out_of_host_memory, "out of host memory");
ISAAC_CREATE_CL_EXCEPTION(profiling_info_not_available, "profiling info not available");
ISAAC_CREATE_CL_EXCEPTION(mem_copy_overlap, "mem copy overlap");
ISAAC_CREATE_CL_EXCEPTION(image_format_mismatch, "image format mismatch");
ISAAC_CREATE_CL_EXCEPTION(image_format_not_supported, "image format not supported");
ISAAC_CREATE_CL_EXCEPTION(build_program_failure, "build program failure");
ISAAC_CREATE_CL_EXCEPTION(map_failure, "map failure");
ISAAC_CREATE_CL_EXCEPTION(invalid_value, "invalid value");
ISAAC_CREATE_CL_EXCEPTION(invalid_device_type, "invalid device type");
ISAAC_CREATE_CL_EXCEPTION(invalid_platform, "invalid platform");
ISAAC_CREATE_CL_EXCEPTION(invalid_device, "invalid device");
ISAAC_CREATE_CL_EXCEPTION(invalid_context, "invalid context");
ISAAC_CREATE_CL_EXCEPTION(invalid_queue_properties, "invalid queue properties");
ISAAC_CREATE_CL_EXCEPTION(invalid_command_queue, "invalid command queue");
ISAAC_CREATE_CL_EXCEPTION(invalid_host_ptr, "invalid host pointer");
ISAAC_CREATE_CL_EXCEPTION(invalid_mem_object, "invalid mem object");
ISAAC_CREATE_CL_EXCEPTION(invalid_image_format_descriptor, "invalid image format descriptor");
ISAAC_CREATE_CL_EXCEPTION(invalid_image_size, "invalid image size");
ISAAC_CREATE_CL_EXCEPTION(invalid_sampler, "invalid sampler");
ISAAC_CREATE_CL_EXCEPTION(invalid_binary, "invalid binary");
ISAAC_CREATE_CL_EXCEPTION(invalid_build_options, "invalid build options");
ISAAC_CREATE_CL_EXCEPTION(invalid_program, "invalid program");
ISAAC_CREATE_CL_EXCEPTION(invalid_program_executable, "invalid program executable");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_name, "invalid kernel name");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_definition, "invalid kernel definition");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel, "invalid kernel");
ISAAC_CREATE_CL_EXCEPTION(invalid_arg_index, "invalid arg index");
ISAAC_CREATE_CL_EXCEPTION(invalid_arg_value, "invalid arg value");
ISAAC_CREATE_CL_EXCEPTION(invalid_arg_size, "invalid arg size");
ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_args, "invalid kernel args");
ISAAC_CREATE_CL_EXCEPTION(invalid_work_dimension, "invalid work dimension");
ISAAC_CREATE_CL_EXCEPTION(invalid_work_group_size, "invalid work group size");
ISAAC_CREATE_CL_EXCEPTION(invalid_work_item_size, "invalid work item size");
ISAAC_CREATE_CL_EXCEPTION(invalid_global_offset, "invalid global offset");
ISAAC_CREATE_CL_EXCEPTION(invalid_event_wait_list, "invalid event wait list");
ISAAC_CREATE_CL_EXCEPTION(invalid_event, "invalid event");
ISAAC_CREATE_CL_EXCEPTION(invalid_operation, "invalid operation");
ISAAC_CREATE_CL_EXCEPTION(invalid_gl_object, "invalid GL object");
ISAAC_CREATE_CL_EXCEPTION(invalid_buffer_size, "invalid buffer size");
ISAAC_CREATE_CL_EXCEPTION(invalid_mip_level, "invalid MIP level");
ISAAC_CREATE_CL_EXCEPTION(invalid_global_work_size, "invalid global work size");
#ifdef CL_INVALID_PROPERTY
ISAAC_CREATE_CL_EXCEPTION(invalid_property, "invalid property");
#endif
}
}
}
RESTORE_MSVC_WARNING_C4275
#endif

2909
include/isaac/external/half.hpp vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,118 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_TEMPLATES_base_
#define ISAAC_TEMPLATES_base_
#include <list>
#include <set>
#include <cmath>
#include <stdint.h>
#include "isaac/types.h"
#include "isaac/jit/generation/engine/stream.h"
#include "isaac/runtime/handler.h"
#include "isaac/jit/syntax/engine/binder.h"
#include "isaac/jit/syntax/engine/object.h"
namespace isaac
{
namespace templates
{
//Error codes
static const int TEMPLATE_VALID = 0;
static const int TEMPLATE_LOCAL_MEMORY_OVERFLOW = -1;
static const int TEMPLATE_WORK_GROUP_SIZE_OVERFLOW = -2;
static const int TEMPLATE_LOCAL_SIZE_0_OVERFLOW = -3;
static const int TEMPLATE_LOCAL_SIZE_1_OVERFLOW = -4;
static const int TEMPLATE_LOCAL_SIZE_2_OVERFLOW = -5;
static const int TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE = -6;
static const int TEMPLATE_INVALID_SIMD_WIDTH = -7;
static const int TEMPLATE_ALIGNMENT_MUST_BE_BLOCK_SIZE_MULTIPLE = -8;
static const int TEMPLATE_INVALID_FETCHING_POLICY_TYPE= -9;
static const int TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH = -10;
static const int TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE = -11;
static const int TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL = -12;
static const int TEMPLATE_SIMD_WIDTH_MUST_BE_ONE = -13;
static const int TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT = -14;
static const int TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE = -15;
static const int TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE = -16;
static const int TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE = -17;
static const int TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE = -18;
static const int TEMPLATE_TEMPORARY_TOO_LARGE = -19;
static const int TEMPLATE_BLOCK_SIZE_TOO_LARGE = -20;
class base: public std::enable_shared_from_this<base>
{
private:
virtual std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & mapping) const = 0;
public:
base();
virtual ~base();
virtual unsigned int temporary_workspace(expression_tree const &) const;
virtual unsigned int lmem_usage(expression_tree const &) const;
virtual unsigned int registers_usage(expression_tree const &) const;
virtual std::vector<int_t> input_sizes(expression_tree const & expressions) const = 0;
virtual int is_invalid(expression_tree const & expressions, driver::Device const & device) const = 0;
virtual void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & expressions) = 0;
virtual expression_type type() const = 0;
std::string generate(std::string const & suffix, expression_tree const & expressions, driver::Device const & device);
std::shared_ptr<base> getptr();
};
class external_base: public base
{
private:
virtual std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & mapping) const;
public:
external_base();
virtual unsigned int temporary_workspace(expression_tree const &) const;
virtual unsigned int lmem_usage(expression_tree const &) const;
virtual unsigned int registers_usage(expression_tree const &) const;
virtual std::vector<int_t> input_sizes(expression_tree const & expressions) const = 0;
};
class parameterized_base : public base
{
private:
virtual int is_invalid_impl(driver::Device const &, expression_tree const &) const;
public:
parameterized_base(unsigned int _vwidth, int_t _ls0, int_t _ls1);
unsigned int ls0() const;
unsigned int ls1() const;
/** @brief returns whether or not the profile has undefined behavior on particular device */
int is_invalid(expression_tree const & expressions, driver::Device const & device) const;
protected:
unsigned int vwidth_;
unsigned int ls0_;
unsigned int ls1_;
};
}
}
#endif

View File

@@ -1,49 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_BACKEND_TEMPLATES_VAXPY_H
#define ISAAC_BACKEND_TEMPLATES_VAXPY_H
#include "isaac/jit/generation/base.h"
namespace isaac
{
namespace templates
{
class elementwise_1d : public parameterized_base
{
private:
std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & symbols) const;
public:
elementwise_1d(unsigned int vwidth, unsigned int ls, unsigned int ng);
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
expression_type type() const;
private:
unsigned int ng_;
};
}
}
#endif

View File

@@ -1,52 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_BACKEND_TEMPLATES_MAXPY_H
#define ISAAC_BACKEND_TEMPLATES_MAXPY_H
#include <vector>
#include "isaac/jit/generation/base.h"
namespace isaac
{
namespace templates
{
class elementwise_2d : public parameterized_base
{
private:
int is_invalid_impl(driver::Device const &, expression_tree const &) const;
std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & mapping) const;
public:
elementwise_2d(unsigned int vwidth, unsigned int ls0, unsigned int ls1, unsigned int ng0, unsigned int ng1);
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
expression_type type() const;
private:
unsigned int ng0_;
unsigned int ng1_;
};
}
}
#endif

View File

@@ -1,98 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_BACKEND_KEYWORDS_H
#define ISAAC_BACKEND_KEYWORDS_H
#include "isaac/driver/common.h"
#include "isaac/driver/device.h"
namespace isaac
{
class keyword
{
public:
keyword(driver::backend_type backend, std::string const & opencl, std::string const & cuda);
std::string const & get() const;
private:
driver::backend_type backend_;
std::string opencl_;
std::string cuda_;
};
static inline std::string size_type(driver::Device const & device)
{
switch(device.backend())
{
case driver::CUDA:
return "int";
case driver::OPENCL:
return "int";
default:
throw;
}
}
std::ostream & operator<<(std::ostream & ss, keyword const & kw);
#define ADD_KEYWORD(NAME, OCLKW, CUDAKW) class NAME : public keyword { public: NAME(driver::backend_type backend) : keyword(backend, OCLKW, CUDAKW){} };
ADD_KEYWORD(KernelPrefix, "__kernel", "extern \"C\" __global__")
ADD_KEYWORD(Local, "__local", "__shared__")
ADD_KEYWORD(Global, "__global", "")
ADD_KEYWORD(LocalPtr, "__local", "")
ADD_KEYWORD(GlobalIdx0, "get_global_id(0)", "(blockIdx.x*blockDim.x + threadIdx.x)")
ADD_KEYWORD(GlobalIdx1, "get_global_id(1)", "(blockIdx.y*blockDim.y + threadIdx.y)")
ADD_KEYWORD(GlobalIdx2, "get_global_id(2)", "(blockIdx.z*blockDim.z + threadIdx.z)")
ADD_KEYWORD(GlobalSize0, "get_global_size(0)", "(blockDim.x*gridDim.x)")
ADD_KEYWORD(GlobalSize1, "get_global_size(1)", "(blockDim.y*gridDim.y)")
ADD_KEYWORD(GlobalSize2, "get_global_size(2)", "(blockDim.z*gridDim.z)")
ADD_KEYWORD(LocalIdx0, "get_local_id(0)", "threadIdx.x")
ADD_KEYWORD(LocalIdx1, "get_local_id(1)", "threadIdx.y")
ADD_KEYWORD(LocalIdx2, "get_local_id(2)", "threadIdx.z")
ADD_KEYWORD(LocalSize0, "get_local_size(0)", "blockDim.x")
ADD_KEYWORD(LocalSize1, "get_local_size(1)", "blockDim.y")
ADD_KEYWORD(LocalSize2, "get_local_size(2)", "blockDim.z")
ADD_KEYWORD(GroupIdx0, "get_group_id(0)", "blockIdx.x")
ADD_KEYWORD(GroupIdx1, "get_group_id(1)", "blockIdx.y")
ADD_KEYWORD(GroupIdx2, "get_group_id(2)", "blockIdx.z")
ADD_KEYWORD(GroupSize0, "get_ng(0)", "GridDim.x")
ADD_KEYWORD(GroupSize1, "get_ng(1)", "GridDim.y")
ADD_KEYWORD(GroupSize2, "get_ng(2)", "GridDim.z")
ADD_KEYWORD(LocalBarrier, "barrier(CLK_LOCAL_MEM_FENCE)", "__syncthreads()")
struct CastPrefix: public keyword{ CastPrefix(driver::backend_type backend, std::string const & datatype): keyword(backend, "convert_" + datatype, "make_" + datatype){} };
struct InitPrefix: public keyword{ InitPrefix(driver::backend_type backend, std::string const & datatype): keyword(backend, "", "make_" + datatype){} };
struct Infinity: public keyword{ Infinity(driver::backend_type backend, std::string const & datatype): keyword(backend, "INFINITY", "infinity<" + datatype + ">()"){} };
struct Select: public keyword{ Select(driver::backend_type backend, std::string cond, std::string if_value, std::string else_value): keyword(backend, "select(" + else_value + "," + if_value + "," + cond + ")", "(" + cond + ")?" + if_value + ":" + else_value) {} };
#undef ADD_KEYWORD
}
#endif

View File

@@ -1,62 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_BACKEND_STREAM_H
#define ISAAC_BACKEND_STREAM_H
#include <sstream>
#include "isaac/driver/common.h"
namespace isaac
{
class kernel_generation_stream : public std::ostream
{
class kgenstream : public std::stringbuf
{
public:
kgenstream(std::ostringstream& oss,unsigned int const & tab_count) ;
int sync();
~kgenstream();
private:
std::ostream& oss_;
unsigned int const & tab_count_;
};
void process(std::string& str);
public:
kernel_generation_stream(driver::backend_type backend);
~kernel_generation_stream();
std::string str();
void inc_tab();
void dec_tab();
private:
unsigned int tab_count_;
driver::backend_type backend_;
std::ostringstream oss;
};
}
#endif

View File

@@ -1,155 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_BACKEND_TEMPLATES_MPRODUCT_H
#define ISAAC_BACKEND_TEMPLATES_MPRODUCT_H
#include "isaac/jit/generation/base.h"
#include "isaac/jit/syntax/expression/expression.h"
#include "isaac/jit/syntax/expression/preset.h"
namespace isaac
{
namespace templates
{
class cublas_gemm : public external_base
{
bool init();
public:
cublas_gemm(char A_trans, char B_trans);
int is_invalid(expression_tree const &, driver::Device const &) const;
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & h);
expression_type type() const;
private:
const char A_trans_;
const char B_trans_;
bool init_;
};
class intelblas_gemm : public external_base
{
bool init();
public:
intelblas_gemm(char A_trans, char B_trans);
int is_invalid(expression_tree const &, driver::Device const &) const;
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & h);
expression_type type() const;
private:
std::string generate_impl(std::string const & suffix, expression_tree const &, driver::Device const & device, symbolic::symbols_table const &) const;
const char A_trans_;
const char B_trans_;
bool init_;
};
class intelblas_gemm_image : public external_base
{
bool init();
public:
intelblas_gemm_image(char A_trans, char B_trans);
int is_invalid(expression_tree const &, driver::Device const &) const;
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const &, std::string const &, runtime::execution_handler const & h);
expression_type type() const;
private:
std::string generate_impl(std::string const & suffix, expression_tree const &, driver::Device const & device, symbolic::symbols_table const &) const;
const char A_trans_;
const char B_trans_;
bool init_;
};
class gemm : public parameterized_base
{
private:
unsigned int temporary_workspace(expression_tree const & expressions) const;
unsigned int lmem_usage(expression_tree const & expressions) const;
unsigned int registers_usage(expression_tree const & expressions) const;
int is_invalid_impl(driver::Device const &, expression_tree const &) const;
std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const &) const;
void enqueue_block(driver::CommandQueue & queue, int_t M, int_t N, int_t K, const expression_tree::node &A, const expression_tree::node &B, const expression_tree::node &C,
value_scalar const &alpha, value_scalar const &beta, driver::Program const & program, std::string const & suffix, runtime::execution_options_type const & options);
public:
gemm(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns, int_t lf0, int_t lf1
, char A_trans, char B_trans);
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const & h);
expression_type type() const;
private:
//Parameters
unsigned int mL_;
unsigned int kL_;
unsigned int nL_;
unsigned int depth_;
unsigned int mS_;
unsigned int kS_;
unsigned int nS_;
unsigned int lf0_;
unsigned int lf1_;
bool prefetch_;
bool unroll_outer_;
//
const char A_trans_;
const char B_trans_;
expression_type type_;
};
class gemm_nn : public gemm
{
public:
gemm_nn(unsigned int vwidth, int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns, int_t lf0, int_t lf1);
};
class gemm_tn : public gemm
{
public:
gemm_tn(unsigned int vwidth, int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns, int_t lf0, int_t lf1);
};
class gemm_nt : public gemm
{
public:
gemm_nt(unsigned int vwidth, int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns, int_t lf0, int_t lf1);
};
class gemm_tt : public gemm
{
public:
gemm_tt(unsigned int vwidth, int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns, int_t lf0, int_t lf1);
};
}
}
#endif

View File

@@ -1,57 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_BACKEND_TEMPLATES_DOT_H
#define ISAAC_BACKEND_TEMPLATES_DOT_H
#include "isaac/jit/generation/base.h"
namespace isaac
{
namespace templates
{
class reduce_1d : public parameterized_base
{
private:
unsigned int lmem_usage(expression_tree const & expressions) const;
unsigned int temporary_workspace(expression_tree const & expressions) const;
inline void reduce_1d_local_memory(kernel_generation_stream & stream, unsigned int size, std::vector<symbolic::reduce_1d*> exprs,
std::string const & buf_str, std::string const & buf_value_str, driver::backend_type backend) const;
std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & mapping) const;
public:
reduce_1d(unsigned int vwidth, unsigned int ls, unsigned int ng);
std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
expression_type type() const;
private:
unsigned int ng_;
std::vector< driver::Buffer > tmp_;
std::vector< driver::Buffer > tmpidx_;
};
}
}
#endif

View File

@@ -1,69 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_BACKEND_TEMPLATES_MDOT_H
#define ISAAC_BACKEND_TEMPLATES_MDOT_H
#include <vector>
#include "isaac/jit/syntax/expression/expression.h"
#include "isaac/jit/generation/base.h"
namespace isaac
{
namespace templates
{
class reduce_2d : public parameterized_base
{
protected:
reduce_2d(unsigned int vwidth, unsigned int ls0, unsigned int ls1, unsigned int ng0, unsigned int ng1, operation_type_family);
private:
unsigned int lmem_usage(expression_tree const &) const;
unsigned int temporary_workspace(expression_tree const & expressions) const;
std::string generate_impl(std::string const & suffix, expression_tree const &, driver::Device const & device, symbolic::symbols_table const &) const;
public:
virtual std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
expression_type type() const;
private:
unsigned int ng0_;
unsigned int ng1_;
operation_type_family reduction_type_;
};
class reduce_2d_rows : public reduce_2d
{
public:
reduce_2d_rows(unsigned int vwidth, unsigned int ls0, unsigned int ls1, unsigned int ng0, unsigned int ng1);
};
class reduce_2d_cols : public reduce_2d
{
public:
reduce_2d_cols(unsigned int vwidth, unsigned int ls0, unsigned int ls1, unsigned int ng0, unsigned int ng1);
};
}
}
#endif

View File

@@ -1,85 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_BACKEND_BINDER_H
#define ISAAC_BACKEND_BINDER_H
#include <map>
#include "isaac/driver/buffer.h"
#include "isaac/jit/syntax/expression/expression.h"
namespace isaac
{
class array_base;
class symbolic_binder
{
class cmp
{
public:
cmp(driver::backend_type backend) : backend_(backend) {}
bool operator()(handle_t const & x, handle_t const & y) const
{
if(backend_==driver::OPENCL)
return x.cl < y.cl;
else
return x.cu < y.cu;
}
private:
driver::backend_type backend_;
};
public:
symbolic_binder(driver::backend_type backend);
virtual ~symbolic_binder();
virtual bool bind(handle_t const &, bool) = 0;
virtual unsigned int get(handle_t const &, bool) = 0;
unsigned int get();
protected:
unsigned int current_arg_;
std::map<handle_t,unsigned int, cmp> memory;
};
class bind_sequential : public symbolic_binder
{
public:
bind_sequential(driver::backend_type backend);
bool bind(handle_t const & a, bool);
unsigned int get(handle_t const & a, bool);
};
class bind_independent : public symbolic_binder
{
public:
bind_independent(driver::backend_type backend);
bool bind(handle_t const & a, bool);
unsigned int get(const handle_t &a, bool);
};
}
#endif

View File

@@ -1,54 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_SYMBOLIC_ENGINE_MACRO_H
#define ISAAC_SYMBOLIC_ENGINE_MACRO_H
#include <string>
#include <vector>
namespace isaac
{
namespace symbolic
{
//Macro
class macro
{
public:
macro(std::string const & code);
macro(const char * code);
int expand(std::string & str) const;
bool operator<(macro const & o) const;
private:
std::string code_;
std::string name_;
std::vector<std::string> args_;
std::vector<std::string> tokens_;
};
}
}
#endif

View File

@@ -1,207 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_MAPPED_OBJECT_H
#define ISAAC_MAPPED_OBJECT_H
#include <set>
#include <map>
#include <string>
#include "isaac/jit/syntax/engine/macro.h"
#include "isaac/jit/syntax/expression/expression.h"
#include "isaac/jit/generation/engine/stream.h"
#include "isaac/types.h"
namespace isaac
{
namespace symbolic
{
class object;
typedef std::map<size_t, std::shared_ptr<object> > symbols_table;
//Node
class node
{
public:
node(size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
op_element op() const;
object const * lhs() const;
object const * rhs() const;
size_t root() const;
protected:
op_element op_;
object* lhs_;
object* rhs_;
size_t root_;
};
//Object
class object
{
protected:
void add_base(std::string const & name);
void add_load(bool contiguous);
public:
object(driver::Context const & context, std::string const & scalartype, unsigned int id);
object(driver::Context const & context, std::string const & scalartype, std::string const & name);
virtual ~object();
bool hasattr(std::string const & name) const;
std::string process(std::string const & in) const;
virtual std::string evaluate(std::map<std::string, std::string> const & table) const;
protected:
driver::Context const & context_;
std::map<std::string, std::string> attributes_;
std::set<macro> macros_;
std::list<std::string> hierarchy_;
};
//Leaf
class leaf: public object
{
public:
leaf(driver::Context const & context, std::string const & scalartype, unsigned int id);
leaf(driver::Context const & context, std::string const & scalartype, std::string const & name);
};
//Arithmetic node
class arithmetic_node : public object, public node
{
public:
arithmetic_node(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
protected:
std::string op_str_;
};
//Binary arithmetic
class binary_arithmetic_node: public arithmetic_node
{
public:
binary_arithmetic_node(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
std::string evaluate(std::map<std::string, std::string> const & table) const;
};
//Unary arithmetic
class unary_arithmetic_node: public arithmetic_node
{
public:
unary_arithmetic_node(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
std::string evaluate(std::map<std::string, std::string> const & table) const;
};
//Sfor
class sfor: public object, public node
{
public:
sfor(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
};
//Reductions
class reduction : public object, public node
{
public:
reduction(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
};
class reduce_1d : public reduction
{
public:
reduce_1d(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
};
class reduce_2d : public reduction
{
public:
reduce_2d(unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
};
//Host scalar
class host_scalar : public leaf
{
public:
host_scalar(driver::Context const & context, std::string const & scalartype, unsigned int id);
};
//Placeholder
class placeholder : public leaf
{
public:
placeholder(driver::Context const & context, unsigned int level);
};
//Arrays
class array : public leaf
{
protected:
std::string make_broadcast(tuple const & shape);
public:
array(driver::Context const & context, std::string const & scalartype, unsigned int id);
};
//Buffer
class buffer : public array
{
public:
buffer(driver::Context const & context, std::string const & scalartype, unsigned int id, tuple const & shape, tuple const &strides);
unsigned int dim() const { return dim_; }
private:
std::string ld_;
std::string start_;
std::string stride_;
unsigned int dim_;
};
//Index modifier
class index_modifier: public array, public node
{
public:
index_modifier(std::string const & scalartype, unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
};
class reshape : public index_modifier
{
public:
reshape(std::string const & scalartype, unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
};
class trans : public index_modifier
{
public:
trans(std::string const & scalartype, unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
};
class diag_vector : public index_modifier
{
public:
diag_vector(std::string const & scalartype, unsigned int id, size_t root, op_element op, expression_tree const & tree, symbols_table const & table);
};
}
}
#endif

View File

@@ -1,123 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_SYMBOLIC_ENGINE_PROCESS
#define ISAAC_SYMBOLIC_ENGINE_PROCESS
#include <functional>
#include <typeinfo>
#include "isaac/tools/cpp/string.hpp"
#include "isaac/jit/syntax/expression/expression.h"
#include "isaac/jit/syntax/engine/binder.h"
#include "isaac/jit/syntax/engine/object.h"
#include "isaac/array.h"
namespace isaac
{
namespace symbolic
{
//Traverse
template<class FUN>
inline void traverse(expression_tree const & tree, size_t root, FUN const & fun,
std::function<bool(size_t)> const & recurse)
{
expression_tree::node const & node = tree[root];
if (node.type==COMPOSITE_OPERATOR_TYPE && recurse(root)){
traverse(tree, node.binary_operator.lhs, fun, recurse);
traverse(tree, node.binary_operator.rhs, fun, recurse);
}
if (node.type != INVALID_SUBTYPE)
fun(root);
}
template<class FUN>
inline void traverse(expression_tree const & tree, size_t root, FUN const & fun)
{ return traverse(tree, root, fun, [](size_t){return true;}); }
template<class FUN>
inline void traverse(expression_tree const & tree, FUN const & fun)
{ return traverse(tree, tree.root(), fun); }
//Extract symbolic types
template<class T>
inline void extract(expression_tree const & tree, symbols_table const & table,
size_t root, std::set<std::string>& processed, std::vector<T*>& result, bool array_recurse = true)
{
auto extract_impl = [&](size_t index)
{
symbols_table::const_iterator it = table.find(index);
if(it!=table.end())
{
T* obj = dynamic_cast<T*>(&*it->second);
if(obj && processed.insert(obj->process("#name")).second)
result.push_back(obj);
}
};
auto recurse = [&](size_t index){ return array_recurse?true:dynamic_cast<index_modifier*>(&*table.at(index))==0;};
traverse(tree, root, extract_impl, recurse);
}
template<class T>
inline std::vector<T*> extract(expression_tree const & tree, symbols_table const & table, std::vector<size_t> roots, bool array_recurse = true)
{
std::vector<T*> result;
std::set<std::string> processed;
for(size_t root: roots)
extract(tree, table, root, processed, result, array_recurse);
return result;
}
template<class T>
inline std::vector<T*> extract(expression_tree const & tree, symbols_table const & table, size_t root, bool array_recurse = true)
{
return extract<T>(tree, table, std::vector<size_t>{root}, array_recurse);
}
template<class T>
inline std::vector<T*> extract(expression_tree const & tree, symbols_table const & table)
{
return extract<T>(tree, table, tree.root());
}
// Filter nodes
std::vector<size_t> find(expression_tree const & tree, size_t root, std::function<bool (expression_tree::node const &)> const & pred);
std::vector<size_t> find(expression_tree const & tree, std::function<bool (expression_tree::node const &)> const & pred);
std::vector<size_t> assignments(expression_tree const & tree);
std::vector<size_t> lhs_of(expression_tree const & tree, std::vector<size_t> const & in);
std::vector<size_t> rhs_of(expression_tree const & tree, std::vector<size_t> const & in);
// Hash
std::string hash(expression_tree const & tree);
//Set arguments
void set_arguments(expression_tree const & tree, driver::Kernel & kernel, unsigned int& current_arg);
//Symbolize
symbols_table symbolize(isaac::expression_tree const & expression);
}
}
#endif

View File

@@ -1,154 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _ISAAC_SYMBOLIC_EXPRESSION_H
#define _ISAAC_SYMBOLIC_EXPRESSION_H
#include <utility>
#include <vector>
#include <list>
#include "isaac/driver/backend.h"
#include "isaac/driver/context.h"
#include "isaac/driver/command_queue.h"
#include "isaac/driver/event.h"
#include "isaac/driver/kernel.h"
#include "isaac/driver/ndrange.h"
#include "isaac/driver/buffer.h"
#include "isaac/jit/syntax/expression/operations.h"
#include "isaac/tools/cpp/tuple.hpp"
#include "isaac/types.h"
#include "isaac/value_scalar.h"
#include <memory>
#include <iostream>
namespace isaac
{
class array_base;
struct invalid_node{};
enum node_type
{
INVALID_SUBTYPE = 0,
COMPOSITE_OPERATOR_TYPE,
VALUE_SCALAR_TYPE,
DENSE_ARRAY_TYPE,
};
union handle_t
{
cl_mem cl;
CUdeviceptr cu;
};
struct array_holder
{
int_t start;
handle_t handle;
array_base* base;
};
class expression_tree
{
public:
struct node
{
//Constructors
node();
node(invalid_node);
node(value_scalar const & x);
node(array_base const & x);
node(int_t lhs, op_element op, int_t rhs, numeric_type dtype, tuple const & shape);
//Common
node_type type;
numeric_type dtype;
tuple shape;
tuple ld;
//Type-specific
union
{
//Operator
struct{
int_t lhs;
op_element op;
int_t rhs;
}binary_operator;
//Scalar
values_holder scalar;
//Array
array_holder array;
};
};
typedef std::vector<node> data_type;
public:
expression_tree(node const & lhs, node const & rhs, op_element const & op, driver::Context const * context, numeric_type const & dtype, tuple const & shape);
expression_tree(expression_tree const & lhs, node const & rhs, op_element const & op, driver::Context const * context, numeric_type const & dtype, tuple const & shape);
expression_tree(node const & lhs, expression_tree const & rhs, op_element const & op, driver::Context const * context, numeric_type const & dtype, tuple const & shape);
expression_tree(expression_tree const & lhs, expression_tree const & rhs, op_element const & op, driver::Context const * context, numeric_type const & dtype, tuple const & shape);
tuple shape() const;
int_t dim() const;
data_type const & data() const;
std::size_t root() const;
driver::Context const & context() const;
numeric_type const & dtype() const;
node const & operator[](size_t) const;
node & operator[](size_t);
expression_tree operator-();
expression_tree operator!();
private:
data_type tree_;
std::size_t root_;
driver::Context const * context_;
};
template<class T> typename std::enable_if<!std::is_arithmetic<T>::value, T const &>::type wrap_generic(T const & x){ return x;}
template<class T> typename std::enable_if<std::is_arithmetic<T>::value, value_scalar>::type wrap_generic(T x) { return value_scalar(x); }
template<typename T>
ISAACAPI typename std::conditional<std::is_arithmetic<T>::value, value_scalar, T const &>::type make_tuple(driver::Context const &, T const & x)
{ return wrap_generic(x); }
template<typename T, typename... Args>
ISAACAPI expression_tree make_tuple(driver::Context const & context, T const & x, Args... args)
{ return expression_tree(wrap_generic(x), make_tuple(context, args...), op_element(BINARY_ARITHMETIC, PAIR_TYPE), &context, numeric_type_of(x), {1}); }
//io
std::string to_string(node_type const & f);
std::string to_string(expression_tree::node const & e);
std::ostream & operator<<(std::ostream & os, expression_tree::node const & s_node);
std::string to_string(isaac::expression_tree const & s);
}
#endif

View File

@@ -1,157 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _ISAAC_SYMBOLIC_OPERATIONS_H
#define _ISAAC_SYMBOLIC_OPERATIONS_H
#include <string>
namespace isaac
{
/** @brief Optimization enum for grouping operations into unary or binary operations. Just for optimization of lookups. */
enum operation_type_family
{
INVALID_ = 0,
// BLAS1-type
UNARY_ARITHMETIC,
BINARY_ARITHMETIC,
REDUCE,
// BLAS2-type
REDUCE_ROWS,
REDUCE_COLUMNS,
// BLAS3-type
GEMM
};
/** @brief Enumeration for identifying the possible operations */
enum operation_type
{
INVALID_TYPE = 0,
// unary operator
MINUS_TYPE,
NEGATE_TYPE,
// unary expression
CAST_BOOL_TYPE,
CAST_CHAR_TYPE,
CAST_UCHAR_TYPE,
CAST_SHORT_TYPE,
CAST_USHORT_TYPE,
CAST_INT_TYPE,
CAST_UINT_TYPE,
CAST_LONG_TYPE,
CAST_ULONG_TYPE,
CAST_HALF_TYPE,
CAST_FLOAT_TYPE,
CAST_DOUBLE_TYPE,
ABS_TYPE,
ACOS_TYPE,
ASIN_TYPE,
ATAN_TYPE,
CEIL_TYPE,
COS_TYPE,
COSH_TYPE,
EXP_TYPE,
FABS_TYPE,
FLOOR_TYPE,
LOG_TYPE,
LOG10_TYPE,
SIN_TYPE,
SINH_TYPE,
SQRT_TYPE,
TAN_TYPE,
TANH_TYPE,
TRANS_TYPE,
// binary expression
ASSIGN_TYPE,
INPLACE_ADD_TYPE,
INPLACE_SUB_TYPE,
ADD_TYPE,
SUB_TYPE,
MULT_TYPE,
DIV_TYPE,
ELEMENT_ARGFMAX_TYPE,
ELEMENT_ARGFMIN_TYPE,
ELEMENT_ARGMAX_TYPE,
ELEMENT_ARGMIN_TYPE,
ELEMENT_PROD_TYPE,
ELEMENT_DIV_TYPE,
ELEMENT_EQ_TYPE,
ELEMENT_NEQ_TYPE,
ELEMENT_GREATER_TYPE,
ELEMENT_GEQ_TYPE,
ELEMENT_LESS_TYPE,
ELEMENT_LEQ_TYPE,
ELEMENT_POW_TYPE,
ELEMENT_FMAX_TYPE,
ELEMENT_FMIN_TYPE,
ELEMENT_MAX_TYPE,
ELEMENT_MIN_TYPE,
//Products
OUTER_PROD_TYPE,
GEMM_NN_TYPE,
GEMM_TN_TYPE,
GEMM_NT_TYPE,
GEMM_TT_TYPE,
//Access modifiers
RESHAPE_TYPE,
SHIFT_TYPE,
DIAG_MATRIX_TYPE,
DIAG_VECTOR_TYPE,
ACCESS_INDEX_TYPE,
PAIR_TYPE,
OPERATOR_FUSE,
SFOR_TYPE,
};
struct op_element
{
op_element();
op_element(operation_type_family const & _type_family, operation_type const & _type);
operation_type_family type_family;
operation_type type;
};
std::string to_string(operation_type type);
bool is_assignment(operation_type op);
bool is_operator(operation_type op);
bool is_function(operation_type op);
bool is_cast(operation_type op);
bool is_indexing(operation_type op);
}
#endif

Some files were not shown because too many files have changed in this diff Show More