555 Commits
isaac ... v0.1

Author SHA1 Message Date
Philippe Tillet
b9e36331be [PACKAGING] sdist now generates working .tar.gz file 2020-05-04 10:43:38 -04:00
Philippe Tillet
c3d4a8b7ae [PACKAGING] Fixed import error 2020-05-04 10:14:34 -04:00
Philippe Tillet
0685fcb804 [PACKAGING] Added MANIFEST.in and some symlinks for better packaging 2020-05-04 10:09:30 -04:00
Philippe Tillet
1e7fee573b [PACKAGING] Fixed typo in setup.py 2020-05-04 09:09:54 -04:00
Philippe Tillet
dc2ded4165 [PACKAGING] Added some more files for packaging 2020-05-04 08:58:58 -04:00
Philippe Tillet
57a0b0a132 [CODEGEN] Fixed bug for phi nodes with constant incoming value 2020-05-02 17:30:15 -04:00
Philippe Tillet
a9efb27fde [CODEGEN][ANALYSIS] bugfix in alignment analysis 2020-05-01 17:38:23 -04:00
Philippe Tillet
4c2c244b91 [PYTHON][SRC][BINDING] Improved code portability across compilers 2020-04-17 13:51:47 -04:00
Philippe Tillet
eaabfb1d8e [PYTHON][EXAMPLES][EINSUM] Updated configs for matmul 2020-04-10 12:42:48 -04:00
Philippe Tillet
28f845eab1 [PYTHON][EXAMPLES][EINSUM] Added stride in CONV2D example 2020-04-10 00:14:31 -04:00
Philippe Tillet
8e276484ea [PYTHON][EXAMPLES][EINSUM] Added group-convolution test/benchmark 2020-04-09 23:37:39 -04:00
Philippe Tillet
840af73c8c [PYTHON][EINSUM] re-established auto-tuning 2020-04-09 11:01:57 -04:00
Philippe Tillet
25055cbba7 [TESTS] Simplified testing of half-precision transposes 2020-04-09 01:10:11 -04:00
Philippe Tillet
4114351c7c [PYTHON][KERNEL] Added thread-safety when caching custom torch op 2020-04-07 20:21:50 -04:00
Philippe Tillet
fa5e4af93e [CORE][RUNTIME] Better error message on internal compilation error 2020-04-07 14:01:21 -04:00
Philippe Tillet
0b35c5ff3f [PYTHON][KERNEL] Better handling of case where cache directory already
exists
2020-04-07 13:18:11 -04:00
Philippe Tillet
e04efc1c85 [GENERAL] Error messages now no longer make terminal color green 2020-04-03 23:25:29 -04:00
Philippe Tillet
7c09ff80eb [CORE] Fixed several issues that arose in the development of the
torch-blocksparse package:

* Now using warp shuffle in reductions when possible
* Various bugfixes in layout inference
* Added INFINITY, exponential and select
* Better error messages for unimplemented constructs
2020-03-31 18:57:28 -04:00
Phillippe Tillet
f68ed147fa Merge pull request #35 from jack-willturner/iss33
[PYTHON] Add empty string to llvm-config versions in setup.py
2020-03-13 15:06:32 -04:00
Jack Turner
53b9e4790e [PYTHON] Add empty string to llvm-config versions in setup.py 2020-03-13 18:03:25 +00:00
Philippe Tillet
699711724a [DOCS] Fixed typo: triton.function -> torch.autograd.Function 2020-03-13 11:44:55 -04:00
Philippe Tillet
7ee3380faf [DOCS] Made documentation consistent with the new kernel API 2020-03-10 13:25:57 -04:00
Philippe Tillet
2dcf381bdc [CORE][TOOLS][BENCH] Disabled normalization for auto-tuning 2020-03-06 18:41:36 -05:00
Philippe Tillet
a27b96cad1 [CORE][DRIVER] Now only using PTX6.4 if CUDA10.1+ is detected 2020-03-05 21:29:22 -05:00
Philippe Tillet
20c0246d88 [CMAKE] target_link_directories -> link_directories 2020-03-05 20:16:46 -05:00
Philippe Tillet
b7895c653f [PYTHON][EXAMPLES] Removed BlockSparse examples; see
https://github.com/ptillet/torch-blocksparse.git
2020-03-05 13:32:42 -05:00
Philippe Tillet
1f1e4ee9ec [PYTHON] Merged blocksparse branch:
* Example for blocksparse matrix multiplication
* Simplified Triton kernel API
* Revived auto-tuning in einsum
2020-03-05 13:08:07 -05:00
Philippe Tillet
51bc244b2c [PYTHON][OPS] Fixed typo in einsum 2020-02-25 16:39:04 -05:00
Philippe Tillet
cc6892f9f4 [CMAKE] Fixed issue in LLVM link directory 2020-02-25 12:52:23 -08:00
Philippe Tillet
3ee342d056 [TRITON][NN][CONV] Renamed input -> x to not modify built-in functions 2020-02-25 10:56:39 -05:00
Philippe Tillet
1e76bb4145 [PYTHON][NN][CONV] Fixed typo in dx computation 2020-02-24 22:45:49 -05:00
Philippe Tillet
cc094936d0 [PYTHON][SETUP] Removed obsolete debug print() 2020-02-24 21:35:45 -05:00
Philippe Tillet
d0ca849be2 [DOCUMENTATION] Updated installation instructions in README.md 2020-02-24 18:07:25 -05:00
Philippe Tillet
a59c24ab67 [CORE] Added missing sha1.hpp file 2020-02-24 18:02:18 -05:00
Philippe Tillet
8c35bd775f [PYTHON] Added missing files for nn submodule 2020-02-24 17:58:24 -05:00
Philippe Tillet
67c633aa2d [PYTHON] Better packaging 2020-02-24 17:46:20 -05:00
Philippe Tillet
f2daff85d2 [GENERAL] Improved caching mechanism:
* Now computing hash in libtriton
* Now only compiling a single pytorch hook per function signature
2020-02-24 16:36:50 -05:00
Philippe Tillet
29c38b38e5 [PYTHON][OPS][EINSUM] Now throwing error for automatic differentiation
of extended einsum
2020-02-20 17:29:01 -05:00
Philippe Tillet
9693fe1441 [PYTHON][OP][EINSUM] simplified API 2020-02-19 23:42:22 -05:00
Philippe Tillet
c30c4861be [PYTHON][OPS][EINSUM] Added support for inner tensor strides 2020-02-19 11:50:17 -05:00
Philippe Tillet
7621aeda3f [CODEGEN][TRANSFORM][PEEPHOLE] Fixed bug in *1 multiplication 2020-02-19 00:18:55 -05:00
Philippe Tillet
304b003969 [PYTHON][EXAMPLES] Removed obsolete files 2020-02-18 12:26:06 -05:00
Philippe Tillet
d11d2db6ee [PYTHON][EINSUM] Now handling reduction sizes that are not a multiple of
TK
2020-02-17 13:52:58 -05:00
Philippe Tillet
fcdc65ffb0 [PYTHON][OPS][EINSUM] Added support for masked accumulator 2020-02-13 18:11:12 -05:00
Philippe Tillet
ae462e4fa1 [DOCS][TUTORIALS] Fixed typo 2020-02-10 04:22:45 -05:00
Philippe Tillet
6a4d42c1b8 [PYTHON][CORE] Deprecating Tensorflow support 2020-02-10 04:20:33 -05:00
Philippe Tillet
7c37258549 [DOCS] Fixed README.md 2020-02-10 03:47:49 -05:00
Philippe Tillet
0493863ee9 [DOC] Fixed formatting issues in tutorial 2020-02-10 03:27:36 -05:00
Philippe Tillet
868922e9b0 [DOC] Added more tutorials 2020-02-10 03:18:23 -05:00
Philippe Tillet
56330f1fdf [DOC] Basic Sphynx skeleton 2020-02-10 01:56:45 -05:00
Philippe Tillet
a099c6f7f3 [TRITON][LANG] Added support for bitcast 2020-02-09 20:11:13 -05:00
Philippe Tillet
0767c27b3b [README] Added comment about ISAAC 2020-02-06 14:09:09 -05:00
Philippe Tillet
4ff0205275 [GENERAL] Updated README.md 2020-02-06 13:59:42 -05:00
Philippe Tillet
5a3c30148e [PYTHON][EXAMPLES] Changed shape of einsum examples 2020-02-06 13:57:30 -05:00
Philippe Tillet
e6fc28050a [GENERAL] Updated README.md 2020-02-06 13:20:25 -05:00
Philippe Tillet
48a4dc172f [GENERAL] ISAAC -> Triton 2020-02-06 00:49:18 -05:00
Philippe Tillet
fa29e63838 [GENERAL] Deleted ISAAC Files 2020-02-06 00:48:45 -05:00
Philippe Tillet
3e92901bd5 [TRITON][PYTHON] Cleaned up API 2020-02-05 19:44:19 -05:00
Philippe Tillet
2fcf5cec5b [TRITON][CODEGEN] Fixed flawed assert() 2020-01-24 15:25:00 -05:00
Philippe Tillet
db941161ed [PYTHON][EXAMPLES] Cleaned self-attention benchmarks 2020-01-22 18:09:00 -05:00
Philippe Tillet
ce7a00674a [PYTHON][EXAMPLES] Added self-attention example using triton.ops.einsum 2020-01-21 16:45:04 -05:00
Philippe Tillet
78b98fb7cf [GENERAL] Cleaned polymorphic structure of layouts analysis pass 2020-01-21 11:38:39 -05:00
Philippe Tillet
382ca2c745 [CODEGEN][ANALYSIS] cleaning: moving towards better polymorphism for
tile layouts
2020-01-20 12:43:04 -05:00
Philippe Tillet
fbf2a3f56f [CODEGEN][TRANSFORM] some bug-fixes for FP32 einsum 2020-01-20 12:42:53 -05:00
Philippe Tillet
f278d9741a [GENERAL] Merged einsum feature branch. Various feature, performance
improvements and bugfixes:

* Added preliminary support for extended Einstein summation in PyTriton
* Significant performance improvement on FP32 kernels containing matrix
multiplication
* Added re-coalescing pass for FP16 kernels containing matrix
multiplication
* Various bugfixes
2020-01-20 12:42:48 -05:00
Philippe Tillet
50a52df489 [PYTHON][OPS] Convolution: Some cleaning of Triton-C kernel 2019-11-01 11:21:30 -04:00
Philippe Tillet
f4bbbbe5e4 [PYTHON][OPS] Bugfix in conv fprop 2019-11-01 00:43:02 -04:00
Philippe Tillet
739a8d9061 some work on conv 2019-10-31 18:08:27 -04:00
Philippe Tillet
91a2fd463b [PYTHON][TENSORFLOW] More bugfixes for forward/backward signatures 2019-10-31 01:49:30 -04:00
Philippe Tillet
93a86d4fc6 [PYTHON][TENSORFLOW] Signature of function.forward() does not have to
match signature of kernel anymore
2019-10-30 20:29:23 -04:00
Philippe Tillet
e0fe8d9058 [PYTHON][TENSORFLOW] More work 2019-10-30 18:39:58 -04:00
Philippe Tillet
fd09f9c99d fixup 2019-10-30 13:48:55 -04:00
Philippe Tillet
9b0f1a0807 more stuff 2019-10-30 13:44:31 -04:00
Philippe Tillet
bf3dc63858 [PYTHON] Removed dead code for alloc_empty and register_scalar 2019-10-30 10:37:30 -04:00
Philippe Tillet
f4fcaf84df [PYTHON][EXAMPLES] Added example for batchnorm 2019-10-30 01:49:42 -04:00
Philippe Tillet
2b9355c9e4 [PYTHON][TENSORFLOW] Got rid of alloc_empty entirely; now doing
generating allocation code inside the tensorflow op
2019-10-30 01:38:30 -04:00
Philippe Tillet
d65a94c768 [PYTHON][OPS] Added batch normalization op 2019-10-29 17:29:11 -04:00
Philippe Tillet
d9eacf937c [PYTHON][FUNCTION] Now using common grad output format for both
tensorflow and pytorch
2019-10-29 14:09:40 -04:00
Philippe Tillet
76651a065f [PYTHON][EXAMPLES] Better einsum example 2019-10-29 12:56:58 -04:00
Philippe Tillet
448f4433d9 [PYTHON][KERNEL] Enforcing shapes to be known at compile-time for
TensorFlow Graph Execution
2019-10-29 00:48:53 -04:00
Philippe Tillet
e9c787ef05 [PYTHON][EINSUM] Added support for FP16 2019-10-28 14:07:17 -04:00
Philippe Tillet
0ec213547c [PYTHON][KERNEL] Added benchmarking functionalities for kernels 2019-10-28 00:30:04 -04:00
Philippe Tillet
e11557855f [PYTHON] [OPS] Added einsum implementation 2019-10-26 22:14:50 -04:00
Philippe Tillet
655f43fb5b more work 2019-10-26 15:10:19 -04:00
Philippe Tillet
76adcb755a [PYTHON][EXAMPLES] Tentative support for einsum with transpositions 2019-10-25 19:01:21 -04:00
Philippe Tillet
8bd87fa19d [TEST][DOT] There seems to be a bug in casting tiles before ternary.
Reverting for now
2019-10-25 17:00:53 -04:00
Philippe Tillet
b615af2e7e [codegen] [generator] fixed issue when tile size is 1 along one or more
dimensions
2019-10-25 14:22:28 -04:00
Philippe Tillet
0770ccf537 [codegen] [selection] disassociation prototype 2019-10-25 09:39:46 -04:00
Philippe Tillet
943bf41b5c [python] [op] added Triton NT einsum 2019-10-21 23:37:39 -04:00
Philippe Tillet
099918b3c0 [python] [ops] added skeleton for einsum op 2019-10-21 18:58:02 -04:00
Philippe Tillet
4b0c43bb7b [python][example] added test for einsum 2019-10-21 17:13:12 -04:00
Philippe Tillet
b81734553b [lang] added support for batched matrix multiplication 2019-10-21 15:41:50 -04:00
Philippe Tillet
e827d4f467 [python] [bindings] removed obsolete #include 2019-10-20 20:37:37 -04:00
Philippe Tillet
de6fdd5625 [general] removed useless files and includes 2019-10-20 19:29:48 -04:00
Philippe Tillet
96cba9036a [tests] [unit] added 1D and 3D reduction test 2019-10-20 17:48:19 -04:00
Philippe Tillet
abe3fbb480 [test] [reduce] added test for 1D reduction 2019-10-20 01:01:53 -04:00
Philippe Tillet
23db500edf [tests] [common] added reduce.h to common headers 2019-10-19 16:53:48 -04:00
Philippe Tillet
a76efd326d [selection] [codegen] added reduction 2019-10-19 14:47:16 -04:00
Philippe Tillet
d76c6bc3c7 Merge branch 'master' into auto-coalesce 2019-10-18 16:21:28 -04:00
Philippe Tillet
50efd9c82f [codegen] [liveness] bugfix in live range computation 2019-10-18 14:54:26 -04:00
Philippe Tillet
cfde3dd766 [codegen] [layout] fixed padding issue for row-major HMMA 2019-10-18 13:42:15 -04:00
Philippe Tillet
b43454c9b7 [codegen] [membar] view do not write to shared memory 2019-10-17 22:38:41 -04:00
Philippe Tillet
cf4fbfefee [codegen] [selection] no longer using llvm::IRBuilder<>::Insert() 2019-10-17 13:12:37 -04:00
Philippe Tillet
be25e954f6 [codegen] [selection] merged selection into generator visit 2019-10-17 12:55:37 -04:00
Philippe Tillet
f4f70db234 [codegen] [selection] re-arranged file structure 2019-10-17 12:31:26 -04:00
Philippe Tillet
a0182f41dd more cleaning 2019-10-17 08:17:23 -04:00
Philippe Tillet
a157177267 [codegen] [selection] more cleaning 2019-10-17 00:51:26 -04:00
Philippe Tillet
ae24621825 more cleaning 2019-10-17 00:36:46 -04:00
Philippe Tillet
4bfe998cc8 [codegen] [selection] everything is now implemented with visitor 2019-10-16 18:10:03 -04:00
Philippe Tillet
1b5b76b629 [codegen] [selection] machine layouts now create machine tiles 2019-10-15 16:12:08 -04:00
Philippe Tillet
3d5ab4bc0d [codegen] [selection] created machine layouts 2019-10-15 12:29:58 -04:00
Philippe Tillet
6f5f511a33 [doc][pytriton] now showing full requirements of triton.function 2019-10-14 11:36:54 -04:00
Philippe Tillet
0a2a4d9fdd more cleaning 2019-10-13 19:59:33 -04:00
Philippe Tillet
ee387ff567 more cleaning 2019-10-13 14:43:17 -04:00
Philippe Tillet
e787ce0cab [codegen] more cleaning 2019-10-13 02:26:30 -04:00
Philippe Tillet
cb12fc1a87 [codegen] adding visitor 2019-10-13 00:25:06 -04:00
Philippe Tillet
6beef4be1f more cleaning 2019-10-12 01:25:08 -04:00
Philippe Tillet
7d77f34db0 [codegen] more cleaning 2019-10-11 23:40:27 -04:00
Philippe Tillet
ee3803b577 more cleaning 2019-10-11 19:29:24 -04:00
Philippe Tillet
323c90e431 ugh 2019-10-11 19:05:54 -04:00
Philippe Tillet
4efd0a3c6b [codegen] more cleaning 2019-10-10 15:52:03 -04:00
Philippe Tillet
a3f76b6eb1 [codegen] more cleaning 2019-10-09 21:59:35 -04:00
Philippe Tillet
9bc6df4fd1 [codegen] more cleaning 2019-10-09 15:05:44 -04:00
Philippe Tillet
10ab94d1c5 [codegen] added missing file 2019-10-08 17:10:34 -04:00
Philippe Tillet
254ed52958 [codegen] more cleaning 2019-10-08 11:26:22 -04:00
Philippe Tillet
650c43ca07 [codegen] more cleaning 2019-10-07 18:06:54 -04:00
Philippe Tillet
1783d45bef [codegen] better handling of row/column-major 2019-10-04 16:07:31 -04:00
Philippe Tillet
a1e0512703 [codegen] more progress 2019-10-03 14:11:50 -04:00
Philippe Tillet
1bf0c8adeb [test] re-added bounds checking in dot test 2019-10-02 15:08:32 -04:00
Philippe Tillet
adbc56d10a [tests] delete redundant code in dot benchmark and unit tests 2019-10-02 14:26:09 -04:00
Philippe Tillet
86a3e5d897 [codegen] now matrix-multiplication is bank-conflict free for all
layouts
2019-10-01 16:57:59 -04:00
Philippe Tillet
ed1b2bc563 more work on padding 2019-09-27 22:15:30 -04:00
Philippe Tillet
575dd06be3 [codegen] more progress towards unified dot implementation 2019-09-26 14:01:28 -04:00
Philippe Tillet
69800a0318 [tests] [dot] now testing row-major 2019-09-24 20:36:55 -04:00
Philippe Tillet
a3bf3a1804 [codegen] more hmma row-major handling 2019-09-24 19:35:46 -04:00
Philippe Tillet
c24d55db23 [codegen] more work on hmma coalescing 2019-09-23 20:38:27 -04:00
Philippe Tillet
f0013f8bf1 [codegen] [allocation] fixed issues in HMMA 2019-09-23 17:54:42 -04:00
Philippe Tillet
b95ac15d48 [codegen] [selection] fixed synchronization issue with double-buffering 2019-09-23 13:56:46 -04:00
Philippe Tillet
856e7baa04 [test] added tests for copy 2019-09-23 12:07:24 -04:00
Philippe Tillet
001973630e [codegen] cleaned up shared memory and double-buffering logic 2019-09-21 22:21:40 -04:00
Philippe Tillet
43d88154bd [codegen] cleaning-up / formalizing shared-memory passes 2019-09-20 16:01:12 -04:00
Philippe Tillet
e35be1ddcf [ir][instruction] added identifier for each instruction 2019-09-19 16:25:36 -04:00
Philippe Tillet
1fd9be27ee [tests][bench] now benchmarking all variants of copy 2019-09-17 22:17:58 -04:00
Philippe Tillet
307c1128d5 [codegen] removed vectorization pass (now part of selection) 2019-09-17 15:21:10 -04:00
Philippe Tillet
e01e623333 [codegen][auto-coalesce] more debugging 2019-09-16 20:34:08 -04:00
Philippe Tillet
e184bad9a1 [auto-coalesce] more bugfixes 2019-09-16 13:28:23 -04:00
Philippe Tillet
8d37a55a21 [codegen][analysis] cleaned-up tiling formalism 2019-09-15 21:14:14 -04:00
Philippe Tillet
031f4dfe96 no performance regression 2019-09-14 19:13:54 -04:00
Philippe Tillet
495163e0e8 some more cleaning 2019-09-14 16:53:13 -04:00
Philippe Tillet
0d8f59dcec [codegen][selection] some cleaning 2019-09-14 16:04:06 -04:00
Philippe Tillet
66e32b3074 [codegen][grid] some cleaning 2019-09-14 13:05:53 -04:00
Philippe Tillet
8ae779206f more fixes 2019-09-14 02:36:11 -04:00
Philippe Tillet
eae02b99e5 [codegen][coalesce] fixed stale users in cloned instructions 2019-09-13 19:16:04 -04:00
Philippe Tillet
579a662e60 [codegen][coalesce] more bugfixes 2019-09-13 14:17:21 -04:00
Philippe Tillet
3fa3b90f16 test 2019-09-12 23:02:51 -04:00
Philippe Tillet
0dc7313e3b fixup 2019-09-12 22:46:03 -04:00
Philippe Tillet
11ff27d638 [codegen][coalesce] some bugfix for phi-nodes 2019-09-12 22:44:07 -04:00
Philippe Tillet
981ffb6d85 Merge branch 'c-reduction' 2019-09-12 17:04:09 -04:00
Philippe Tillet
7f2bc5bb66 [testing] re-arranged util.h 2019-09-12 16:20:29 -04:00
Philippe Tillet
f4beb713ab [test] added support for max, min reduction and made it easy to add more 2019-09-12 16:11:57 -04:00
Philippe Tillet
c4c93943df [codegen] fixed bug in reduction 2019-09-12 00:32:10 -04:00
Philippe Tillet
178094b5f7 [codegen] exposed a bug in reductions 2019-09-11 20:47:17 -04:00
Philippe Tillet
04a0fbd8e3 [tests] basic test for reduction in python passes 2019-09-11 17:35:56 -04:00
Philippe Tillet
0c41bade07 [codegen] basic recoalescing working 2019-09-10 23:25:47 -04:00
Philippe Tillet
d7be0edb15 [documentation] swapped the order of pytriton and triton-c tutorial in README.md 2019-09-10 21:17:22 -04:00
Philippe Tillet
2781cdcf93 [lang] added templates for reductions 2019-09-10 15:54:16 -04:00
Philippe Tillet
41acac6ba1 [documentation] added description of the __multipleof attribute 2019-09-10 14:16:52 -04:00
Philippe Tillet
df2455f4b8 [documentation][triton-c] grammar 2019-09-10 13:43:03 -04:00
Philippe Tillet
8111d56ee9 [documentation][triton-c] improved wording on Triton-C being
single-threaded
2019-09-10 13:36:34 -04:00
Philippe Tillet
7f21a63ae1 [documentation][triton-c] clearer motivations; now starting each snippet
with the language it's written in
2019-09-10 13:34:26 -04:00
Philippe Tillet
59c667801c [documentation][triton-c] fixed syntax highlighting on conditional transposition example 2019-09-10 12:28:42 -04:00
Philippe Tillet
d3491e01a9 [documentation][triton-c] making it clearer that it is not (yet) an MLIR dialect 2019-09-10 12:27:28 -04:00
Philippe Tillet
ab33e84337 [documentation] improved wording in triton-c tutorial 2019-09-10 03:01:41 -04:00
Philippe Tillet
ef1feefe7f [lang] added __global__ storage specifier 2019-09-10 02:01:09 -04:00
Philippe Tillet
060498cad1 [documentation] fixed broken references in PyTriton tutorial 2019-09-10 01:36:11 -04:00
Philippe Tillet
3c88a206c3 [documentation] fixed formating issue in pytriton tutorial 2019-09-10 01:33:30 -04:00
Philippe Tillet
898b116f30 [documentation] added pytriton tutorial 2019-09-10 01:32:31 -04:00
Philippe Tillet
c622619bcb more progress 2019-09-10 00:37:51 -04:00
Philippe Tillet
ef99baa743 Merge branch 'master' of https://github.com/ptillet/triton into auto-coalesce 2019-09-10 00:37:31 -04:00
Philippe Tillet
e1019cff3a [documentations] updated pytriton tutorial 2019-09-10 00:35:02 -04:00
Philippe Tillet
7d3fb6c390 [documentation] updated triton-c tutorial 2019-09-09 19:02:57 -04:00
Philippe Tillet
b953051eee [documentation] improved wording of triton-c tutorial 2019-09-09 04:09:23 -04:00
Philippe Tillet
433b08b39b [documentation] added [coming soon...] for tutorials in progress 2019-09-09 02:38:23 -04:00
Philippe Tillet
c0fa4e0a57 Merge branch 'master' of https://github.com/ptillet/triton 2019-09-09 02:29:54 -04:00
Philippe Tillet
4a69af08e7 [documentation] added README.md and first part of the Triton-C tutorial 2019-09-09 02:29:18 -04:00
Philippe Tillet
0cbbcce5c0 added missing file 2019-09-08 21:38:08 -04:00
Philippe Tillet
3daef1726d more progress 2019-09-08 21:36:54 -04:00
Philippe Tillet
3d78810d5e more progress 2019-09-08 21:29:40 -04:00
Philippe Tillet
32234c2612 ugh 2019-09-08 17:35:24 -04:00
Philippe Tillet
0ff81badac [driver] added TRITON_LIBCUDA environment variable to specify libcuda
path if not in LD_LIBRARY_PATH
2019-09-06 13:26:51 -04:00
Philippe Tillet
96bdae25d5 [python][example] now executing tensorflow and/or pytorch example
automatically
2019-09-05 21:35:23 -04:00
Philippe Tillet
b79bcbaee8 [auto-tuning] now not compiling kernels that use too much shared memory 2019-09-05 21:03:09 -04:00
Philippe Tillet
1f8fd525b5 [python] fixed warnings for pybind11 and pytorch 2019-09-05 20:28:00 -04:00
Philippe Tillet
3fd61c1a02 [cmake] better FindLLVM 2019-09-05 17:48:29 -04:00
Philippe Tillet
0405509190 [python] setup.py now finds LLVM version if available 2019-09-05 17:47:53 -04:00
Philippe Tillet
18848cbb71 [driver] now passing std::unique_ptr<> instead of cloning LLVM module
when compiling it
2019-09-05 17:25:58 -04:00
Philippe Tillet
0a6329ea7d [python] more robust way to add triton includes to python package 2019-09-05 16:01:56 -04:00
Philippe Tillet
945593e847 [python] using generic path for triton include directories 2019-09-05 15:42:43 -04:00
Philippe Tillet
7bfbb89612 [python] now packaging include and libtriton in triton._C submodule 2019-09-05 15:37:00 -04:00
Philippe Tillet
9ab2880fba [python][examples] cleaned up dot example 2019-09-05 12:54:35 -04:00
Philippe Tillet
2d6c8311e8 [python] upgraded pybind11 ; forcing torch tensors to be contiguous() 2019-09-05 12:30:51 -04:00
Philippe Tillet
58544d0523 [python] renamed src/tensorflow.cc -> src/bindings.cc 2019-09-05 09:39:58 -04:00
Philippe Tillet
b2629da1fe [python] more cleaning of frameworks logic 2019-09-05 02:21:07 -04:00
Philippe Tillet
44896ee777 [pytorch] clean-up of dynamic framework load 2019-09-05 02:16:27 -04:00
Philippe Tillet
65133cdf33 [python] basic support for pytorch seems to be working 2019-09-05 01:32:21 -04:00
Philippe Tillet
ed0f706005 [python] fixed various issues in pytorch supoport 2019-09-05 00:19:42 -04:00
Philippe Tillet
945b5d0de9 [python] modularized triton package 2019-09-04 21:55:47 -04:00
Philippe Tillet
f6e9c24fe8 [python] more progress towards tensorflow/pytorch unification 2019-09-04 19:45:50 -04:00
Philippe Tillet
cdbc9d4ecd [python] more generic gradient registration 2019-09-04 03:12:23 -04:00
Philippe Tillet
b747959a57 trying to work around tensorflow limitations 2019-09-04 01:54:43 -04:00
Philippe Tillet
2ccc915011 [python][examples] added template for blocksparse 2019-09-03 20:44:27 -04:00
Philippe Tillet
5e03f0a065 [codegen][align] reverted some changes 2019-09-03 15:28:07 -04:00
Philippe Tillet
97fdb5b6be [tests] added missing files 2019-09-03 12:44:35 -04:00
Philippe Tillet
a842d337c5 [general] various cleaning and bugfix:
* added copy1d and copy2d benchmark
* fixed issue in reassociation pass
2019-09-02 23:00:49 -04:00
Philippe Tillet
90d80c3b2e [codegen][selection] bugfix in scanline dot lowering 2019-09-01 16:30:53 -04:00
Philippe Tillet
2d4ddab4d0 [ir][print] improved pretty-printing of constants and instructions 2019-08-30 18:02:33 -07:00
Philippe Tillet
5db3a7adfe [python][examples] some more cleaning of dot product example 2019-08-30 17:05:03 -07:00
Philippe Tillet
7e0af2118c [codegen] worked around bug seemingly from nvptx/ptxas by simplifying multiplications by 1:
- Generated LLVM-IR looked correct
- Illegal addressing disappeared when running cuda-memcheck
- Illegal addressing disappeared when using nvptx-short-pointer
2019-08-30 16:45:14 -07:00
Philippe Tillet
141a823799 [python] refactoring in anticipation of pytorch support 2019-08-29 18:08:51 -07:00
Philippe Tillet
e3c953e79f [test] added more re-usable code in common/util.h 2019-08-28 18:06:36 -07:00
Philippe Tillet
d457482539 [codegen] fixed issue in double buffering pointer update 2019-08-28 17:50:45 -07:00
Philippe Tillet
59281f5794 [structure] better directory structure for tests 2019-08-27 20:33:38 -07:00
Philippe Tillet
37cbcfabd0 [examples] back to 96 TFLOPS on V100 2019-08-26 22:49:14 -07:00
Philippe Tillet
b4ae06a714 tracking down performance regression 2019-08-26 20:38:39 -07:00
Philippe Tillet
7cb73f66e2 testing some register gradient 2019-08-26 19:25:58 -07:00
Philippe Tillet
9ece3eccc6 some cleaning 2019-08-26 17:28:24 -07:00
Philippe Tillet
4075949f80 [python] basic tensorflow wrapper working 2019-08-26 16:53:49 -07:00
Philippe Tillet
0e0399f866 more tests 2019-08-26 11:00:00 -07:00
Philippe Tillet
321d268a4a more progress 2019-08-25 21:26:09 -07:00
Philippe Tillet
96b4d5e411 [examples] multiple transposition schemes now supported 2019-08-24 13:08:38 -07:00
Philippe Tillet
0b1c389894 [lang] changed array declarations from [{}] to [] 2019-08-23 20:34:24 -07:00
Philippe Tillet
44eb3891ae [lang] added support for restrict; added macros for attributes 2019-08-23 20:29:12 -07:00
Philippe Tillet
8c6bac49d1 [lang][codegen] added basic attribute support 2019-08-23 19:49:06 -07:00
Philippe Tillet
cb04ec0b3b some more cleaning 2019-08-23 19:22:38 -07:00
Philippe Tillet
732156b942 [general] rename *.cpp -> *.cc 2019-08-23 19:06:39 -07:00
Philippe Tillet
6158d96ff7 [general] cleaned include guards and added #pragma once 2019-08-23 18:08:05 -07:00
Philippe Tillet
606e799948 [LICENSING] updated license to incorporate credit for wgtcc 2019-08-23 17:56:30 -07:00
Philippe Tillet
a110a7e8cf [ir] changed type of tile shapes from constant_int* to int 2019-08-23 17:49:21 -07:00
Philippe Tillet
c9371c7234 [general] error messages no longer depend on a program name 2019-08-23 17:32:05 -07:00
Philippe Tillet
f98b0b8e2a [general] deleted the old compiler frontend 2019-08-23 17:28:02 -07:00
Philippe Tillet
8798d240dc matmul test passes 2019-08-23 17:13:30 -07:00
Philippe Tillet
64a6910644 [lang][parser] better support for attributes 2019-08-22 21:02:38 -07:00
Philippe Tillet
845c0e5b93 adding tunable parameters 2019-08-22 19:21:01 -07:00
Philippe Tillet
87072203c1 [codegen] triton-ir code generation does not crash 2019-08-22 17:27:10 -07:00
Philippe Tillet
a6ec807223 more debugging 2019-08-21 21:53:41 -07:00
Philippe Tillet
a23225ad37 more progress 2019-08-21 18:27:02 -07:00
Philippe Tillet
5224bbbe06 preparing codegen 2019-08-20 18:06:30 -07:00
Philippe Tillet
61f25f90eb basic parsing doesn't throw error 2019-08-20 16:22:43 -07:00
Philippe Tillet
bc11e31419 [lang] more progress on parser 2019-08-19 20:56:39 -07:00
Philippe Tillet
0970fe12dd [general] cleaned tensorflow source code generation 2019-08-18 15:39:36 -07:00
Philippe Tillet
457c330f15 more cleaning 2019-08-18 14:20:42 -07:00
Philippe Tillet
c787ebae68 more cleaning 2019-08-18 14:09:55 -07:00
Philippe Tillet
81571246cf [general] fixed some warnings 2019-08-18 14:08:57 -07:00
Philippe Tillet
c05445d001 [general] removed dnn/ module and runtime/jit.cpp 2019-08-18 00:41:05 -07:00
Philippe Tillet
b58b0d8b27 [general] removed unnecessary includes 2019-08-18 00:34:30 -07:00
Philippe Tillet
b4a9ed9663 [python] added basic tensorflow support 2019-08-17 18:18:26 -07:00
Philippe Tillet
078f0052fe more cleaning 2019-08-17 16:12:17 -07:00
Philippe Tillet
11a6a92598 [python][tensorflow] basic op generation is working 2019-08-16 20:50:18 -07:00
Philippe Tillet
c7cb5f82ad [general] removed LLVM #include's in all Triton headers 2019-08-16 15:56:58 -07:00
Philippe Tillet
4de22df930 [python] added skeleton for python interface 2019-08-15 20:50:10 -07:00
Philippe Tillet
3ece461ce2 added tensorflow code generator 2019-08-15 15:59:53 -07:00
Philippe Tillet
38a8b0ab19 [runtime] overall of the run-time API 2019-08-14 20:26:11 -07:00
Philippe Tillet
b8cd63e0da [codegen] separated lower_dot_inst into lower_outer_dot ||
lower_hmma_dot || lower_scanline_dot
2019-08-12 21:48:30 -07:00
Philippe Tillet
4bc5758a22 [general] some cleaning:
* trans/dot -> peephole
* isel -> added function for tile-level lowering
2019-08-12 21:15:21 -07:00
Philippe Tillet
1400d960a6 [auto-tuning] much smaller parameters space 2019-08-12 21:15:21 -07:00
Philippe Tillet
fd49cdc92b [dnn][blocksparse] added dw code 2019-08-08 19:15:35 -07:00
Philippe Tillet
f93099bda1 [codegen][transform][trans] fixed incorrect replace_all_uses_with 2019-08-07 21:50:16 -07:00
Philippe Tillet
7578c27d3d [general][filesystem] added structure and namespace to code generation files 2019-08-07 21:17:17 -07:00
Philippe Tillet
392b55280d [codegen] some cleaning for batched matmul 2019-08-07 21:17:17 -07:00
Philippe Tillet
7b75b68edc dirty but working warp-splitting 2019-08-06 21:07:13 -07:00
Philippe Tillet
494bfa7671 didn't break correctness of existing HMMA 2019-08-06 17:34:00 -07:00
Philippe Tillet
46e9863ebe better fp16 support for dot 2019-08-06 17:19:13 -07:00
Philippe Tillet
0e201e18ff fixed simple FP16 test 2019-08-06 17:14:16 -07:00
Philippe Tillet
6c39cdbace making sure changes didn't break HMMA 2019-08-06 16:48:53 -07:00
Philippe Tillet
cf256a636c fixup 2019-08-06 16:44:16 -07:00
Philippe Tillet
5efdb7978e more improvements and regressions 2019-08-06 16:21:20 -07:00
Philippe Tillet
26c9849462 [ir][instructions] added permutations option for trans 2019-08-05 21:19:13 -07:00
Philippe Tillet
d62e581ab3 basic split-k across warps working for GEMM 2019-08-05 19:33:28 -07:00
Philippe Tillet
899b2b72e1 simple constexpr 2019-08-05 13:06:56 -07:00
Philippe Tillet
d869d9a924 [codegen][selection] more flexible instruction selection for reduce_inst 2019-08-04 16:34:36 -07:00
Philippe Tillet
6be532c6a2 [codegen][selection] adding support for reduction along arbitrary axis 2019-08-02 21:29:36 -07:00
Philippe Tillet
d9945692a9 [dnn] better specification of recompilation key 2019-08-02 17:42:48 -07:00
Philippe Tillet
3b92ddf7e6 [codegen/reassociation] now recursively takes pointer arguments into account as well 2019-07-31 18:41:56 -07:00
Philippe Tillet
f7bd976fc7 [dnn/blocksparse] added heuristics for block-sparse dot 2019-07-31 17:12:36 -07:00
Philippe Tillet
bb32ac56c9 [codegen/optimize_dce.cpp] fixed bugs whereby barriers were removed by DCE 2019-07-31 15:11:10 -07:00
Philippe Tillet
5af7e5adac Made sure it works for FP16 2019-07-30 20:02:16 -07:00
Philippe Tillet
080bf1af88 [dnn/blocksparse/dot]: BlocksparseDx also working 2019-07-30 11:42:31 -07:00
Philippe Tillet
dc11f70fad [dnn/blocksparse] FPROP test passes! 2019-07-29 17:06:20 -07:00
Philippe Tillet
17cb2db356 [dnn/blocksparse/dot] prototype version seems to pass basic test 2019-07-27 21:21:36 -07:00
Philippe Tillet
2a377bc8b1 [ir] deleted mask/merge instructions; will be replaced by masked_load/store and select 2019-07-25 15:06:15 -07:00
Philippe Tillet
6ce82dfcdb FINALLY 2019-07-23 22:19:57 -07:00
Philippe Tillet
b7fadb9986 more stuff 2019-07-23 21:22:47 -07:00
Philippe Tillet
397d76156b progress on re-association 2019-07-23 17:21:24 -07:00
Philippe Tillet
38b3771c26 some reassociation 2019-07-23 14:43:18 -07:00
Philippe Tillet
c448876178 better benchmarking 2019-07-22 19:26:12 -07:00
Philippe Tillet
ead368d1ed [general] a bunch of fixes in anticipation of proper triton vs cudnn
benchmarks

* DNN: Added partial auto-tuning mode and skeleton for heuristics
* Examples: Moduralized benchmarking and now evaluating ResNet-18 shapes
2019-07-21 20:17:56 -07:00
Philippe Tillet
b1d81a5802 more work on heuristics 2019-07-21 18:11:54 -07:00
Philippe Tillet
484e3871cf [dnn/shift] added base pointer for a, b 2019-07-20 23:00:27 -07:00
Philippe Tillet
d159455f7b [codegen/alignment_info] better alignment information 2019-07-20 21:44:18 -07:00
Philippe Tillet
28c250216c [dnn/gemm] added some bounds checking 2019-07-19 21:32:55 -07:00
Philippe Tillet
5215fb0424 [codegen] some more optimizations 2019-07-19 20:29:03 -07:00
Philippe Tillet
71594da66f [dnn/gemm]: fixed leading dimension in transposed variants 2019-07-18 16:35:48 -07:00
Philippe Tillet
f0d8306437 [codegen/alignment_info] better handling of constants 2019-07-18 16:12:06 -07:00
Philippe Tillet
86f70f8224 [codegen/selection] performance fix-up when A is transposed for hmma 2019-07-17 21:46:23 -07:00
Philippe Tillet
2f0817b2cd [codegen/selection] tensor cores now used for transposed layotus 2019-07-17 17:20:38 -07:00
Philippe Tillet
bfa39b8992 preparing the field for tensor cores transposes 2019-07-17 13:20:33 -07:00
Philippe Tillet
d2e116d057 testing GEMM 2019-07-17 12:38:30 -07:00
Philippe Tillet
791c91ee63 [dnn/shift] bugfix in static shape division 2019-07-17 11:39:17 -07:00
Philippe Tillet
a55b098e88 [dnn/shift] now using constant divisions 2019-07-16 21:05:21 -07:00
Philippe Tillet
07c964919c [dnn/shift] now strictly only shifting the interior 2019-07-16 20:18:48 -07:00
Philippe Tillet
ec24e1e7df trying to remove interior logic 2019-07-16 18:47:50 -07:00
Philippe Tillet
5f6dd23fc2 [dnn/dot] reverted back to peak tensorcores performance 2019-07-16 16:14:58 -07:00
Philippe Tillet
164d85077f more stuff 2019-07-16 15:03:53 -07:00
Philippe Tillet
28959fe165 [runtime/jit] made auto-tuning silent 2019-07-16 14:41:38 -07:00
Philippe Tillet
7d1797cd32 ugh 2019-07-16 12:59:27 -07:00
Philippe Tillet
f50d7a420a [runtime/jit] fixed bug in multi-threaded auto-tuning 2019-07-15 21:16:50 -07:00
Philippe Tillet
aa8bcf6bde [dnn/shift] added split-k for shift-conv 2019-07-15 21:03:58 -07:00
Philippe Tillet
434f65737f [runtime] put jit::launch_info in another file 2019-07-15 12:35:53 -07:00
Philippe Tillet
3c128fc2e2 [jit/autotune] added support for multi-threaded auto-tuning 2019-07-14 22:31:30 -07:00
Philippe Tillet
3e7a3ed67a [dnn/shift]: added support for fp16 2019-07-13 21:05:34 -07:00
Philippe Tillet
fe42cb7142 [dnn/shift] optimizations for NCHW layout 2019-07-12 20:22:32 -07:00
Philippe Tillet
54617b4e51 some cleaning 2019-07-12 20:10:15 -07:00
Philippe Tillet
7512c7ebed some cleaning 2019-07-12 20:03:05 -07:00
Philippe Tillet
c1c7062914 blabla 2019-07-12 17:42:29 -07:00
Philippe Tillet
f36a646ffc [dnn/shift-conv] added and tested NCHW layout 2019-07-11 21:00:33 -07:00
Philippe Tillet
fe8caf12f0 [dnn/conv]: skeleton for NCHW layout 2019-07-11 20:34:38 -07:00
Philippe Tillet
207e021973 [codegen/shift] substantial cleaning of triton-c shift-conv code 2019-07-11 20:11:23 -07:00
Philippe Tillet
75cf2df110 [dnn/shift] many bugfixes in strided shift-conv 2019-07-10 19:49:31 -07:00
Philippe Tillet
4ca83f1935 ugh bug in shift-conv striding 2019-07-10 17:00:22 -07:00
Philippe Tillet
f665c742f9 testing a simple shiftnet 2019-07-10 13:33:08 -07:00
Philippe Tillet
3b89bc8463 [examples/python/pytorch] added skeleton of wrapper for shift-conv and batch-norm 2019-07-09 21:54:37 -07:00
Philippe Tillet
63b249c1d6 [examples/python/pytorch] added batchnorm cpp extension 2019-07-09 20:59:04 -07:00
Philippe Tillet
b7986baffa [dnn]: Now implementing all existing DNN routines using common base template and auto-tuner 2019-07-09 19:52:55 -07:00
Philippe Tillet
88675fa01a [dnn] added base template class for mutualized auto-tuning 2019-07-09 16:09:34 -07:00
Philippe Tillet
066ae338f1 [dnn/shift]: added stride to shift 2019-07-09 14:08:51 -07:00
Philippe Tillet
cc41604784 [codegen/batchnorm] forward and backward now seemingly working 2019-07-09 13:03:16 -07:00
Philippe Tillet
f74dcb7e30 [dnn/batchnorm]: added some more code in Triton-C batchnorm implementations 2019-07-08 20:18:20 -07:00
Philippe Tillet
fa3270dcf2 [codegen/selection] bugfix in code generation for reduction instructions 2019-07-08 18:53:37 -07:00
Philippe Tillet
f9db0449b7 [dnn] Adding batchnorm 2019-07-08 18:44:37 -07:00
Philippe Tillet
b0cf3143c5 [dnn/shift] bugfix in wgrad 2019-07-06 11:27:49 -07:00
Philippe Tillet
3e49dbe6ab [dnn/shift] fixed in leading dimensions for shift-conv operation 2019-07-05 17:17:22 -07:00
Philippe Tillet
c666f71fd6 fixed bug 2019-07-05 15:07:20 -07:00
Philippe Tillet
88ebdddf3d makes more sense now 2019-07-03 20:45:03 -07:00
Philippe Tillet
bd1040510f dx works but that makes no sense? 2019-07-03 20:24:52 -07:00
Philippe Tillet
1b2ceadf0d weight gradient seem to work 2019-07-03 20:04:38 -07:00
Philippe Tillet
39aa22babb more tinkering 2019-07-03 19:52:31 -07:00
Philippe Tillet
1d88f0a36b stuff 2019-07-03 19:25:16 -07:00
Philippe Tillet
0d8faa5b1e fixup 2019-07-02 21:38:10 -07:00
Philippe Tillet
5144dc3a6c [examples/python] added framework code for shift-conv 2019-07-02 20:45:10 -07:00
Philippe Tillet
8fc253946c [codegen] shift: added sketch for shift-convolution backpropagation 2019-07-02 16:39:07 -07:00
Philippe Tillet
6cfb575d29 [lang] fixup in cast type 2019-06-30 17:43:18 -07:00
Philippe Tillet
c172bd518b more stuff 2019-06-30 16:55:02 -07:00
Philippe Tillet
9a86bc51e1 [language] added alignment metadata for variables 2019-06-29 13:58:46 -07:00
Philippe Tillet
d8c3d58593 more optimization 2019-06-28 20:22:52 -07:00
Philippe Tillet
83b753512c prefetching with shift 2019-06-28 17:17:50 -07:00
Philippe Tillet
ab1afbf082 more performance optimizations 2019-06-28 17:04:07 -07:00
Philippe Tillet
a567f3f8a8 more cleaning 2019-06-28 15:10:39 -07:00
Philippe Tillet
21fd0fd65e fixup 2019-06-28 11:13:36 -07:00
Philippe Tillet
f4dedb522c fixup 2019-06-27 17:05:48 -07:00
Philippe Tillet
12e6036e5f trying interior shift 2019-06-27 14:13:48 -07:00
Philippe Tillet
d8526669f5 fixup 2019-06-27 12:39:17 -07:00
Philippe Tillet
9028e40f1d [dnn] added shift in the DNN libs 2019-06-27 11:37:19 -07:00
Philippe Tillet
6300ec5080 [examples] added conv2d op in tensorflow 2019-06-26 18:50:53 -07:00
Philippe Tillet
f1a8972267 [examples] added tensorflow dense convolution templates 2019-06-26 11:39:22 -07:00
Philippe Tillet
25e9a10917 changed auto-tuner parameter ranges 2019-06-25 19:27:49 -07:00
Philippe Tillet
d945ce5e1b Now showing valid parameter for NN 2019-06-25 19:18:43 -07:00
Philippe Tillet
616f22c610 confirmed this is the fastest bounds checking 2019-06-25 16:35:43 -07:00
Philippe Tillet
64513fb407 [codegen] added fallback when tensor cores cannot be used 2019-06-25 15:49:58 -07:00
Philippe Tillet
62000738f0 [codegen] renamed axis_info -> alignment_info 2019-06-25 15:10:47 -07:00
Philippe Tillet
d52abc9379 [codegen] bugfix in alignment inference 2019-06-25 15:06:15 -07:00
Philippe Tillet
edc31cabb0 [codegen] rough template for axis_info pass 2019-06-24 18:57:32 -07:00
Philippe Tillet
72867d17d4 more cleaning 2019-06-24 12:37:13 -07:00
Philippe Tillet
f257884eb7 some cleaning 2019-06-24 09:31:34 -07:00
Philippe Tillet
67989e7d18 fixup 2019-06-13 20:03:28 -07:00
Philippe Tillet
f7dcea1187 Now doing double-buffering 2019-06-13 19:48:02 -07:00
Philippe Tillet
36e3667a9a removed shared conflicts for 8x32x4 and 32x8x4 configurations 2019-06-13 17:51:54 -07:00
Philippe Tillet
21a9b92c87 disabling interleaving 2019-06-13 17:16:00 -07:00
Philippe Tillet
d487cf31ce trying 128 bits loads 2019-06-12 21:07:01 -07:00
Philippe Tillet
1c6372711b added interleaving 2019-06-12 20:30:28 -07:00
Philippe Tillet
a6b580ec05 interleaving fails with B 2019-06-12 19:46:43 -07:00
Philippe Tillet
1b5a742a88 [triton/codegen] added shared memory padding for HMMA arguments and vectorized loads 2019-06-11 19:51:08 -07:00
Philippe Tillet
cbd916994d [example/tensorflow] no longer hardcoding library dir 2019-06-11 11:06:02 -07:00
Philippe Tillet
7d50b87681 [selection/codegen] bugfix in distributed tile indices initialization 2019-06-11 10:45:19 -07:00
Philippe Tillet
06b5992509 [feature] added basic tensor core support 2019-06-11 10:24:49 -07:00
Philippe Tillet
d074a166e2 [feature] basic tensor core utilization works 2019-06-08 14:39:45 -07:00
Philippe Tillet
5f3d48c1d0 [tensor cores] added basic codegen template for using wmma 2019-06-07 21:19:47 -07:00
Philippe Tillet
ec4c6aaaaa Added inline PTX for mma.sync 2019-06-07 19:39:33 -07:00
Philippe Tillet
6fce9f28ae added fragmented axis 2019-06-07 10:32:56 -07:00
Philippe Tillet
781b6d377d seleciton now segfault (expected 2019-06-06 20:34:56 -07:00
Philippe Tillet
6045209d5b Now find correct tuning configuration 2019-06-06 20:13:26 -07:00
Philippe Tillet
0a0b48e9a2 adding hmma tuning parameters 2019-06-06 19:51:02 -07:00
Philippe Tillet
81eba3e1ec ugh 2019-06-06 19:36:41 -07:00
Philippe Tillet
cdf5a0d011 [codegen/tune]: added fragmentation types 2019-06-06 16:48:32 -07:00
Philippe Tillet
f58c9a4d2b [general] hmma baseline setup 2019-06-05 14:43:38 -07:00
Philippe Tillet
49fcfd6fc7 [examples/tensorflow] fixed #include issue 2019-06-05 11:09:41 -07:00
Philippe Tillet
383b5b2a2a [triton/ast] renamed ast -> lang in namespace and file structure 2019-05-28 17:28:02 -04:00
Philippe Tillet
d2a46afe00 [triton/ast]: cleaned the ast module 2019-05-28 17:07:54 -04:00
Philippe Tillet
8102efc064 [triton/examples/cpp] removed common.hpp helper 2019-05-28 14:14:33 -04:00
Philippe Tillet
a9d078c06f [triton/dnn/conv] merged optimizations branch
- Added forward/backward support for strided convolution
- Added support for bias
- Added support for reduction splitting
2019-05-28 14:04:53 -04:00
Philippe Tillet
e526ffc62b [examples/pytorch] added a bunch of models for more thorough testing 2019-05-28 14:04:31 -04:00
Philippe Tillet
3f3eb1c2a4 [dnn/conv] Added the option to have look-up table for filters for all
operations
2019-05-22 19:03:33 -04:00
Philippe Tillet
f8291af7ef [dnn/conv] removed divergent paths in LUT computations 2019-05-22 17:49:40 -04:00
Philippe Tillet
2672812ad0 [dnn/conv] No more divergent path in conv::set_arg 2019-05-22 15:25:43 -04:00
Philippe Tillet
e8f23bcade [dnn/conv] Added bias and forward stride 2019-05-22 13:27:08 -04:00
Philippe Tillet
f33a1f3fe3 [examples/pytorch] Fixed issues in backward pass of conv 2019-05-19 01:31:08 -04:00
Philippe Tillet
b2b55c52c9 [triton/python/conv]: Added cache for compiled kernels 2019-05-18 11:51:49 -04:00
Philippe Tillet
600aef72d5 [conv/dnn] now created a separate .h and .cpp file 2019-05-17 12:29:11 -04:00
Philippe Tillet
34f8617709 [dnn/conv] fixed formatting of generated Triton-C code 2019-05-16 15:48:02 -04:00
Philippe Tillet
ece7beea3c [dnn/conv]: now using look-up table for wgrad computation as well 2019-05-16 15:26:16 -04:00
Philippe Tillet
15a967c81e [dnn/conv] minor cleaning 2019-05-15 11:32:47 -04:00
Philippe Tillet
be2ba03382 [dnn/conv] optimizations of backpropagation with look-up tables 2019-05-14 19:10:59 -04:00
Philippe Tillet
cbfbe72e46 [general] added LICENSE file 2019-05-13 22:29:53 -04:00
Philippe Tillet
5941501f70 [dnn] added Triton-C derivative computations in conv 2019-05-13 18:04:11 -04:00
Philippe Tillet
f6fe9492e4 [dnn/conv] added triton-c code for wgrad 2019-05-11 18:09:23 -04:00
Philippe Tillet
fc4daf11dd [examples/conv] now deferring shape computations to conv configuration 2019-05-08 13:58:25 -04:00
Philippe Tillet
54f888a270 [dnn/conv] some minor fixes 2019-05-08 10:09:30 -04:00
Philippe Tillet
615569287e more cleaning of conv 2019-05-06 19:30:22 -04:00
Philippe Tillet
fd91368f98 [general] creation of dnn module for gemm/conv triton routines 2019-05-06 17:47:06 -04:00
Philippe Tillet
f80441017c [codegen] added leading dimension padding for transposition in shared
memory
2019-05-06 11:53:35 -04:00
Philippe Tillet
4813bb007c [codegen] bugfix in builder insert point for predicated instructions 2019-05-04 12:09:27 -04:00
Philippe Tillet
30833c18f1 [codegen/tune] bugfix in heuristics for nano-tile sizes 2019-05-04 01:32:34 -04:00
Philippe Tillet
0d694445e6 [examples] added skeleton for pytorch wrapper 2019-05-03 14:30:06 -04:00
Philippe Tillet
208d1525de [driver] added spirv-llvm dispatch functions 2019-05-02 10:44:29 -04:00
Philippe Tillet
70f49a56c1 [examples/python/tensorflow] better skeleton for blocksparse 2019-05-01 17:09:01 -04:00
Philippe Tillet
55866f1ef6 [examples/python/tensorflow] fixed ABI compatibility issue in JIT
destructor
2019-05-01 13:38:56 -04:00
Philippe Tillet
7b6efc0463 [examples/python/tensorflow] bugfix in tensorflow wrapper example 2019-04-30 21:04:30 -04:00
Philippe Tillet
d934d8fb40 [examples/python/tensorflow] improved matmul wrapper 2019-04-30 12:25:35 -04:00
Philippe Tillet
8e809a9536 [examples] added skeleton for tensorflow op 2019-04-30 10:50:54 -04:00
Philippe Tillet
93f53501c6 [triton-c] added implicit conversion to bool in while/for loops 2019-04-28 00:32:51 -04:00
Philippe Tillet
af58b8bd81 [triton-c] predicate in assignment statement now propagates to rhs
computations
2019-04-27 14:00:15 -04:00
Philippe Tillet
4b77b764ba [triton-c] added support for while loops 2019-04-26 15:08:02 -04:00
Philippe Tillet
b6af06910d [examples] deleted placeholders for not implemented examples 2019-04-25 16:24:16 -04:00
Philippe Tillet
3413aad582 [general] major overhaul of triton-c/triton-ir/triton-jit:
- Added alloc const
- Added atomics
- Pruning tuning space
- Added example for dot/conv/shift
- Bugfixes
2019-04-25 16:18:15 -04:00
Philippe Tillet
0c607c9392 [examples] normalize benchmark by max_clock / current_clock 2019-03-28 07:58:37 -04:00
Philippe Tillet
2c3ae0675e [JIT] re-added nvidia compatibility 2019-03-27 21:12:01 -04:00
Philippe Tillet
fdf8559806 [general] added missing files 2019-03-27 20:01:35 -04:00
Philippe Tillet
bc2a257d5c [code generation] more flexibility in backend selection 2019-03-27 11:29:42 -07:00
Philippe Tillet
e04253c0dd [code generation] basic CPU backend 2019-03-27 11:13:36 -07:00
Philippe Tillet
9d6fc1c051 [code generation] bugfix in single buffering 2019-03-26 15:55:48 -07:00
Philippe Tillet
8d35c98920 [code generation] search space pruning 2019-03-25 14:10:24 -07:00
Philippe Tillet
deb7a1cc5c Hack to make OpenCL for AMD work 2019-03-23 18:58:25 -07:00
Philippe Tillet
be55b3a081 saving progress 2019-03-23 16:52:53 -07:00
Philippe Tillet
9de9feff4a [jit] added runtime for host but compilation still needs to be implemented 2019-03-23 13:40:42 -07:00
Philippe Tillet
49fd6ece99 some cleaning 2019-03-21 23:51:47 -07:00
Philippe Tillet
907bbb1ad2 [driver] now debugging AMD runtime 2019-03-20 17:32:17 -07:00
Philippe Tillet
b6305f4388 [driver] added more genericity for opencl support 2019-03-20 00:25:08 -07:00
Philippe Tillet
02775a226e [driver] adding opencl in the driver API 2019-03-18 23:12:14 -07:00
Philippe Tillet
b73c3bdd25 [examples] removed dependency on isaac for auto-tuning 2019-03-11 22:22:43 -04:00
Philippe Tillet
87c85ed50d [code generation] reparameterization 2019-03-11 19:30:21 -04:00
Philippe Tillet
614f83baee [jit] basic auto-tuning support 2019-03-11 12:00:50 -04:00
Philippe Tillet
94e315ea8a Reparameterized in terms of micro- and nano- tiles 2019-03-10 23:10:17 -04:00
Philippe Tillet
c96a263896 [jit] changed default metaparameter ranges 2019-03-10 10:45:21 -04:00
Philippe Tillet
d2e7d7890d [jit] preparing auto-tuning 2019-03-10 00:42:36 -05:00
Philippe Tillet
9e2cfddf4c [examples] some cleaning 2019-03-09 17:17:55 -05:00
Philippe Tillet
9a3537662d [jit] can now infer launch parameters from triton module 2019-03-09 14:44:13 -05:00
Philippe Tillet
b721202812 [code generation] uniformized shape and layout metaparameters 2019-03-09 12:31:21 -05:00
Philippe Tillet
5f29263044 [code generation] now using ir::metaparameter* for all tunable
metaparameters
2019-03-09 12:05:12 -05:00
Philippe Tillet
d049679aa2 [general] added simple jit interface 2019-03-08 23:58:42 -05:00
Philippe Tillet
c5073a5af6 [abstract syntax tree] better error messages 2019-03-05 23:45:58 -05:00
Philippe Tillet
20ff9543ac [abstract syntax tree] improved the grammar 2019-03-05 21:03:19 -05:00
Philippe Tillet
4189e130bf [general] added support for constant memory declaration 2019-03-03 23:16:33 -05:00
Philippe Tillet
1f30e111ec [code generation] more optimizations 2019-03-02 16:03:26 -05:00
Philippe Tillet
2467c5e504 [code generation] added ternary operator 2019-03-01 21:53:35 -05:00
Philippe Tillet
08fcfbca47 [code generation] better predication 2019-03-01 14:36:17 -05:00
Philippe Tillet
36acf22fd3 better masking 2019-02-28 23:46:11 -05:00
Philippe Tillet
017702590b [intermediate representation] added ternary_inst 2019-02-26 14:20:58 -05:00
Philippe Tillet
68dea75aa0 [syntax tree] more fixes in lowering phi nodes 2019-02-26 12:36:37 -05:00
Philippe Tillet
338f291835 [code generation] now ordered iterations across distributed tiles 2019-02-25 11:41:45 -05:00
Philippe Tillet
6dc88878ac [code generation] bugfix in double-buffering 2019-02-24 23:22:28 -05:00
Philippe Tillet
daa828ec18 [general] rename namespace tdl -> triton 2019-02-24 14:35:16 -05:00
Philippe Tillet
6b49818282 [filesystem] rename tdl -> triton 2019-02-24 14:20:40 -05:00
Philippe Tillet
1b5f7f2139 [code generation] basic metaparameter support 2019-02-23 22:24:12 -05:00
Philippe Tillet
8f4798b81a [intermediate representation] transitioning towards more flexible tile
shapes
2019-02-23 11:37:01 -05:00
Philippe Tillet
7cda55df16 [code generation] implements hidden operands in user (e.g., mask) 2019-02-21 18:00:27 -05:00
Philippe Tillet
5618a15dc1 [code generation] more bugfixes in control flow 2019-02-20 22:55:20 -05:00
Philippe Tillet
90ec0ae2c0 [code generation] some more bugfixing with nested control flow 2019-02-18 22:54:08 -05:00
Philippe Tillet
f3094a512b [syntax tree] fixed bugs in control flow lowering 2019-02-17 21:35:03 -05:00
Philippe Tillet
cf1a583dbf bla 2019-02-15 22:03:09 -05:00
Philippe Tillet
5f5959dc6e [code generation] added masked loads 2019-02-15 11:14:50 -05:00
Philippe Tillet
896e856b07 [syntax] added support for predicated expressions 2019-02-13 15:41:03 -05:00
Philippe Tillet
32562677e9 [code generation] added barriers placement 2019-02-12 19:36:16 -05:00
Philippe Tillet
41aad4800c [code generation] added double-buffering 2019-02-12 11:47:52 -05:00
Philippe Tillet
e45d6bbb60 some cleaning 2019-02-12 11:00:24 -05:00
Philippe Tillet
f8e522ada8 blabla 2019-02-11 17:27:16 -05:00
Philippe Tillet
b2e487491f [code generation] now vectorizing shared memory stores 2019-02-10 21:59:41 -05:00
Philippe Tillet
8ab5ca3de3 blabla 2019-02-10 20:41:07 -05:00
Philippe Tillet
3d07e909c6 attempting vectorization 2019-02-10 18:29:25 -05:00
Philippe Tillet
4a0736ce20 [code generation] in-place CSE in shared memory reads 2019-02-09 23:56:53 -05:00
Philippe Tillet
d39f97ef38 [code generation] simple matrix-multiplication working 2019-02-09 19:20:50 -05:00
Philippe Tillet
4c8dbcccdc test 2019-02-08 23:49:18 -05:00
Philippe Tillet
77dd99efe8 [code generation] bug fixes in grid axes binding 2019-02-08 23:32:17 -05:00
Philippe Tillet
f697fcb887 [driver] fixed some bugs 2019-02-08 18:05:43 -05:00
Philippe Tillet
a9d219cdf5 [driver] added driver source code from isaac repository 2019-02-08 14:47:56 -05:00
Philippe Tillet
937bc464a3 [examples] debugging matrix multiplication code 2019-02-08 13:15:04 -05:00
Philippe Tillet
90c0474974 [examples] improved template for testing matrix multiplication 2019-02-08 12:54:20 -05:00
Philippe Tillet
dd35277858 [examples] added basic skeleton to generate matrix multiplication PTX 2019-02-07 22:42:54 -05:00
Philippe Tillet
1b9a7a8e97 [code generation] added basic shared copy/read 2019-02-07 17:03:19 -05:00
Philippe Tillet
5fdb27d9ae [code generation] fixed bug in tile phi nodes 2019-02-06 23:34:45 -05:00
Philippe Tillet
53aca3fa89 [code generation] fixed bugs in tile instructions lowering 2019-02-06 17:30:33 -05:00
Philippe Tillet
4490061950 test 2019-02-06 17:21:07 -05:00
Philippe Tillet
5aec34a094 [code generation] improved handling of constants 2019-02-06 15:02:01 -05:00
Philippe Tillet
e522b06be2 [code generation]: more progress for instruction selection 2019-01-26 02:05:56 -05:00
Philippe Tillet
e2de27dfe2 [project] added missing files 2019-01-23 02:07:53 -05:00
Philippe Tillet
7eebdceb6a [code generation] fixed bug in on-the-fly AST to IR lowering 2019-01-23 00:11:42 -05:00
Philippe Tillet
a0ecdba5a2 [code generation] testing analysis passes 2019-01-12 23:24:25 -05:00
Philippe Tillet
80d019ec16 [syntax tree] added syntactic support for dereferencing 2019-01-10 23:53:27 -05:00
Philippe Tillet
b5c8c25d43 more debugging 2019-01-10 16:50:47 -05:00
Philippe Tillet
63459228f8 [syntax tree] added some slicing/retiling syntax 2019-01-09 13:41:12 -05:00
Philippe Tillet
4f923accd7 [syntax tree] added basic support for range 2019-01-09 02:07:34 -05:00
Philippe Tillet
7dfa578c9d [syntax tree] fixed bug in pointer arithmetic 2019-01-08 18:04:19 -05:00
Philippe Tillet
73db84c8ba [syntax tree] fixed broadcast semantics lowering 2019-01-08 17:44:31 -05:00
Philippe Tillet
7a14693f51 [code generation] added constraints checking 2019-01-08 15:57:45 -05:00
Philippe Tillet
58757b8f10 [code generation] some more cleaning 2019-01-08 12:39:25 -05:00
Philippe Tillet
297d1a99d1 [code generation] adding missing files 2019-01-07 22:49:37 -05:00
Philippe Tillet
a1c0c9762c [code generation] added basic structure 2019-01-07 22:44:26 -05:00
Philippe Tillet
c48b7fb676 [intermediate representation] bugfix in getelementptr_inst 2019-01-07 04:09:30 -05:00
Philippe Tillet
ce1c0a62c0 [syntax tree] trivial phi-node elimination 2019-01-06 23:49:48 -05:00
Philippe Tillet
0dd4a52ce5 [syntax tree]: debugging phi-nodes simplification 2019-01-06 21:33:53 -05:00
Philippe Tillet
179890c7ad [ast] laying down the ground work for on-the-fly phi-node simplification 2019-01-06 15:16:02 -05:00
Philippe Tillet
6bfceae4a6 [code generation] some more bugfixes 2019-01-06 03:36:56 -05:00
Philippe Tillet
c12ec9f214 [code generator] more bugfixes 2019-01-06 00:53:11 -05:00
Philippe Tillet
f9ba69f1a4 [code generation] some bugfixes 2019-01-05 19:23:00 -05:00
Philippe Tillet
ec656af57c [code generation] basic to-llvm lowering 2019-01-05 14:50:31 -05:00
Philippe Tillet
f131ebb0bc [intermediate representation] fixed some bugs 2019-01-04 01:43:02 -05:00
Philippe Tillet
88504ca172 [build sysem] better llvm handling 2019-01-03 17:14:54 -05:00
Philippe Tillet
c35ca8353e [intermediate representation] defined more symbols 2019-01-03 15:32:22 -05:00
Philippe Tillet
9a1739957d [intermediate representation] added some builder function definitions 2019-01-03 12:44:33 -05:00
Philippe Tillet
8f4aafb4ac [intermediate representation] improvements on constants 2019-01-03 03:42:10 -05:00
Philippe Tillet
8dbb565200 [general] added missing file 2019-01-03 00:55:24 -05:00
Philippe Tillet
b039498d15 [intermediate representation] added subdefinitions in types submodule 2019-01-03 00:42:37 -05:00
Philippe Tillet
22a83ab526 [intermediate representation] added some instruction definitions 2019-01-02 19:29:59 -05:00
Philippe Tillet
0378b9eb43 [intermediate representation] more implementation 2019-01-02 14:37:14 -05:00
Philippe Tillet
24bd2145df [intermediate representation] improved skeleton 2019-01-02 01:06:43 -05:00
Philippe Tillet
e7a4e70e22 [Intermediate Representation] Added skeleton 2018-12-31 22:47:31 -05:00
Philippe Tillet
d260aefbd1 [Codegen] More debugging 2018-12-31 13:16:25 -05:00
Philippe Tillet
8f9e6a3655 [AST] Adding indexing operations 2018-12-29 17:06:48 -05:00
Philippe Tillet
1b8199b82d [Code generation] added support for FOR and IF/THEN/ELSE 2018-12-24 01:04:55 -05:00
Philippe Tillet
9d3224754e [Code generation] Prototype for phi node 2018-12-22 21:45:45 -05:00
Philippe Tillet
91c9ede021 [Code generation] Adding functions to construct SSA form 2018-12-22 18:25:03 -05:00
Philippe Tillet
d06f0fa593 [AST] disambiguate named_expression vs identifier 2018-12-22 11:55:04 -05:00
Philippe Tillet
eab275dc99 [Code generation] Added skeleton for expressions generation 2018-12-20 10:32:07 -05:00
Philippe Tillet
9247ed3714 [Code generation] 2018-12-19 11:25:29 -05:00
Philippe Tillet
951e9733ea [Code generation] added missing file 2018-12-18 23:04:02 -05:00
Philippe Tillet
176a437b21 [Code generation] bugfixes in type logic 2018-12-18 23:02:28 -05:00
Philippe Tillet
97acf52dca TDL [codegen]: improving class structure 2018-12-17 18:38:02 -05:00
Philippe Tillet
9dfa6993fb TDL [codegen]: added basic structure 2018-12-17 10:43:49 -05:00
Philippe Tillet
50573052f7 TDL: restructured project directories 2018-12-16 16:15:40 -05:00
Philippe Tillet
986b158833 TDL [Parser]: better handling of operator/specifier tokens 2018-12-16 12:35:28 -05:00
Philippe Tillet
dc755612b9 TDL [Parser]: Initial commit 2018-12-15 22:29:36 -05:00
Philippe Tillet
a7a3d57f3c FindLLVM 2018-12-03 07:44:45 -05:00
Philippe Tillet
8b040b4645 updates 2018-12-03 07:42:05 -05:00
Philippe Tillet
68c8de88f5 More cleaning of masks 2018-11-27 12:20:51 +01:00
Philippe Tillet
e0cd621bb8 more tinkering 2018-11-27 09:39:56 +01:00
Philippe Tillet
bd5b213921 Initial commit 2018-11-24 10:14:26 +01:00
Philippe Tillet
16a63c5d49 first commit 2018-11-17 08:27:27 +01:00
347 changed files with 69080 additions and 84846 deletions

52
CMakeLists.txt Executable file → Normal file
View File

@@ -1,29 +1,45 @@
cmake_minimum_required(VERSION 2.8.7)
project(isaac)
cmake_minimum_required(VERSION 2.8)
project(triton)
include(CTest)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
#Default build type
# Options
option(BUILD_TESTS "Build C++ Triton tests" ON)
option(BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF)
# LLVM
find_package(LLVM REQUIRED)
link_directories(${LLVM_LIBRARY_DIRS})
include_directories(${LLVM_INCLUDE_DIRS})
add_definitions(${LLVM_DEFINITIONS})
# Default build type
if(NOT CMAKE_BUILD_TYPE)
message(STATUS "Default build type: Release")
set(CMAKE_BUILD_TYPE "Release")
endif()
#QtCreator: add visibility of headers
file( GLOB_RECURSE ALL_SRC *.cpp *.hpp *.h *.py)
add_custom_target( ALL SOURCES ${ALL_SRC} )
#Compiler flags
# Compiler flags
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include/isaac/external/CUDA)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -pedantic -Wno-strict-aliasing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
#Source
file(GLOB_RECURSE LIBISAAC_SRC lib/*.cpp)
add_library(isaac SHARED ${LIBISAAC_SRC})
target_link_libraries(isaac "dl")
# Tests
if(BUILD_TESTS)
message(STATUS "Adding C++ tests")
add_subdirectory(tests)
endif()
#Examples
add_subdirectory(examples)
# Python module
if(BUILD_PYTHON_MODULE)
message(STATUS "Adding Python module")
# PyBind11 wrapper source file
set(PYTHON_SRC bindings.cc)
include_directories("." ${PYTHON_INCLUDE_DIRS})
endif()
# Triton
file(GLOB_RECURSE LIBTRITON_SRC lib/*.cc)
add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC})
target_link_libraries(triton ${LLVM_LIBRARIES})
#Tests
add_subdirectory(tests)

View File

@@ -1,4 +1,4 @@
/* Copyright 2015-2017 Philippe Tillet
/* Copyright 2018-2020 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
@@ -20,3 +20,7 @@
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
// The compiler front-end is based on a modified version of WGTCC
// https://github.com/wgtdkp/wgtcc
// Copyright (c) 2016 wgtdkp

83
README.md Executable file → Normal file
View File

@@ -1,76 +1,39 @@
# ISAAC
# Triton
This is the development repository for ISAAC, an input-aware auto-tuning framework and code-generator for HPC/DL. This version is only compatible with NVIDIA hardware (it generates PTX source code). For OpenCL/CUDA compatibility, visit the Intel fork (https://github.com/intel/isaac) or the v1.0 branch (deprecated) or the
This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives. The aim of Triton is to provide an open-source environment to write fast code at higher productivity than CUDA, but also with much higher flexibility than [TVM](https://github.com/apache/incubator-tvm) and without having to manually specify compute schedules.
### License
The main components of Triton at the moment are:
ISAAC is distributed under the MIT/X11 license.
- **Triton-C**: An imperative, single-threaded language for writing highly efficient compute-kernels at a relatively high abstraction level (think numpy-like array operations in a C-like language).
- **Triton-IR**: A special-purpose intermediate representation (Triton-IR) for aiding array-level program analysis and optimizations in Triton-C programs.
- **Triton-JIT**: An optimizing just-in-time compiler for Triton-IR, which generates GPU code on par with state-of-the-art CUDA-C (e.g., [CUTLASS](https://github.com/NVIDIA/cutlass)). This includes transparent support for mixed-precision and Tensor Cores.
### Getting started - Deep Learning Inference
Bindings for **automatic** PyTorch custom op generations are included in **PyTriton**, along with a small DSL based on einsum that supports convolutions, shift-convolutions, direct einsums, etc.
The formal foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please cite us if you use our work!
## Installation
Triton is a fairly self-contained package and uses its own parser (forked from [wgtcc](https://github.com/wgtdkp/wgtcc)) and LLVM-8.0+ for code generation.
Execute the following commands on a python environment that contains a recent version of pytorch:
```
git clone https://github.com/ptillet/isaac.git
cd isaac/python;
python setup.py build;
python setup.py install;
cd examples/pytorch;
python imagenet.py --arch resnet152 /path/to/imagenet/;
sudo apt-get install llvm-8-dev
pip install -e "git+https://github.com/ptillet/triton.git#egg=triton&subdirectory=python"
```
This should give you 78.1% accuracy, and roughly 4x speed-up over pytorch.
### Getting started - C++ API
## Getting Started
In order to compile and use the ISAAC C++ API, only a proprietary NVIDIA driver is necessary. No CUDA SDK is required (except for testing and benchmarking against cuBLAS/cuDNN):
```
git clone https://github.com/ptillet/isaac.git
cd isaac;
mkdir build;
cd build;
cmake ../ ; make -j8;
./examples/isaac-tools --gemm --bench --suite deepbench --dtype float32
./examples/isaac-tools --conv --bench --suite deepbench --dtype float32
```
If you want, you can also dump the PTX source code generated by ISAAC for some shapes:
```
./examples/isaac-tools --gemm --dump --format ptx --shape 2048,2048,2048 --layout NT --dtype float32
```
If you really know what you're doing, you can also capture the tiling parameters found by ISAAC:
```
./examples/isaac-tools --gemm --dump --format params --shape 2048,2048,2048 --layout NT --dtype float32
```
You will get the following output:
```
Tuning parameters: 4, 16, 8, 8, 8, 8, 16, 8, 16, 8, 1, 1, 1
```
The parameters respectively mean:
(1) that shared memory loads have a width of **4** ;
(2) each block comprises **16**x**8** threads ;
(3) each threads computes a tile of **8**x**8** elements;
(4) Each loop iteration processes **8** elements along the K axis ;
(5) threads are rearranged as a **16** x **8** block for loading A, and a **16** x **8** block for loading B;
(6) the reduction is split accross **1**, **1** and **1** independent batches within each thread, thread-block and grid, and the results are accumulated after the inner-loop
Please visit the [documentation](https://docs.triton-lang.org) to get started with Triton
### Benchmarks - C++ API
ISAAC often provides
Tesla P100 - SGEMM:
![sgemm-gv100](https://github.com/ptillet/isaac/blob/master/documentation/bench/gv100/sgemm.png?raw=true)
## Contributing
Tesla P100 - DGEMM:
![sgemm-gv100](https://github.com/ptillet/isaac/blob/master/documentation/bench/gv100/dgemm.png?raw=true)
Please keep in mind that this is a project I have been carrying out completely on my own as part of my Ph.D. thesis. While I am confident in the approach, there are still many things to fix and to polish. Please contact me (ptillet AT g.harvard.edu) or raise an issue if you want to contribute!
Tesla P100 - SCONV (vs cuDNN's IMPLICIT_PRECOMP_GEMM)
![sgemm-gv100](https://github.com/ptillet/isaac/blob/master/documentation/bench/gv100/sconv.png?raw=true)
## Acknowledgments
This work was partially supported by the National Science Foundation (IIS 1409097) and by IARPA (contract D16PC00002).
## ISAAC (deprecated) for fast inference
Before working on Triton, I wrote custom auto-tuned PTX code for fast, quantized inference on GPUs. While this project is now deprecated, you can use it at your own risk by checking out the "isaac" tag in this repository.

166
cmake/FindLLVM.cmake Normal file
View File

@@ -0,0 +1,166 @@
# - Find LLVM headers and libraries.
# This module locates LLVM and adapts the llvm-config output for use with
# CMake.
#
# A given list of COMPONENTS is passed to llvm-config.
#
# The following variables are defined:
# LLVM_FOUND - true if LLVM was found
# LLVM_CXXFLAGS - C++ compiler flags for files that include LLVM headers.
# LLVM_HOST_TARGET - Target triple used to configure LLVM.
# LLVM_INCLUDE_DIRS - Directory containing LLVM include files.
# LLVM_LDFLAGS - Linker flags to add when linking against LLVM
# (includes -LLLVM_LIBRARY_DIRS).
# LLVM_LIBRARIES - Full paths to the library files to link against.
# LLVM_LIBRARY_DIRS - Directory containing LLVM libraries.
# LLVM_ROOT_DIR - The root directory of the LLVM installation.
# llvm-config is searched for in ${LLVM_ROOT_DIR}/bin.
# LLVM_VERSION_MAJOR - Major version of LLVM.
# LLVM_VERSION_MINOR - Minor version of LLVM.
# LLVM_VERSION_STRING - Full LLVM version string (e.g. 6.0.0svn).
# LLVM_VERSION_BASE_STRING - Base LLVM version string without git/svn suffix (e.g. 6.0.0).
#
# Note: The variable names were chosen in conformance with the offical CMake
# guidelines, see ${CMAKE_ROOT}/Modules/readme.txt.
# Try suffixed versions to pick up the newest LLVM install available on Debian
# derivatives.
# We also want an user-specified LLVM_ROOT_DIR to take precedence over the
# system default locations such as /usr/local/bin. Executing find_program()
# multiples times is the approach recommended in the docs.
set(llvm_config_names llvm-config-9 llvm-config-9.0 llvm-config90
llvm-config-8 llvm-config-8.0 llvm-config80
llvm-config)
find_program(LLVM_CONFIG
NAMES ${llvm_config_names}
PATHS ${LLVM_ROOT_DIR}/bin NO_DEFAULT_PATH
DOC "Path to llvm-config tool.")
find_program(LLVM_CONFIG NAMES ${llvm_config_names})
# Prints a warning/failure message depending on the required/quiet flags. Copied
# from FindPackageHandleStandardArgs.cmake because it doesn't seem to be exposed.
macro(_LLVM_FAIL _msg)
if(LLVM_FIND_REQUIRED)
message(FATAL_ERROR "${_msg}")
else()
if(NOT LLVM_FIND_QUIETLY)
message(STATUS "${_msg}")
endif()
endif()
endmacro()
if(NOT LLVM_CONFIG)
if(NOT LLVM_FIND_QUIETLY)
message(WARNING "Could not find llvm-config (LLVM >= ${LLVM_FIND_VERSION}). Try manually setting LLVM_CONFIG to the llvm-config executable of the installation to use.")
endif()
else()
macro(llvm_set var flag)
if(LLVM_FIND_QUIETLY)
set(_quiet_arg ERROR_QUIET)
endif()
set(result_code)
execute_process(
COMMAND ${LLVM_CONFIG} --${flag}
RESULT_VARIABLE result_code
OUTPUT_VARIABLE LLVM_${var}
OUTPUT_STRIP_TRAILING_WHITESPACE
${_quiet_arg}
)
if(result_code)
_LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'")
else()
if(${ARGV2})
file(TO_CMAKE_PATH "${LLVM_${var}}" LLVM_${var})
endif()
endif()
endmacro()
macro(llvm_set_libs var flag)
if(LLVM_FIND_QUIETLY)
set(_quiet_arg ERROR_QUIET)
endif()
set(result_code)
execute_process(
COMMAND ${LLVM_CONFIG} --${flag} ${LLVM_FIND_COMPONENTS}
RESULT_VARIABLE result_code
OUTPUT_VARIABLE tmplibs
OUTPUT_STRIP_TRAILING_WHITESPACE
${_quiet_arg}
)
if(result_code)
_LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'")
else()
file(TO_CMAKE_PATH "${tmplibs}" tmplibs)
string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_${var} ${tmplibs})
endif()
endmacro()
llvm_set(VERSION_STRING version)
llvm_set(CXXFLAGS cxxflags)
llvm_set(HOST_TARGET host-target)
llvm_set(INCLUDE_DIRS includedir true)
llvm_set(ROOT_DIR prefix true)
llvm_set(ENABLE_ASSERTIONS assertion-mode)
# The LLVM version string _may_ contain a git/svn suffix, so cut that off
string(SUBSTRING "${LLVM_VERSION_STRING}" 0 5 LLVM_VERSION_BASE_STRING)
# Versions below 4.0 do not support components debuginfomsf and demangle
if(${LLVM_VERSION_STRING} MATCHES "^3\\..*")
list(REMOVE_ITEM LLVM_FIND_COMPONENTS "debuginfomsf" index)
list(REMOVE_ITEM LLVM_FIND_COMPONENTS "demangle" index)
endif()
# Versions below 8.0 not supported
if(${LLVM_VERSION_STRING} MATCHES "^[3-7]\\..*")
message(FATAL_ERROR "LLVM version below 8.0 not supported")
endif()
llvm_set(LDFLAGS ldflags)
# In LLVM 3.5+, the system library dependencies (e.g. "-lz") are accessed
# using the separate "--system-libs" flag.
llvm_set(SYSTEM_LIBS system-libs)
string(REPLACE "\n" " " LLVM_LDFLAGS "${LLVM_LDFLAGS} ${LLVM_SYSTEM_LIBS}")
llvm_set(LIBRARY_DIRS libdir true)
llvm_set_libs(LIBRARIES libs)
# LLVM bug: llvm-config --libs tablegen returns -lLLVM-3.8.0
# but code for it is not in shared library
if("${LLVM_FIND_COMPONENTS}" MATCHES "tablegen")
if (NOT "${LLVM_LIBRARIES}" MATCHES "LLVMTableGen")
set(LLVM_LIBRARIES "${LLVM_LIBRARIES};-lLLVMTableGen")
endif()
endif()
# Versions below 4.0 do not support llvm-config --cmakedir
if(${LLVM_VERSION_STRING} MATCHES "^3\\..*")
set(LLVM_CMAKEDIR ${LLVM_LIBRARY_DIRS}/cmake/llvm)
else()
llvm_set(CMAKEDIR cmakedir)
endif()
llvm_set(TARGETS_TO_BUILD targets-built)
string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_TARGETS_TO_BUILD ${LLVM_TARGETS_TO_BUILD})
endif()
# Remove some clang-specific flags for gcc.
if(CMAKE_COMPILER_IS_GNUCXX)
string(REPLACE "-Wcovered-switch-default " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
string(REPLACE "-Wstring-conversion " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
string(REPLACE "-fcolor-diagnostics " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
string(REPLACE "-Werror=unguarded-availability-new " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
endif()
# Remove gcc-specific flags for clang.
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
string(REPLACE "-Wno-maybe-uninitialized " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS})
endif()
string(REGEX REPLACE "([0-9]+).*" "\\1" LLVM_VERSION_MAJOR "${LLVM_VERSION_STRING}" )
string(REGEX REPLACE "[0-9]+\\.([0-9]+).*[A-Za-z]*" "\\1" LLVM_VERSION_MINOR "${LLVM_VERSION_STRING}" )
# Use the default CMake facilities for handling QUIET/REQUIRED.
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(LLVM
REQUIRED_VARS LLVM_ROOT_DIR LLVM_HOST_TARGET
VERSION_VAR LLVM_VERSION_STRING)

20
docs/Makefile Normal file
View File

@@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = Triton
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

171
docs/conf.py Normal file
View File

@@ -0,0 +1,171 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Triton documentation build configuration file, created by
# sphinx-quickstart on Mon Feb 10 01:19:09 2020.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = []
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = 'Triton'
copyright = '2020, Philippe Tillet'
author = 'Philippe Tillet'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = ''
# The full version, including alpha/beta/rc tags.
release = ''
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
import sphinx_rtd_theme
html_theme = 'sphinx_rtd_theme'
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# This is required for the alabaster theme
# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
html_sidebars = {
'**': [
'relations.html', # needs 'show_related': True theme option to display
'searchbox.html',
]
}
# -- Options for HTMLHelp output ------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'Tritondoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'Triton.tex', 'Triton Documentation',
'Philippe Tillet', 'manual'),
]
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'triton', 'Triton Documentation',
[author], 1)
]
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'Triton', 'Triton Documentation',
author, 'Triton', 'One line description of project.',
'Miscellaneous'),
]

22
docs/index.rst Normal file
View File

@@ -0,0 +1,22 @@
.. Triton documentation master file, created by
sphinx-quickstart on Mon Feb 10 01:01:37 2020.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to Triton's documentation!
==================================
.. toctree::
:maxdepth: 1
:caption: Contents:
installation/index
tutorials/index
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

View File

@@ -0,0 +1,21 @@
***************
From Source
***************
Triton is a fairly self-contained package and uses its own parser (forked from `wgtcc <https://github.com/wgtdkp/wgtcc>`_) and LLVM-8.0+ for code generation.
.. code-block:: bash
sudo apt-get install llvm-8-dev
git clone https://github.com/ptillet/triton.git;
cd triton/python/;
python setup.py develop;
This should take about 15-20 seconds to compile on a modern machine.
You can then test your installation by running the *einsum.py* example in an environment that contains pytorch:
.. code-block:: bash
cd examples;
python einsum.py

View File

@@ -0,0 +1,7 @@
Installation
============
.. toctree::
:maxdepth: 1
from-source

View File

@@ -0,0 +1,99 @@
===========================
Writing a Custom Operation
===========================
--------------
Compute Kernel
--------------
Let us start with something simple, and see how Triton can be used to create a custom vector addition for PyTorch. The Triton compute kernel for this operation is the following:
.. code-block:: C
// Triton
// launch on a grid of (N + TILE - 1) / TILE programs
__global__ void add(float* z, float* x, float* y, int N){
// program id
int pid = get_program_id(0);
// create arrays of pointers
int offset[TILE] = pid * TILE + 0 ... TILE;
float* pz[TILE] = z + offset;
float* px[TILE] = x + offset;
float* py[TILE] = y + offset;
// bounds checking
bool check[TILE] = offset < N;
// write-back
*?(check)pz = *?(check)px + *?(check)py;
}
As you can see, arrays are first-class citizen in Triton. This has a number of important advantages that will be highlighted in the next tutorial. For now, let's keep it simple and see how to execute the above operation in PyTorch.
---------------
PyTorch Wrapper
---------------
As you will see, a wrapper for the above Triton function can be created in just a few lines of pure python code.
.. code-block:: python
import torch
import triton
class _add(torch.autograd.Function):
# source-code for Triton compute kernel
src = """
__global__ void add(float* z, float* x, float* y, int N){
// program id
int pid = get_program_id(0);
// create arrays of pointers
int offset[TILE] = pid * TILE + 0 ... TILE;
float* pz[TILE] = z + offset;
float* px[TILE] = x + offset;
float* py[TILE] = y + offset;
// bounds checking
bool check[TILE] = offset < N;
// write-back
*?(check)pz = *?(check)px + *?(check)py;
}
"""
# create callable kernel for the source-code
# options: 4 warps and a -DTILE=1024
kernel = triton.kernel(src, defines = {'TILE': 1024}; num_warps = [4])
# Forward pass
@staticmethod
def forward(ctx, x, y):
# type checking
assert x.dtype == torch.float32
# allocate output
z = torch.empty_like(x).cuda()
# create launch grid
# this is a function of the launch parameters
# triton.cdiv indicates ceil division
N = x.numel()
grid = lambda opt: (triton.cdiv(N, opt.d('TILE')), )
# launch kernel
_add.kernel(z, x, y, N, grid = grid)
# return output
return z
# get callable from Triton function
add = _add.apply
# test
torch.manual_seed(0)
x = torch.rand(98432).cuda()
y = torch.rand(98432).cuda()
za = x + y
zb = add(x, y)
diff = (za - zb).abs().max()
print(diff)
Executing the above code will:
- Generate a .cpp file containing PyTorch bindings for the Triton function
- Compile this .cpp file using distutils
- Cache the resulting custom op
- Call the resulting custom op
In other words, the first program run will generate and cache a bunch of files in $HOME/.triton/cache, but subsequent runs should be just as fast as using a handwritten custom operation.

10
docs/tutorials/index.rst Normal file
View File

@@ -0,0 +1,10 @@
Tutorials
==========
.. toctree::
:maxdepth: 1
custom-operation
triton-vs-cuda
matrix-transposition
matrix-multiplication

View File

@@ -0,0 +1,184 @@
*********************
Matrix Multiplication
*********************
The purpose of this section is to present a Triton-C implementation of matrix multiplication that achieves performance competitive with the best existing hand-written CUDA kernels (see `CUTLASS <https://github.com/NVIDIA/cutlass>`_). We will also see how pre-processors macros can be leveraged to fuse transposition operations as well as to provide support for auto-tuning and FP16 Tensor Cores.
*Note: Bounds-checking is ommitted throughout for the sake of clarity. This feature can be easily added into our kernel, but may result in a slight performance hit because LLVM and PTXAS have issues dealing with conditionals and predicates inside loops.*
==============
Compute Kernel
==============
Matrix multiplications of the form `C = A x B` can be implemented in Triton-C fairly concisely, as shown below:
.. code-block:: C
// Triton-C
// launched on a grid of (M / TM) x (N / TN) programs
__global__ void dot(TYPE * A, TYPE * B, TYPE * C, int M, int N, int K,
int lda __multipleof(8), int ldb __multipleof(8), int ldc __multipleof(8)) {
// prologue
int pm = get_program_id(0); //(1)
int pn = get_program_id(1); //(2)
int rm[TM] = pm * TM + 0 ... TM; //(3)
int rn[TN] = pn * TN + 0 ... TN; //(4)
int rk[TK] = 0 ... TK; //(5)
// initialize accumulator
float c[TM, TN] = 0; //(6)
// pointers to operands
TYPE* pa[TM, TK] = A + rk[newaxis, :] * 1 + rm[:, newaxis] * lda; //(7)
TYPE* pb[TK, TN] = B + rk[:, newaxis] * ldb + rn[newaxis, :] * 1; //(8)
// reduction loop
for(int k = K; k > 0; k-= TK){
// fetch operands
TYPE a[TM, TK] = *pa; //(9)
TYPE b[TK, TN] = *pb; //(10)
// matrix-multiply accumulate
c += dot(a, b); //(11)
// increment pointers
pa = pa + TK * 1; //(12)
pb = pb + TK * ldb; //(13)
}
// epilogue
TYPE* pc[TM, TN] = C + rn[newaxis, :] + rm[:, newaxis] * ldc; //(14)
*pc = c; //(15)
}
Here, each kernel instance produces a :code:`TM x TN` tile of the output matrix C as follows:
- Statements (1) - (2) fetch the id of the current program instance.
- Statements (3) - (4) construct ranges of indices to process for the vertical and horizontal axes of the output matrix :code:`C`
- Statement (5) constructs a range of indices along the reduction axis: :code:`rk = [0, 1, ..., TK - 1]`
- Statement (6) initialize a :code:`TM x TN` array of accumulators to hold the result of :code:`A[rm, :] x B[:, rn]`
- Statements (7) - (8) initializes arrays of pointers :code:`pa` and :code:`pb` to the operands :code:`A` and :code:`B` using logic similar to that of the above transposition kernel
- Statements (9) - (10) load tiles of operands by dereferencing :code:`pa` and :code:`pb`
- Statement (11) performs updates the accumulator array using Triton-C's matrix multiplication operator :code:'@'
- Statements (12) - (13) updates :code:`pa` and :code:`pb`
- Statement (14) creates an array of pointers `pc` to the result matrix :code:`C`
- Statement (15) writes back the accumulator to :code:`C`
Internally, the Triton compiler will perform quite a few optimizations that will ensure good performance for this kernel:
- Automatic coalescing of load/store operations
- Automatic vectorization of load/store operations
- Stashing `a` and `b` to shared memory
- Automatic allocation of shared memory
- Automatic synchronization of shared memory
- Automatic padding of shared memory to avoid bank conflicts
- Automatic usage of tensor cores when TYPE = half and TK % 4 = 0
==============
Optimizations
==============
Nonetheless, there are two important optimizations that the Triton compiler does not do automatically at the moment yet are critical to achieve peak performance: pre-fetching and rematerialization. In this subsection we describe how these optimizations can be done manually by modifying the above source-code.
-------------
Pre-Fetching
-------------
The purpose of pre-fetching is to overlap the update of the accumulator `c` with the memory loads for the next tiles that will need to be multiplied. This can be done by modifying the above reduction loop as follows:
.. code-block:: C
// pre-fetch operands
TYPE a[TM, TK] = *pa; //(9)
TYPE b[TK, TN] = *pb; //(10)
for(int k = K; k > 0; k-= TK){
c += dot(a, b);
pa = pa + TK * 1;
pb = pb + TK * ldb;
// don't prefetch last iteration
bool check = k > TK;
// pre-fetch operands
a = check ? *pa : 0;
b = check ? *pb : 0;
}
Note that the Triton-C compiler will now also be able to use double-buffering techniques to make sure that the array `a` can be used and updated at the same time without any memory hazard.
-----------------
Rematerialization
-----------------
`Rematerialization <https://en.wikipedia.org/wiki/Rematerialization>`_ is a compiler optimization which consists in recomputing some values instead of storing and reloading them from (register) memory, so as to decrease register pressure in the compute kernel. Although LLVM does this automatically to some extent, it fails to find good heuristics for the above kernel -- thereby requiring some source code modification to achieve optimal performance. Fortunately, only :code:`rm` and :code:`rn` need to be rematerialized, leading to the following epilogue:
.. code-block:: C
// epilogue
int rcm[TM] = pm * TM + 0 ... TM;
int rcn[TN] = pn * TN + 0 ... TN;
TYPE* pc[TM, TN] = C + rcn[newaxis, :] + rcm[:, newaxis] * ldc;
*pc = c;
------------------------------------
Fused Transpositions and Auto-Tuning
------------------------------------
It is common for optimized matrix-multiplication implementations (e.g., BLAS) to provide variants in which one or both operands are transposed. Fortunately, this can be done by using pre-processors macros for tile shapes and broadcasting directives, leading to the following kernel:
.. code-block:: C
// Triton-C
// launched on a grid of (M / TM) x (N / TN) programs
void dot(TYPE * A, TYPE * B, TYPE * C,
int M, int N, int K,
int lda __multipleof(8), int ldb __multipleof(8), int ldc __multipleof(8)) {
// prologue
int pm = get_program_id(0);
int pn = get_program_id(1);
int rm[TM] = pm * TM + 0 ... TM;
int rn[TN] = pn * TN + 0 ... TN;
int rk[TK] = 0 ... TK;
float c[TM, TN] = 0;
// pointers to operands
TYPE* pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM;
TYPE* pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN;
// prefetches operands
TYPE a[SHAPE_A] = (*pa);
TYPE b[SHAPE_B] = (*pb);
// reduction loop
for(int k = K; k > 0; k-= TK){
c += dot(USE_A, USE_B);
pa = pa + TK * STRIDE_AK;
pb = pb + TK * STRIDE_BK;
a = *pa;
b = *pb;
}
// epilogue
int rcm[TM] = pm * TM + 0 ... TM;
int rcn[TN] = pn * TN + 0 ... TN;
TYPE* pc[TM, TN] = C + rcn[newaxis, :] + rcm[:, newaxis] * ldc;
*pc = c;
}
All matrix multiplications variants can then be retrieved using the following compilation option:
.. code-block:: C
// A is not transposed
-DUSE_A=a -DSTRIDE_AK=1-DSTRIDE_AM=lda
-DBROADCAST_AK=newaxis,: -DBROADCAST_AN=:,newaxis -DSHAPE_A=TM,TK
// A is transposed
-DUSE_A=^a -DSTRIDE_AK=lda-DSTRIDE_AM=1
-DBROADCAST_AK=:,newaxis -DBROADCAST_AN=newaxis,: -DSHAPE_A=TK,TM
// B is not transpose
-DUSE_B=b -DSTRIDE_BK=ldb-DSTRIDE_BN=1
-DBROADCAST_BK=:,newaxis -DBROADCAST_BN=newaxis,: -DSHAPE_B=TK,TN
// B is transpose
-DUSE_B=^b -DSTRIDE_BK=1-DSTRIDE_BN=ldb
-DBROADCAST_BK=newaxis,: -DBROADCAST_BN=:,newaxis -DSHAPE_B=TN,TK
Auto-tuning can also be handled using pre-processor macros:
.. code-block:: C
// Auto-tuning TM and TN in {32, 64, 128}; TK in {8, 16}
-DTM=[32, 64, 128] -DTN=[32, 64, 128] -DTK=[8, 16]

View File

@@ -0,0 +1,113 @@
*********************
Matrix Transpositions
*********************
Transpositions are (relatively) hard to efficiently write in CUDA because naive implementations typically suffer from *uncoalesced* memory operations when writing back the transposed matrix to DRAM.
Of course, this can be fixed by using shared memory as shown `here <https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc>`_, but this comes at the cost of simplicity interferes with auto-tuning.
==============
Compute Kernel
==============
In Triton, however, kernels are single-threaded and the compiler automatically detects if and when data should be temporarily stashed to shared memory. Therefore, an optimal Triton kernel for this operation would look like:
.. code-block:: C
// launched on a grid of (M / TM) x (N / TN) programs of 1 thread each
__global__ void transpose(TYPE * X, TYPE * Y,
int M, int N, int ldx, int ldy) {
// extract program ID
int pidm = get_program_id(0); //(1)
int pidn = get_program_id(1); //(2)
// create 1D range along the two matrix's axes
int rm[TM] = pidm * TM + 0 ... TM; //(3)
int rn[TN] = pidn * TN + 0 ... TN; //(4)
// create 2D array of pointers
TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; //(5)
TYPE* py[TN, TM] = Y + rm[newaxis, :] * ldy + rn[:, newaxis]; //(6)
// write back using the transposition operator '^'
*py = ^(*px); //(7)
}
At a high level, this kernel loads a :code:`TM x TN` tile from the input matrix :code:`X`, transposes it and writes the resulting :code:`TN x TM` tile to the output matrix :code:`Y`. Eventually, transposition of the full input matrix is achieved by launching a grid of :code:`(M / TM) x (N / TN)` programs decomposed as follows:
- Statements (1) and (2) extract the coordinates the program in the above 2D launch grid. For example, the program producing the output tile `Y[TN:2TN-1, 2TN:3TN-1]` holds the values:
.. code-block:: C
pidm = 2
pidn = 1
- Statements (3) and (4) construct the ranges of indices:
.. code-block:: C
rm = [pidm*TM + 0, pidm*TM + 1, ..., pidm*TM + (TM - 1)]
rn = [pidn*TN + 0, pidn*TN + 1, ..., pidn*TN + (TN - 1)]
which will be used in statements (5) and (6) to construct tiles of pointers
- Statements (5) constructs the following array of pointers `px` using numpy-style broadcasting semantics:
::
│ X + (pidm*TM + 0) + (pidn*TN + 0)*ldx, ..., ..., X + (pidm*TM + 0) + (pidn*TN + TN - 1)*ldx) │
│ ⋮ ⋮ │
│ ⋮ ⋮ │
│ X + (pidm*TM + TM - 1) + (pidn*TN + 0)*ldx, ..., ..., X + (pidm*TM + TM - 1) + (pidn*TN + TN - 1)*ldx) │
- Statement (6) constructs the following array of pointers `py` using numpy-style broadcasting semantics:
::
│ Y + (pidn*TN + 0) + (pidm*TM + 0)*ldy, ..., ..., Y + (pidn*TN + 0) + (pidm*TM + TM - 1)*ldy) │
│ ⋮ ⋮ │
│ ⋮ ⋮ │
│ Y + (pidn*TN + TN - 1) + (pidn*TN + 0)*ldy, ..., ..., Y + (pidn*TN + TN - 1) + (pidm*TM + TM - 1)*ldy) │
- Statement (7) element-wise dereferences the above array of pointers `*px`, transposes it using the unary transposition operator `^`, and writes it back at the location specified by `py`.
==========================
The __multipleof attribute
==========================
The memory loads and store in our transposition kernel are not vectorizable by default, since `X + ldx` (and `Y + ldy`) may be misaligned when `ldx` (and `ldy`) are not multiples of e.g., 4. This is unfortunate because tensor dimensions can be easily made into nice powers of two in Deep Learning, due to batch-sizes and layer width being flexible.
For this reason, Triton provides a __multipleof(N) attributes for variables that are guaranteed to always be multiple of N. In the case of Matrix Transpositions, vector loads can be enabled by modifying the function's signature as follows:
.. code-block:: C
__global__ void transpose(TYPE * X, TYPE * Y, int M, int N,
int ldx __multipleof(8),
int ldy __multipleof(8)) {
// ...
}
==========================
Bounds Checking
==========================
You might have noticed that the above code will fail when `M` and `N` are not multiples of `TM` and `TN` respectively. Fortunately, the above kernel can be slightly modified to handle thie situation, as shown below:
.. code-block:: C
// launched on a grid of ((M + TM - 1) / TM) x ((N + TN - 1) / TN) programs
__global__ void transpose(TYPE * X, TYPE * Y, int M, int N, int ldx, int ldy) {
// ...
// create bounds-checking mask
bool checkx[TM, TN] = (rm[:, newaxis] < M) && (rn[newaxis, :] < N); //(7a)
bool checky[TN, TM] = (rm[newaxis, :] < M) && (rn[:, newaxis] < N); //(7b)
// conditional write-back using the conditional dereferencing operatior '*?()'
*?(checky)py = ^(*?(checkx)px); //(7)
}
Here, statements (7a) creates an array of booleans :code:`checkx[TM, TN]` such that :code:`checkx(i, j) = True` if and only if `px(i, j)` should be dereferenced. Statement (7b) does the same for `py`. Both `px` and `py` are then conditionally dereferenced using Triton-C's conditional dereferencing operator :code:`*?(predicate) pointer`.

View File

@@ -0,0 +1,180 @@
====================================================
Putting It All Together
====================================================
In the previous tutorial, we saw how to write tensor-core-friendly matrix multiplication code competitive with cuBLAS in 20 lines of Triton code. Here, we will see how to wrap it into an automatically differentiable PyTorch functions for easy integration in your Deep Learning pipeline.
-----------------
PyTriton Function
-----------------
The PyTriton API provides a :code:`triton.function` class which automatically handles the interaction with automatic differentiation in whichever framework was detected. Therefore, every differentiable custom operation written with PyTriton should inherit from this class
.. code-block:: python
import triton
# Entry point
class _dot(torch.autograd.Function):
@staticmethod
# Forward Pass
def forward(ctx, *args):
#...
@staticmethod
# Backward Pass
def backward(ctx, dy):
#...
-----------------
PyTriton Kernels
-----------------
PyTriton also provides a :code:`triton.kernel` class which automatically takes care of interaction with the Triton-JIT as well as the generation and compilation of C++ framework bindings code. For our dot operation we create a kernel from the Triton code shown at the end of the previous tutorial.
.. code-block:: python
src = """
__global__ void dot(TYPE * A, TYPE * B, TYPE * C,
int M, int N, int K,
int lda __multipleof(8), int ldb __multipleof(8), int ldc __multipleof(8)) {
// prologue
int pm = get_program_id(0);
int pn = get_program_id(1);
int rm[TM] = pm * TM + 0 ... TM;
int rn[TN] = pn * TN + 0 ... TN;
int rk[TK] = 0 ... TK;
float c[TM, TN] = 0;
// pointers to operands
TYPE* pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM;
TYPE* pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN;
// prefetches operands
TYPE a[SHAPE_A] = (*pa);
TYPE b[SHAPE_B] = (*pb);
// reduction loop
for(int k = K; k > 0; k-= TK){
c += USE_A @ USE_B;
pa = pa + TK * STRIDE_AK;
pb = pb + TK * STRIDE_BK;
a = *pa;
b = *pb;
}
// epilogue
int rcm[TM] = pm * TM + 0 ... TM;
int rcn[TN] = pn * TN + 0 ... TN;
TYPE* pc[TM, TN] = C + rcn[newaxis, :] + rcm[:, newaxis] * ldc;
*pc = c;
}
"""
kernel = triton.kernel(src)
At this point, `kernel` is a callable object which takes the same signature as the :code:`dot` function in our source code, except that pointers are treated as tensors: :code:`[tensor, tensor, tensor, int, int, int, int, int, int]`.
-----------------------
Using PyTriton Kernels
-----------------------
However, in practice only A, B are provided by the user, and all the other :code:`int` arguments should be derived from these operands only. Hence, we create a helper function that extracts shapes from the :code:`A` and :code:`B` tensors, and then returns the results of a call to :code:`kernel`:
.. code:: python
@staticmethod
def _call(a, b, transpose_a, transpose_b):
# extract shapes
shape_a = a.shape
shape_b = b.shape
M, Ka = shape_a[0], shape_a[1]
Kb, N = shape_b[0], shape_b[1]
# transpose shapes
if transpose_a:
M, Ka = Ka, M
if transpose_b:
Kb, N = N, Kb
# contiguous dimensions
lda = M if transpose_a else Ka
ldb = Kb if transpose_b else N
ldc = N
# data-type
dtype = a.dtype
# allocate output
c = triton.empty([M, N], dtype = dtype)
# launch grid
grid = lambda opt: [triton.cdiv(M, opt.d('TM')), triton.cdiv(N, opt.d('TN'))]
# pre-processor definitions
defines = {# tile sizes
'TYPE' : dtype,
'AT' : transpose_a,
'BT' : transpose_b,
'TM' : [32, 64, 128]
'TN' : [32, 64, 128]
'TK' : [8]
# handle A transposition
'USE_A' : '^a' if transpose_a else 'a',
'STRIDE_AK' : 'lda' if transpose_a else '1',
'STRIDE_AM' : '1' if transpose_a else 'lda',
'BROADCAST_AK': ':, newaxis' if transpose_a else 'newaxis, :',
'BROADCAST_AM': 'newaxis, :' if transpose_a else ':, newaxis',
'SHAPE_A' : 'TK, TM' if transpose_a else 'TM, TK',
# handle B transposition
'USE_B' : '^b' if transpose_b else 'b',
'STRIDE_BK' : '1' if transpose_b else 'ldb',
'STRIDE_BN' : 'ldb' if transpose_b else '1',
'BROADCAST_BK': 'newaxis, :' if transpose_b else ':, newaxis',
'BROADCAST_BN': ':, newaxis' if transpose_b else 'newaxis, :',
'SHAPE_B' : 'TN, TK' if transpose_b else 'TK, TN'}
return _dot.kernel(a, b, c, M, N, Ka, lda, ldb, ldc,
grid=grid, num_warps=4, defines=defines)
--------------------------------------------
Automatic Differentiation
--------------------------------------------
At this point, our custom operation only takes two tensor arguments and transposition information, which is good. However, it is still not compatible with PyTorch's or TensorFlow's automatic differentiation engine, and a small amount of additional effort is needed.
Creating custom operations for Triton and PyTorch is very similar; programmers have to provide two static methods :code:`forward` and :code:`backward` that take a context as their first input:
.. code:: python
@staticmethod
def forward(ctx, a, b, transpose_a = False, transpose_b = False):
ctx.save_for_backward(a, b)
ctx.t_a = transpose_a
ctx.t_b = transpose_b
return _dot._call(a, b, transpose_a, transpose_b)
@staticmethod
def backward(ctx, dy):
a, b = ctx.saved_tensors
t_a, t_b = ctx.t_a, ctx.t_b
if not t_a and not t_b:
da = _dot._call(dy, b, False, True)
db = _dot._call(a, dy, True, False)
elif not t_a and t_b:
da = _dot._call(dy, b, False, False)
db = _dot._call(dy, a, True, False)
elif t_a and not t_b:
da = _dot._call(b, dy, False, True)
db = _dot._call(a, dy, False, False)
elif t_a and t_b:
da = _dot._call(b, dy, True, True)
db = _dot._call(dy, a, True, True)
else:
assert False
return da, db, None, None, None, None, None, None, None
A callable operation can be created using the :code:`apply` method of the :code:`torch.autograd.Function` class.
.. code:: python
dot = _dot.apply
And that's it! In just ~100 lines of pure python, we have written a fully functional matrix multiplication that will not only work with automatic differentiation but also provide performance very close to cuBLAS. And it's all open-source~

View File

@@ -0,0 +1,109 @@
***************
Triton vs. CUDA
***************
The purpose of this tutorial is to explore in more depth the major differences between Triton and CUDA. To keep things simple, we will still be focusing on the following vector addition code:
.. code-block:: C
// Triton
// launch on a grid of (N + TILE - 1) / TILE programs
__global__ void add(float* z, float* x, float* y, int N){
int offset[TILE] = get_program_id(0) * TILE + 0 ... TILE;
bool check[TILE] = offset < N;
float* pz[TILE] = z + offset;
float* px[TILE] = x + offset;
float* py[TILE] = y + offset;
*?(check)pz = *?(check)*px + *?(check)py;
}
And its CUDA equivalent:
.. code-block:: C
// CUDA
// launch on a grid of (N + TILE - 1) / TILE programs
__global__ void add(float *z, float *x, float *y, int N) {
int off = blockIdx.x * TILE + threadIdx.x;
if(off < N){
float *pz = z + off;
float *px = x + off;
float *py = y + off;
*pz = *px + *py
}
}
==========================
Automatic parallelization
==========================
While the two above pieces of code may look at first sight similar, a closer look reveals one *fundamental* difference: While CUDA kernels are launched on a cooperative array of threads, **Triton kernel are single-threaded and automatically parallelized**.
This is a major difference in programming model, which not only makes your life much easier as a programmer, but also allows the Triton compiler to automatically do all sorts of nice optimizations:
- *Automatic shared memory allocation and synchronization*
That's right; programmers don't need to worry about shared memory allocation, usage and synchronization. Instead, the Triton compiler will use complex program analysis techniques to determine when shared memory should be used, where it should be synchronized and how threads should access it to avoid memory bank conflicts.
- *Automatic memory coalescing*
When you write Triton code, you also don't need to worry about memory coalescing. The compiler will arrange threads so that global memory accesses are coalesced when possible.
- *Automatic tensor core utilization*
Using tensor cores on Volta and Turing is notoriously difficult. Code is hard to write and even harder to optimize. Fortunately, the Triton compiler can also generate very efficient tensor core instructions (e.g., :code:`mma.sync.m8n8k4`) when low-precision matrices are multiplied together:
.. code-block:: C
half A[16, 8] = ... // initialize A
half B[8, 16] = ... // initialize B
float C[16, 16] = dot(A, B); // uses Tensor Cores!
- *Automatic instruction predication*
Contrary to CUDA, Triton directly exposes predicated instruction through masked load/store instructions. This enables the Triton compiler to generate predicated instructions in PTX directly, resulting in sometimes better performance than I/O operations wrapped inside conditionals.
===========================
Vector Addition - Revisited
===========================
In light of these optimizations, it turns out that the GPU code generated by our Triton-C vector addition code is actually more analogous to the following:
.. code-block:: C
// CUDA
// launch on a grid of (N + TILE - 1) / TILE programs
__global__ void add(float *z, float *x, float *y, int N) {
int off[4];
#pragma unroll
for(int k = 0; k < 4; k++)
off[k] = blockIdx.x * TILE + threadIdx.x + k * blockSize.x;
#pragma unroll
for(int k = 0; k < 4; k++)
z[off[0]] = x[off[0]] + y[off[0]]
}
This code is actually more complicated when x, y and z have :code:`half` type, because then the Triton compiler automatically vectorizes data accesses using :code:`half2` to ensure memory transactions of 32-bits per thread.
============================
Auto-Tuning
============================
Now assume that you want to tune the above code for different data types, tile sizes and thread block sizes. This is doable in CUDA but would require you to write cumbersome machinery to handle different vector sizes and loop unrolling factors. In Triton, this can be trivially done by adjusting some compilation parameters. For example:
.. code-block:: python
kernel = triton.kernel(src, defines = {'TILE': [256, 512, 1024]}, num_warps = [2, 4, 8])
would benchmark our above triton source-code for tile sizes of 256, 512 and 1024 executed with 2, 4 or 8 warps -- and cache the fastest kernel.
=============================
Going Further
=============================
The benefits of Triton become more and more pronounced as compute kernels get more and more complex. In the next few tutorials, you will see how to implement transposition and tensor-core-compatible matrix multiplication routine on par with cuBLAS and CUTLASS without having to know anything about GPU micro-architecture!

View File

@@ -1,7 +0,0 @@
for i in $(find ../lib/ ../include/isaac/ ../python/src/bind -name '*.cpp' -or -name '*.hpp' -or -name '*.h' | grep -v "../lib/external" | grep -v "../include/isaac/driver/external/");
do
if ! grep -q Copyright $i
then
cat ../LICENSE $i >$i.new && mv $i.new $i
fi
done

Binary file not shown.

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 55 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 62 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 62 KiB

Binary file not shown.

View File

@@ -1,6 +0,0 @@
foreach(PROG isaac-tools)
add_executable(${PROG} ${PROG}.cpp)
set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG})
include_directories(/usr/local/cuda/include/)
target_link_libraries(${PROG} isaac)
endforeach(PROG)

View File

@@ -1,649 +0,0 @@
#include "opts.hpp"
#include "isaac/scalar.h"
#include "isaac/api.h"
#include "isaac/driver/cublas.h"
#include "isaac/driver/backend.h"
#include "isaac/driver/context.h"
#include "isaac/driver/stream.h"
#include "isaac/runtime/predict.h"
#include "isaac/templates/gemm.h"
#include "isaac/templates/error.hpp"
#include "isaac/tools/bench.hpp"
namespace sc = isaac;
namespace drv = sc::driver;
using sc::param_t;
enum Code {
RESET = 0,
BOLD = 1,
ITALIC = 3,
FG_RED = 31,
FG_GREEN = 32,
FG_YELLOW = 33,
FG_BLUE = 34,
FG_MAGENTA = 35,
FG_CYAN = 36,
FG_LIGHT_GRAY = 37,
FG_DARK_GRAY = 90,
FG_LIGHT_RED = 91,
FG_LIGHT_GREEN = 92,
FG_LIGHT_YELLOW = 93,
FG_LIGHT_BLUE = 94,
FG_LIGHT_MAGENTA = 95,
FG_LIGHT_CYAN = 96,
FG_WHITE = 97
};
class color_stream {
Code code;
public:
color_stream(Code pCode) : code(pCode) {}
friend std::ostream&
operator<<(std::ostream& os, const color_stream& mod) {
return os << "\033[" << mod.code << "m";
}
};
/* Helpers for benchmarking */
typedef std::tuple<sc::DType, sc::IsaacOperation_t, sc::IsaacOperation_t, sc::param_t, sc::param_t, sc::param_t> gemm_params_t;
typedef std::tuple<sc::DType, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> conv_params_t;
typedef std::tuple<sc::DType, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> pool_params_t;
struct SC17{
// GEMM
static std::vector<gemm_params_t> gemm(sc::DType dtype){
std::vector<gemm_params_t> shapes;
// LinPack
for(param_t N: std::vector<param_t>{512, 1024, 2048})
shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, N));
// DeepBench
for(sc::IsaacOperation_t AT: std::vector<sc::IsaacOperation_t>{sc::ISAAC_OP_N, sc::ISAAC_OP_T})
for(param_t M: std::vector<param_t>{1760})
for(param_t N: std::vector<param_t>{16, 32, 64, 128})
shapes.push_back(std::make_tuple(dtype, AT, sc::ISAAC_OP_N, M, N, M));
// PCA/ICA
for(param_t N: std::vector<param_t>{16, 64, 256})
for(param_t K: std::vector<param_t>{64000})
shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
// LaPACK
for(param_t N: std::vector<param_t>{1024, 2048, 4096})
for(param_t K: std::vector<param_t>{32})
shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
return shapes;
}
// CONV
static std::vector<conv_params_t> conv(sc::DType dtype){
// Vector of (dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)
std::vector<conv_params_t> shapes;
// // DeepSpeech
// for(size_t N: std::vector<size_t>{8})
// shapes.push_back(std::make_tuple(dtype, 1, 700, 161, 1, N, 32, 1, 5, 20, 0, 0, 0, 1, 1, 1));
// for(size_t N: std::vector<size_t>{8})
// shapes.push_back(std::make_tuple(dtype, 1, 341, 79, 32, N, 32, 1, 5, 10, 0, 0, 0, 1, 1, 1));
// // OCR
// shapes.push_back(std::make_tuple(dtype, 1, 480, 48, 1, 16, 16, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 240, 24, 16, 16, 32, 1, 3, 3, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 120, 12, 32, 16, 64, 1, 3, 3, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 60, 6, 64, 16, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// // Face Recognition
// shapes.push_back(std::make_tuple(dtype, 1, 108, 108, 3, 8, 64, 1, 3, 3, 0, 1, 1, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 54, 54, 64, 8, 64, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 27, 27, 128, 8, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 128, 8, 256, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 256, 8, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// // Vision
// for(size_t N: std::vector<size_t>{8}){
// shapes.push_back(std::make_tuple(dtype, 1, 224, 224, 3, N, 64, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 112, 112, 64, N, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 128, N, 256, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 256, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// }
// shapes.push_back(std::make_tuple(dtype, 1, 224, 224, 3, 16, 64, 1, 7, 7, 0, 3, 3, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 192, 16, 32, 1, 5, 5, 0, 2, 2, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 192, 16, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, 16, 48, 1, 5, 5, 0, 2, 2, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, 16, 192, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 832, 16, 256, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 832, 16, 128, 1, 5, 5, 0, 2, 2, 1, 1, 1));
// // Speaker ID
// shapes.push_back(std::make_tuple(dtype, 1, 350, 80, 64, 16, 128, 1, 5, 5, 0, 1, 1, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 175, 40, 128, 16, 256, 1, 5, 5, 0, 1, 1, 1, 2, 2));
// // ResNET
// for(size_t N: std::vector<size_t>{8}){
// shapes.push_back(std::make_tuple(dtype, 1, 112, 112, 64, N, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 64, N, 256, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 256, N, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 256, N, 128, 1, 1, 1, 0, 0, 0, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 128, N, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 128, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 256, 1, 1, 1, 0, 0, 0, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 256, N, 1024, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 1024, 1, 1, 1, 0, 0, 0, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 1024, N, 2048, 1, 1, 1, 0, 0, 0, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 2048, 1, 1, 1, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 1024, N, 2048, 1, 1, 1, 0, 1, 1, 1, 2, 2));
// }
// 3D-Unet
shapes.push_back(std::make_tuple(dtype, 31, 204, 204, 4, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 29, 202, 202, 24, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 27, 100, 100, 24, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 25, 98, 98, 72, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 23, 48, 48, 72, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 21, 46, 46, 216, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 19, 22, 22, 216, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 17, 20, 20, 648, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 15, 36, 36, 648, 1, 432, 1, 1, 1, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 13, 36, 36, 432, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 11, 34, 34, 216, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 216, 1, 144, 1, 1, 1, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 144, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 9 , 62, 62, 72, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 7 , 120, 120, 72, 1, 48, 1, 1, 1, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 5 , 120, 120, 48, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 3 , 118, 118, 24, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 1 , 116, 116, 4 , 1, 24, 1, 1, 1, 0, 0, 0, 1, 1, 1));
return shapes;
}
// POOL
static std::vector<pool_params_t> pool(sc::DType dtype){
std::vector<pool_params_t> shapes;
// 3D-Unet
shapes.push_back(std::make_tuple(dtype, 31, 204, 204, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 29, 202, 202, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 27, 100, 100, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 25, 98, 98, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 23, 48, 48, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 21, 46, 46, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 19, 22, 22, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 17, 20, 20, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 15, 36, 36, 1, 432, 1, 1, 1, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 13, 36, 36, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 11, 34, 34, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 1, 144, 1, 1, 1, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 9 , 62, 62, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 7 , 120, 120, 1, 48, 1, 1, 1, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 5 , 120, 120, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 3 , 118, 118, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 1 , 116, 116, 1, 24, 1, 1, 1, 0, 0, 0, 1, 1, 1));
return shapes;
}
};
/* Metrics for benchmarking */
struct Metric{
virtual std::function<bool(double, double)> cmp() const = 0;
virtual double conv(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t C, param_t R, param_t S, param_t T, double tsec) const = 0;
virtual double gemm(param_t M, param_t N, param_t K, double tsec) const = 0;
virtual double pool(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t, param_t, param_t, double tsec) const = 0;
};
class FLOPS: public Metric{
public:
FLOPS(double scale): scale_(scale){}
std::function<bool(double, double)> cmp() const { return std::greater<double>(); }
double conv(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t C, param_t R, param_t S, param_t T, double tsec) const
{ return sc::templates::Conv::tflops(P,Q,M,K,N,C,R,S,T,tsec) * 1e12 / scale_; }
double gemm(param_t M, param_t N, param_t K, double tsec) const
{ return sc::templates::GEMM::tflops(M, N, K, tsec) * 1e12 / scale_; }
double pool(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t T, param_t R, param_t S, double tsec) const
{ return sc::templates::Pool::tflops(P, Q, M, K, N, T, R, S, tsec) * 1e12 / scale_;}
private:
double scale_;
};
class Time: public Metric{
public:
Time(double scale): scale_(scale){}
std::function<bool(double, double)> cmp() const { return std::less<double>(); }
double conv(param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; }
double gemm(param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; }
double pool(param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; }
private:
double scale_;
};
void print_results_header(std::vector<std::string> sections){
std::cout << color_stream(ITALIC) << color_stream(BOLD) ;
std::copy(sections.begin(), sections.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
std::cout << color_stream(RESET) << std::endl;
}
void print_results(std::vector<double> const & times, std::vector<std::string> const & prefix, std::function<bool(double, double)> cmp, std::function<double(double)> fn){
std::copy(prefix.begin(), prefix.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
std::vector<double> perf;
std::transform(times.begin(), times.end(), std::back_inserter(perf), fn);
auto fastest = perf;
std::sort(fastest.begin(), fastest.end(), cmp);
for(auto x: perf){
if(x == fastest[0] && x / fastest[1] > 1.05)
std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
else
std::cout << x;
std::cout << "\t";
}
std::cout << std::endl;
}
void benchmark_gemm(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream,
sc::DType dtype, sc::IsaacOperation_t AT, sc::IsaacOperation_t BT, size_t M, size_t N, size_t K,
sc::templates::Generator* generator){
size_t ldc = M;
size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
size_t dtsize = sc::size_of(dtype);
sc::scalar alpha(1., dtype);
sc::scalar beta(0., dtype);
char cuAT = (AT==sc::ISAAC_OP_T)?'T':'N';
char cuBT = (BT==sc::ISAAC_OP_T)?'T':'N';
sc::driver::Buffer C(ctx, M*N*dtsize);
sc::driver::Buffer A(ctx, M*K*dtsize);
sc::driver::Buffer B(ctx, K*N*dtsize);
std::vector<double> times;
times.push_back(bench([&](){ sc::GEMM(device, stream, dtype, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, alpha, A, B, beta, C, 1., 1., 1., NULL, (sc::templates::GEMM*)generator, 10); }, [&](){ stream.synchronize(); }, device));
if(sc::driver::dispatch::cublasinit()){
cublasGemmAlgo_t fastest;
sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, &fastest);
times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, NULL, fastest); }, [&](){ stream.synchronize(); }, device));
//times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }, [&](){ stream.synchronize(); }, device));
}
print_results(times, {str(AT), str(BT), str(M), str(N), str(K)}, metric.cmp(), [&](double tsec){ return metric.gemm(M, N, K, tsec);});
}
void benchmark_conv(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream,
sc::DType in_dtype, sc::DType out_dtype, size_t D, size_t H, size_t W, size_t C, size_t N, size_t K, size_t T, size_t R, size_t S,
size_t pad_d, size_t pad_h, size_t pad_w,
size_t stride_d, size_t stride_h, size_t stride_w,
size_t upsample_d, size_t upsample_h, size_t upsample_w,
sc::templates::Generator* generator){
param_t Zk = 0, crop_z_m0 = 0, crop_z_m1 = 0, crop_z_p0 = 0, crop_z_p1 = 0, crop_z_q0 = 0, crop_z_q1 = 0;
param_t M, P, Q;
sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, M, P, Q);
sc::ActivationType activation = sc::Linear;
size_t vect_c = (in_dtype==sc::INT8X4_TYPE)?4:1;
size_t vect_k = (out_dtype==sc::INT8X4_TYPE)?4:1;
sc::DType ab_dtype = (out_dtype==sc::INT8X4_TYPE)?sc::FLOAT_TYPE:out_dtype;
sc::scalar alpha(1., ab_dtype);
sc::scalar beta(0., ab_dtype);
sc::driver::Buffer O(ctx, N*K/vect_k*M*P*Q*sc::size_of(out_dtype));
sc::driver::Buffer I(ctx, C/vect_c*D*H*W*N*sc::size_of(in_dtype));
sc::driver::Buffer F(ctx, K*C/vect_c*T*R*S*sc::size_of(in_dtype));
std::vector<double> times;
times.push_back(bench([&](){ sc::CONV(device, stream, in_dtype, out_dtype, N, K, M, P, Q, C, T, R, S, D, H, W, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, I, F, &O, 1, NULL, activation, 0., 1., 1., {1.}, 1., sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1, NULL, (sc::templates::Conv*)generator, 10); }, [&](){ stream.synchronize(); }, device));
// if(sc::driver::dispatch::cudnninit())
// times.push_back(bench([&](){ sc::driver::cudnnConv(out_dtype, stream, D, H, W, N, K, M, P, Q, C, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device));
print_results(times, {str(N), str(K), str(M), str(P), str(Q), str(C), str(T), str(R), str(S)}, metric.cmp(), [&](double tsec){ return metric.conv(M, P, Q, K, N, C, T, R, S, tsec);});
}
void benchmark_pool(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream,
sc::DType dtype, size_t D, size_t H, size_t W, size_t N, size_t K, size_t T, size_t R, size_t S, size_t pad_d, size_t pad_h, size_t pad_w, size_t stride_d, size_t stride_h, size_t stride_w,
sc::templates::Generator* generator){
param_t M, P, Q;
sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, 1, 1, 1, M, P, Q);
size_t dtsize = sc::size_of(dtype);
sc::scalar alpha(1., dtype);
sc::scalar beta(0., dtype);
sc::driver::Buffer O(ctx, N*K*M*P*Q*dtsize);
sc::driver::Buffer I(ctx, K*D*H*W*N*dtsize);
std::vector<double> times;
times.push_back(bench([&](){ sc::POOL(device, stream, dtype, dtype, sc::MaxPool, K, M, P, Q, N, T, R, S, D, H, W, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, I, O, 1., 1., (sc::templates::Pool*)generator); }, [&](){ stream.synchronize(); }, device));
if(sc::driver::dispatch::cudnninit())
times.push_back(bench([&](){ sc::driver::cudnnPool(dtype, stream, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, alpha, I, beta, O); }, [&](){ stream.synchronize(); }, device));
print_results(times, {str(N), str(K), str(M), str(P), str(Q), str(T), str(R), str(S)}, metric.cmp(), [&](double tsec){ return metric.pool(M, P, Q, K, N, T, R, S, tsec);});
}
/* ------------------------------- */
void loop_nest(std::vector<size_t> const & ranges, std::function<void(std::vector<size_t> const &)> const & f){
size_t D = ranges.size();
std::vector<size_t> values(D, 0);
// Start with innermost loop
size_t i = D - 1;
while(true){
//Execute function
f(values);
//Increment counters
while(values[i]++ == ranges[i] - 1){
if(i == 0)
return;
values[i--] = 0;
}
i = D - 1;
}
}
template<class T>
void loop_nest(std::vector<std::vector<T>> const & iterates, std::function<void(std::vector<T>)> const & f){
//Ranges to iterate over
std::vector<size_t> ranges;
for(auto const & x: iterates)
ranges.push_back(x.size());
//Proxy function
auto proxy = [&](std::vector<size_t> const & idx){
std::vector<T> x(iterates.size());
for(size_t i = 0; i < x.size(); ++i)
x[i] = iterates[i][idx[i]];
f(x);
};
//Iterate
loop_nest(ranges, proxy);
}
void search_conv(int32_t D, int32_t H, int32_t W,
int32_t C, int32_t N, int32_t K,
int32_t T, int32_t R, int32_t S,
int32_t pad_d, int32_t pad_h, int32_t pad_w,
int32_t stride_d, int32_t stride_h, int32_t stride_w,
int32_t upsample_d, int32_t upsample_h, int32_t upsample_w,
sc::ActivationType activation, sc::DType in_dtype, sc::DType out_dtype)
{
auto ctx = drv::backend::contexts::get_default();
size_t P = (H - R + 1 + 2*pad_h + stride_h - 1)/stride_h;
size_t Q = (W - S + 1 + 2*pad_w + stride_w - 1)/stride_w;
size_t M = (D - T + 1 + 2*pad_d + stride_d - 1)/stride_d;
size_t Zk = 0, crop_z_m0 = 0, crop_z_m1 = 0, crop_z_p0 = 0, crop_z_p1 = 0, crop_z_q0 = 0, crop_z_q1 = 0;
//Setup
drv::Buffer O(ctx, K*P*Q*M*N*sc::size_of(out_dtype));
drv::Buffer I(ctx, C*H*W*D*N*sc::size_of(in_dtype));
drv::Buffer F(ctx, C*R*S*T*K*sc::size_of(in_dtype));
drv::Stream stream(ctx);
//Exhaustive search
std::vector<sc::param_t> r1 = {1};
std::vector<sc::param_t> rv = {4};
std::vector<sc::param_t> rr = {1, 2, 4, 8};
std::vector<sc::param_t> rl = {4, 8, 16, 32};
std::vector<sc::param_t> rs = {4, 8, 16};
double best;
loop_nest<sc::param_t>({rv, rl, rl, rs, rs, rl, rl, r1, rr, rr}, [&](std::vector<sc::param_t> const & x){
sc::templates::Conv generator(in_dtype, out_dtype,
C, D, H, W, N, K, M, P, Q, T, R, S,
pad_d, pad_h, pad_w,
stride_d, stride_h, stride_w,
upsample_d, upsample_h, upsample_w,
activation, 1,
sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1,
x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8]);
//Compile
try{
std::string src = generator.dump(ctx.device(), "conv");
drv::Module program(ctx, src);
drv::Kernel kernel(program, "conv");
double tsec = bench([&](){ generator.enqueue(kernel, stream, I, F, &O); }, [&](){ stream.synchronize(); }, ctx.device());
double tflops = sc::templates::Conv::tflops(P,Q,M,K,N,C,R,S,T,tsec);
best = std::max(tflops, best);
std::cout << "//";
std::copy(x.begin(), x.end(), std::ostream_iterator<int>(std::cout, " "));
std::cout << ": " << tflops << " TFLOPS [BEST: " << best << "]" << std::endl;
}catch(isaac::templates::invalid_parameters const &){
return;
}catch(drv::exception::cuda::launch_out_of_resources const &){
return;
}
});
std::cout << "ISAAC: " << best << std::endl;
}
void search_gemm(int32_t M, int32_t N, int32_t K, sc::IsaacOperation_t AT, sc::IsaacOperation_t BT, sc::DType dtype){
auto ctx = drv::backend::contexts::get_default();
size_t dtsize = sc::size_of(dtype);
// Setup
size_t ldc = M;
size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
int32_t offc = 0, offa = 0, offb = 0;
drv::Buffer C(ctx, M*N*dtsize);
drv::Buffer A(ctx, M*K*dtsize);
drv::Buffer B(ctx, K*N*dtsize);
drv::Stream stream(ctx);
sc::scalar alpha(1., dtype), beta(0., dtype);
// Exhaustive search
std::vector<sc::param_t> r1 = {1};
std::vector<sc::param_t> rv = {4};
std::vector<sc::param_t> rr = {1, 2, 4, 8};
std::vector<sc::param_t> rl = {4, 8, 16, 32};
std::vector<sc::param_t> rs = {4, 8, 16};
double best = 0;
loop_nest<sc::param_t>({rv, rl, rl, rl, rs, r1, rs, rl, rl, rl, rl, r1, rr, rr}, [&](std::vector<sc::param_t> const & x){
isaac::templates::GEMM generator(dtype, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]);
// Compile
try{
std::string src = generator.dump(ctx.device(), "gemm");
drv::Module program(ctx, src);
drv::Kernel kernel(program, "gemm");
double time = bench([&](){ generator.enqueue(kernel, stream, alpha, A, B, beta, C); }, [&](){ stream.synchronize(); }, ctx.device());
double tflops = 2*1e-3*M*N*K/time;
best = std::max(tflops, best);
std::cout << "//";
std::copy(x.begin(), x.end(), std::ostream_iterator<int>(std::cout, " "));
std::cout << ": " << tflops << " TFLOPS [BEST: " << best << "]" << std::endl;
}catch(isaac::templates::invalid_parameters const &){
return;
}catch(drv::exception::cuda::launch_out_of_resources const &){
return;
}
});
std::cout << "ISAAC: " << best << std::endl;
}
/* Helpers for dumping source code */
void dump_source(sc::driver::Device const & device, sc::templates::Generator& generator, opts::Options* options, std::string const & name){
if(options->get<std::string>("format") == "ptx")
std::cout << generator.dump(device, name) << std::endl;
else{
auto x = generator.tuning_params();
std::cout << "Tuning parameters: " << std::flush;
for(size_t i = 0; i < x.size(); ++i)
std::cout << ((i>0)?", ":"") << x[i] << std::flush;
std::cout << std::endl;
}
}
/* Application code */
int main(int argc, char* argv[]){
opts::Application program("isaac-tools", "Command-line interface for ISAAC");
// Options
opts::Options* options = program.options();
options->add<size_t>("device", "Device to run on", 0);
options->add<sc::DType>("dtype", "Data-type to use for computations", "float32", {{"int8x4", sc::INT8X4_TYPE}, {"float32", sc::FLOAT_TYPE}, {"float64", sc::DOUBLE_TYPE}});
options->add<std::string>("name", "Name to give to the generated kernel", "kernel");
options->add_group("search", "Exhaustively search for best tuning parameters");
opts::Options* dump = options->add_group("dump", "Dump source-code generated by ISAAC");
dump->add("format", "Format to generate", "ptx", {"ptx", "params"});
dump->add("target", "Target GPU (sm_xx)", {"sm_50", "sm_52", "sm_60", "sm_61", "sm_70"});
opts::Options* bench = options->add_group("bench", "Benchmark source code generated by ISAAC");
bench->add("suite", "Benchmarking suite to run", "custom", {"custom", "deepbench"});
bench->add<std::shared_ptr<Metric>>("metric", "performance metric for the results", "tflops", {{"tflops", std::make_shared<FLOPS>(1e12)}, {"ms", std::make_shared<Time>(1e-3)}, {"us", std::make_shared<Time>(1e-6)}});
// Constraints
options->add_constraint(opts::OneOf({"bench", "dump", "search"}));
options->add_constraint(opts::OneOf({"gemm", "conv", "pool"}));
// GEMM
opts::Options* gemm = options->add_group("gemm", "Use matrix-multiplication");
gemm->add("layout", "Transposition layout for A and B", "NT", {"NN", "NT", "TN", "TT"});
gemm->add<std::vector<size_t>>("shape", "Matrix shapes (M,N,K)", {2048, 2048, 2048}, opts::SizeConstraint(3));
gemm->add<std::vector<size_t>>("kernel", "Bypass predictive model to use given tuning parameters", opts::SizeConstraint(14));
// CONV
opts::Options* conv = options->add_group("conv", "Use convolutions");
conv->add<std::vector<size_t>>("shape", "Tensor shapes (D, H, W, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)", {1, 70, 14, 512, 128, 64, 1, 7, 7, 0, 0, 0, 1, 1, 1}, opts::SizeConstraint(15));
conv->add<std::vector<size_t>>("kernel", "Bypass predictive model to use given tuning parameters", opts::SizeConstraint(9));
// POOL
opts::Options* pool = options->add_group("pool", "Use pooling");
pool->add<std::vector<size_t>>("shape", "Tensor shapes (D, H, W, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)", {1, 70, 14, 128, 64, 1, 7, 7, 0, 0, 0, 1, 1, 1}, opts::SizeConstraint(14));
pool->add<std::vector<size_t>>("kernel", "Bypass predictive model to use given tuning parameters", opts::SizeConstraint(4));
program.parse(argc, argv);
if(options->has("bench"))
std::cout << std::fixed << std::setprecision(2);
//Device
sc::driver::Device device = sc::driver::backend::devices()[options->get<size_t>("device")];
if(options->has("dump") && dump->has("target")){
std::string target = dump->get<std::string>("target");
char major = target[3];
char minor = target[4];
device.interpret_as(std::make_pair((size_t)std::atoi(&major), (size_t)std::atoi(&minor)));
}
static sc::driver::Context context(device);
sc::driver::Stream stream(context);
// Data-Type
sc::DType dtype = options->get<sc::DType>("dtype");
// Kernel name
std::string name = options->get<std::string>("name");
/* Get optimized kernel generator */
std::unique_ptr<sc::templates::Generator> generator;
// GEMM
if(options->has("gemm")){
std::string layout = gemm->get<std::string>("layout");
sc::IsaacOperation_t AT = layout[0]=='T'?sc::ISAAC_OP_T:sc::ISAAC_OP_N;
sc::IsaacOperation_t BT = layout[1]=='T'?sc::ISAAC_OP_T:sc::ISAAC_OP_N;
auto shape = gemm->get<std::vector<size_t>>("shape");
size_t M = shape[0], N = shape[1], K = shape[2];
//Get Source
size_t ldc = M;
size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
if(options->has("search")){
search_gemm(M, N, K, AT, BT, dtype);
}
if(gemm->has("kernel")){
auto x = gemm->get<std::vector<size_t>>("kernel");
generator.reset(new sc::templates::GEMM(dtype, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]));
}
else{
sc::runtime::GEMMProfile* profile = (sc::runtime::GEMMProfile*)sc::runtime::database.at({device.architecture(), sc::runtime::GEMM}).get();
generator.reset(new sc::templates::GEMM(profile->predict(stream, dtype, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc)));
}
if(options->has("dump"))
dump_source(device, *generator, dump, name);
if(options->has("bench")){
auto metric = bench->get<std::shared_ptr<Metric>>("metric");
print_results_header({"AT", "BT", "M", "N", "K", "ISAAC", "cuBLAS"});
std::vector<gemm_params_t> shapes;
//User provided shapes
if(bench->get<std::string>("suite")=="custom")
shapes = {std::make_tuple(dtype, AT, BT, M, N, K)};
//SC17 paper shapes
if(bench->get<std::string>("suite")=="deepbench")
shapes = SC17::gemm(dtype);
//Print results
for(auto x: shapes){
std::tie(dtype, AT, BT, M, N, K) = x;
benchmark_gemm(*metric, context, device, stream, dtype, AT, BT, M, N, K, gemm->has("kernel")?generator.get():NULL);
}
}
}
// CONV
if(options->has("conv")){
sc::DType in_dtype = dtype;
sc::DType out_dtype = dtype;
auto x = conv->get<std::vector<size_t>>("shape");
param_t D = x[0], H = x[1], W = x[2], C = x[3], N = x[4], K = x[5], T = x[6], R = x[7], S = x[8], pad_d = x[9], pad_h = x[10], pad_w = x[11], stride_d = x[12], stride_h = x[13], stride_w = x[14];
param_t M, P, Q;
param_t upsample_d = 1, upsample_h = 1, upsample_w = 1;
param_t Zk = 0, crop_z_m0 = 0, crop_z_m1 = 0, crop_z_p0 = 0, crop_z_p1 = 0, crop_z_q0 = 0, crop_z_q1 = 0;
sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, M, P, Q);
sc::ActivationType activation = sc::Linear;
if(options->has("search"))
search_conv(D, H, W, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, in_dtype, out_dtype);
if(conv->has("kernel")){
auto x = conv->get<std::vector<size_t>>("kernel");
generator.reset(new sc::templates::Conv(in_dtype, out_dtype, C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, 1, sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8]));
}
else{
sc::runtime::ConvProfile* profile = (sc::runtime::ConvProfile*)sc::runtime::database.at({device.architecture(), sc::runtime::CONV}).get();
generator.reset(new sc::templates::Conv(profile->predict(stream, in_dtype, out_dtype, C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, 1, sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1)));
}
if(options->has("dump"))
dump_source(device, *generator, dump, name);
if(options->has("bench")){
auto metric = bench->get<std::shared_ptr<Metric>>("metric");
print_results_header({"N", "K", "M", "P", "Q", "C", "T", "R", "S", "ISAAC", "cuDNN"});
std::vector<conv_params_t> shapes;
//User provided shapes
if(bench->get<std::string>("suite")=="custom")
shapes = {std::make_tuple(dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)};
//SuperComputing17 shapes
if(bench->get<std::string>("suite")=="deepbench")
shapes = SC17::conv(dtype);
//Print results
for(auto x: shapes){
std::tie(dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w) = x;
benchmark_conv(*metric, context, device, stream, in_dtype, out_dtype, D, H, W, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, conv->has("kernel")?generator.get():NULL);
}
}
}
// POOL
if(options->has("pool")){
auto x = pool->get<std::vector<size_t>>("shape");
param_t D = x[0], W = x[1], H = x[2], N = x[3], K = x[4], T = x[5], R = x[6], S = x[7], pad_d = x[8], pad_h = x[9], pad_w = x[10], stride_d = x[11], stride_h = x[12], stride_w = x[13];
param_t M, P, Q;
sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, 1, 1, 1, M, P, Q);
if(pool->has("kernel")){
auto x = pool->get<std::vector<size_t>>("kernel");
generator.reset(new sc::templates::Pool(dtype, dtype, sc::MaxPool, K, D, H, W, N, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, x[0], x[1], x[2], x[3]));
}
else{
generator.reset(new sc::templates::Pool(dtype, dtype, sc::MaxPool, K, D, H, W, N, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w));
}
if(options->has("dump"))
dump_source(device, *generator, dump, name);
if(options->has("bench")){
auto metric = bench->get<std::shared_ptr<Metric>>("metric");
print_results_header({"N", "K", "M", "P", "Q", "T", "R", "S", "ISAAC", "cuDNN"});
std::vector<pool_params_t> shapes;
//User provided shapes
if(bench->get<std::string>("suite")=="custom")
shapes = {std::make_tuple(dtype, D, W, H, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)};
//SuperComputing17 shapes
if(bench->get<std::string>("suite")=="deepbench")
shapes = SC17::pool(dtype);
//Print results
for(auto x: shapes){
std::tie(dtype, D, W, H, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w) = x;
benchmark_pool(*metric, context, device, stream, dtype, D, H, W, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, pool->has("kernel")?generator.get():NULL);
}
}
}
}

View File

@@ -1,395 +0,0 @@
#ifndef OPTS_HPP
#define OPTS_HPP
#include <string>
#include <set>
#include <vector>
#include <iostream>
#include <sstream>
#include <memory>
#include <map>
#include <algorithm>
#include <type_traits>
#include <functional>
#include <sstream>
#include <stdexcept>
namespace opts{
class InvalidOptions: public std::exception{
public:
InvalidOptions(std::string const & msg): msg_("Invalid options: " + msg){}
const char* what() const throw(){ return msg_.c_str();}
private:
std::string msg_;
};
/**
* @class OptionBase
* @brief Base class for command-line options
*/
class OptionBase{
protected:
template<class ItType>
std::vector<std::string>::const_iterator get_option(ItType const & begin, ItType const & end){
auto it = std::find(begin, end, "--" + name_);
if(it==end && required_)
throw InvalidOptions("parameter '" + name_ + "' is mandatory");
if(parent_ && parent_->parent_ && parent_->get_option(begin, it)==it)
throw InvalidOptions("parameter '" + name_ + "' needs to be nested in group '" + parent_->name_ + "'");
return it;
}
public:
OptionBase(std::string const & name, std::string const & desc, bool required = false, OptionBase* parent = NULL): name_(name), desc_(desc), required_(required), parent_(parent)
{}
virtual std::ostream& usage(std::ostream& os, size_t indent) const{
if(!desc_.empty())
os << std::string(indent, ' ') << "--" << "\033[1m" << name_ << "\033[0m" << ": " << desc_ << std::endl;
return os;
}
virtual void parse(std::vector<std::string> const & args, std::map<std::string, void*>& values) = 0;
std::string const & name() const
{ return name_; }
protected:
const std::string name_;
const std::string desc_;
bool required_;
OptionBase* parent_;
};
/**
* @class OptionHelp
* @brief Automatically added --help option
*/
class OptionHelp: public OptionBase{
public:
OptionHelp() : OptionBase("help", "Display this message", false){}
void parse(std::vector<std::string> const & args, std::map<std::string, void*>& values){
if(get_option(args.begin(), args.end()) != args.end())
values[name_] = (void*)this;
}
};
/**
* @class Option
* @brief Standard, typed option
*/
template<class T>
class Option: public OptionBase{
public:
typedef std::function<T(std::string const &)> converter_t;
typedef std::function<void(T const &)> constraint_t;
public:
Option(std::string const & name, std::string const & desc, T dft, converter_t convert, constraint_t constraint, OptionBase* parent):
OptionBase(name, desc, false, parent), default_(new T(dft)), convert_(convert), constraint_(constraint){}
Option(std::string const & name, std::string const & desc, bool required, converter_t convert, constraint_t constraint, OptionBase* parent):
OptionBase(name, desc, required, parent), convert_(convert), constraint_(constraint){}
void parse(std::vector<std::string> const & args, std::map<std::string, void*>& values){
value_ = default_;
auto it = get_option(args.begin(), args.end());
if(it!=args.end()){
auto next = it + 1;
if(next==args.end() || next->compare(0, 2, "--")==0)
throw InvalidOptions("parameter " + name_ + " requires an argument");
else{
value_.reset(new T(convert_(*next)));
constraint_(*value_);
}
}
values[name_] = (void*)value_.get();
}
std::ostream& usage(std::ostream& os, size_t indent) const{
OptionBase::usage(os, indent);
return os;
}
private:
std::shared_ptr<T> default_;
std::shared_ptr<T> value_;
converter_t convert_;
constraint_t constraint_;
};
/**
* @class SwitchOption
* @brief Boolean option activated with --flag or --no-flag
*/
class SwitchOption: public OptionBase{
public:
SwitchOption(std::string const & name, std::string const & desc, bool dft, OptionBase* parent):
OptionBase(name, desc, false, parent), default_(dft)
{}
void parse(std::vector<std::string> const & args, std::map<std::string, void*>& values){
auto it_true = std::find(args.begin(), args.end(), "--" + name_);
auto it_false = std::find(args.begin(), args.end(), "--no-" + name_);
value_.reset(new bool(default_));
if(it_true != args.end()) value_.reset(new bool(true));
if(it_false != args.end()) value_.reset(new bool(false));
values[name_] = (void*)value_.get();
}
private:
bool default_;
std::shared_ptr<bool> value_;
};
/* Pre-defined converters */
template<class T>
class MapConverter{
public:
MapConverter(std::map<std::string, T> const & values): values_(values){}
inline T operator()(std::string const & str){
if(values_.find(str) == values_.end())
throw InvalidOptions("value " + str + " is invalid");
return values_.at(str);
}
private:
std::map<std::string, T> values_;
};
//Read type from stream
template<class T>
class StreamConverter{
public:
T operator()(std::string const & str){
T value;
std::istringstream iss(str);
iss >> value;
return value;
}
};
//Read vector from stream
template<class T>
class StreamConverter<std::vector<T>>{
public:
std::vector<T> operator()(std::string const & str){
std::vector<T> result;
std::istringstream iss(str);
std::string token;
while(std::getline(iss, token, ','))
result.push_back(StreamConverter<T>()(token));
return result;
}
};
//Read tuple from stream
template<class... Args>
class StreamConverter<std::tuple<Args...>>{
template<size_t I, class T, class... U>
struct TupleReader{
static std::tuple<T, U...> get(std::istringstream& iss){
auto x = TupleReader<0,T>::get(iss);
auto y = TupleReader<I-1, U...>::get(iss);
return std::tuple_cat(x, y);
}
};
template<class T>
struct TupleReader<0, T>{
static std::tuple<T> get(std::istringstream& iss){
std::string token;
std::getline(iss, token, ',');
return std::make_tuple(StreamConverter<T>()(token));
}
};
public:
inline std::tuple<Args...> operator()(std::string const & str){
std::istringstream iss(str);
return TupleReader<sizeof...(Args) - 1, Args...>::get(iss);
}
};
/* Pre-defined constraints */
struct NoOp {
template<class T>
void operator()(T const &) {}
};
class SizeConstraint{
public:
SizeConstraint(size_t size): size_(size){}
template<class T>
void operator()(std::vector<T> const & x) const {
if(x.size()!=size_)
throw InvalidOptions("parameter must have size " + std::to_string(size_));
}
private:
size_t size_;
};
class OneOf{
public:
OneOf(std::vector<std::string> keys): keys_(keys){}
void operator()(std::map<std::string, void*> values){
std::vector<std::string> keys;
for(auto& x: values)
keys.push_back(x.first);
size_t found = 0;
for(auto& x: keys_)
if(std::find(keys.begin(), keys.end(), x) != keys.end())
found++;
std::string msg;
for(size_t i = 0; i < keys_.size(); ++i)
msg += (i>0?", ":"") + keys_[i];
if(found != 1)
throw InvalidOptions(std::string(found<1?"At least":"Only") + " one of the following flags must be specified: " + msg);
}
private:
std::vector<std::string> keys_;
};
/**
* @class Options
* @brief Container for multiple options
*/
class Options: public OptionBase{
public:
typedef std::function<void(std::map<std::string, void*> const &)> constraint_t;
std::map<std::string, std::string> set_to_map(std::set<std::string> const & set){
std::map<std::string, std::string> tmp;
for(std::string x: set)
tmp.insert(std::make_pair(x, x));
return tmp;
}
public:
Options(std::string const & name, std::string const & desc, OptionBase* parent): OptionBase(name, desc, false, parent)
{}
std::ostream& usage(std::ostream& os, size_t indent) const{
OptionBase::usage(os, indent);
for(auto& opt: opts_)
opt->usage(os, indent + (parent_==NULL)?0:2);
return os;
}
void parse(std::vector<std::string> const & args, std::map<std::string, void*>& values){
if(parent_==NULL || get_option(args.begin(), args.end()) != args.end()){
for(auto& opt: opts_)
opt->parse(args, values_);
for(auto& constraint: constraints_)
constraint(values_);
values[name_] = (void*)&values_;
}
}
void parse(int argc, char* argv[]){
std::vector<std::string> args(argv, argv + argc);
parse(args, values_);
}
template<class T>
void add(std::string const & name, std::string const & desc, T dft, typename Option<T>::constraint_t constraint = NoOp())
{ opts_.push_back(std::make_shared<Option<T>>(name, desc, dft, StreamConverter<T>(), constraint, this));}
template<class T>
void add(std::string const & name, std::string const & desc, typename Option<T>::constraint_t constraint = NoOp())
{ opts_.push_back(std::make_shared<Option<T>>(name, desc, false, StreamConverter<T>(), constraint, this));}
void add(std::string const & name, std::string const & desc, std::string dft, std::set<std::string> values)
{ add<std::string>(name, desc, dft, set_to_map(values)); }
void add(std::string const & name, std::string const & desc, std::set<std::string> values)
{ add<std::string>(name, desc, set_to_map(values)); }
template<class T>
void add(std::string const & name, std::string const & desc, std::string dft, std::map<std::string, T> values, typename Option<T>::constraint_t constraint = NoOp())
{ opts_.push_back(std::make_shared<Option<T>>(name, desc, values.at(dft), MapConverter<T>(values), constraint, this)); }
template<class T>
void add(std::string const & name, std::string const & desc, std::map<std::string, T> values, typename Option<T>::constraint_t constraint = NoOp())
{ opts_.push_back(std::make_shared<Option<T>>(name, desc, false, MapConverter<T>(values), constraint, this)); }
void add_switch(std::string const & name, std::string const & desc, bool dft = true)
{ opts_.push_back(std::make_shared<SwitchOption>(name, desc, dft, this)); }
void add(OptionBase* opt)
{ opts_.push_back(std::shared_ptr<OptionBase>(opt)); }
Options* add_group(std::string const & name, std::string const & desc){
opts_.push_back(std::make_shared<Options>(name, desc, this));
return (Options*)opts_.back().get();
}
void add_constraint(constraint_t const & constraint){
constraints_.push_back(constraint);
}
bool has(std::string const & name)
{ return values_.find(name) != values_.end() && values_.at(name)!=NULL; }
template<class T>
T get(std::string const & name)
{ return *((T*)values_[name]); }
private:
std::vector<std::shared_ptr<OptionBase>> opts_;
std::map<std::string, void*> values_;
std::vector<constraint_t> constraints_;
};
/* Application */
class Application{
private:
void show_help() const{
std::cerr << "Usage: " << name_ << " [OPTS]" << std::endl;
std::cerr << "Description: " << desc_ << std::endl;
opts_.usage(std::cerr, 0);
}
public:
Application(std::string const & name, std::string const & desc): name_(name), desc_(desc), opts_("root","",NULL)
{ opts_.add(new OptionHelp()); }
void parse(int argc, char* argv[]){
try{
opts_.parse(argc, argv);
}catch(InvalidOptions const & e){
std::cerr << e.what() << std::endl;
show_help();
exit(EXIT_FAILURE);
}
if(opts_.has("help")){
show_help();
exit(EXIT_FAILURE);
}
}
Options* options()
{ return &opts_; }
private:
std::string name_;
std::string desc_;
Options opts_;
};
}
#endif

View File

@@ -1,69 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <tuple>
#include "isaac/runtime/predict.h"
#include "isaac/driver/backend.h"
#include "isaac/driver/cublas.h"
#include "isaac/driver/context.h"
#include "isaac/driver/kernel.h"
#include "isaac/driver/buffer.h"
#include "isaac/driver/stream.h"
#include "isaac/tools/bench.hpp"
#include "isaac/tools/collections.hpp"
#include "isaac/templates/conv.h"
#include "isaac/templates/gemm.h"
#include "isaac/templates/pool.h"
namespace isaac{
void GEMM(driver::Device const & device, driver::Stream & stream,
DType in_dtype, DType out_dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K,
param_t offa, param_t lda, param_t offb, param_t ldb, param_t offc, param_t ldc,
scalar const & alpha, driver::Buffer const & A, driver::Buffer const & B, scalar const & beta, driver::Buffer& C,
float a_scale, float b_scale, float c_scale,
driver::Buffer const *bias = NULL,
templates::GEMM* generator = NULL, size_t optimization_level = 1);
void CONV(driver::Device const &, driver::Stream & stream,
DType in_dtype, DType out_dtype, param_t N, param_t K, param_t M, param_t P, param_t Q, param_t C, param_t T, param_t R, param_t S,
param_t D, param_t H, param_t W, param_t pad_d, param_t pad_h, param_t pad_w,
param_t stride_d, param_t stride_h, param_t stride_w,
param_t upsample_d, param_t upsample_h, param_t upsample_w,
driver::Buffer const & I, driver::Buffer const & F, driver::Buffer *O, param_t num_outputs,
driver::Buffer const *bias = NULL, ActivationType activation = Linear, float alpha = 0, float iscale = 1, float fscale = 1, std::vector<float> const & oscale = {1}, float z_scale = 1,
ResidualType residual = NoResidual, param_t Zk = 0, param_t crop_z_m0 = 0, param_t crop_z_m1 = 0, param_t crop_z_p0 = 0, param_t crop_z_p1 = 0, param_t crop_z_q0 = 0, param_t crop_z_q1 = 0, driver::Buffer const *Z = NULL,
templates::Conv* generator = NULL, size_t optimization_level = 1);
void POOL(driver::Device const & device, driver::Stream & stream,
DType in_dtype, DType out_dtype, PoolType pool_type, param_t C, param_t M, param_t P, param_t Q, param_t N, param_t T, param_t R, param_t S,
param_t D, param_t H, param_t W, param_t pad_d, param_t pad_h, param_t pad_w, param_t stride_d, param_t stride_h, param_t stride_w,
driver::Buffer const & I, driver::Buffer& O,
float iscale, float oscale,
templates::Pool* generator = NULL, size_t optimization_level = 1);
void TRANSFORM(driver::Stream & stream,
DType in_dtype, DType out_dtype, param_t N, param_t C, param_t D, param_t H, param_t W,
driver::Buffer const & I, driver::Buffer& O);
}

View File

@@ -1,116 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_CL_QUEUES_H
#define ISAAC_CL_QUEUES_H
#include <map>
#include <list>
#include <vector>
namespace isaac
{
namespace driver
{
class Buffer;
class Stream;
class Device;
class Context;
class Platform;
class Module;
class Kernel;
struct backend
{
class modules
{
friend class backend;
public:
static void release();
static Module& get(Stream const & stream, std::string const & name, std::string const &src);
private:
static std::map<std::tuple<Stream, std::string>, Module * > cache_;
};
class kernels
{
friend class backend;
public:
static void release();
static Kernel & get(Module const & program, std::string const & name);
private:
static std::map<std::tuple<Module, std::string>, Kernel * > cache_;
};
class contexts
{
friend class backend;
private:
static void init(std::vector<Platform> const &);
static void release();
public:
static Context const & get_default();
template<class T>
static Context const & import(T context)
{
for(driver::Context const * x: cache_)
if((T)*x==context)
return *x;
cache_.emplace_back(new Context(context, false));
return *cache_.back();
}
static void get(std::list<Context const *> &);
private:
static std::list<Context const *> cache_;
};
class streams
{
friend class backend;
private:
static void init(std::list<Context const *> const &);
static void release();
public:
static void get(Context const &, std::vector<Stream *> &streams);
static Stream & get(Context const &, unsigned int id = 0);
static Stream & get_default();
private:
static std::map< Context, std::vector<Stream*> > cache_;
};
static void init();
static void release();
static std::vector<Device> devices();
static std::vector<Platform> platforms();
static void synchronize(Context const &);
static unsigned int default_device;
};
}
}
#endif

View File

@@ -1,54 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_BUFFER_H
#define ISAAC_DRIVER_BUFFER_H
#include "isaac/driver/handle.h"
#include "isaac/driver/context.h"
namespace isaac
{
namespace driver
{
class Stream;
// Buffer
class Buffer: public HandleInterface<Buffer, CUdeviceptr>
{
public:
Buffer(Context const & context, size_t size);
Buffer(Context const & context, CUdeviceptr cu, bool take_ownership);
void set_zero(Stream const & queue, size_t size);
Handle<CUdeviceptr> const & cu() const;
Handle<CUdeviceptr> & cu();
private:
Context context_;
Handle<CUdeviceptr> cu_;
};
}
}
#endif

View File

@@ -1,66 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_CONTEXT_H
#define ISAAC_DRIVER_CONTEXT_H
#include "isaac/driver/device.h"
#include "isaac/driver/handle.h"
namespace isaac
{
namespace driver
{
class Context: public HandleInterface<Context, CUcontext>
{
private:
static std::string get_cache_path();
static CUdevice device(CUcontext);
public:
//Constructors
explicit Context(CUcontext context, bool take_ownership = true);
explicit Context(Device const & device);
//Accessors
Device const & device() const;
std::string const & cache_path() const;
Handle<CUcontext> const & cu() const;
private:
Handle<CUcontext> cu_;
Device device_;
std::string cache_path_;
};
class ContextSwitcher{
public:
ContextSwitcher(Context const & ctx);
~ContextSwitcher();
private:
Context const & ctx_;
};
}
}
#endif

View File

@@ -1,98 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_DEVICE_H
#define ISAAC_DRIVER_DEVICE_H
#include "isaac/driver/platform.h"
#include "isaac/driver/handle.h"
namespace isaac
{
namespace driver
{
// Device
class Device: public HandleInterface<Device, CUdevice>
{
public:
//Supported architectures
enum class Architecture{
//NVidia
SM_2_0,
SM_2_1,
SM_3_0,
SM_3_5,
SM_3_7,
SM_5_0,
SM_5_2,
SM_6_0,
SM_6_1,
SM_7_0,
UNKNOWN
};
private:
//Metaprogramming elper to get cuda info from attribute
template<CUdevice_attribute attr>
int cuGetInfo() const;
inline Architecture nv_arch(std::pair<unsigned int, unsigned int> sm) const;
inline nvmlDevice_t nvml_device() const;
public:
Device(CUdevice cu = CUdevice(), bool take_ownership = true): cu_(cu, take_ownership){}
//Accessors
Architecture architecture() const;
Handle<CUdevice> const & cu() const;
//Informations
std::string infos() const;
size_t address_bits() const;
driver::Platform platform() const;
std::vector<size_t> max_block_dim() const;
size_t max_threads_per_block() const;
size_t max_shared_memory() const;
size_t warp_size() const;
//Compute Capability
void interpret_as(std::pair<size_t, size_t> cc);
std::pair<size_t, size_t> compute_capability() const;
//Identifier
std::string name() const;
std::string pci_bus_id() const;
//Clocks
size_t current_sm_clock() const;
size_t current_mem_clock() const;
size_t max_sm_clock() const;
size_t max_mem_clock() const;
private:
Handle<CUdevice> cu_;
std::shared_ptr<std::pair<size_t, size_t>> interpreted_as_;
};
}
}
#endif

View File

@@ -1,258 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_DISPATCHER_H
#define ISAAC_DRIVER_DISPATCHER_H
#include <type_traits>
#include <dlfcn.h>
//CUDA Backend
#include "isaac/external/CUDA/cuda.h"
#include "isaac/external/CUDA/nvrtc.h"
#include "isaac/external/CUDA/cublas_v2.h"
#include "isaac/external/CUDA/cudnn.h"
#include "isaac/external/CUDA/nvml.h"
//Exceptions
#include <iostream>
#include <stdexcept>
namespace isaac
{
namespace driver
{
class Context;
template<class T> void check(T){}
void check(nvrtcResult err);
void check(CUresult err);
void check(cublasStatus_t err);
void check(cudnnStatus_t err);
class dispatch
{
private:
template <class F>
struct return_type;
template <class R, class... A>
struct return_type<R (*)(A...)>
{ typedef R type; };
typedef bool (*f_init_t)();
template<f_init_t initializer, typename FunPtrT, typename... Args>
static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args)
{
initializer();
if(cache == nullptr){
cache = dlsym(lib_h, name);
if(cache == 0)
throw std::runtime_error("dlsym unable to load function");
}
FunPtrT fptr;
*reinterpret_cast<void **>(&fptr) = cache;
typename return_type<FunPtrT>::type res = (*fptr)(args...);
check(res);
return res;
}
public:
static bool nvrtcinit();
static bool nvmlinit();
static bool cuinit();
static bool cublasinit();
static bool cudnninit();
static void release();
//CUDA
static CUresult cuCtxGetCurrent(CUcontext *pctx);
static CUresult cuCtxSetCurrent(CUcontext ctx);
static CUresult cuCtxDestroy_v2(CUcontext ctx);
static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
static CUresult cuDeviceGet(CUdevice *device, int ordinal);
static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
static CUresult cuMemFree_v2(CUdeviceptr dptr);
static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
static CUresult cuDriverGetVersion(int *driverVersion);
static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev);
static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t* bytes, CUmodule hmod, const char *name);
static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
static CUresult cuModuleLoad(CUmodule *module, const char *fname);
static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
static CUresult cuModuleUnload(CUmodule hmod);
static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
static CUresult cuDeviceGetCount(int *count);
static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
static CUresult cuInit(unsigned int Flags);
static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
static CUresult cuStreamSynchronize(CUstream hStream);
static CUresult cuStreamDestroy_v2(CUstream hStream);
static CUresult cuEventDestroy_v2(CUevent hEvent);
static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr);
static CUresult cuCtxGetDevice(CUdevice* result);
static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream);
static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device);
static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
static nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char **options);
static nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
static nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
static nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames);
static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
static cublasHandle_t cublasHandle(Context const & ctx);
static cublasStatus_t cublasCreate_v2(cublasHandle_t* h);
static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId);
static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId);
static cublasStatus_t cublasSgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc);
static cublasStatus_t cublasDgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc);
static cublasStatus_t cublasHgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc);
static cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const void *beta, void *C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo);
static cudnnHandle_t cudnnHandle(Context const & ctx);
static cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
static cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc);
static cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
static cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
static cudnnStatus_t cudnnCreate(cudnnHandle_t *handle);
static cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w);
static cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w);
static cudnnStatus_t cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int nbDims, const int dimA[]);
static cudnnStatus_t cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int nbDims, const int filterDimA[]);
static cudnnStatus_t cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, int pad_h, int pad_w, int u, int v, int upscalex, int upscaley, cudnnConvolutionMode_t mode);
static cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, int arrayLength, const int padA[], const int filterStrideA[], const int upscaleA[], cudnnConvolutionMode_t mode, cudnnDataType_t dataType);
static cudnnStatus_t cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims, const int windowDimA[], const int paddingA[], const int strideA[]);
static cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionFwdAlgo_t *algo);
static cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo, size_t *sizeInBytes);
static cudnnStatus_t cudnnConvolutionForward(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const cudnnFilterDescriptor_t wDesc, const void *w, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo, void *workSpace, size_t workSpaceSizeInBytes, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
static cudnnStatus_t cudnnPoolingForward(cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
static cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
static cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y);
private:
static void* cuda_;
static void* nvrtc_;
static void* nvml_;
static void* cublas_;
static void* cudnn_;
//CUDA
static void* cuCtxGetCurrent_;
static void* cuCtxSetCurrent_;
static void* cuCtxDestroy_v2_;
static void* cuEventCreate_;
static void* cuDeviceGet_;
static void* cuMemcpyDtoH_v2_;
static void* cuStreamCreate_;
static void* cuEventElapsedTime_;
static void* cuMemFree_v2_;
static void* cuMemcpyDtoHAsync_v2_;
static void* cuDriverGetVersion_;
static void* cuDeviceGetName_;
static void* cuDeviceGetPCIBusId_;
static void* cuModuleGetGlobal_v2_;
static void* cuMemcpyHtoDAsync_v2_;
static void* cuModuleLoad_;
static void* cuLaunchKernel_;
static void* cuModuleUnload_;
static void* cuModuleLoadDataEx_;
static void* cuDeviceGetAttribute_;
static void* cuDeviceGetCount_;
static void* cuMemcpyHtoD_v2_;
static void* cuInit_;
static void* cuEventRecord_;
static void* cuCtxCreate_v2_;
static void* cuModuleGetFunction_;
static void* cuStreamSynchronize_;
static void* cuStreamDestroy_v2_;
static void* cuEventDestroy_v2_;
static void* cuMemAlloc_v2_;
static void* cuPointerGetAttribute_;
static void* cuCtxGetDevice_;
static void* cuMemsetD8Async_;
static void* cuCtxPushCurrent_v2_;
static void* cuCtxPopCurrent_v2_;
static void* nvmlInit_v2_;
static void* nvmlDeviceGetHandleByPciBusId_v2_;
static void* nvmlDeviceGetClockInfo_;
static void* nvmlDeviceGetMaxClockInfo_;
static void* nvrtcCompileProgram_;
static void* nvrtcGetProgramLogSize_;
static void* nvrtcGetPTX_;
static void* nvrtcGetPTXSize_;
static void* nvrtcCreateProgram_;
static void* nvrtcGetProgramLog_;
static void* cublasCreate_v2_;
static void* cublasGetStream_v2_;
static void* cublasSetStream_v2_;
static void* cublasHgemm_;
static void* cublasSgemm_v2_;
static void* cublasDgemm_v2_;
static void* cublasGemmEx_;
static void* cudnnCreateConvolutionDescriptor_;
static void* cudnnCreatePoolingDescriptor_;
static void* cudnnCreateTensorDescriptor_;
static void* cudnnCreateFilterDescriptor_;
static void* cudnnCreate_;
static void* cudnnSetTensor4dDescriptor_;
static void* cudnnSetFilter4dDescriptor_;
static void* cudnnSetTensorNdDescriptorEx_;
static void* cudnnSetFilterNdDescriptor_;
static void* cudnnSetConvolution2dDescriptor_;
static void* cudnnSetConvolutionNdDescriptor_;
static void* cudnnSetPoolingNdDescriptor_;
static void* cudnnGetConvolutionForwardAlgorithm_;
static void* cudnnGetConvolutionForwardWorkspaceSize_;
static void* cudnnConvolutionForward_;
static void* cudnnPoolingForward_;
static void* cudnnSetStream_;
static void* cudnnTransformTensor_;
};
}
}
#endif

View File

@@ -1,49 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_EVENT_H
#define ISAAC_DRIVER_EVENT_H
#include "isaac/driver/handle.h"
namespace isaac
{
namespace driver
{
// Event
class Event: public HandleInterface<Event, cu_event_t>
{
public:
float elapsed_time() const;
Handle<cu_event_t> const & cu() const;
private:
Handle<cu_event_t> cu_;
};
}
}
#endif

View File

@@ -1,82 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_HANDLE_H
#define ISAAC_DRIVER_HANDLE_H
#include <memory>
#include <iostream>
#include <functional>
#include <type_traits>
#include "isaac/driver/dispatch.h"
namespace isaac
{
namespace driver
{
struct cu_event_t{
operator bool() const { return first && second; }
CUevent first;
CUevent second;
};
struct cu_platform{
cu_platform() : status_(dispatch::cuInit(0)) { }
operator bool() const { return status_; }
private:
CUresult status_;
};
template<class T, class CUType>
class HandleInterface{
public:
//Accessors
operator CUType() const { return *(((T*)this)->cu().h_); }
//Comparison
bool operator==(HandleInterface const & y) { return (CUType)(*this) == (CUType)(y); }
bool operator!=(HandleInterface const & y) { return (CUType)(*this) != (CUType)(y); }
bool operator<(HandleInterface const & y) { return (CUType)(*this) < (CUType)(y); }
};
template<class CUType>
class Handle{
public:
template<class, class> friend class HandleInterface;
public:
//Constructors
Handle(CUType cu = CUType(), bool take_ownership = true);
~Handle();
CUType& operator*() { return *h_; }
CUType const & operator*() const { return *h_; }
CUType* operator->() const { return h_.get(); }
protected:
std::shared_ptr<CUType> h_;
bool has_ownership_;
};
}
}
#endif

View File

@@ -1,68 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_KERNEL_H
#define ISAAC_DRIVER_KERNEL_H
#include "isaac/driver/module.h"
#include "isaac/driver/handle.h"
#include <memory>
namespace isaac
{
namespace driver
{
class Buffer;
// Kernel
class Kernel: public HandleInterface<Kernel, CUfunction>
{
public:
//Constructors
Kernel(Module const & program, const char * name);
//Accessors
Handle<CUfunction> const & cu() const;
Module const & module() const;
//Arguments setters
void setArg(unsigned int index, std::size_t size, void* ptr);
void setArg(unsigned int index, Buffer const &);
template<class T> void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); }
//Arguments getters
void* const* cu_params() const;
private:
Handle<CUfunction> cu_;
Module program_;
unsigned int address_bits_;
std::vector<std::shared_ptr<void> > cu_params_store_;
std::vector<void*> cu_params_;
};
}
}
#endif

View File

@@ -1,61 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_MODULE_H
#define ISAAC_DRIVER_MODULE_H
#include <map>
#include "isaac/driver/handle.h"
#include "isaac/driver/context.h"
#include "isaac/driver/buffer.h"
namespace isaac
{
namespace driver
{
class Context;
class Device;
class Module: public HandleInterface<Module, CUmodule>
{
static std::string header(Device const & device);
public:
Module(Context const & context, std::string const & source);
Context const & context() const;
Handle<CUmodule> const & cu() const;
Buffer symbol(const char * name) const;
private:
Handle<CUmodule> cu_;
Context context_;
std::string source_;
};
}
}
#endif

View File

@@ -1,54 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_PLATFORM_H
#define ISAAC_DRIVER_PLATFORM_H
#include <vector>
#include <string>
#include "isaac/driver/handle.h"
namespace isaac
{
namespace driver
{
class Device;
class Platform
{
public:
//Accessors
std::string name() const;
std::string version() const;
std::vector<Device> devices() const;
private:
Handle<cu_platform> cu_;
};
}
}
#endif

View File

@@ -1,82 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_STREAM_H
#define ISAAC_DRIVER_STREAM_H
#include <map>
#include "isaac/driver/context.h"
#include "isaac/driver/device.h"
#include "isaac/driver/handle.h"
#include "isaac/driver/buffer.h"
namespace isaac
{
namespace driver
{
class Kernel;
class Event;
class Range;
class Buffer;
// Command Queue
class Stream: public HandleInterface<Stream, CUstream>
{
public:
//Constructors
Stream(CUstream stream, bool take_ownership);
Stream(Context const & context);
//Accessors
Handle<CUstream> const & cu() const;
Context const & context() const;
//Synchronize
void synchronize();
//Enqueue
void enqueue(Kernel const & kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const * = NULL, Event *event = NULL);
// Write
void write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
template<class T> void write(Buffer const & buffer, bool blocking, std::size_t offset, std::vector<T> const & x)
{ write(buffer, blocking, offset, x.size()*sizeof(T), x.data()); }
// Read
void read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr);
template<class T> void read(Buffer const & buffer, bool blocking, std::size_t offset, std::vector<T>& x)
{ read(buffer, blocking, offset, x.size()*sizeof(T), x.data()); }
private:
Context context_;
Handle<CUstream> cu_;
};
}
}
#endif

View File

@@ -1,64 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "device_types.h"
#if !defined(__CUDACC_RTC__)
#define EXCLUDE_FROM_RTC
#include "driver_types.h"
#undef EXCLUDE_FROM_RTC
#endif /* !__CUDACC_RTC__ */
#include "surface_types.h"
#include "texture_types.h"
#include "vector_types.h"

View File

@@ -1,412 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CHANNEL_DESCRIPTOR_H__)
#define __CHANNEL_DESCRIPTOR_H__
#if defined(__cplusplus)
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "driver_types.h"
#include "cuda_runtime_api.h"
#include "host_defines.h"
#include "vector_types.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
/**
* \addtogroup CUDART_HIGHLEVEL
*
* @{
*/
/**
* \brief \hl Returns a channel descriptor using the specified format
*
* Returns a channel descriptor with format \p f and number of bits of each
* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
* defined as:
* \code
struct cudaChannelFormatDesc {
int x, y, z, w;
enum cudaChannelFormatKind f;
};
* \endcode
*
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
* ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
*
* \return
* Channel descriptor with format \p f
*
* \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
*/
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
{
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
{
int e = (int)sizeof(char) * 8;
#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
{
int e = (int)sizeof(signed char) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
{
int e = (int)sizeof(unsigned char) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
{
int e = (int)sizeof(short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
{
int e = (int)sizeof(int) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
{
int e = (int)sizeof(unsigned int) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
#if !defined(__LP64__)
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
{
int e = (int)sizeof(long) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
{
int e = (int)sizeof(unsigned long) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
}
#endif /* !__LP64__ */
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
{
int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
}
#endif /* __cplusplus */
/** @} */
/** @} */ /* END CUDART_TEXTURE_HL */
#endif /* !__CHANNEL_DESCRIPTOR_H__ */

View File

@@ -1,266 +0,0 @@
/*
* Copyright 1993-2016 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__HOST_CONFIG_H__)
#define __HOST_CONFIG_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if defined(__CUDACC__)
#if defined(__CUDACC_RTC__)
#define _CRTIMP
#define __THROW
#else /* __CUDACC_RTC__ */
/* check for host compilers that are compatible with nvcc */
#if !defined(__GNUC__) && !defined(_WIN32)
#error --- !!! UNSUPPORTED COMPILER !!! ---
#endif /* !__GNUC__ && !_WIN32 */
#if defined(__ICC)
#if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700) || !defined(__GNUC__) || !defined(__LP64__)
#error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, and ICC 17.0 on Linux x86_64 are supported!
#endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 17.0) || !__GNUC__ || !__LP64__ */
#endif /* __ICC */
#if defined(__PGIC__)
#if (!(__PGIC__ == 17) && \
!(__PGIC__ == 99 && __PGIC_MINOR__ == 99)) || \
!defined(__GNUC__) || !defined(__LP64__)
#error -- unsupported pgc++ configuration! Only pgc++ 17 on Linux x86_64 is supported!
#endif /* (!(__PGIC__ == 17) &&
!(__PGIC__ == 99 && __PGIC_MINOR__ == 99 )) ||
!__GNUC__ || !__LP64__ */
#endif /* __PGIC__ */
#if defined(__powerpc__)
#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
#if defined(__ibmxl_vrm__) && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000)
#error -- unsupported xlC version! only xlC 13.1 is supported
#endif /* __ibmxl_vrm__ && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) */
#endif /* __powerpc__ */
#if defined(__GNUC__)
#if __GNUC__ > 6
#error -- unsupported GNU version! gcc versions later than 6 are not supported!
#endif /* __GNUC__ > 6 */
#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
#error -- clang and clang++ are the only supported host compilers on Mac OS X!
#endif /* __APPLE__ && __MACH__ && !__clang__ */
#endif /* __GNUC__ */
#if defined(_WIN32)
#if _MSC_VER < 1600 || _MSC_VER > 1911
#error -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!
#elif _MSC_VER == 1600 /* _MSC_VERION == 1600 */
#pragma message("support for Microsoft Visual Studio 2010 has been deprecated!")
#endif /* _MSC_VER < 1600 || _MSC_VER > 1800 || _MSC_VERSION == 1600 */
#endif /* _WIN32 */
/* configure host compiler */
#if defined(__APPLE__)
#define _CRTIMP
#define _ACRTIMP
#define __THROW
#if defined(__BLOCKS__) /* nvcc does not support closures */
#undef __BLOCKS__
#endif /* __BLOCKS__ */
#elif defined(__ANDROID__)
#define _CRTIMP
#define _ACRTIMP
#define __THROW
#elif defined(__QNX__)
#define _CRTIMP
#define _ACRTIMP
#define __THROW
#elif defined(__HORIZON__)
#define _CRTIMP
#define _ACRTIMP
#define __THROW
#elif defined(__GNUC__)
#define _CRTIMP
#define _ACRTIMP
#include <features.h> /* for __THROW */
#elif defined(_WIN32)
#if _MSC_VER >= 1500
#undef _USE_DECLSPECS_FOR_SAL
#define _USE_DECLSPECS_FOR_SAL \
1
#endif /* _MSC_VER >= 1500 */
#if !defined(_CRT_NONSTDC_NO_WARNINGS)
#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
#endif /* !_CRT_NONSTDC_NO_WARNINGS */
#if !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
#endif /* !_CRT_SECURE_NO_WARNINGS */
#if !defined(NOMINMAX)
#define NOMINMAX /* min and max are part of cuda runtime */
#endif /* !NOMINMAX */
#include <crtdefs.h> /* for _CRTIMP */
#if _MSC_VER >= 1900
#include <corecrt.h> /* for _ACRTIMP */
#endif /* _MSC_VER >= 1900 */
#define __THROW
#endif /* __APPLE__ */
#endif /* __CUDACC_RTC__ */
#if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER)))
#if __CUDACC_RTC__
typedef char *va_list;
#else /* !__CUDACC_RTC__ */
#include <cstdarg>
#endif /* __CUDACC_RTC__ */
#undef va_start
#undef va_end
#undef va_arg
#ifdef __PGIC__
#undef __builtin_va_end
#define va_start(v,l) __builtin_alt_va_start(v,l)
#define va_end(v) __builtin_va_end(v)
#define va_arg(v,l) __builtin_alt_va_arg(v,l)
#if (__cplusplus >= 201103L)
#undef va_copy
#define va_copy(d,s) __builtin_va_copy(d,s)
#endif
#else /* !__PGIC__ */
#define va_start(ap, x) (__cu_va_start(&ap, x))
#define va_end(ap) (__cu_va_end(&ap))
#define va_arg(ap, t) (*((t *)__cu_va_arg(&ap, (t *)0)))
#if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L))
#undef va_copy
#define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps)))
#endif /* (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */
#endif /* __PGIC__ */
#endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */
#endif /* __CUDACC__ */
#endif /* !__HOST_CONFIG_H__ */

View File

@@ -1,216 +0,0 @@
/*
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__HOST_DEFINES_H__)
#define __HOST_DEFINES_H__
/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
#if defined(__GNUC__) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)
#if defined(__CUDACC_RTC__)
#define __volatile__ volatile
#endif /* __CUDACC_RTC__ */
#define __no_return__ \
__attribute__((noreturn))
#if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
/* gcc allows users to define attributes with underscores,
e.g., __attribute__((__noinline__)).
Consider a non-CUDA source file (e.g. .cpp) that has the
above attribute specification, and includes this header file. In that case,
defining __noinline__ as below would cause a gcc compilation error.
Hence, only define __noinline__ when the code is being processed
by a CUDA compiler component.
*/
#define __noinline__ \
__attribute__((noinline))
#endif /* __CUDACC__ || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */
#define __forceinline__ \
__inline__ __attribute__((always_inline))
#define __align__(n) \
__attribute__((aligned(n)))
#define __thread__ \
__thread
#define __import__
#define __export__
#define __cdecl
#define __annotate__(a) \
__attribute__((a))
#define __location__(a) \
__annotate__(a)
#define CUDARTAPI
#elif defined(_MSC_VER)
#if _MSC_VER >= 1400
#define __restrict__ \
__restrict
#else /* _MSC_VER >= 1400 */
#define __restrict__
#endif /* _MSC_VER >= 1400 */
#define __inline__ \
__inline
#define __no_return__ \
__declspec(noreturn)
#define __noinline__ \
__declspec(noinline)
#define __forceinline__ \
__forceinline
#define __align__(n) \
__declspec(align(n))
#define __thread__ \
__declspec(thread)
#define __import__ \
__declspec(dllimport)
#define __export__ \
__declspec(dllexport)
#define __annotate__(a) \
__declspec(a)
#define __location__(a) \
__annotate__(__##a##__)
#define CUDARTAPI \
__stdcall
#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
#define __inline__
#if !defined(__align__)
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---
#endif /* !__align__ */
#if !defined(CUDARTAPI)
#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---
#endif /* !CUDARTAPI */
#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */
#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \
(defined(_MSC_VER) && _MSC_VER < 1900) || \
(!defined(__GNUC__) && !defined(_MSC_VER))
#define __specialization_static \
static
#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
(_MSC_VER && _MSC_VER < 1900) ||
(!__GNUC__ && !_MSC_VER) */
#define __specialization_static
#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
(_MSC_VER && _MSC_VER < 1900) ||
(!__GNUC__ && !_MSC_VER) */
#if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__)
#undef __annotate__
#define __annotate__(a)
#else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
#define __launch_bounds__(...) \
__annotate__(launch_bounds(__VA_ARGS__))
#endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */
#if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \
defined(__GNUC__) || defined(_WIN64)
#define __builtin_align__(a) \
__align__(a)
#else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
#define __builtin_align__(a)
#endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */
#define __host__ \
__location__(host)
#define __device__ \
__location__(device)
#define __global__ \
__location__(global)
#define __shared__ \
__location__(shared)
#define __constant__ \
__location__(constant)
#define __managed__ \
__location__(managed)
#if !defined(__CUDACC__)
#define __device_builtin__
#define __device_builtin_texture_type__
#define __device_builtin_surface_type__
#define __cudart_builtin__
#else /* defined(__CUDACC__) */
#define __device_builtin__ \
__location__(device_builtin)
#define __device_builtin_texture_type__ \
__location__(device_builtin_texture_type)
#define __device_builtin_surface_type__ \
__location__(device_builtin_surface_type)
#define __cudart_builtin__ \
__location__(cudart_builtin)
#endif /* !defined(__CUDACC__) */
#endif /* !__HOST_DEFINES_H__ */

View File

@@ -1,338 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(CU_COMPLEX_H_)
#define CU_COMPLEX_H_
/* When trying to include C header file in C++ Code extern "C" is required
* But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
* extern "C" cannot be nested
* Hence keep the header out of extern "C" block
*/
#include <math.h> /* import fabsf, sqrt */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
#include "vector_types.h"
typedef float2 cuFloatComplex;
__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
{
return x.x;
}
__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
{
return x.y;
}
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
(float r, float i)
{
cuFloatComplex res;
res.x = r;
res.y = i;
return res;
}
__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
{
return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
}
__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
cuFloatComplex y)
{
return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),
cuCimagf(x) + cuCimagf(y));
}
__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
cuFloatComplex y)
{
return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),
cuCimagf(x) - cuCimagf(y));
}
/* This implementation could suffer from intermediate overflow even though
* the final result would be in range. However, various implementations do
* not guard against this (presumably to avoid losing performance), so we
* don't do it either to stay competitive.
*/
__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
cuFloatComplex y)
{
cuFloatComplex prod;
prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) -
(cuCimagf(x) * cuCimagf(y)),
(cuCrealf(x) * cuCimagf(y)) +
(cuCimagf(x) * cuCrealf(y)));
return prod;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Such guarded implementations are usually the default for
* complex library implementations, with some also offering an unguarded,
* faster version.
*/
__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
cuFloatComplex y)
{
cuFloatComplex quot;
float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
float oos = 1.0f / s;
float ars = cuCrealf(x) * oos;
float ais = cuCimagf(x) * oos;
float brs = cuCrealf(y) * oos;
float bis = cuCimagf(y) * oos;
s = (brs * brs) + (bis * bis);
oos = 1.0f / s;
quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
((ais * brs) - (ars * bis)) * oos);
return quot;
}
/*
* We would like to call hypotf(), but it's not available on all platforms.
* This discrete implementation guards against intermediate underflow and
* overflow by scaling. Otherwise we would lose half the exponent range.
* There are various ways of doing guarded computation. For now chose the
* simplest and fastest solution, however this may suffer from inaccuracies
* if sqrt and division are not IEEE compliant.
*/
__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
{
float a = cuCrealf(x);
float b = cuCimagf(x);
float v, w, t;
a = fabsf(a);
b = fabsf(b);
if (a > b) {
v = a;
w = b;
} else {
v = b;
w = a;
}
t = w / v;
t = 1.0f + t * t;
t = v * sqrtf(t);
if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
t = v + w;
}
return t;
}
/* Double precision */
typedef double2 cuDoubleComplex;
__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
{
return x.x;
}
__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)
{
return x.y;
}
__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex
(double r, double i)
{
cuDoubleComplex res;
res.x = r;
res.y = i;
return res;
}
__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
{
return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
cuDoubleComplex y)
{
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),
cuCimag(x) + cuCimag(y));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
cuDoubleComplex y)
{
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),
cuCimag(x) - cuCimag(y));
}
/* This implementation could suffer from intermediate overflow even though
* the final result would be in range. However, various implementations do
* not guard against this (presumably to avoid losing performance), so we
* don't do it either to stay competitive.
*/
__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
cuDoubleComplex y)
{
cuDoubleComplex prod;
prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -
(cuCimag(x) * cuCimag(y)),
(cuCreal(x) * cuCimag(y)) +
(cuCimag(x) * cuCreal(y)));
return prod;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Such guarded implementations are usually the default for
* complex library implementations, with some also offering an unguarded,
* faster version.
*/
__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
cuDoubleComplex y)
{
cuDoubleComplex quot;
double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
double oos = 1.0 / s;
double ars = cuCreal(x) * oos;
double ais = cuCimag(x) * oos;
double brs = cuCreal(y) * oos;
double bis = cuCimag(y) * oos;
s = (brs * brs) + (bis * bis);
oos = 1.0 / s;
quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
((ais * brs) - (ars * bis)) * oos);
return quot;
}
/* This implementation guards against intermediate underflow and overflow
* by scaling. Otherwise we would lose half the exponent range. There are
* various ways of doing guarded computation. For now chose the simplest
* and fastest solution, however this may suffer from inaccuracies if sqrt
* and division are not IEEE compliant.
*/
__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
{
double a = cuCreal(x);
double b = cuCimag(x);
double v, w, t;
a = fabs(a);
b = fabs(b);
if (a > b) {
v = a;
w = b;
} else {
v = b;
w = a;
}
t = w / v;
t = 1.0 + t * t;
t = v * sqrt(t);
if ((v == 0.0) ||
(v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
t = v + w;
}
return t;
}
#if defined(__cplusplus)
}
#endif /* __cplusplus */
/* aliases */
typedef cuFloatComplex cuComplex;
__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
float y)
{
return make_cuFloatComplex (x, y);
}
/* float-to-double promotion */
__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
(cuFloatComplex c)
{
return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
}
__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
(cuDoubleComplex c)
{
return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
}
__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
{
float real_res;
float imag_res;
real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);
real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;
return make_cuComplex(real_res, imag_res);
}
__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
{
double real_res;
double imag_res;
real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
return make_cuDoubleComplex(real_res, imag_res);
}
#endif /* !defined(CU_COMPLEX_H_) */

View File

@@ -1,565 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
* This is the public header file for the CUBLAS library, defining the API
*
* CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
* on top of the CUDA runtime.
*/
#if !defined(CUBLAS_H_)
#define CUBLAS_H_
#include <cuda_runtime.h>
#ifndef CUBLASWINAPI
#ifdef _WIN32
#define CUBLASWINAPI __stdcall
#else
#define CUBLASWINAPI
#endif
#endif
#undef CUBLASAPI
#ifdef __CUDACC__
#define CUBLASAPI __host__
#else
#define CUBLASAPI
#endif
#include "cublas_api.h"
#if defined(__cplusplus)
extern "C" {
#endif
/* CUBLAS data types */
#define cublasStatus cublasStatus_t
cublasStatus CUBLASWINAPI cublasInit (void);
cublasStatus CUBLASWINAPI cublasShutdown (void);
cublasStatus CUBLASWINAPI cublasGetError (void);
cublasStatus CUBLASWINAPI cublasGetVersion(int *version);
cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr);
cublasStatus CUBLASWINAPI cublasFree (void *devicePtr);
cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream);
/* ---------------- CUBLAS BLAS1 functions ---------------- */
/* NRM2 */
float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx);
double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx);
float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx);
double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* DOT */
float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y,
int incy);
double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y,
int incy);
cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y,
int incy);
cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y,
int incy);
cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
int incy);
cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
int incy);
/*------------------------------------------------------------------------*/
/* SCAL */
void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx);
void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx);
void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx);
void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx);
void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx);
void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* AXPY */
void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx,
float *y, int incy);
void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x,
int incx, double *y, int incy);
void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x,
int incx, cuComplex *y, int incy);
void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* COPY */
void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y,
int incy);
void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y,
int incy);
void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
int incy);
void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
int incy);
/*------------------------------------------------------------------------*/
/* SWAP */
void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy);
void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy);
void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy);
void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* AMAX */
int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx);
int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx);
int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx);
int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* AMIN */
int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx);
int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx);
int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx);
int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* ASUM */
float CUBLASWINAPI cublasSasum (int n, const float *x, int incx);
double CUBLASWINAPI cublasDasum (int n, const double *x, int incx);
float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx);
double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* ROT */
void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy,
float sc, float ss);
void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy,
double sc, double ss);
void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y,
int incy, float c, cuComplex s);
void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx,
cuDoubleComplex *y, int incy, double sc,
cuDoubleComplex cs);
void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
int incy, float c, float s);
void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx,
cuDoubleComplex *y, int incy, double c, double s);
/*------------------------------------------------------------------------*/
/* ROTG */
void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss);
void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss);
void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
cuComplex *cs);
void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
cuDoubleComplex *cs);
/*------------------------------------------------------------------------*/
/* ROTM */
void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
const float* sparam);
void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
const double* sparam);
/*------------------------------------------------------------------------*/
/* ROTMG */
void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1,
const float *sy1, float* sparam);
void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1,
const double *sy1, double* sparam);
/* --------------- CUBLAS BLAS2 functions ---------------- */
/* GEMV */
void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
const float *A, int lda, const float *x, int incx,
float beta, float *y, int incy);
void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
const double *A, int lda, const double *x, int incx,
double beta, double *y, int incy);
void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x, int incx,
cuComplex beta, cuComplex *y, int incy);
void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
cuDoubleComplex beta, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* GBMV */
void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku,
float alpha, const float *A, int lda,
const float *x, int incx, float beta, float *y,
int incy);
void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku,
double alpha, const double *A, int lda,
const double *x, int incx, double beta, double *y,
int incy);
void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *x, int incx, cuComplex beta, cuComplex *y,
int incy);
void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y,
int incy);
/*------------------------------------------------------------------------*/
/* TRMV */
void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n,
const float *A, int lda, float *x, int incx);
void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n,
const double *A, int lda, double *x, int incx);
void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n,
const cuComplex *A, int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n,
const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TBMV */
void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k,
const float *A, int lda, float *x, int incx);
void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k,
const double *A, int lda, double *x, int incx);
void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k,
const cuComplex *A, int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k,
const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TPMV */
void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx);
void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TRSV */
void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx);
void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx);
void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda,
cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TPSV */
void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP,
float *x, int incx);
void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx);
void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP,
cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* TBSV */
void CUBLASWINAPI cublasStbsv(char uplo, char trans,
char diag, int n, int k, const float *A,
int lda, float *x, int incx);
void CUBLASWINAPI cublasDtbsv(char uplo, char trans,
char diag, int n, int k, const double *A,
int lda, double *x, int incx);
void CUBLASWINAPI cublasCtbsv(char uplo, char trans,
char diag, int n, int k, const cuComplex *A,
int lda, cuComplex *x, int incx);
void CUBLASWINAPI cublasZtbsv(char uplo, char trans,
char diag, int n, int k, const cuDoubleComplex *A,
int lda, cuDoubleComplex *x, int incx);
/*------------------------------------------------------------------------*/
/* SYMV/HEMV */
void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
int lda, const float *x, int incx, float beta,
float *y, int incy);
void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
int lda, const double *x, int incx, double beta,
double *y, int incy);
void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
int lda, const cuComplex *x, int incx, cuComplex beta,
cuComplex *y, int incy);
void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta,
cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* SBMV/HBMV */
void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha,
const float *A, int lda, const float *x, int incx,
float beta, float *y, int incy);
void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha,
const double *A, int lda, const double *x, int incx,
double beta, double *y, int incy);
void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x, int incx,
cuComplex beta, cuComplex *y, int incy);
void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
cuDoubleComplex beta, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* SPMV/HPMV */
void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
const float *AP, const float *x,
int incx, float beta, float *y, int incy);
void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
const double *AP, const double *x,
int incx, double beta, double *y, int incy);
void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
const cuComplex *AP, const cuComplex *x,
int incx, cuComplex beta, cuComplex *y, int incy);
void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
const cuDoubleComplex *AP, const cuDoubleComplex *x,
int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy);
/*------------------------------------------------------------------------*/
/* GER */
void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
const float *y, int incy, float *A, int lda);
void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
const double *y, int incy, double *A, int lda);
void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy,
cuComplex *A, int lda);
void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy,
cuComplex *A, int lda);
void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, const cuDoubleComplex *y, int incy,
cuDoubleComplex *A, int lda);
void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, const cuDoubleComplex *y, int incy,
cuDoubleComplex *A, int lda);
/*------------------------------------------------------------------------*/
/* SYR/HER */
void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
int incx, float *A, int lda);
void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
int incx, double *A, int lda);
void CUBLASWINAPI cublasCher (char uplo, int n, float alpha,
const cuComplex *x, int incx, cuComplex *A, int lda);
void CUBLASWINAPI cublasZher (char uplo, int n, double alpha,
const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda);
/*------------------------------------------------------------------------*/
/* SPR/HPR */
void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
int incx, float *AP);
void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
int incx, double *AP);
void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
int incx, cuComplex *AP);
void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
int incx, cuDoubleComplex *AP);
/*------------------------------------------------------------------------*/
/* SYR2/HER2 */
void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x,
int incx, const float *y, int incy, float *A,
int lda);
void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x,
int incx, const double *y, int incy, double *A,
int lda);
void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy, cuComplex *A,
int lda);
void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A,
int lda);
/*------------------------------------------------------------------------*/
/* SPR2/HPR2 */
void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x,
int incx, const float *y, int incy, float *AP);
void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
const double *x, int incx, const double *y,
int incy, double *AP);
void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
const cuComplex *x, int incx, const cuComplex *y,
int incy, cuComplex *AP);
void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
int incy, cuDoubleComplex *AP);
/* ------------------------BLAS3 Functions ------------------------------- */
/* GEMM */
void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k,
float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C,
int ldc);
void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
double alpha, const double *A, int lda,
const double *B, int ldb, double beta, double *C,
int ldc);
void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
int k, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb,
cuDoubleComplex beta, cuDoubleComplex *C,
int ldc);
/* -------------------------------------------------------*/
/* SYRK */
void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha,
const float *A, int lda, float beta, float *C,
int ldc);
void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
double alpha, const double *A, int lda,
double beta, double *C, int ldc);
void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
cuComplex beta, cuComplex *C, int ldc);
void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/* ------------------------------------------------------- */
/* HERK */
void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
float alpha, const cuComplex *A, int lda,
float beta, cuComplex *C, int ldc);
void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
double alpha,
const cuDoubleComplex *A, int lda,
double beta,
cuDoubleComplex *C, int ldc);
/* ------------------------------------------------------- */
/* SYR2K */
void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha,
const float *A, int lda, const float *B, int ldb,
float beta, float *C, int ldc);
void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
double alpha, const double *A, int lda,
const double *B, int ldb, double beta,
double *C, int ldc);
void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/* ------------------------------------------------------- */
/* HER2K */
void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, float beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, double beta,
cuDoubleComplex *C, int ldc);
/*------------------------------------------------------------------------*/
/* SYMM*/
void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha,
const float *A, int lda, const float *B, int ldb,
float beta, float *C, int ldc);
void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha,
const double *A, int lda, const double *B, int ldb,
double beta, double *C, int ldc);
void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *B, int ldb,
cuComplex beta, cuComplex *C, int ldc);
void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
/*------------------------------------------------------------------------*/
/* HEMM*/
void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc);
void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/*------------------------------------------------------------------------*/
/* TRSM*/
void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
int m, int n, float alpha, const float *A, int lda,
float *B, int ldb);
void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
char diag, int m, int n, double alpha,
const double *A, int lda, double *B,
int ldb);
void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
int m, int n, cuComplex alpha, const cuComplex *A,
int lda, cuComplex *B, int ldb);
void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
char diag, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
cuDoubleComplex *B, int ldb);
/*------------------------------------------------------------------------*/
/* TRMM*/
void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
int m, int n, float alpha, const float *A, int lda,
float *B, int ldb);
void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
char diag, int m, int n, double alpha,
const double *A, int lda, double *B,
int ldb);
void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
int m, int n, cuComplex alpha, const cuComplex *A,
int lda, cuComplex *B, int ldb);
void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
char diag, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
int ldb);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif /* !defined(CUBLAS_H_) */

File diff suppressed because it is too large Load Diff

View File

@@ -1,274 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
/*
* This is the public header file for the new CUBLAS library API, it mapped the generic
* Cublas name functions to the actual _v2 implementations.
*/
#if !defined(CUBLAS_V2_H_)
#define CUBLAS_V2_H_
#undef CUBLASAPI
#ifdef __CUDACC__
#define CUBLASAPI __host__ __device__
#else
#define CUBLASAPI
#endif
#include "cublas_api.h"
#define cublasCreate cublasCreate_v2
#define cublasDestroy cublasDestroy_v2
#define cublasGetVersion cublasGetVersion_v2
#define cublasSetStream cublasSetStream_v2
#define cublasGetStream cublasGetStream_v2
#define cublasGetPointerMode cublasGetPointerMode_v2
#define cublasSetPointerMode cublasSetPointerMode_v2
/* Blas3 Routines */
#define cublasSnrm2 cublasSnrm2_v2
#define cublasDnrm2 cublasDnrm2_v2
#define cublasScnrm2 cublasScnrm2_v2
#define cublasDznrm2 cublasDznrm2_v2
#define cublasSdot cublasSdot_v2
#define cublasDdot cublasDdot_v2
#define cublasCdotu cublasCdotu_v2
#define cublasCdotc cublasCdotc_v2
#define cublasZdotu cublasZdotu_v2
#define cublasZdotc cublasZdotc_v2
#define cublasSscal cublasSscal_v2
#define cublasDscal cublasDscal_v2
#define cublasCscal cublasCscal_v2
#define cublasCsscal cublasCsscal_v2
#define cublasZscal cublasZscal_v2
#define cublasZdscal cublasZdscal_v2
#define cublasSaxpy cublasSaxpy_v2
#define cublasDaxpy cublasDaxpy_v2
#define cublasCaxpy cublasCaxpy_v2
#define cublasZaxpy cublasZaxpy_v2
#define cublasScopy cublasScopy_v2
#define cublasDcopy cublasDcopy_v2
#define cublasCcopy cublasCcopy_v2
#define cublasZcopy cublasZcopy_v2
#define cublasSswap cublasSswap_v2
#define cublasDswap cublasDswap_v2
#define cublasCswap cublasCswap_v2
#define cublasZswap cublasZswap_v2
#define cublasIsamax cublasIsamax_v2
#define cublasIdamax cublasIdamax_v2
#define cublasIcamax cublasIcamax_v2
#define cublasIzamax cublasIzamax_v2
#define cublasIsamin cublasIsamin_v2
#define cublasIdamin cublasIdamin_v2
#define cublasIcamin cublasIcamin_v2
#define cublasIzamin cublasIzamin_v2
#define cublasSasum cublasSasum_v2
#define cublasDasum cublasDasum_v2
#define cublasScasum cublasScasum_v2
#define cublasDzasum cublasDzasum_v2
#define cublasSrot cublasSrot_v2
#define cublasDrot cublasDrot_v2
#define cublasCrot cublasCrot_v2
#define cublasCsrot cublasCsrot_v2
#define cublasZrot cublasZrot_v2
#define cublasZdrot cublasZdrot_v2
#define cublasSrotg cublasSrotg_v2
#define cublasDrotg cublasDrotg_v2
#define cublasCrotg cublasCrotg_v2
#define cublasZrotg cublasZrotg_v2
#define cublasSrotm cublasSrotm_v2
#define cublasDrotm cublasDrotm_v2
#define cublasSrotmg cublasSrotmg_v2
#define cublasDrotmg cublasDrotmg_v2
/* Blas2 Routines */
#define cublasSgemv cublasSgemv_v2
#define cublasDgemv cublasDgemv_v2
#define cublasCgemv cublasCgemv_v2
#define cublasZgemv cublasZgemv_v2
#define cublasSgbmv cublasSgbmv_v2
#define cublasDgbmv cublasDgbmv_v2
#define cublasCgbmv cublasCgbmv_v2
#define cublasZgbmv cublasZgbmv_v2
#define cublasStrmv cublasStrmv_v2
#define cublasDtrmv cublasDtrmv_v2
#define cublasCtrmv cublasCtrmv_v2
#define cublasZtrmv cublasZtrmv_v2
#define cublasStbmv cublasStbmv_v2
#define cublasDtbmv cublasDtbmv_v2
#define cublasCtbmv cublasCtbmv_v2
#define cublasZtbmv cublasZtbmv_v2
#define cublasStpmv cublasStpmv_v2
#define cublasDtpmv cublasDtpmv_v2
#define cublasCtpmv cublasCtpmv_v2
#define cublasZtpmv cublasZtpmv_v2
#define cublasStrsv cublasStrsv_v2
#define cublasDtrsv cublasDtrsv_v2
#define cublasCtrsv cublasCtrsv_v2
#define cublasZtrsv cublasZtrsv_v2
#define cublasStpsv cublasStpsv_v2
#define cublasDtpsv cublasDtpsv_v2
#define cublasCtpsv cublasCtpsv_v2
#define cublasZtpsv cublasZtpsv_v2
#define cublasStbsv cublasStbsv_v2
#define cublasDtbsv cublasDtbsv_v2
#define cublasCtbsv cublasCtbsv_v2
#define cublasZtbsv cublasZtbsv_v2
#define cublasSsymv cublasSsymv_v2
#define cublasDsymv cublasDsymv_v2
#define cublasCsymv cublasCsymv_v2
#define cublasZsymv cublasZsymv_v2
#define cublasChemv cublasChemv_v2
#define cublasZhemv cublasZhemv_v2
#define cublasSsbmv cublasSsbmv_v2
#define cublasDsbmv cublasDsbmv_v2
#define cublasChbmv cublasChbmv_v2
#define cublasZhbmv cublasZhbmv_v2
#define cublasSspmv cublasSspmv_v2
#define cublasDspmv cublasDspmv_v2
#define cublasChpmv cublasChpmv_v2
#define cublasZhpmv cublasZhpmv_v2
#define cublasSger cublasSger_v2
#define cublasDger cublasDger_v2
#define cublasCgeru cublasCgeru_v2
#define cublasCgerc cublasCgerc_v2
#define cublasZgeru cublasZgeru_v2
#define cublasZgerc cublasZgerc_v2
#define cublasSsyr cublasSsyr_v2
#define cublasDsyr cublasDsyr_v2
#define cublasCsyr cublasCsyr_v2
#define cublasZsyr cublasZsyr_v2
#define cublasCher cublasCher_v2
#define cublasZher cublasZher_v2
#define cublasSspr cublasSspr_v2
#define cublasDspr cublasDspr_v2
#define cublasChpr cublasChpr_v2
#define cublasZhpr cublasZhpr_v2
#define cublasSsyr2 cublasSsyr2_v2
#define cublasDsyr2 cublasDsyr2_v2
#define cublasCsyr2 cublasCsyr2_v2
#define cublasZsyr2 cublasZsyr2_v2
#define cublasCher2 cublasCher2_v2
#define cublasZher2 cublasZher2_v2
#define cublasSspr2 cublasSspr2_v2
#define cublasDspr2 cublasDspr2_v2
#define cublasChpr2 cublasChpr2_v2
#define cublasZhpr2 cublasZhpr2_v2
/* Blas3 Routines */
#define cublasSgemm cublasSgemm_v2
#define cublasDgemm cublasDgemm_v2
#define cublasCgemm cublasCgemm_v2
#define cublasZgemm cublasZgemm_v2
#define cublasSsyrk cublasSsyrk_v2
#define cublasDsyrk cublasDsyrk_v2
#define cublasCsyrk cublasCsyrk_v2
#define cublasZsyrk cublasZsyrk_v2
#define cublasCherk cublasCherk_v2
#define cublasZherk cublasZherk_v2
#define cublasSsyr2k cublasSsyr2k_v2
#define cublasDsyr2k cublasDsyr2k_v2
#define cublasCsyr2k cublasCsyr2k_v2
#define cublasZsyr2k cublasZsyr2k_v2
#define cublasCher2k cublasCher2k_v2
#define cublasZher2k cublasZher2k_v2
#define cublasSsymm cublasSsymm_v2
#define cublasDsymm cublasDsymm_v2
#define cublasCsymm cublasCsymm_v2
#define cublasZsymm cublasZsymm_v2
#define cublasChemm cublasChemm_v2
#define cublasZhemm cublasZhemm_v2
#define cublasStrsm cublasStrsm_v2
#define cublasDtrsm cublasDtrsm_v2
#define cublasCtrsm cublasCtrsm_v2
#define cublasZtrsm cublasZtrsm_v2
#define cublasStrmm cublasStrmm_v2
#define cublasDtrmm cublasDtrmm_v2
#define cublasCtrmm cublasCtrmm_v2
#define cublasZtrmm cublasZtrmm_v2
#endif /* !defined(CUBLAS_V2_H_) */

View File

@@ -1,248 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
#define __CUDA_DEVICE_RUNTIME_API_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if !defined(__CUDACC_RTC__)
#if (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
#if defined(__cplusplus)
extern "C" {
#endif
struct cudaFuncAttributes;
#if defined(_WIN32)
#define __NV_WEAK__ __declspec(nv_weak)
#else
#define __NV_WEAK__ __attribute__((nv_weak))
#endif
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s)
{
return cudaErrorUnknown;
}
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
{
return cudaErrorUnknown;
}
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
{
return cudaErrorUnknown;
}
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaGetDevice(int *device)
{
return cudaErrorUnknown;
}
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
{
return cudaErrorUnknown;
}
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
{
return cudaErrorUnknown;
}
#undef __NV_WEAK__
#if defined(__cplusplus)
}
#endif
#endif /* (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
#endif /* !defined(__CUDACC_RTC__) */
#if defined(__cplusplus) && defined(__CUDACC__) /* Visible to nvcc front-end only */
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) // Visible to SM>=3.5 and "__host__ __device__" only
#include "driver_types.h"
#include "host_defines.h"
extern "C"
{
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
/**
* \ingroup CUDART_EXECUTION
* \brief Obtains a parameter buffer
*
* Obtains a parameter buffer which can be filled with parameters for a kernel launch.
* Parameters passed to ::cudaLaunchDevice must be allocated via this function.
*
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
* CUDA user code should use <<< >>> to launch kernels.
*
* \param alignment - Specifies alignment requirement of the parameter buffer
* \param size - Specifies size requirement in bytes
*
* \return
* Returns pointer to the allocated parameterBuffer
* \notefnerr
*
* \sa cudaLaunchDevice
*/
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
/**
* \ingroup CUDART_EXECUTION
* \brief Launches a specified kernel
*
* Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
* by calling ::cudaGetParameterBuffer().
*
* This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
* CUDA user code should use <<< >>> to launch the kernels.
*
* \param func - Pointer to the kernel to be launched
* \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
* \param gridDimension - Specifies grid dimensions
* \param blockDimension - Specifies block dimensions
* \param sharedMemSize - Specifies size of shared memory
* \param stream - Specifies the stream to be used
*
* \return
* ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
* ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
* \notefnerr
* \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
* Guide for the detailed descriptions of launch configuration and parameter layout respectively.
*
* \sa cudaGetParameterBuffer
*/
extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
// When compiling for the device and per thread default stream is enabled, add
// a static inline redirect to the per thread stream entry points.
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
{
return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
}
static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
{
return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
}
#else
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
#endif
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle);
extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle);
}
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
#endif /* defined(__cplusplus) && defined(__CUDACC__) */
#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,69 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__DEVICE_TYPES_H__)
#define __DEVICE_TYPES_H__
#include "host_defines.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
enum __device_builtin__ cudaRoundMode
{
cudaRoundNearest,
cudaRoundZero,
cudaRoundPosInf,
cudaRoundMinInf
};
#endif /* !__DEVICE_TYPES_H__ */

View File

@@ -1,145 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__DRIVER_FUNCTIONS_H__)
#define __DRIVER_FUNCTIONS_H__
#include "builtin_types.h"
#include "host_defines.h"
#include "driver_types.h"
/**
* \addtogroup CUDART_MEMORY
*
* @{
*/
/**
* \brief Returns a cudaPitchedPtr based on input parameters
*
* Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
* \p p, \p xsz, and \p ysz.
*
* \param d - Pointer to allocated memory
* \param p - Pitch of allocated memory in bytes
* \param xsz - Logical width of allocation in elements
* \param ysz - Logical height of allocation in elements
*
* \return
* ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
*
* \sa make_cudaExtent, make_cudaPos
*/
static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
{
struct cudaPitchedPtr s;
s.ptr = d;
s.pitch = p;
s.xsize = xsz;
s.ysize = ysz;
return s;
}
/**
* \brief Returns a cudaPos based on input parameters
*
* Returns a ::cudaPos based on the specified input parameters \p x,
* \p y, and \p z.
*
* \param x - X position
* \param y - Y position
* \param z - Z position
*
* \return
* ::cudaPos specified by \p x, \p y, and \p z
*
* \sa make_cudaExtent, make_cudaPitchedPtr
*/
static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z)
{
struct cudaPos p;
p.x = x;
p.y = y;
p.z = z;
return p;
}
/**
* \brief Returns a cudaExtent based on input parameters
*
* Returns a ::cudaExtent based on the specified input parameters \p w,
* \p h, and \p d.
*
* \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
* \param h - Height in elements
* \param d - Depth in elements
*
* \return
* ::cudaExtent specified by \p w, \p h, and \p d
*
* \sa make_cudaPitchedPtr, make_cudaPos
*/
static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d)
{
struct cudaExtent e;
e.width = w;
e.height = h;
e.depth = d;
return e;
}
/** @} */ /* END CUDART_MEMORY */
#endif /* !__DRIVER_FUNCTIONS_H__ */

File diff suppressed because it is too large Load Diff

View File

@@ -1,50 +0,0 @@
/*
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#include "crt/host_config.h"

View File

@@ -1,50 +0,0 @@
/*
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#include "crt/host_defines.h"

View File

@@ -1,80 +0,0 @@
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__LIBRARY_TYPES_H__)
#define __LIBRARY_TYPES_H__
typedef enum cudaDataType_t
{
CUDA_R_16F= 2, /* real as a half */
CUDA_C_16F= 6, /* complex as a pair of half numbers */
CUDA_R_32F= 0, /* real as a float */
CUDA_C_32F= 4, /* complex as a pair of float numbers */
CUDA_R_64F= 1, /* real as a double */
CUDA_C_64F= 5, /* complex as a pair of double numbers */
CUDA_R_8I = 3, /* real as a signed char */
CUDA_C_8I = 7, /* complex as a pair of signed char numbers */
CUDA_R_8U = 8, /* real as a unsigned char */
CUDA_C_8U = 9, /* complex as a pair of unsigned char numbers */
CUDA_R_32I= 10, /* real as a signed int */
CUDA_C_32I= 11, /* complex as a pair of signed int numbers */
CUDA_R_32U= 12, /* real as a unsigned int */
CUDA_C_32U= 13 /* complex as a pair of unsigned int numbers */
} cudaDataType;
typedef enum libraryPropertyType_t
{
MAJOR_VERSION,
MINOR_VERSION,
PATCH_LEVEL
} libraryPropertyType;
#endif /* !__LIBRARY_TYPES_H__ */

View File

@@ -1,525 +0,0 @@
//
// NVIDIA_COPYRIGHT_BEGIN
//
// Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto. Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.
//
// NVIDIA_COPYRIGHT_END
//
#ifndef __NVRTC_H__
#define __NVRTC_H__
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#include <stdlib.h>
/*************************************************************************//**
*
* \defgroup error Error Handling
*
* NVRTC defines the following enumeration type and function for API call
* error handling.
*
****************************************************************************/
/**
* \ingroup error
* \brief The enumerated type nvrtcResult defines API call result codes.
* NVRTC API functions return nvrtcResult to indicate the call
* result.
*/
typedef enum {
NVRTC_SUCCESS = 0,
NVRTC_ERROR_OUT_OF_MEMORY = 1,
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
NVRTC_ERROR_INVALID_INPUT = 3,
NVRTC_ERROR_INVALID_PROGRAM = 4,
NVRTC_ERROR_INVALID_OPTION = 5,
NVRTC_ERROR_COMPILATION = 6,
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
NVRTC_ERROR_INTERNAL_ERROR = 11
} nvrtcResult;
/**
* \ingroup error
* \brief nvrtcGetErrorString is a helper function that returns a string
* describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to
* \c "NVRTC_SUCCESS".
* For unrecognized enumeration values, it returns
* \c "NVRTC_ERROR unknown".
*
* \param [in] result CUDA Runtime Compilation API result code.
* \return Message string for the given #nvrtcResult code.
*/
const char *nvrtcGetErrorString(nvrtcResult result);
/*************************************************************************//**
*
* \defgroup query General Information Query
*
* NVRTC defines the following function for general information query.
*
****************************************************************************/
/**
* \ingroup query
* \brief nvrtcVersion sets the output parameters \p major and \p minor
* with the CUDA Runtime Compilation version number.
*
* \param [out] major CUDA Runtime Compilation major version number.
* \param [out] minor CUDA Runtime Compilation minor version number.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
*
*/
nvrtcResult nvrtcVersion(int *major, int *minor);
/*************************************************************************//**
*
* \defgroup compilation Compilation
*
* NVRTC defines the following type and functions for actual compilation.
*
****************************************************************************/
/**
* \ingroup compilation
* \brief nvrtcProgram is the unit of compilation, and an opaque handle for
* a program.
*
* To compile a CUDA program string, an instance of nvrtcProgram must be
* created first with ::nvrtcCreateProgram, then compiled with
* ::nvrtcCompileProgram.
*/
typedef struct _nvrtcProgram *nvrtcProgram;
/**
* \ingroup compilation
* \brief nvrtcCreateProgram creates an instance of nvrtcProgram with the
* given input parameters, and sets the output parameter \p prog with
* it.
*
* \param [out] prog CUDA Runtime Compilation program.
* \param [in] src CUDA program source.
* \param [in] name CUDA program name.\n
* \p name can be \c NULL; \c "default_program" is
* used when \p name is \c NULL.
* \param [in] numHeaders Number of headers used.\n
* \p numHeaders must be greater than or equal to 0.
* \param [in] headers Sources of the headers.\n
* \p headers can be \c NULL when \p numHeaders is
* 0.
* \param [in] includeNames Name of each header by which they can be
* included in the CUDA program source.\n
* \p includeNames can be \c NULL when \p numHeaders
* is 0.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
* - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
*
* \see ::nvrtcDestroyProgram
*/
nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
const char *src,
const char *name,
int numHeaders,
const char * const *headers,
const char * const *includeNames);
/**
* \ingroup compilation
* \brief nvrtcDestroyProgram destroys the given program.
*
* \param [in] prog CUDA Runtime Compilation program.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
*
* \see ::nvrtcCreateProgram
*/
nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
/**
* \ingroup compilation
* \brief nvrtcCompileProgram compiles the given program.
*
* It supports compile options listed in \ref options.
*/
nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
int numOptions, const char * const *options);
/**
* \ingroup compilation
* \brief nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX
* generated by the previous compilation of \p prog (including the
* trailing \c NULL).
*
* \param [in] prog CUDA Runtime Compilation program.
* \param [out] ptxSizeRet Size of the generated PTX (including the trailing
* \c NULL).
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
*
* \see ::nvrtcGetPTX
*/
nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
/**
* \ingroup compilation
* \brief nvrtcGetPTX stores the PTX generated by the previous compilation
* of \p prog in the memory pointed by \p ptx.
*
* \param [in] prog CUDA Runtime Compilation program.
* \param [out] ptx Compiled result.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
*
* \see ::nvrtcGetPTXSize
*/
nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
/**
* \ingroup compilation
* \brief nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
* log generated by the previous compilation of \p prog (including the
* trailing \c NULL).
*
* Note that compilation log may be generated with warnings and informative
* messages, even when the compilation of \p prog succeeds.
*
* \param [in] prog CUDA Runtime Compilation program.
* \param [out] logSizeRet Size of the compilation log
* (including the trailing \c NULL).
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
*
* \see ::nvrtcGetProgramLog
*/
nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
/**
* \ingroup compilation
* \brief nvrtcGetProgramLog stores the log generated by the previous
* compilation of \p prog in the memory pointed by \p log.
*
* \param [in] prog CUDA Runtime Compilation program.
* \param [out] log Compilation log.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
*
* \see ::nvrtcGetProgramLogSize
*/
nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
/**
* \ingroup compilation
* \brief nvrtcAddNameExpression notes the given name expression
* denoting a __global__ function or function template
* instantiation.
*
* The identical name expression string must be provided on a subsequent
* call to nvrtcGetLoweredName to extract the lowered name.
* \param [in] prog CUDA Runtime Compilation program.
* \param [in] name_expression constant expression denoting a __global__
* function or function template instantiation.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink
*
* \see ::nvrtcGetLoweredName
*/
nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog,
const char * const name_expression);
/**
* \ingroup compilation
* \brief nvrtcGetLoweredName extracts the lowered (mangled) name
* for a __global__ function or function template instantiation,
* and updates *lowered_name to point to it. The memory containing
* the name is released when the NVRTC program is destroyed by
* nvrtcDestroyProgram.
* The identical name expression must have been previously
* provided to nvrtcAddNameExpression.
*
* \param [in] prog CUDA Runtime Compilation program.
* \param [in] name_expression constant expression denoting a __global__
* function or function template instantiation.
* \param [out] lowered_name initialized by the function to point to a
* C string containing the lowered (mangled)
* name corresponding to the provided name expression.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink
* - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink
*
* \see ::nvrtcAddNameExpression
*/
nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog,
const char *const name_expression,
const char** lowered_name);
/**
* \defgroup options Supported Compile Options
*
* NVRTC supports the compile options below.
* Option names with two preceding dashs (\c --) are long option names and
* option names with one preceding dash (\c -) are short option names.
* Short option names can be used instead of long option names.
* When a compile option takes an argument, an assignment operator (\c =)
* is used to separate the compile option argument from the compile option
* name, e.g., \c "--gpu-architecture=compute_30".
* Alternatively, the compile option name and the argument can be specified in
* separate strings without an assignment operator, .e.g,
* \c "--gpu-architecture" \c "compute_30".
* Single-character short option names, such as \c -D, \c -U, and \c -I, do
* not require an assignment operator, and the compile option name and the
* argument can be present in the same string with or without spaces between
* them.
* For instance, \c "-D=<def>", \c "-D<def>", and \c "-D <def>" are all
* supported.
*
* The valid compiler options are:
*
* - Compilation targets
* - \c --gpu-architecture=\<arch\> (\c -arch)\n
* Specify the name of the class of GPU architectures for which the
* input must be compiled.\n
* - Valid <c>\<arch\></c>s:
* - \c compute_30
* - \c compute_32
* - \c compute_35
* - \c compute_37
* - \c compute_50
* - \c compute_52
* - \c compute_53
* - \c compute_60
* - \c compute_61
* - \c compute_62
* - \c compute_70
* - \c compute_72
* - Default: \c compute_30
* - Separate compilation / whole-program compilation
* - \c --device-c (\c -dc)\n
* Generate relocatable code that can be linked with other relocatable
* device code. It is equivalent to --relocatable-device-code=true.
* - \c --device-w (\c -dw)\n
* Generate non-relocatable code. It is equivalent to
* \c --relocatable-device-code=false.
* - \c --relocatable-device-code={true|false} (\c -rdc)\n
* Enable (disable) the generation of relocatable device code.
* - Default: \c false
* - Debugging support
* - \c --device-debug (\c -G)\n
* Generate debug information.
* - \c --generate-line-info (\c -lineinfo)\n
* Generate line-number information.
* - Code generation
* - \c --maxrregcount=\<N\> (\c -maxrregcount)\n
* Specify the maximum amount of registers that GPU functions can use.
* Until a function-specific limit, a higher value will generally
* increase the performance of individual GPU threads that execute this
* function. However, because thread registers are allocated from a
* global register pool on each GPU, a higher value of this option will
* also reduce the maximum thread block size, thereby reducing the amount
* of thread parallelism. Hence, a good maxrregcount value is the result
* of a trade-off. If this option is not specified, then no maximum is
* assumed. Value less than the minimum registers required by ABI will
* be bumped up by the compiler to ABI minimum limit.
* - \c --ftz={true|false} (\c -ftz)\n
* When performing single-precision floating-point operations, flush
* denormal values to zero or preserve denormal values.
* \c --use_fast_math implies \c --ftz=true.
* - Default: \c false
* - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n
* For single-precision floating-point square root, use IEEE
* round-to-nearest mode or use a faster approximation.
* \c --use_fast_math implies \c --prec-sqrt=false.
* - Default: \c true
* - \c --prec-div={true|false} (\c -prec-div)\n
* For single-precision floating-point division and reciprocals, use IEEE
* round-to-nearest mode or use a faster approximation.
* \c --use_fast_math implies \c --prec-div=false.
* - Default: \c true
* - \c --fmad={true|false} (\c -fmad)\n
* Enables (disables) the contraction of floating-point multiplies and
* adds/subtracts into floating-point multiply-add operations (FMAD,
* FFMA, or DFMA). \c --use_fast_math implies \c --fmad=true.
* - Default: \c true
* - \c --use_fast_math (\c -use_fast_math)\n
* Make use of fast math operations.
* \c --use_fast_math implies \c --ftz=true \c --prec-div=false
* \c --prec-sqrt=false \c --fmad=true.
* - Preprocessing
* - \c --define-macro=\<def\> (\c -D)\n
* \c \<def\> can be either \c \<name\> or \c \<name=definitions\>.
* - \c \<name\> \n
* Predefine \c \<name\> as a macro with definition \c 1.
* - \c \<name\>=\<definition\> \n
* The contents of \c \<definition\> are tokenized and preprocessed
* as if they appeared during translation phase three in a \c \#define
* directive. In particular, the definition will be truncated by
* embedded new line characters.
* - \c --undefine-macro=\<def\> (\c -U)\n
* Cancel any previous definition of \c \<def\>.
* - \c --include-path=\<dir\> (\c -I)\n
* Add the directory \c \<dir\> to the list of directories to be
* searched for headers. These paths are searched after the list of
* headers given to ::nvrtcCreateProgram.
* - \c --pre-include=\<header\> (\c -include)\n
* Preinclude \c \<header\> during preprocessing.
* - Language Dialect
* - \c --std={c++11|c++14} (\c -std={c++11|c++14})\n
* Set language dialect to C++11 or C++14.
* - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n
* Provide builtin definitions of \c std::move and \c std::forward,
* when C++11 language dialect is selected.
* - Default: \c true
* - \c --builtin-initializer-list={true|false}
* (\c -builtin-initializer-list)\n
* Provide builtin definitions of \c std::initializer_list class and
* member functions when C++11 language dialect is selected.
* - Default: \c true
* - Misc.
* - \c --disable-warnings (\c -w)\n
* Inhibit all warning messages.
* - \c --restrict (\c -restrict)\n
* Programmer assertion that all kernel pointer parameters are restrict
* pointers.
* - \c --device-as-default-execution-space
* (\c -default-device)\n
* Treat entities with no execution space annotation as \c __device__
* entities.
*
* \param [in] prog CUDA Runtime Compilation program.
* \param [in] numOptions Number of compiler options passed.
* \param [in] options Compiler options in the form of C string array.\n
* \p options can be \c NULL when \p numOptions is 0.
*
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
* - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
* - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
* - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
*/
#ifdef __cplusplus
}
#endif /* __cplusplus */
/* The utility function 'nvrtcGetTypeName' is not available by default. Define
the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available.
*/
#if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__
#if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__
#include <cxxabi.h>
#include <cstdlib>
#elif defined(_WIN32)
#include <Windows.h>
#include <DbgHelp.h>
#endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */
#include <string>
#include <typeinfo>
/*************************************************************************//**
*
* \defgroup hosthelper Host Helper
*
* NVRTC defines the following functions for easier interaction with host code.
*
****************************************************************************/
/**
* \ingroup hosthelper
* \brief nvrtcGetTypeName stores the source level name of the template type argument
* T in the given std::string location.
*
* This function is only provided when the macro NVRTC_GET_TYPE_NAME is
* defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
* function calls to extract the type name, when using gcc/clang or cl.exe compilers,
* respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
* otherwise *result is initialized with the extracted name.
*
* \param [in] result: pointer to std::string in which to store the type name.
* \return
* - \link #nvrtcResult NVRTC_SUCCESS \endlink
* - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
*
*/
template <typename T>
nvrtcResult nvrtcGetTypeName(std::string *result)
{
const char *name = typeid(T).name();
#if USE_CXXABI || __clang__ || __GNUC__
int status;
char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status);
if (status == 0) {
*result = undecorated_name;
free(undecorated_name);
return NVRTC_SUCCESS;
}
#elif defined(_WIN32)
char undecorated_name[4096];
if(UnDecorateSymbolName(name, undecorated_name,
sizeof(undecorated_name) / sizeof(*undecorated_name),
UNDNAME_COMPLETE) ) {
*result = undecorated_name;
return NVRTC_SUCCESS;
}
#endif /* USE_CXXABI || __clang__ || __GNUC__ */
return NVRTC_ERROR_INTERNAL_ERROR;
}
#endif /* NVRTC_GET_TYPE_NAME */
#endif /* __NVRTC_H__ */

View File

@@ -1,119 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__SURFACE_TYPES_H__)
#define __SURFACE_TYPES_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "driver_types.h"
/**
* \addtogroup CUDART_TYPES
*
* @{
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#define cudaSurfaceType1D 0x01
#define cudaSurfaceType2D 0x02
#define cudaSurfaceType3D 0x03
#define cudaSurfaceTypeCubemap 0x0C
#define cudaSurfaceType1DLayered 0xF1
#define cudaSurfaceType2DLayered 0xF2
#define cudaSurfaceTypeCubemapLayered 0xFC
/**
* CUDA Surface boundary modes
*/
enum __device_builtin__ cudaSurfaceBoundaryMode
{
cudaBoundaryModeZero = 0, /**< Zero boundary mode */
cudaBoundaryModeClamp = 1, /**< Clamp boundary mode */
cudaBoundaryModeTrap = 2 /**< Trap boundary mode */
};
/**
* CUDA Surface format modes
*/
enum __device_builtin__ cudaSurfaceFormatMode
{
cudaFormatModeForced = 0, /**< Forced format mode */
cudaFormatModeAuto = 1 /**< Auto format mode */
};
/**
* CUDA Surface reference
*/
struct __device_builtin__ surfaceReference
{
/**
* Channel descriptor for surface reference
*/
struct cudaChannelFormatDesc channelDesc;
};
/**
* An opaque value that represents a CUDA Surface object
*/
typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
/** @} */
/** @} */ /* END CUDART_TYPES */
#endif /* !__SURFACE_TYPES_H__ */

View File

@@ -1,217 +0,0 @@
/*
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__TEXTURE_TYPES_H__)
#define __TEXTURE_TYPES_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "driver_types.h"
/**
* \addtogroup CUDART_TYPES
*
* @{
*/
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#define cudaTextureType1D 0x01
#define cudaTextureType2D 0x02
#define cudaTextureType3D 0x03
#define cudaTextureTypeCubemap 0x0C
#define cudaTextureType1DLayered 0xF1
#define cudaTextureType2DLayered 0xF2
#define cudaTextureTypeCubemapLayered 0xFC
/**
* CUDA texture address modes
*/
enum __device_builtin__ cudaTextureAddressMode
{
cudaAddressModeWrap = 0, /**< Wrapping address mode */
cudaAddressModeClamp = 1, /**< Clamp to edge address mode */
cudaAddressModeMirror = 2, /**< Mirror address mode */
cudaAddressModeBorder = 3 /**< Border address mode */
};
/**
* CUDA texture filter modes
*/
enum __device_builtin__ cudaTextureFilterMode
{
cudaFilterModePoint = 0, /**< Point filter mode */
cudaFilterModeLinear = 1 /**< Linear filter mode */
};
/**
* CUDA texture read modes
*/
enum __device_builtin__ cudaTextureReadMode
{
cudaReadModeElementType = 0, /**< Read texture as specified element type */
cudaReadModeNormalizedFloat = 1 /**< Read texture as normalized float */
};
/**
* CUDA texture reference
*/
struct __device_builtin__ textureReference
{
/**
* Indicates whether texture reads are normalized or not
*/
int normalized;
/**
* Texture filter mode
*/
enum cudaTextureFilterMode filterMode;
/**
* Texture address mode for up to 3 dimensions
*/
enum cudaTextureAddressMode addressMode[3];
/**
* Channel descriptor for the texture reference
*/
struct cudaChannelFormatDesc channelDesc;
/**
* Perform sRGB->linear conversion during texture read
*/
int sRGB;
/**
* Limit to the anisotropy ratio
*/
unsigned int maxAnisotropy;
/**
* Mipmap filter mode
*/
enum cudaTextureFilterMode mipmapFilterMode;
/**
* Offset applied to the supplied mipmap level
*/
float mipmapLevelBias;
/**
* Lower end of the mipmap level range to clamp access to
*/
float minMipmapLevelClamp;
/**
* Upper end of the mipmap level range to clamp access to
*/
float maxMipmapLevelClamp;
int __cudaReserved[15];
};
/**
* CUDA texture descriptor
*/
struct __device_builtin__ cudaTextureDesc
{
/**
* Texture address mode for up to 3 dimensions
*/
enum cudaTextureAddressMode addressMode[3];
/**
* Texture filter mode
*/
enum cudaTextureFilterMode filterMode;
/**
* Texture read mode
*/
enum cudaTextureReadMode readMode;
/**
* Perform sRGB->linear conversion during texture read
*/
int sRGB;
/**
* Texture Border Color
*/
float borderColor[4];
/**
* Indicates whether texture reads are normalized or not
*/
int normalizedCoords;
/**
* Limit to the anisotropy ratio
*/
unsigned int maxAnisotropy;
/**
* Mipmap filter mode
*/
enum cudaTextureFilterMode mipmapFilterMode;
/**
* Offset applied to the supplied mipmap level
*/
float mipmapLevelBias;
/**
* Lower end of the mipmap level range to clamp access to
*/
float minMipmapLevelClamp;
/**
* Upper end of the mipmap level range to clamp access to
*/
float maxMipmapLevelClamp;
};
/**
* An opaque value that represents a CUDA texture object
*/
typedef __device_builtin__ unsigned long long cudaTextureObject_t;
/** @} */
/** @} */ /* END CUDART_TYPES */
#endif /* !__TEXTURE_TYPES_H__ */

View File

@@ -1,177 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__VECTOR_FUNCTIONS_H__)
#define __VECTOR_FUNCTIONS_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "builtin_types.h"
#include "host_defines.h"
#include "vector_types.h"
#if defined(__CUDACC_RTC__)
#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
#else /* !__CUDACC_RTC__ */
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
#endif /* __CUDACC_RTC__ */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
#undef __VECTOR_FUNCTIONS_DECL__
#if !defined(__CUDACC_RTC__)
#include "vector_functions.hpp"
#endif /* !__CUDACC_RTC__ */
#endif /* !__VECTOR_FUNCTIONS_H__ */

View File

@@ -1,318 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__VECTOR_FUNCTIONS_HPP__)
#define __VECTOR_FUNCTIONS_HPP__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "builtin_types.h"
#include "host_defines.h"
#include "vector_types.h"
#if defined(__CUDACC_RTC__)
#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
#else /* !__CUDACC_RTC__ */
#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
#endif /* __CUDACC_RTC__ */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
{
char1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
{
uchar1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
{
char2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
{
uchar2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
{
char3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
{
uchar3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
{
char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
{
uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
{
short1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
{
ushort1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
{
short2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
{
ushort2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
{
short3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
{
ushort3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
{
short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
{
ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
{
int1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
{
uint1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
{
int2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
{
uint2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
{
int3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
{
uint3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
{
int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
{
uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
{
long1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
{
ulong1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
{
long2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
{
ulong2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
{
long3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
{
ulong3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
{
long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
{
ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
{
float1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
{
float2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
{
float3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
{
float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
{
longlong1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
{
ulonglong1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
{
longlong2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
{
ulonglong2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
{
longlong3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
{
ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
{
longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
{
ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
{
double1 t; t.x = x; return t;
}
__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
{
double2 t; t.x = x; t.y = y; return t;
}
__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
{
double3 t; t.x = x; t.y = y; t.z = z; return t;
}
__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
{
double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
#undef __VECTOR_FUNCTIONS_DECL__
#endif /* !__VECTOR_FUNCTIONS_HPP__ */

View File

@@ -1,425 +0,0 @@
/*
* Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#if !defined(__VECTOR_TYPES_H__)
#define __VECTOR_TYPES_H__
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#include "host_defines.h"
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && \
defined(_WIN32) && !defined(_WIN64)
#pragma warning(push)
#pragma warning(disable: 4201 4408)
#define __cuda_builtin_vector_align8(tag, members) \
struct __device_builtin__ tag \
{ \
union \
{ \
struct { members }; \
struct { long long int :1,:0; }; \
}; \
}
#else /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
#define __cuda_builtin_vector_align8(tag, members) \
struct __device_builtin__ __align__(8) tag \
{ \
members \
}
#endif /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
struct __device_builtin__ char1
{
signed char x;
};
struct __device_builtin__ uchar1
{
unsigned char x;
};
struct __device_builtin__ __align__(2) char2
{
signed char x, y;
};
struct __device_builtin__ __align__(2) uchar2
{
unsigned char x, y;
};
struct __device_builtin__ char3
{
signed char x, y, z;
};
struct __device_builtin__ uchar3
{
unsigned char x, y, z;
};
struct __device_builtin__ __align__(4) char4
{
signed char x, y, z, w;
};
struct __device_builtin__ __align__(4) uchar4
{
unsigned char x, y, z, w;
};
struct __device_builtin__ short1
{
short x;
};
struct __device_builtin__ ushort1
{
unsigned short x;
};
struct __device_builtin__ __align__(4) short2
{
short x, y;
};
struct __device_builtin__ __align__(4) ushort2
{
unsigned short x, y;
};
struct __device_builtin__ short3
{
short x, y, z;
};
struct __device_builtin__ ushort3
{
unsigned short x, y, z;
};
__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
struct __device_builtin__ int1
{
int x;
};
struct __device_builtin__ uint1
{
unsigned int x;
};
__cuda_builtin_vector_align8(int2, int x; int y;);
__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
struct __device_builtin__ int3
{
int x, y, z;
};
struct __device_builtin__ uint3
{
unsigned int x, y, z;
};
struct __device_builtin__ __builtin_align__(16) int4
{
int x, y, z, w;
};
struct __device_builtin__ __builtin_align__(16) uint4
{
unsigned int x, y, z, w;
};
struct __device_builtin__ long1
{
long int x;
};
struct __device_builtin__ ulong1
{
unsigned long x;
};
#if defined(_WIN32)
__cuda_builtin_vector_align8(long2, long int x; long int y;);
__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
#else /* !_WIN32 */
struct __device_builtin__ __align__(2*sizeof(long int)) long2
{
long int x, y;
};
struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
{
unsigned long int x, y;
};
#endif /* _WIN32 */
struct __device_builtin__ long3
{
long int x, y, z;
};
struct __device_builtin__ ulong3
{
unsigned long int x, y, z;
};
struct __device_builtin__ __builtin_align__(16) long4
{
long int x, y, z, w;
};
struct __device_builtin__ __builtin_align__(16) ulong4
{
unsigned long int x, y, z, w;
};
struct __device_builtin__ float1
{
float x;
};
#if !defined(__CUDACC__) && defined(__arm__) && \
defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-pedantic"
struct __device_builtin__ __attribute__((aligned(8))) float2
{
float x; float y; float __cuda_gnu_arm_ice_workaround[0];
};
#pragma GCC poison __cuda_gnu_arm_ice_workaround
#pragma GCC diagnostic pop
#else /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
__GNUC__ == 4&& __GNUC_MINOR__ == 6 */
__cuda_builtin_vector_align8(float2, float x; float y;);
#endif /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
__GNUC__ == 4&& __GNUC_MINOR__ == 6 */
struct __device_builtin__ float3
{
float x, y, z;
};
struct __device_builtin__ __builtin_align__(16) float4
{
float x, y, z, w;
};
struct __device_builtin__ longlong1
{
long long int x;
};
struct __device_builtin__ ulonglong1
{
unsigned long long int x;
};
struct __device_builtin__ __builtin_align__(16) longlong2
{
long long int x, y;
};
struct __device_builtin__ __builtin_align__(16) ulonglong2
{
unsigned long long int x, y;
};
struct __device_builtin__ longlong3
{
long long int x, y, z;
};
struct __device_builtin__ ulonglong3
{
unsigned long long int x, y, z;
};
struct __device_builtin__ __builtin_align__(16) longlong4
{
long long int x, y, z ,w;
};
struct __device_builtin__ __builtin_align__(16) ulonglong4
{
unsigned long long int x, y, z, w;
};
struct __device_builtin__ double1
{
double x;
};
struct __device_builtin__ __builtin_align__(16) double2
{
double x, y;
};
struct __device_builtin__ double3
{
double x, y, z;
};
struct __device_builtin__ __builtin_align__(16) double4
{
double x, y, z, w;
};
#if !defined(__CUDACC__) && defined(_WIN32) && !defined(_WIN64)
#pragma warning(pop)
#endif /* !__CUDACC__ && _WIN32 && !_WIN64 */
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
typedef __device_builtin__ struct char1 char1;
typedef __device_builtin__ struct uchar1 uchar1;
typedef __device_builtin__ struct char2 char2;
typedef __device_builtin__ struct uchar2 uchar2;
typedef __device_builtin__ struct char3 char3;
typedef __device_builtin__ struct uchar3 uchar3;
typedef __device_builtin__ struct char4 char4;
typedef __device_builtin__ struct uchar4 uchar4;
typedef __device_builtin__ struct short1 short1;
typedef __device_builtin__ struct ushort1 ushort1;
typedef __device_builtin__ struct short2 short2;
typedef __device_builtin__ struct ushort2 ushort2;
typedef __device_builtin__ struct short3 short3;
typedef __device_builtin__ struct ushort3 ushort3;
typedef __device_builtin__ struct short4 short4;
typedef __device_builtin__ struct ushort4 ushort4;
typedef __device_builtin__ struct int1 int1;
typedef __device_builtin__ struct uint1 uint1;
typedef __device_builtin__ struct int2 int2;
typedef __device_builtin__ struct uint2 uint2;
typedef __device_builtin__ struct int3 int3;
typedef __device_builtin__ struct uint3 uint3;
typedef __device_builtin__ struct int4 int4;
typedef __device_builtin__ struct uint4 uint4;
typedef __device_builtin__ struct long1 long1;
typedef __device_builtin__ struct ulong1 ulong1;
typedef __device_builtin__ struct long2 long2;
typedef __device_builtin__ struct ulong2 ulong2;
typedef __device_builtin__ struct long3 long3;
typedef __device_builtin__ struct ulong3 ulong3;
typedef __device_builtin__ struct long4 long4;
typedef __device_builtin__ struct ulong4 ulong4;
typedef __device_builtin__ struct float1 float1;
typedef __device_builtin__ struct float2 float2;
typedef __device_builtin__ struct float3 float3;
typedef __device_builtin__ struct float4 float4;
typedef __device_builtin__ struct longlong1 longlong1;
typedef __device_builtin__ struct ulonglong1 ulonglong1;
typedef __device_builtin__ struct longlong2 longlong2;
typedef __device_builtin__ struct ulonglong2 ulonglong2;
typedef __device_builtin__ struct longlong3 longlong3;
typedef __device_builtin__ struct ulonglong3 ulonglong3;
typedef __device_builtin__ struct longlong4 longlong4;
typedef __device_builtin__ struct ulonglong4 ulonglong4;
typedef __device_builtin__ struct double1 double1;
typedef __device_builtin__ struct double2 double2;
typedef __device_builtin__ struct double3 double3;
typedef __device_builtin__ struct double4 double4;
/*******************************************************************************
* *
* *
* *
*******************************************************************************/
struct __device_builtin__ dim3
{
unsigned int x, y, z;
#if defined(__cplusplus)
__host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
__host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
__host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
#endif /* __cplusplus */
};
typedef __device_builtin__ struct dim3 dim3;
#undef __cuda_builtin_vector_align8
#endif /* !__VECTOR_TYPES_H__ */

File diff suppressed because it is too large Load Diff

View File

@@ -1,148 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_RUNTIME_PREDICT_H_
#define ISAAC_RUNTIME_PREDICT_H_
#include <fstream>
#include <vector>
#include <memory>
#include <iostream>
#include <cstring>
#include <algorithm>
#include "isaac/tools/matrix.hpp"
#include "isaac/driver/device.h"
#include "isaac/templates/common.hpp"
#include "isaac/templates/pool.h"
#include "isaac/templates/conv.h"
#include "isaac/templates/gemm.h"
#include <map>
namespace isaac{
namespace runtime{
// Layers
class Layer{
public:
static Layer* read(u_char*& current);
virtual void forward(matrix<float> const & X, matrix<float> & Y) = 0;
virtual size_t n_outs(size_t n_outs_prev) = 0;
};
class Activation: public Layer{
public:
static const int BINARY_CODE = 0;
size_t n_outs(size_t n_outs_prev);
private:
};
class ReLU: public Activation{
public:
static const int BINARY_CODE = 0;
void forward(matrix<float> const & X, matrix<float> & Y);
};
class Linear: public Activation{
public:
static const int BINARY_CODE = 1;
void forward(matrix<float> const & X, matrix<float> & Y);
};
// Dense
class Dense: public Layer{
public:
static const int BINARY_CODE = 1;
Dense(u_char*& data);
size_t n_outs(size_t);
void forward(matrix<float> const & X, matrix<float> & Y);
private:
matrix<float> W_;
std::vector<float> b_;
};
// Network
class Network{
public:
Network(u_char* data);
void predict(const matrix<float>& X, matrix<float>& Y);
private:
std::vector<std::shared_ptr<Layer>> layers_;
};
enum OperationType{
GEMM,
CONV,
POOL
};
//Profile
class Profile{
protected:
typedef void (&validator_t)(driver::Device const &, size_t, param_t*, uint8_t*);
typedef std::function<double(std::vector<param_t> const&)> benchmark_t;
public:
Profile(u_char* data, size_t nshapes);
std::vector<param_t> predict(driver::Device const & device, std::vector<param_t> const & shapes, validator_t const & validator, benchmark_t const & benchmark, size_t num_re_evaluate);
matrix<param_t> const & kernels() const;
private:
matrix<param_t> kernels_;
driver::Device device_;
Network predictor_;
};
class ConvProfile: public Profile{
public:
ConvProfile(u_char* data);
templates::Conv predict(driver::Stream& stream, DType in_dtype, DType out_dtype, param_t C, param_t D, param_t H, param_t W, param_t N, param_t K, param_t M, param_t P, param_t Q, param_t T, param_t R, param_t S,
param_t pad_d, param_t pad_h, param_t pad_w,
param_t stride_d, param_t stride_h, param_t stride_w,
param_t upsample_d, param_t upsample_h, param_t upsample_w,
ActivationType activation, size_t num_outputs,
ResidualType residual, param_t Zk, param_t crop_z_m0, param_t crop_z_m1, param_t crop_z_p0, param_t crop_z_p1, param_t crop_z_q0, param_t crop_z_q1, size_t num_re_evaluate = 1);
};
class PoolProfile: public Profile{
public:
PoolProfile(u_char* data);
templates::Pool predict(driver::Stream& stream, DType in_dtype, DType out_dtype, PoolType pool_type, param_t C, param_t D, param_t H, param_t W, param_t N, param_t M, param_t P, param_t Q, param_t T, param_t R, param_t S,
param_t pad_d, param_t pad_h, param_t pad_w, param_t stride_d, param_t stride_h, param_t stride_w, size_t num_re_evaluate = 1);
};
class GEMMProfile: public Profile{
public:
GEMMProfile(u_char* data);
templates::GEMM predict(driver::Stream& stream, DType in_dtype, DType out_dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K,
param_t offa, param_t lda, param_t offb, param_t ldb, param_t offc, param_t ldc, size_t num_re_evaluate = 1);
};
//Database
extern const std::map<std::pair<driver::Device::Architecture, OperationType>, std::shared_ptr<Profile> > database;
}
}
#endif

View File

@@ -1,95 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_SCALAR_H
#define ISAAC_SCALAR_H
#include "isaac/external/half.hpp"
namespace isaac{
enum DType{
INT8X4_TYPE = 1,
INT32_TYPE,
FLOAT_TYPE,
DOUBLE_TYPE,
};
inline size_t size_of(DType dtype){
switch (dtype) {
case INT8X4_TYPE: return 4;
case INT32_TYPE: return 4;
case FLOAT_TYPE: return 4;
case DOUBLE_TYPE: return 8;
default: throw;
}
}
template<class T> struct to_DType;
template<> struct to_DType<int32_t>{ static const DType value = INT8X4_TYPE; };
template<> struct to_DType<float>{ static const DType value = FLOAT_TYPE; };
template<> struct to_DType<double>{ static const DType value = DOUBLE_TYPE; };
class scalar{
private:
template<class T>
void init(T const & x){
switch(dtype_){
case INT32_TYPE: value_.int32 = (int32_t)x; break;
case FLOAT_TYPE: value_.float32 = (float)x; break;
case DOUBLE_TYPE: value_.float64 = (double)x; break;
default: throw;
}
}
public:
#define ISAAC_INSTANTIATE(TYPE) scalar(TYPE value, DType dtype = to_DType<TYPE>::value) : dtype_(dtype) { init(value); }
ISAAC_INSTANTIATE(float)
ISAAC_INSTANTIATE(double)
#undef ISAAC_INSTANTIATE
void* data() const{
switch(dtype_){
case INT32_TYPE: return (void*)&value_.int32;
case FLOAT_TYPE: return (void*)&value_.float32;
case DOUBLE_TYPE: return (void*)&value_.float64;
default: throw;
}
}
DType dtype() const{
return dtype_;
}
private:
DType dtype_;
union{
int32_t int32;
float float32;
double float64;
}value_;
};
}
#endif

View File

@@ -1,89 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_TEMPLATES_COMMON_HPP_
#define ISAAC_TEMPLATES_COMMON_HPP_
#include <cstddef>
#include <cstdint>
#include <vector>
#include "isaac/scalar.h"
namespace isaac{
inline int32_t ceil(int32_t num, int32_t div){
return (num + div - 1)/div;
}
inline size_t log2(size_t x){
size_t res = 0;
while((x>>=1)>0) res++;
return res;
}
inline size_t next_pow2(size_t N){
size_t res = 1;
while(res < N)
res*=2;
return res;
}
inline std::string arith_str(DType dtype){
switch (dtype) {
case INT8X4_TYPE: return "s32";
case FLOAT_TYPE: return "f32";
case DOUBLE_TYPE: return "f64";
default: throw;
}
}
inline std::string io_str(DType dtype){
switch (dtype) {
case INT8X4_TYPE: return "b32";
case FLOAT_TYPE: return "b32";
case DOUBLE_TYPE: return "b64";
default: throw;
}
}
typedef uint32_t param_t;
namespace driver{
class Device;
class Stream;
class Kernel;
class Buffer;
}
namespace templates{
class Generator{
public:
Generator(){}
virtual std::string dump(driver::Device const & device, std::string const & name) = 0;
virtual std::vector<param_t> tuning_params() const = 0;
};
}
}
#endif

View File

@@ -1,155 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_TEMPLATES_CONV_H_
#define ISAAC_TEMPLATES_CONV_H_
#include <cstddef>
#include <string>
#include "isaac/templates/common.hpp"
namespace isaac{
enum ActivationType{
Linear,
ReLU,
ELU,
Sigmoid
};
enum ResidualType{
NoResidual,
CatResidual,
AddResidual
};
namespace templates{
class Conv: public Generator{
public:
static const std::string id;
static const size_t Nshapes;
static const size_t Ntune;
static const size_t Nparams;
private:
void init_constant_memory(std::vector<int32_t>& delta, std::vector<uint32_t> &masks, size_t nlut, int32_t strideIc, int32_t strideIw, int32_t strideIh, int32_t strideId);
public:
Conv(DType in_dtype, DType out_dtype, param_t C, param_t D, param_t H, param_t W, param_t N, param_t K, param_t M, param_t P, param_t Q, param_t T, param_t R, param_t S,
param_t pad_h, param_t pad_w, param_t pad_d, param_t stride_h, param_t stride_w, param_t stride_d, param_t upsample_d, param_t upsample_h, param_t upsample_w,
ActivationType activation, size_t num_outputs,
ResidualType residual_type, param_t Zk, param_t z_crop_m0, param_t z_crop_m1, param_t z_crop_p0, param_t z_crop_p1, param_t z_crop_q0, param_t z_crop_q1,
param_t vec, param_t bpqn, param_t bk, param_t pqns, param_t ks, param_t crs_l, param_t cs, param_t bc, param_t gridc);
// Execution
std::string dump(driver::Device const & device, std::string const & name);
std::vector<param_t> tuning_params() const;
void enqueue(driver::Kernel& kernel, driver::Stream& queue, driver::Buffer const & I, driver::Buffer const & F, driver::Buffer *O, driver::Buffer const * bias = NULL, float alpha = 0, float iscale = 1, float fscale = 1, std::vector<float> oscale = {1}, float z_scale = 1, driver::Buffer const *Z = NULL);
// Validity
static void output_shapes(param_t D, param_t H, param_t W, param_t T, param_t R, param_t S, param_t pad_d,
param_t pad_h, param_t pad_w, param_t stride_d, param_t stride_h, param_t stride_w,
param_t upsample_d, param_t upsample_h, param_t upsample_w,
param_t& M, param_t& P, param_t& Q);
static void check_valid(driver::Device const & device, size_t M, param_t* params, uint8_t* valid);
// Benchmark
static double tflops(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t C, param_t R, param_t S, param_t T, double time);
private:
// data types
DType in_dtype_;
DType out_dtype_;
// activation type
ActivationType activation_;
size_t num_outputs_;
// residual
ResidualType residual_type_;
param_t Zk_;
param_t z_crop_m0_;
param_t z_crop_m1_;
param_t z_crop_p0_;
param_t z_crop_p1_;
param_t z_crop_q0_;
param_t z_crop_q1_;
param_t Zm_;
param_t Zp_;
param_t Zq_;
//input shapes
param_t C_;
param_t N_;
param_t K_;
param_t Kout_;
// Input dimensions
param_t D_;
param_t H_;
param_t W_;
// Output Dimensions
param_t M_;
param_t P_;
param_t Q_;
// Filter Dimensions
param_t T_;
param_t R_;
param_t S_;
// Pad
param_t pad_d_;
param_t pad_h_;
param_t pad_w_;
// stride
param_t stride_d_;
param_t stride_h_;
param_t stride_w_;
// upsample
param_t upsample_d_;
param_t upsample_h_;
param_t upsample_w_;
//parameters
param_t vec_;
param_t bc0_;
param_t bc1_;
param_t cs0_;
param_t cs1_;
param_t bf_n_;
param_t u_;
param_t us_;
param_t zs_;
param_t bz_;
param_t gridz_;
// constant memory
std::vector<int32_t> cLUT;
std::vector<uint32_t> masks_;
};
}
}
#endif

View File

@@ -1,39 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_TEMPLATES_ERROR_HPP_
#define ISAAC_TEMPLATES_ERROR_HPP_
#include <exception>
namespace isaac{
namespace templates{
class invalid_parameters: public std::exception {
public:
const char * what() const throw(){ return "Invalid parameters";}
};
}
}
#endif

View File

@@ -1,102 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_TEMPLATES_GEMM_H_
#define ISAAC_TEMPLATES_GEMM_H_
#include <cstddef>
#include <string>
#include "isaac/templates/common.hpp"
#include "isaac/scalar.h"
namespace isaac{
namespace driver{
class Device;
class Stream;
class Kernel;
class Buffer;
}
enum IsaacOperation_t{
ISAAC_OP_N = 1,
ISAAC_OP_T = 2
};
namespace templates{
class GEMM: public Generator{
public:
static const std::string id;
static const size_t Nshapes;
static const size_t Ntune;
static const size_t Nparams;
public:
GEMM(DType in_dtype, DType out_dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K, param_t offa, param_t lda, param_t offb, param_t ldb, param_t offc, param_t ldc,
param_t vec, param_t bm, param_t u, param_t bn, param_t ms, param_t us, param_t ns, param_t ba0, param_t ba1, param_t bb0, param_t bb1,
param_t ks, param_t bk, param_t kg);
std::string dump(driver::Device const & device, std::string const & name);
std::vector<param_t> tuning_params() const;
void enqueue(driver::Kernel& kernel, driver::Stream& queue, scalar const & alpha, driver::Buffer const & A, driver::Buffer const & B, scalar const & beta, driver::Buffer& C, float a_scale = 1, float b_scale = 1, float c_scale = 1, const driver::Buffer *bias = NULL);
static void check_valid(driver::Device const & device, size_t M, param_t* params, uint8_t* valid);
static double tflops(param_t M, param_t N, param_t K, double time);
private:
DType in_dtype_;
DType out_dtype_;
//transposition
IsaacOperation_t AT_;
IsaacOperation_t BT_;
//input shapes
param_t M_;
param_t N_;
param_t K_;
param_t offa_;
param_t lda_;
param_t offb_;
param_t ldb_;
param_t offc_;
param_t ldc_;
//parameters
param_t vec_;
param_t bc0_;
param_t bc1_;
param_t cs0_;
param_t cs1_;
param_t u_;
param_t us_;
param_t ba0_;
param_t ba1_;
param_t bb0_;
param_t bb1_;
param_t zs_;
param_t bz_;
param_t gridz_;
param_t stn_;
};
}
}
#endif

View File

@@ -1,100 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_TEMPLATES_POOL_H_
#define ISAAC_TEMPLATES_POOL_H_
#include <cstddef>
#include <string>
#include "isaac/templates/common.hpp"
namespace isaac{
enum PoolType{
MaxPool,
AvgPool
};
namespace templates{
class Pool: public Generator{
private:
void init_constant_memory(std::vector<int32_t>& delta, std::vector<uint32_t> &masks, size_t nlut, int32_t strideIc, int32_t strideIw, int32_t strideIh, int32_t strideId);
public:
static const std::string id;
static const size_t Nshapes;
static const size_t Ntune;
static const size_t Nparams;
public:
Pool(DType in_dtype, DType out_dtype, PoolType pool_type,
param_t C, param_t D, param_t H, param_t W, param_t N, param_t M, param_t P, param_t Q, param_t T, param_t R, param_t S,
param_t pad_d, param_t pad_h, param_t pad_w,
param_t stride_d, param_t stride_h, param_t stride_w,
param_t vec = 1, param_t bc0 = 32, param_t cs0 = 4, param_t u = 1);
// Execution
std::string dump(driver::Device const & device, std::string const & name);
static void check_valid(driver::Device const & device, size_t M, param_t* params, uint8_t* valid);
void enqueue(driver::Kernel& kernel, driver::Stream& queue, driver::Buffer const & I, driver::Buffer &O, float i_scale = 1, float o_scale = 1);
std::vector<unsigned int> tuning_params() const;
static double tflops(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t T, param_t R, param_t S, double time);
private:
DType in_dtype_;
DType out_dtype_;
PoolType pool_type_;
// Shapes
param_t Cin_;
param_t Cout_;
param_t D_;
param_t H_;
param_t W_;
param_t N_;
param_t M_;
param_t P_;
param_t Q_;
param_t T_;
param_t R_;
param_t S_;
param_t pad_d_;
param_t pad_h_;
param_t pad_w_;
param_t stride_d_;
param_t stride_h_;
param_t stride_w_;
// Tuning params
param_t vec_;
param_t bc0_;
param_t cs0_;
param_t u_;
// Constant buffer
std::vector<int32_t> cLUT;
std::vector<uint32_t> masks_;
};
}
}
#endif

View File

@@ -1,80 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef BENCH_HPP
#define BENCH_HPP
#include <chrono>
#include <algorithm>
#include <isaac/driver/device.h>
#include <iostream>
#include <iomanip>
#include <iterator>
class Timer
{
typedef std::chrono::high_resolution_clock high_resolution_clock;
typedef std::chrono::nanoseconds nanoseconds;
public:
explicit Timer(bool run = false)
{ if (run) start(); }
void start()
{ _start = high_resolution_clock::now(); }
nanoseconds get() const
{ return std::chrono::duration_cast<nanoseconds>(high_resolution_clock::now() - _start); }
private:
high_resolution_clock::time_point _start;
};
template<class T>
T min(std::vector<T> x)
{ return *std::min_element(x.begin(), x.end()); }
template<class OP, class SYNC>
double bench(OP const & op, SYNC const & sync, isaac::driver::Device const & device)
{
Timer tmr;
std::vector<size_t> times;
double total_time = 0;
op();
sync();
while(total_time*1e-9 < 1e-1){
float norm = (float)device.current_sm_clock()/device.max_sm_clock();
tmr.start();
op();
sync();
times.push_back(norm*tmr.get().count());
total_time+=times.back();
}
return min(times);
}
template<class T>
std::string str(T const & x){ return std::to_string(x); }
#endif

View File

@@ -1,286 +0,0 @@
/*
* Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
*
* This file is part of ISAAC.
*
* ISAAC is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301 USA
*/
#ifndef ISAAC_CPP_COLLECTIONS_HPP
#define ISAAC_CPP_COLLECTIONS_HPP
#include <vector>
#include <iostream>
#include <sstream>
#include <iterator>
#include <algorithm>
#include <numeric>
#include <memory>
#include <map>
#include <set>
#include <unordered_map>
#include <unordered_set>
#include <type_traits>
#include <deque>
namespace isaac
{
namespace cpp
{
/* ---- Cached Map ----- */
template<class K, class V>
class CachedMap{
public:
CachedMap(std::function<V(K const &)> value_maker) : value_maker_(value_maker)
{ }
V const & get(K const & key){
auto it = cache_.find(key);
if(it==cache_.end())
return cache_.insert(std::make_pair(key, value_maker_(key))).first->second;
return it->second;
}
private:
std::map<K, V> cache_;
std::function<V(K const &)> value_maker_;
};
/* ---- Cartesian ---- */
inline std::vector<std::vector<int>> cartesian(const std::vector<std::vector<int>>& v) {
std::vector<std::vector<int>> res = {{}};
for (const auto& u : v){
std::vector<std::vector<int>> current;
for (const auto& x : res)
for (const auto y : u){
current.push_back(x);
current.back().push_back(y);
}
res = std::move(current);
}
return res;
}
/* ---- Tuple ----- */
template<class T>
class tuple
{
template<class U>
friend std::ostream& operator<<(std::ostream & oss, tuple<U> const &);
public:
tuple() {}
tuple(std::vector<T> const & list): data_(list){}
tuple(std::initializer_list<T> const & list) : data_(list){}
tuple(T a) : data_{a} {}
tuple(T a, T b) : data_{a, b} {}
tuple(tuple const & other) = default;
tuple(tuple&& other) = default;
tuple& operator=(tuple const & other) = default;
tuple& operator=(tuple && other) = default;
typename std::vector<T>::iterator begin() { return data_.begin(); }
typename std::vector<T>::const_iterator begin() const { return data_.begin(); }
typename std::vector<T>::iterator end() { return data_.end(); }
typename std::vector<T>::const_iterator end() const { return data_.end(); }
size_t size() const { return data_.size(); }
T front() const { return data_.front(); }
T back() const { return data_.back(); }
void remove_index(size_t i) { data_.erase(std::next(data_.begin(), i)); }
T& operator[](size_t i) { return data_[i]; }
T operator[](size_t i) const { return data_[i]; }
bool operator==(tuple const & other) const { return data_==other.data_; }
operator std::vector<T>() const { return data_; }
private:
std::vector<T> data_;
};
template<class T>
inline std::ostream& operator<<(std::ostream & oss, tuple<T> const &tp)
{
oss << "(";
std::copy(tp.data_.begin(), tp.data_.end() - 1, std::ostream_iterator<T>(oss, ","));
oss << tp.data_.back();
if(tp.size()==1)
oss << ",";
oss << ")";
return oss;
}
template<class T>
inline std::string to_string(tuple<T> const & tp)
{
std::ostringstream oss;
oss << tp;
return oss.str();
}
template<class T>
inline void remove_index(std::vector<T>& tp, size_t i)
{ tp.erase(std::next(tp.begin(), i)); }
template<class T>
inline T max(std::vector<T> const & tp)
{ return std::accumulate(tp.begin(), tp.end(), std::numeric_limits<T>::min(), [](T a, T b){ return std::max(a, b); }); }
template<class T>
inline T min(std::vector<T> const & tp)
{ return std::accumulate(tp.begin(), tp.end(), std::numeric_limits<T>::max(), [](T a, T b){ return std::min(a, b); }); }
template<class T>
inline T prod(std::vector<T> const & tp)
{ return std::accumulate(tp.begin(), tp.end(), 1, std::multiplies<T>()); }
template<class T>
inline size_t numgt1(std::vector<T> const & tp)
{ return std::accumulate(tp.begin(), tp.end(), 0, [](size_t a, size_t b){ return a + (b>1); }); }
/* ----- Set/Map ----- */
template<class T>
struct deref_hash
{ size_t operator()(T const & x) const { return x.hash();} };
template<class T>
struct deref_hash<T*>
{ size_t operator()(T const * x) const { return x->hash();} };
template<class T>
struct deref_hash<std::shared_ptr<T>>
{ size_t operator()(std::shared_ptr<T> const & x) const { return x->hash();} };
template<class T>
struct deref_eq
{ size_t operator()(T const & x, T const & y) const { return x == y;} };
template<class T>
struct deref_eq<T*>
{ size_t operator()(T const * x, T const * y) const { return *x == *y;} };
template<class T>
struct deref_eq<std::shared_ptr<T>>
{ size_t operator()(std::shared_ptr<T> const & x, std::shared_ptr<T> const & y) const { return *x == *y;} };
template<class KEY>
using deref_unordered_set = std::unordered_set<KEY, deref_hash<KEY>, deref_eq<KEY>>;
template<class U>
using set_map = std::map<U, std::set<U>>;
template<class U, class H = std::hash<U>, class E = std::equal_to<U>>
using unordered_set_map = std::unordered_map<U, std::unordered_set<U,H,E>, H, E>;
template<class T>
struct is_set_map
{ static const bool value = false; };
template<class U>
struct is_set_map<set_map<U>> { static const bool value = true; };
template<class U, class H, class E>
struct is_set_map<unordered_set_map<U,H,E>> { static const bool value = true; };
/* ---- Transformations ---- */
//Pairs
template<class T, class Enable = typename std::enable_if<is_set_map<T>::value>::type>
std::deque<std::pair<typename T::key_type, typename T::key_type>> pairs(T const & map)
{
typedef typename T::key_type K;
std::deque<std::pair<K,K>> result;
for(auto const& x: map)
for(auto const & y: x.second)
result.push_back({x.first, y});
return result;
}
//Invert
template<class T, class Enable = typename std::enable_if<is_set_map<T>::value>::type>
static T invert(T const & in)
{
T result;
typedef typename T::key_type U;
typedef typename T::mapped_type V;
for(auto const & x: in){
U u = x.first;
result.insert({u, V()});
for(U v: x.second)
result[v].insert(u);
}
return result;
}
//Intersect
template<class T, class H, class E>
std::unordered_set<T,H,E> intersection(std::unordered_set<T,H,E> const & x,
std::unordered_set<T,H,E> const & y)
{
if(y.size() < x.size())
return intersection(y, x);
std::unordered_set<T,H,E> result;
for(auto const & u: x)
if(y.find(u)!=y.end())
result.insert(u);
return result;
}
//Merge
template<class T>
typename std::enable_if<!is_set_map<T>::value, T&>::type merge(T& x, T const & y)
{
std::merge(x.begin(), x.end(), y.begin(), y.end(), std::inserter(x, x.end()));
return x;
}
template<class T>
typename std::enable_if<is_set_map<T>::value, T&>::type merge(T& x, T const & y)
{
for(auto const & p: y) merge(x[p.first], p.second);
return x;
}
//Transfer
template<class T, class U, class Enable = typename std::enable_if<is_set_map<T>::value>::type>
void transfer(T& map, U u, U v, typename T::mapped_type const & exclude)
{
for(auto const & x: exclude)
map[v].erase(x);
merge(map[u], map[v]);
for(auto& x: map)
x.second.erase(v);
map.erase(v);
}
//subset
template<class T, class Enable = typename std::enable_if<is_set_map<T>::value>::type>
T subset(T& map, typename T::mapped_type const & include)
{
T result;
for(auto const & e: map)
if(include.find(e.first)!=include.end())
result[e.first] = cpp::intersection(e.second, include);
return result;
}
}
}
#endif

View File

@@ -1,84 +0,0 @@
/*
* Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
*
* This file is part of ISAAC.
*
* ISAAC is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301 USA
*/
#ifndef ISAAC_CPP_FUNCTIONAL_HPP
#define ISAAC_CPP_FUNCTIONAL_HPP
#include <type_traits>
#include <tuple>
namespace isaac
{
namespace cpp
{
template <typename T>
struct function_traits
: public function_traits<decltype(&T::operator())>
{};
// For generic types, directly use the result of the signature of its 'operator()'
template <typename ClassType, typename ReturnType, typename... Args>
struct function_traits<ReturnType(ClassType::*)(Args...) const>
// we specialize for pointers to member function
{
enum { arity = sizeof...(Args) };
// arity is the number of arguments.
typedef ReturnType result_type;
template <size_t i>
struct arg
{
typedef typename std::tuple_element<i, std::tuple<Args...>>::type type;
// the i-th argument is equivalent to the i-th tuple element of a tuple
// composed of those arguments.
};
};
template<class U, class FN, class V>
V forward_dyncast(U const & x, FN const & fn, V const &backup)
{
typedef typename function_traits<FN>::template arg<0>::type RT;
typedef typename std::remove_reference<RT>::type T;
if(T const * p = dynamic_cast<T const *>(&x))
return fn(*p);
return backup;
}
template<class U, class FN>
void forward_dyncast(U const & x, FN const & fn)
{
typedef typename function_traits<FN>::template arg<0>::type RT;
typedef typename std::remove_reference<RT>::type T;
if(T const * p = dynamic_cast<T const *>(&x))
fn(*p);
}
template<class U, class FN>
bool compare_if_same(U const & base, FN const & f)
{ return cpp::forward_dyncast(base, f, false); }
}
}
#endif

View File

@@ -1,92 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_TOOLS_MATRIX_HPP_
#define ISAAC_TOOLS_MATRIX_HPP_
#include <cstddef>
inline void read_inc(void* dst, u_char*& data, size_t nbytes){
std::memcpy(dst, (void*)data, nbytes);
data += nbytes;
}
template<class T>
void gemm(uint32_t M, uint32_t N, uint32_t K, T alpha, T* A, uint32_t lda, T* B, uint32_t ldb, T, T* C, uint32_t ldc, T* bias){
for(uint32_t i = 0; i < M ; ++i)
for(uint32_t j = 0; j < N ; ++j){
T acc = 0;
for(uint32_t k = 0; k < K; ++k)
acc += A[i*lda + k] * B[k*ldb + j];
C[i*ldc + j] = alpha*acc + bias[j];
}
}
template<class T>
class matrix{
typedef std::array<uint32_t, 2> shapes_t;
public:
matrix(u_char*& data){
read_inc((void*)shapes_.data(), data, 8);
values_.resize(shapes_[0]*shapes_[1]);
ld_ = shapes_[1];
read_inc((void*)values_.data(), data, values_.size()*4);
data_ = values_.data();
}
matrix(shapes_t const & shapes, size_t ld, T* data): shapes_(shapes), ld_(ld), data_(data){}
matrix(shapes_t const & shapes): shapes_(shapes), ld_(shapes.back()), values_(shapes[0]*shapes[1]), data_(values_.data()){}
shapes_t const & shapes() const
{ return shapes_; }
T const & operator()(size_t i, size_t j) const
{ return data_[i*ld_ + j]; }
T & operator ()(size_t i, size_t j)
{ return data_[i*ld_ + j]; }
T* data() const
{ return data_; }
T* data()
{ return data_; }
uint32_t ld() const
{ return ld_; }
private:
shapes_t shapes_;
size_t ld_;
std::vector<T> values_;
T* data_;
};
template<class T>
matrix<T> pad_left(matrix<T> const & in, uint32_t npad){
uint32_t M = in.shapes()[0], N = in.shapes()[1];
matrix<T> result({M, N + npad});
for(size_t i = 0; i < M; ++i)
for(size_t j = 0; j < N; ++j)
result(i, npad + j) = in(i, j);
return result;
}
#endif

View File

@@ -0,0 +1,77 @@
#ifndef TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H
#define TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H
#include <map>
#include <vector>
namespace triton {
namespace ir {
class value;
class module;
class phi_node;
class splat_inst;
class reshape_inst;
class broadcast_inst;
class binary_operator;
class getelementptr_inst;
}
namespace codegen{
namespace analysis{
class align {
private:
struct cst_info {
unsigned num_cst;
unsigned value;
};
// helpers
std::vector<unsigned> get_shapes(ir::value *v);
// populate is_constant
std::vector<cst_info> populate_is_constant_phi(ir::phi_node* x);
std::vector<cst_info> populate_is_constant_splat(ir::splat_inst* x);
std::vector<cst_info> populate_is_constant_reshape(ir::reshape_inst* x);
std::vector<cst_info> populate_is_constant_broadcast(ir::broadcast_inst* x);
std::vector<cst_info> populate_is_constant_binop(ir::binary_operator* x);
std::vector<cst_info> populate_is_constant_gep(ir::getelementptr_inst* x);
std::vector<cst_info> populate_is_constant_default(ir::value* v);
std::vector<cst_info> populate_is_constant(ir::value *v);
// populate max_contiguous
std::vector<unsigned> populate_max_contiguous_phi(ir::phi_node* x);
std::vector<unsigned> populate_max_contiguous_splat(ir::splat_inst* x);
std::vector<unsigned> populate_max_contiguous_reshape(ir::reshape_inst* x);
std::vector<unsigned> populate_max_contiguous_broadcast(ir::broadcast_inst* x);
std::vector<unsigned> populate_max_contiguous_binop(ir::binary_operator* x);
std::vector<unsigned> populate_max_contiguous_gep(ir::getelementptr_inst* x);
std::vector<unsigned> populate_max_contiguous_default(ir::value* v);
std::vector<unsigned> populate_max_contiguous(ir::value *v);
// populate starting_multiple
std::vector<unsigned> populate_starting_multiple_phi(ir::phi_node* x);
std::vector<unsigned> populate_starting_multiple_splat(ir::splat_inst* x);
std::vector<unsigned> populate_starting_multiple_reshape(ir::reshape_inst* x);
std::vector<unsigned> populate_starting_multiple_broadcast(ir::broadcast_inst* x);
std::vector<unsigned> populate_starting_multiple_binop(ir::binary_operator* x);
std::vector<unsigned> populate_starting_multiple_gep(ir::getelementptr_inst* x);
std::vector<unsigned> populate_starting_multiple_default(ir::value* v);
std::vector<unsigned> populate_starting_multiple(ir::value *v);
// populate all maps
void populate(ir::value *v);
public:
void run(ir::module &mod);
unsigned get(ir::value* v, unsigned ax) const;
std::vector<unsigned> contiguous(ir::value* v) const;
private:
std::map<ir::value*, std::vector<cst_info>> is_constant_;
std::map<ir::value*, std::vector<unsigned>> max_contiguous_;
std::map<ir::value*, std::vector<unsigned>> starting_multiple_;
};
}
}
}
#endif

View File

@@ -0,0 +1,47 @@
#ifndef TDL_INCLUDE_IR_CODEGEN_STORAGE_ALLOC_H
#define TDL_INCLUDE_IR_CODEGEN_STORAGE_ALLOC_H
#include <map>
#include <set>
#include <iostream>
#include "triton/codegen/analysis/liveness.h"
namespace triton{
namespace ir{
class value;
class function;
class module;
}
namespace codegen{
namespace analysis{
class tiles;
class liveness;
class cts;
class allocation {
public:
allocation(liveness *live)
: liveness_(live) { }
// accessors
bool has_offset(const data_layout *x) const { return offsets_.find(x) != offsets_.end(); }
unsigned offset(const data_layout *x) const { return offsets_.at(x); }
unsigned allocated_size() const { return allocated_size_; }
// run
void run(ir::module& mod);
private:
std::map<const data_layout*, unsigned> offsets_;
size_t allocated_size_;
// dependences
liveness *liveness_;
};
}
}
}
#endif

View File

@@ -0,0 +1,51 @@
#ifndef _TRITON_CODEGEN_ANALYSIS_AXES_H_
#define _TRITON_CODEGEN_ANALYSIS_AXES_H_
#include "triton/tools/graph.h"
#include <map>
#include <vector>
namespace triton{
namespace ir{
class value;
class module;
class instruction;
}
namespace codegen{
namespace analysis{
class axes {
typedef std::pair<ir::value*, unsigned> node_t;
private:
// update graph
void update_graph_store(ir::instruction *i);
void update_graph_reduce(ir::instruction *i);
void update_graph_reshape(ir::instruction *i);
void update_graph_trans(ir::instruction *i);
void update_graph_broadcast(ir::instruction *i);
void update_graph_dot(ir::instruction *i);
void update_graph_elementwise(ir::instruction *i);
void update_graph_no_edge(ir::instruction *i);
void update_graph(ir::instruction *i);
public:
axes();
void run(ir::module &mod);
// accessors
int get(ir::value *value, unsigned dim);
std::vector<int> get(ir::value *value);
private:
tools::graph<node_t> graph_;
std::map<node_t, size_t> axes_;
};
}
}
}
#endif

View File

@@ -0,0 +1,205 @@
#ifndef _TRITON_CODEGEN_ANALYSIS_GRID_H_
#define _TRITON_CODEGEN_ANALYSIS_GRID_H_
#include <map>
#include <set>
#include <vector>
#include <memory>
#include "triton/tools/graph.h"
namespace triton{
namespace ir{
class value;
class type;
class module;
class instruction;
class phi_node;
}
namespace codegen{
namespace analysis{
class axes;
class align;
class layout_visitor;
class data_layout;
class mma884_layout;
class scanline_layout;
class shared_layout;
class layout_visitor {
public:
virtual void visit_layout(data_layout *);
virtual void visit_layout_hmma_884(mma884_layout*) = 0;
virtual void visit_layout_scanline(scanline_layout*) = 0;
virtual void visit_layout_shared(shared_layout*) = 0;
};
class data_layout {
protected:
enum id_t {
HMMA_884,
SCANLINE,
SHARED
};
typedef std::vector<int> axes_t;
typedef std::vector<unsigned> shape_t;
typedef std::vector<int> order_t;
typedef std::vector<ir::value*> values_t;
private:
template<typename T>
T* downcast(id_t id) {
if(id_ == id)
return static_cast<T*>(this);
return nullptr;
}
public:
data_layout(id_t id,
const std::vector<int>& axes,
const std::vector<unsigned> &shape,
const std::vector<ir::value *> &values,
analysis::align* align);
// visitor
virtual void accept(layout_visitor* vst) = 0;
// downcast
mma884_layout* to_mma884() { return downcast<mma884_layout>(HMMA_884); }
scanline_layout* to_scanline() { return downcast<scanline_layout>(SCANLINE); }
shared_layout* to_shared() { return downcast<shared_layout>(SHARED); }
// accessors
size_t get_rank() { return shape_.size(); }
const shape_t& get_shape() const { return shape_; }
const order_t& get_order() const { return order_; }
const values_t& get_values() const { return values_;}
int get_axis(size_t k) const { return axes_.at(k); }
const int get_order(size_t k) const { return order_.at(k); }
// find the position of given axis
size_t find_axis(int to_find) const;
private:
id_t id_;
axes_t axes_;
values_t values_;
protected:
order_t order_;
shape_t shape_;
};
class mma884_layout: public data_layout {
public:
mma884_layout(size_t num_warps,
const std::vector<int>& axes,
const std::vector<unsigned>& shapes,
const std::vector<ir::value *> &values,
analysis::align* align);
void accept(layout_visitor* vst) { vst->visit_layout_hmma_884(this); }
// accessor
int fpw(size_t k) { return fpw_.at(k); }
int wpt(size_t k) { return wpt_.at(k); }
private:
std::vector<int> fpw_;
std::vector<int> wpt_;
};
struct scanline_layout: public data_layout {
scanline_layout(size_t num_warps,
const std::vector<int>& axes,
const std::vector<unsigned>& shape,
const std::vector<ir::value *> &values,
analysis::align* align);
void accept(layout_visitor* vst) { vst->visit_layout_scanline(this); }
// accessor
int mts(size_t k) { return mts_.at(k); }
int nts(size_t k) { return nts_.at(k); }
public:
std::vector<int> mts_;
std::vector<int> nts_;
};
struct double_buffer_info_t {
ir::value* first;
ir::value* latch;
ir::phi_node* phi;
};
class shared_layout: public data_layout {
private:
static bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator);
static void extract_double_bufferable(ir::value *v, std::shared_ptr<double_buffer_info_t>& res);
public:
shared_layout(const data_layout *arg,
const std::vector<int>& axes,
const std::vector<unsigned>& shapes,
const std::vector<ir::value *> &values_,
ir::type *ty,
analysis::align* align);
void accept(layout_visitor* vst) { vst->visit_layout_shared(this); }
// accessors
size_t get_size() { return size_; }
ir::type* get_type() { return ty_; }
double_buffer_info_t* get_double_buffer() { return double_buffer_.get(); }
private:
size_t size_;
ir::type *ty_;
std::shared_ptr<double_buffer_info_t> double_buffer_;
};
class layouts {
typedef ir::value* node_t;
typedef std::map <node_t, std::set<node_t>> graph_t;
private:
// graph creation
void connect(ir::value *x, ir::value *y);
void make_graph(ir::instruction *i);
void init_hmma_tile(data_layout& layouts);
void init_scanline_tile(data_layout &layouts);
void create(size_t id, const std::vector<ir::value*>& values);
public:
// constructor
layouts(analysis::axes *axes, analysis::align *align, size_t num_warps);
// accessors
unsigned layout_of(ir::value *value) const { return groups_.at(value); }
const std::vector<ir::value*>& values_of(unsigned id) const { return values_.at(id); }
size_t num_layouts() const { return values_.size();}
data_layout* get(size_t id) { return layouts_.at(id); }
data_layout* get(ir::value *v) { return get(layout_of(v));}
std::map<size_t, data_layout*> &get_all() { return layouts_; }
size_t tmp(ir::instruction* i) { return tmp_.at((ir::value*)i);}
// execution
void run(ir::module &mod);
private:
analysis::axes* axes_;
analysis::align* align_;
size_t num_warps_;
tools::graph<ir::value*> graph_;
std::map<ir::value*, size_t> groups_;
std::map<size_t, std::vector<ir::value*>> values_;
std::map<size_t, data_layout*> layouts_;
std::map<ir::value*, size_t> tmp_;
};
}
}
}
#endif

View File

@@ -0,0 +1,67 @@
#ifndef TDL_INCLUDE_IR_CODEGEN_LIVENESS_H
#define TDL_INCLUDE_IR_CODEGEN_LIVENESS_H
#include <map>
#include <set>
#include <vector>
#include "triton/codegen/analysis/layout.h"
#include "triton/tools/graph.h"
namespace triton{
namespace ir{
class value;
class phi_node;
class function;
class module;
class instruction;
}
namespace codegen{
namespace analysis{
typedef unsigned slot_index;
class tiles;
class layouts;
class data_layout;
struct segment {
slot_index start;
slot_index end;
bool contains(slot_index idx) const {
return start <= idx && idx < end;
}
bool intersect(const segment &Other){
return contains(Other.start) || Other.contains(start);
}
};
class liveness {
private:
typedef std::map<shared_layout*, segment> intervals_map_t;
public:
// constructor
liveness(layouts *l): layouts_(l){ }
// accessors
const intervals_map_t& get() const { return intervals_; }
segment get(shared_layout* v) const { return intervals_.at(v); }
// run
void run(ir::module &mod);
private:
// analysis
layouts *layouts_;
intervals_map_t intervals_;
};
}
}
}
#endif

View File

@@ -0,0 +1,30 @@
#ifndef _TRITON_CODEGEN_PASS_H_
#define _TRITON_CODEGEN_PASS_H_
#include <list>
namespace triton{
namespace ir{
class module;
}
namespace codegen{
class pass {
public:
virtual void run(ir::module& m);
};
class pass_manager {
public:
void add(pass* p);
void run(ir::module& m);
private:
std::list<pass*> passes;
};
}
}

View File

@@ -0,0 +1,173 @@
#pragma once
#ifndef _TRITON_SELECTION_GENERATOR_H_
#define _TRITON_SELECTION_GENERATOR_H_
#include "triton/ir/visitor.h"
#include "triton/codegen/analysis/layout.h"
#include "triton/codegen/selection/machine_value.h"
#include <functional>
// forward
namespace llvm{
class Type;
class Value;
class Instruction;
class Constant;
class LLVMContext;
class Module;
class ConstantFolder;
class IRBuilderDefaultInserter;
template <typename T, typename Inserter>
class IRBuilder;
class ArrayType;
class Function;
}
namespace triton{
namespace codegen{
// forward
namespace analysis{
class liveness;
class tiles;
class align;
class allocation;
class cts;
class axes;
class layouts;
}
// typedef
typedef llvm::IRBuilder<llvm::ConstantFolder,
llvm::IRBuilderDefaultInserter> Builder;
typedef llvm::LLVMContext LLVMContext;
typedef llvm::Type Type;
typedef llvm::Value Value;
typedef llvm::Module Module;
typedef llvm::Instruction Instruction;
typedef llvm::Constant Constant;
typedef llvm::ArrayType ArrayType;
typedef llvm::Function Function;
typedef std::vector<Value*> indices_t;
// forward
class machine_data_layout;
class tile;
class shared_tile;
class distributed_tile;
class target;
}
}
namespace triton{
namespace codegen{
class generator: public ir::visitor, public analysis::layout_visitor {
private:
void for_each(ir::value *x, const std::function<void(indices_t)>& fn);
Value* get_value(ir::value *x, const indices_t& idx);
void set_value(ir::value *x, const indices_t& idx, Value* v);
void visit_hmma_dot(ir::dot_inst*, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK);
void visit_scanline_dot(ir::dot_inst*, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add);
void visit_outer_dot(ir::dot_inst*, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK,
Type *c_ty, Function *f_mul_add);
void finalize_shared_layout(analysis::shared_layout*);
void finalize_function(ir::function*);
void finalize_phi_node(ir::phi_node*);
public:
generator(analysis::axes *a_axes,
analysis::layouts *layouts,
analysis::align *alignment,
analysis::allocation *alloc,
target *tgt,
unsigned num_warps);
void visit_value(ir::value* v);
void visit_phi_node(ir::phi_node*);
void visit_binary_operator(ir::binary_operator*);
void visit_getelementptr_inst(ir::getelementptr_inst*);
void visit_icmp_inst(ir::icmp_inst*);
void visit_fcmp_inst(ir::fcmp_inst*);
void visit_cast_inst(ir::cast_inst*);
void visit_return_inst(ir::return_inst*);
void visit_cond_branch_inst(ir::cond_branch_inst*);
void visit_uncond_branch_inst(ir::uncond_branch_inst*);
void visit_unmasked_load_inst(ir::unmasked_load_inst*);
void visit_masked_load_inst(ir::masked_load_inst*);
void visit_unmasked_store_inst(ir::unmasked_store_inst*);
void visit_masked_store_inst(ir::masked_store_inst*);
void visit_reshape_inst(ir::reshape_inst*);
void visit_splat_inst(ir::splat_inst*);
void visit_broadcast_inst(ir::broadcast_inst*);
void visit_downcast_inst(ir::downcast_inst*);
void visit_exp_inst(ir::exp_inst*);
void visit_get_program_id_inst(ir::get_program_id_inst*);
void visit_get_num_program_inst(ir::get_num_program_inst*);
void visit_atomic_cas_inst(ir::atomic_cas_inst*);
void visit_atomic_exch_inst(ir::atomic_exch_inst*);
void visit_atomic_add_inst(ir::atomic_add_inst*);
void visit_dot_inst(ir::dot_inst*);
void visit_trans_inst(ir::trans_inst*);
void visit_sqrt_inst(ir::sqrt_inst*);
void visit_reduce_inst(ir::reduce_inst*);
void visit_select_inst(ir::select_inst*);
void visit_recoalesce_inst(ir::recoalesce_inst*);
void visit_copy_to_shared_inst(ir::copy_to_shared_inst*);
void visit_copy_from_shared_inst(ir::copy_from_shared_inst*);
void visit_barrier_inst(ir::barrier_inst*);
void visit_make_range_dyn(ir::make_range_dyn*);
void visit_make_range(ir::make_range*);
void visit_make_range_sta(ir::make_range_sta*);
void visit_undef_value(ir::undef_value*);
void visit_constant_int(ir::constant_int*);
void visit_constant_fp(ir::constant_fp*);
void visit_alloc_const(ir::alloc_const*);
void visit_function(ir::function*);
void visit_basic_block(ir::basic_block*);
void visit_argument(ir::argument*);
void visit_layout_hmma_884(analysis::mma884_layout*);
void visit_layout_scanline(analysis::scanline_layout*);
void visit_layout_shared(analysis::shared_layout*);
void visit(ir::module &, llvm::Module &);
private:
LLVMContext *ctx_;
Builder* builder_;
Module *mod_;
std::map<const analysis::data_layout*, machine_data_layout*> machine_layouts_;
analysis::axes *a_axes_;
std::map<unsigned, distributed_axis> axes_;
std::map<ir::value *, Value *> vmap_;
std::map<ir::value *, tile *> tmap_;
target *tgt_;
analysis::layouts *layouts_;
analysis::align *alignment_;
analysis::allocation *alloc_;
Value *sh_mem_ptr_;
unsigned num_warps_;
std::set<ir::value*> seen_;
};
}
}
#endif

View File

@@ -0,0 +1,138 @@
#pragma once
#ifndef _TRITON_SELECTION_MACHINE_LAYOUT_H_
#define _TRITON_SELECTION_MACHINE_LAYOUT_H_
#include <map>
#include "triton/codegen/analysis/layout.h"
namespace llvm{
class Type;
class Value;
class Instruction;
class Constant;
class LLVMContext;
class Module;
class ConstantFolder;
class IRBuilderDefaultInserter;
template <typename T, typename Inserter>
class IRBuilder;
class ArrayType;
class Function;
}
namespace triton{
namespace ir{
class value;
}
namespace codegen{
namespace analysis{
class liveness;
class tiles;
class align;
class allocation;
class cts;
class axes;
class layouts;
}
typedef llvm::IRBuilder<llvm::ConstantFolder,
llvm::IRBuilderDefaultInserter> Builder;
typedef llvm::LLVMContext LLVMContext;
typedef llvm::Type Type;
typedef llvm::Value Value;
typedef llvm::Module Module;
typedef llvm::Instruction Instruction;
typedef llvm::Constant Constant;
typedef llvm::ArrayType ArrayType;
typedef llvm::Function Function;
class distributed_axis;
class machine_data_layout;
class tile;
class shared_tile;
class distributed_tile;
class target;
}
}
namespace triton{
namespace codegen{
class machine_data_layout {
public:
virtual tile* create(ir::value *v) = 0;
};
class machine_shared_layout: public machine_data_layout {
public:
machine_shared_layout(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, Value *&sh_mem_ptr,
analysis::shared_layout* layout,
std::map<ir::value *, Value *>& vmap,
std::map<ir::value *, tile *>& tmap);
tile* create(ir::value *v);
Module *mod_;
Builder *builder_;
target *tgt_;
analysis::allocation* alloc_;
Value *&sh_mem_ptr_;
analysis::shared_layout* layout_;
std::map<ir::value *, Value *>& vmap_;
std::map<ir::value *, tile *>& tmap_;
Value *offset_;
Value *ptr_;
Value *pre_ptr_;
Value *next_ptr_;
};
class machine_distributed_layout: public machine_data_layout {
public:
machine_distributed_layout(Module *mod, Builder *builder, target *tgt,
analysis::axes *a_axes, std::map<unsigned, distributed_axis>& axes,
analysis::data_layout* layout);
tile* create(ir::value *v);
Module *mod_;
Builder *builder_;
target *tgt_;
analysis::axes *a_axes_;
std::map<unsigned, distributed_axis>& axes_;
analysis::data_layout* layout_;
};
class machine_mma884_layout: public machine_distributed_layout {
public:
machine_mma884_layout(Module *mod, Builder *builder,
target *tgt,
analysis::axes *a_axes, std::map<unsigned, distributed_axis>& axes,
analysis::mma884_layout* layout);
Value *offset_a_i_, *offset_a_k_;
Value *offset_b_j_, *offset_b_k_;
unsigned pack_size_0_;
unsigned pack_size_1_;
unsigned num_packs_0_;
unsigned num_packs_1_;
};
class machine_scanline_layout: public machine_distributed_layout {
public:
machine_scanline_layout(Module *mod, Builder *builder,
target *tgt,
analysis::axes *a_axes, std::map<unsigned, distributed_axis>& axes,
analysis::scanline_layout* layout);
};
}
}
#endif

View File

@@ -0,0 +1,152 @@
#pragma once
#ifndef _TRITON_SELECTION_MACHINE_VALUE_H_
#define _TRITON_SELECTION_MACHINE_VALUE_H_
#include <vector>
#include <map>
#include <functional>
namespace llvm{
class Type;
class Value;
class Instruction;
class Constant;
class LLVMContext;
class Module;
class ConstantFolder;
class IRBuilderDefaultInserter;
template <typename T, typename Inserter>
class IRBuilder;
class ArrayType;
class Function;
}
namespace triton{
namespace codegen{
typedef llvm::IRBuilder<llvm::ConstantFolder,
llvm::IRBuilderDefaultInserter> Builder;
typedef llvm::LLVMContext LLVMContext;
typedef llvm::Type Type;
typedef llvm::Value Value;
typedef llvm::Module Module;
typedef llvm::Instruction Instruction;
typedef llvm::Constant Constant;
typedef llvm::ArrayType ArrayType;
typedef llvm::Function Function;
}
}
namespace triton{
namespace codegen{
namespace analysis{
class liveness;
class tiles;
class align;
class allocation;
class cts;
class axes;
class layouts;
}
class distributed_axis;
class machine_data_layout;
class tile;
class shared_tile;
class distributed_tile;
class target;
typedef std::vector<Value*> indices_t;
}
}
namespace triton{
namespace codegen{
struct distributed_axis {
int contiguous;
std::vector<Value*> values;
Value* thread_id;
};
class tile {
protected:
typedef std::vector<unsigned> shapes_t;
public:
tile(Type *ty, const shapes_t &shapes): ty_(ty), shapes_(shapes){ }
virtual void set_value(indices_t idx, Value *v) = 0;
virtual Value* get_value(indices_t idx) = 0;
Type *get_ty() const { return ty_; }
shapes_t get_shapes() const { return shapes_; }
protected:
Type *ty_;
shapes_t shapes_;
};
class shared_tile: public tile {
private:
void extract_constant(Value *arg, Value *&non_cst, Value *&cst);
void extract_constant(const indices_t &arg_idx, indices_t &non_cst_idx, indices_t &cst_idx);
public:
shared_tile(Type* ty, const shapes_t &shapes, const std::vector<int> &order, Value* ptr, Builder &builder, Value* offset = nullptr, const std::vector<int>& perm = {});
void set_vector_size(unsigned vector_size);
void set_return_mode(bool return_vector);
void set_value(indices_t, Value *);
Value* get_ptr_to(indices_t idx);
Value* get_value(indices_t idx);
Value* get_pointer() { return ptr_; }
Value* get_offset() { return offset_; }
const std::vector<int>& get_perm() { return perm_; }
const std::vector<int>& get_order() { return order_; }
static Value* shared_offset(Builder& builder, const shapes_t& shapes, const std::vector<int>& perm, const std::vector<int>& order, indices_t idx);
private:
Value *ptr_;
bool return_vector_;
Builder &builder_;
Value *offset_;
std::map<indices_t, Value*> ptr_cache_;
unsigned vector_size_;
std::vector<int> order_;
std::vector<int> perm_;
};
// Distribtued tile
class distributed_tile: public tile{
typedef std::vector<distributed_axis> axes_t;
typedef std::vector<indices_t> ordered_indices_vec_t;
typedef std::map<indices_t, unsigned> indices_map_t;
typedef std::map<indices_t, Value*> values_map_t;
private:
void init_indices();
public:
distributed_tile(Type *ty, const shapes_t& shapes, const std::vector<int>& order, const axes_t &axes, Builder &builder);
void set_value(indices_t idx, Value *v);
Value* get_value(indices_t idx);
const std::vector<int>& get_order() { return order_; }
unsigned get_linear_index(indices_t idx);
indices_t get_ordered_indices(unsigned id);
void for_each(std::function<void(indices_t)> fn, int start = 0, int end = -1);
void for_each(std::function<void(indices_t)> fn, std::vector<int> start, std::vector<int> size);
const distributed_axis &axis(unsigned dim) { return axes_.at(dim); }
private:
axes_t axes_;
std::vector<int> order_;
indices_map_t indices_;
values_map_t values_;
ordered_indices_vec_t ordered_indices_;
Builder &builder_;
};
}
}
#endif

View File

@@ -0,0 +1,98 @@
#ifndef TDL_INCLUDE_IR_CODEGEN_TARGET_H
#define TDL_INCLUDE_IR_CODEGEN_TARGET_H
namespace llvm{
class Type;
class Value;
class Instruction;
class Constant;
class LLVMContext;
class Module;
class ConstantFolder;
class IRBuilderDefaultInserter;
template <typename T, typename Inserter>
class IRBuilder;
class ArrayType;
class Function;
}
// typedefs
namespace triton{
namespace codegen{
typedef llvm::IRBuilder<llvm::ConstantFolder,
llvm::IRBuilderDefaultInserter> Builder;
typedef llvm::LLVMContext LLVMContext;
typedef llvm::Type Type;
typedef llvm::Value Value;
typedef llvm::Module Module;
typedef llvm::Instruction Instruction;
typedef llvm::Constant Constant;
typedef llvm::ArrayType ArrayType;
typedef llvm::Function Function;
}
}
namespace triton{
namespace codegen{
class target {
public:
target(bool is_gpu): is_gpu_(is_gpu){}
virtual ~target() {}
virtual void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn) = 0;
virtual Instruction* add_barrier(Module *module, Builder& builder) = 0;
virtual Instruction* add_memfence(Module *module, Builder& builder) = 0;
virtual Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax) = 0;
virtual Value* get_local_id(Module *module, Builder& builder, unsigned ax) = 0;
virtual Value* get_block_id(Module *module, Builder& builder, unsigned ax) = 0;
virtual Value* get_num_blocks(Module *module, Builder& builder, unsigned ax) = 0;
virtual unsigned guaranteed_alignment() = 0;
bool is_gpu() const;
private:
bool is_gpu_;
};
class amd_cl_target: public target {
public:
amd_cl_target(): target(true){}
void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn);
Instruction* add_barrier(Module *module, Builder& builder);
Instruction* add_memfence(Module *module, Builder& builder);
Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax);
Value* get_local_id(Module *module, Builder& builder, unsigned ax);
Value* get_block_id(Module *module, Builder& builder, unsigned ax);
Value* get_num_blocks(Module *module, Builder& builder, unsigned ax);
unsigned guaranteed_alignment() { return 16; }
};
class nvidia_cu_target: public target {
public:
nvidia_cu_target(): target(true){}
void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn);
Instruction* add_barrier(Module *module, Builder& builder);
Instruction* add_memfence(Module *module, Builder& builder);
Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax);
Value* get_local_id(Module *module, Builder& builder, unsigned ax);
Value* get_block_id(Module *module, Builder& builder, unsigned ax);
Value* get_num_blocks(Module *module, Builder& builder, unsigned ax);
unsigned guaranteed_alignment() { return 16; }
};
class cpu_target: public target {
public:
cpu_target(): target(false){}
void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn);
Instruction* add_barrier(Module *module, Builder& builder);
Instruction* add_memfence(Module *module, Builder& builder);
Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax);
Value* get_local_id(Module *module, Builder& builder, unsigned ax);
Value* get_block_id(Module *module, Builder& builder, unsigned ax);
Value* get_num_blocks(Module *module, Builder& builder, unsigned ax);
unsigned guaranteed_alignment() { return 1; }
};
}
}
#endif

View File

@@ -0,0 +1,47 @@
#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_REORDER_H
#define TDL_INCLUDE_CODEGEN_OPTIMIZE_REORDER_H
#include <map>
#include <set>
#include <vector>
namespace triton {
namespace ir {
class module;
class value;
class io_inst;
class instruction;
class builder;
}
namespace codegen{
namespace analysis{
class align;
class layouts;
class cts;
}
namespace transform{
class coalesce {
private:
void extract_io_use(ir::value *v, std::set<ir::io_inst*>& result);
void extract_ld(ir::io_inst *i, std::map<int, std::vector<triton::ir::io_inst *> > &result);
ir::value* rematerialize(ir::value *v, ir::builder& builder, std::map<ir::value*, ir::value*>& seen);
public:
coalesce(analysis::align* align, triton::codegen::analysis::layouts *layouts);
void run(ir::module &mod);
private:
analysis::align* align_;
analysis::layouts* layout_;
};
}
}
}
#endif

View File

@@ -0,0 +1,28 @@
#ifndef TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
#define TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
#include <set>
#include <map>
namespace triton {
namespace ir {
class module;
class value;
class phi_node;
class instruction;
}
namespace codegen{
namespace transform{
class cts {
public:
void run(ir::module &mod);
};
}
}
}
#endif

View File

@@ -0,0 +1,24 @@
#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_CSE_H
#define TDL_INCLUDE_CODEGEN_OPTIMIZE_CSE_H
namespace triton {
namespace ir {
class module;
}
namespace codegen{
namespace transform{
class dce {
public:
dce() {}
void run(ir::module &mod);
};
}
}
}
#endif

View File

@@ -0,0 +1,22 @@
#ifndef _TRITON_SELECTION_TRANSFORM_DISASSOCIATE_H_
#define _TRITON_SELECTION_TRANSFORM_DISASSOCIATE_H_
namespace triton {
namespace ir {
class module;
}
namespace codegen{
namespace transform{
class disassociate {
public:
void run(ir::module &mod);
};
}
}
}
#endif

View File

@@ -0,0 +1,59 @@
#ifndef TDL_INCLUDE_CODEGEN_BARRIERS_H
#define TDL_INCLUDE_CODEGEN_BARRIERS_H
namespace triton {
namespace ir {
class module;
class basic_block;
class instruction;
class value;
class builder;
}
namespace codegen{
namespace analysis{
class allocation;
class liveness;
class layouts;
class cts;
}
namespace transform{
class membar {
private:
typedef std::pair<unsigned, unsigned> interval_t;
typedef std::vector<interval_t> interval_vec_t;
private:
interval_vec_t join(const std::vector<interval_vec_t>& intervals);
void insert_barrier(ir::instruction *instr, ir::builder &builder);
bool intersect(const interval_vec_t &X, interval_t x);
bool intersect(const interval_vec_t &X, const interval_vec_t &Y);
void add_reference(ir::value *v, interval_vec_t &res);
void get_read_intervals(ir::instruction *i, interval_vec_t &res);
void get_written_intervals(ir::instruction *i, interval_vec_t &res);
std::pair<interval_vec_t, interval_vec_t> transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from,
std::set<ir::instruction *> &insert_loc, std::set<triton::ir::value *> &safe_war);
public:
membar(analysis::liveness *liveness, analysis::layouts *layouts, analysis::allocation *alloc):
liveness_(liveness), layouts_(layouts), alloc_(alloc) {}
void run(ir::module &mod);
private:
analysis::liveness *liveness_;
analysis::layouts *layouts_;
analysis::allocation *alloc_;
};
}
}
}
#endif

View File

@@ -0,0 +1,43 @@
#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_TRANS_H
#define TDL_INCLUDE_CODEGEN_OPTIMIZE_TRANS_H
namespace triton {
namespace ir {
class module;
class value;
class instruction;
class trans_inst;
class builder;
class constant_int;
class dot_inst;
}
namespace codegen{
namespace transform{
class peephole {
private:
bool rewrite_cts_cfs(ir::instruction *value, ir::builder &builder);
bool rewrite_trans_phi(ir::instruction* value, ir::builder &builder);
bool rewrite_dot_fp32(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D);
bool rewrite_dot_hmma(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D);
bool rewrite_dot(ir::instruction *value, ir::builder& builder);
bool rewrite_mult(ir::instruction *value, ir::builder& builder);
bool rewrite_unit_red(ir::instruction *value, ir::builder& builder);
bool rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::builder& builder);
private:
public:
peephole() {}
void run(ir::module &mod);
};
}
}
}
#endif

View File

@@ -0,0 +1,49 @@
#ifndef TDL_INCLUDE_IR_CODEGEN_REASSOCIATE_H
#define TDL_INCLUDE_IR_CODEGEN_REASSOCIATE_H
#include <map>
#include <set>
#include <vector>
namespace triton {
// forward declaration
namespace ir {
class module;
class value;
class builder;
class instruction;
class getelementptr_inst;
}
namespace codegen{
namespace analysis{
class tiles;
class align;
}
namespace transform{
class reassociate {
struct cst_info {
ir::value* dyn_ptr;
ir::getelementptr_inst* sta_ptr;
};
private:
ir::instruction* is_bin_add(ir::value *x);
ir::value *reassociate_idx(ir::value *value, ir::builder &builder, ir::value *&noncst, ir::value *&cst);
ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map<ir::value*, cst_info> &offsets);
public:
void run(ir::module& module);
};
}
}
}
#endif

137
include/triton/driver/backend.h Executable file
View File

@@ -0,0 +1,137 @@
#pragma once
#ifndef _TRITON_DRIVER_BACKEND_H_
#define _TRITON_DRIVER_BACKEND_H_
#include <map>
#include <list>
#include <vector>
#include "triton/driver/context.h"
namespace llvm
{
class Module;
}
namespace triton
{
namespace driver
{
class buffer;
class stream;
class device;
class context;
class platform;
class module;
class kernel;
struct backend
{
// platforms
class platforms
{
friend class backend;
private:
static void init();
public:
static void get(std::vector<driver::platform*> &results);
private:
static std::vector<driver::platform*> cache_;
};
// devices
class devices
{
friend class backend;
private:
static void init(const std::vector<platform *> &platforms);
public:
static void get(std::vector<driver::device*>& devs);
private:
static std::vector<driver::device*> cache_;
};
// modules
class modules
{
friend class backend;
public:
static void release();
private:
static std::map<std::tuple<driver::stream*, std::string>, driver::module*> cache_;
};
// kernels
class kernels
{
friend class backend;
public:
static void release();
static driver::kernel* get(driver::module* mod, const std::string & name);
private:
static std::map<std::tuple<module*, std::string>, driver::kernel*> cache_;
};
// contexts
class contexts
{
friend class backend;
private:
static void init(const std::vector<device *> &);
static void release();
public:
static driver::context* get_default();
static driver::context* import(CUcontext ctx)
{
for(driver::context* x: cache_){
driver::cu_context* cu_x = (driver::cu_context*)x;
if(*cu_x->cu()==ctx)
return x;
}
cache_.emplace_back(new driver::cu_context(ctx, false));
return cache_.back();
}
static void get(std::list<driver::context*> &);
private:
static std::list<driver::context*> cache_;
};
// streams
class streams
{
friend class backend;
private:
static void init(std::list<context*> const &);
static void release();
public:
static void get(driver::context*, std::vector<driver::stream *> &streams);
static driver::stream* get(driver::context*, unsigned int id = 0);
static driver::stream* get_default();
private:
static std::map<driver::context*, std::vector<driver::stream*> > cache_;
};
static void init();
static void release();
static void synchronize(triton::driver::context *);
static unsigned int default_device;
};
}
}
#endif

57
include/triton/driver/buffer.h Executable file
View File

@@ -0,0 +1,57 @@
#pragma once
#ifndef _TRITON_DRIVER_BUFFER_H_
#define _TRITON_DRIVER_BUFFER_H_
#include "triton/driver/handle.h"
#include "triton/driver/context.h"
namespace triton
{
namespace driver
{
class stream;
// Base
class buffer : public polymorphic_resource<CUdeviceptr, cl_mem, host_buffer_t> {
public:
buffer(driver::context* ctx, size_t size, CUdeviceptr cl, bool take_ownership);
buffer(driver::context* ctx, size_t size, cl_mem cl, bool take_ownership);
buffer(driver::context* ctx, size_t size, host_buffer_t hst, bool take_ownership);
static buffer* create(driver::context* ctx, size_t size);
driver::context* context();
size_t size();
protected:
driver::context* context_;
size_t size_;
};
// CPU
class host_buffer: public buffer
{
public:
host_buffer(driver::context* context, size_t size);
};
// OpenCL
class ocl_buffer: public buffer
{
public:
ocl_buffer(driver::context* context, size_t size);
};
// CUDA
class cu_buffer: public buffer
{
public:
cu_buffer(driver::context* context, size_t size);
cu_buffer(driver::context* context, size_t size, CUdeviceptr cu, bool take_ownership);
void set_zero(triton::driver::stream *queue, size_t size);
};
}
}
#endif

70
include/triton/driver/context.h Executable file
View File

@@ -0,0 +1,70 @@
#pragma once
#ifndef _TRITON_DRIVER_CONTEXT_H_
#define _TRITON_DRIVER_CONTEXT_H_
#include "triton/driver/device.h"
#include "triton/driver/handle.h"
namespace triton
{
namespace driver
{
class context: public polymorphic_resource<CUcontext, cl_context, host_context_t>{
protected:
static std::string get_cache_path();
public:
context(driver::device *dev, CUcontext cu, bool take_ownership);
context(driver::device *dev, cl_context cl, bool take_ownership);
context(driver::device *dev, host_context_t hst, bool take_ownership);
driver::device* device() const;
std::string const & cache_path() const;
// factory methods
static context* create(driver::device *dev);
protected:
driver::device* dev_;
std::string cache_path_;
};
// Host
class host_context: public context {
public:
host_context(driver::device* dev);
};
// CUDA
class cu_context: public context {
public:
class context_switcher{
public:
context_switcher(driver::context const & ctx);
~context_switcher();
private:
driver::cu_context const & ctx_;
};
private:
static CUdevice get_device_of(CUcontext);
public:
//Constructors
cu_context(CUcontext cu, bool take_ownership = true);
cu_context(driver::device* dev);
};
// OpenCL
class ocl_context: public context {
public:
ocl_context(driver::device* dev);
};
}
}
#endif

View File

@@ -20,19 +20,19 @@
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef ISAAC_DRIVER_CUBLAS_H
#define ISAAC_DRIVER_CUBLAS_H
#ifndef TDL_INCLUDE_DRIVER_CUBLAS_H
#define TDL_INCLUDE_DRIVER_CUBLAS_H
#include "isaac/templates/common.hpp"
#include "isaac/driver/dispatch.h"
#include "isaac/driver/buffer.h"
#include "isaac/driver/stream.h"
#include "isaac/driver/backend.h"
#include "isaac/driver/error.h"
#include "isaac/tools/bench.hpp"
#include "isaac/tools/collections.hpp"
#include "triton/driver/dispatch.h"
#include "triton/driver/buffer.h"
#include "triton/driver/stream.h"
#include "triton/driver/backend.h"
#include "triton/driver/error.h"
#include "triton/tools/bench.hpp"
#include "triton/tools/collections.hpp"
namespace isaac
namespace triton
{
namespace driver
{
@@ -51,7 +51,7 @@ static const std::vector<cublasGemmAlgo_t> cublasAlgorithms = {
static const std::map<DType, cudaDataType> cudtype = {{FLOAT_TYPE, CUDA_R_32F}, {DOUBLE_TYPE,CUDA_R_64F}};
static const std::map<char, cublasOperation_t> cuop = {{'N', CUBLAS_OP_N}, {'T', CUBLAS_OP_T}};
inline cublasGemmAlgo_t cublasGemmFastest(Stream& stream, cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K,
inline cublasGemmAlgo_t cublasGemmFastest(stream& stream, cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K,
void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb,
void* beta, CUdeviceptr C, int32_t ldc){
@@ -84,7 +84,7 @@ inline void cublasGemmEx(cublasHandle_t handle, cudaDataType cudt, cublasOperati
/* Simplified API for default GEMM */
inline void cublasGemm(DType dtype, Stream& stream, char cAT, char cBT, int32_t M, int32_t N, int32_t K, scalar alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, scalar beta, Buffer& C, int32_t ldc, cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT){
inline void cublasGemm(DType dtype, stream& stream, char cAT, char cBT, int32_t M, int32_t N, int32_t K, scalar alpha, cu_buffer const & A, int32_t lda, cu_buffer const & B, int32_t ldb, scalar beta, cu_buffer& C, int32_t ldc, cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT){
ContextSwitcher ctx_switch(stream.context());
cublasHandle_t handle = dispatch::cublasHandle(stream.context());
dispatch::cublasSetStream_v2(handle, (CUstream)stream);
@@ -111,9 +111,9 @@ inline cudnnTensorFormat_t format(cudnnDataType_t cutype){
}
}
inline void cudnnConv(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t C, int32_t T, int32_t R, int32_t S,
int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, Buffer const & F, scalar beta, Buffer const & O){
driver::Context const & ctx = stream.context();
inline void cudnnConv(DType dtype, stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t C, int32_t T, int32_t R, int32_t S,
int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, cu_buffer const & I, cu_buffer const & F, scalar beta, cu_buffer const & O){
driver::driver::context const & ctx = stream.context();
ContextSwitcher switch_ctx(ctx);
std::vector<int> pad = {pad_d, pad_h, pad_w};
@@ -154,16 +154,16 @@ inline void cudnnConv(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t
size_t workspace_size;
dispatch::cudnnGetConvolutionForwardWorkspaceSize(handle, tI, tF, conv, tO, algo, &workspace_size);
static Buffer work(ctx, 1024*1024*64);
static cu_buffer work(ctx, 1024*1024*64);
CUdeviceptr twork = work;
CUdeviceptr pI = I, pF = F, pO = O;
dispatch::cudnnConvolutionForward(handle, alpha.data(), tI, (void*)pI, tF, (void*)pF, conv, algo, (void*)twork, workspace_size, beta.data(), tO, (void*)pO);
}
inline void cudnnPool(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t T, int32_t R, int32_t S,
int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, scalar beta, Buffer const & O){
driver::Context const & ctx = stream.context();
inline void cudnnPool(DType dtype, stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t T, int32_t R, int32_t S,
int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, cu_buffer const & I, scalar beta, cu_buffer const & O){
driver::driver::context const & ctx = stream.context();
ContextSwitcher switch_ctx(ctx);
std::vector<int> pad = {pad_d, pad_h, pad_w};
@@ -200,11 +200,11 @@ inline void cudnnPool(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t
dispatch::cudnnPoolingForward(handle, desc, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO);
}
inline void cudnnTransformTensor(driver::Stream & stream,
inline void cudnnTransformTensor(driver::cu_stream & stream,
DType in_dtype, DType out_dtype,
cudnnTensorFormat_t in_layout, cudnnTensorFormat_t out_layout,
int32_t N, int32_t C, int32_t D, int32_t H, int32_t W,
scalar alpha, driver::Buffer const & I, scalar beta, driver::Buffer& O)
scalar alpha, driver::cu_buffer const & I, scalar beta, driver::cu_buffer& O)
{
cudnnHandle_t handle = dispatch::cudnnHandle(stream.context());
dispatch::cudnnSetStream(handle, (CUstream)stream);

110
include/triton/driver/device.h Executable file
View File

@@ -0,0 +1,110 @@
#pragma once
#ifndef _TRITON_DRIVER_DEVICE_H_
#define _TRITON_DRIVER_DEVICE_H_
#include "triton/driver/platform.h"
#include "triton/driver/handle.h"
namespace triton
{
namespace codegen
{
class target;
}
namespace driver
{
class context;
// Base device
class device: public polymorphic_resource<CUdevice, cl_device_id, host_device_t>{
public:
using polymorphic_resource::polymorphic_resource;
virtual size_t max_threads_per_block() const = 0;
virtual size_t max_shared_memory() const = 0;
virtual std::unique_ptr<codegen::target> make_target() const = 0;
};
// Host device
class host_device: public device {
public:
host_device(): device(host_device_t(), true){ }
size_t max_threads_per_block() const { return 1; }
size_t max_shared_memory() const { return 0; }
std::unique_ptr<codegen::target> make_target() const;
};
// OpenCL device
class ocl_device: public device {
public:
ocl_device(cl_device_id cl, bool take_ownership = true): device(cl, take_ownership) { }
size_t max_threads_per_block() const;
size_t max_shared_memory() const;
std::unique_ptr<codegen::target> make_target() const;
};
// CUDA device
class cu_device: public device {
public:
//Supported architectures
enum class Architecture{
//NVidia
SM_2_0,
SM_2_1,
SM_3_0,
SM_3_5,
SM_3_7,
SM_5_0,
SM_5_2,
SM_6_0,
SM_6_1,
SM_7_0,
UNKNOWN
};
private:
//Metaprogramming elper to get cuda info from attribute
template<CUdevice_attribute attr>
int cuGetInfo() const;
inline Architecture nv_arch(std::pair<unsigned int, unsigned int> sm) const;
inline nvmlDevice_t nvml_device() const;
public:
cu_device(CUdevice cu = CUdevice(), bool take_ownership = true): device(cu, take_ownership){}
// Accessors
Architecture architecture() const;
// Informations
std::string infos() const;
size_t address_bits() const;
std::vector<size_t> max_block_dim() const;
size_t warp_size() const;
// Compute Capability
void interpret_as(std::pair<size_t, size_t> cc);
std::pair<size_t, size_t> compute_capability() const;
// Identifier
std::string name() const;
std::string pci_bus_id() const;
// Clocks
size_t current_sm_clock() const;
size_t current_mem_clock() const;
size_t max_threads_per_block() const;
size_t max_shared_memory() const;
size_t max_sm_clock() const;
size_t max_mem_clock() const;
void set_max_clock();
// Target
std::unique_ptr<codegen::target> make_target() const;
private:
std::shared_ptr<std::pair<size_t, size_t>> interpreted_as_;
};
}
}
#endif

Some files were not shown because too many files have changed in this diff Show More