From b6af06910da8f1433ae8d28ac97b1587862a4cc2 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 25 Apr 2019 16:24:16 -0400 Subject: [PATCH] [examples] deleted placeholders for not implemented examples --- examples/cpp/blocksparse.cpp | 153 ----------------------------------- 1 file changed, 153 deletions(-) delete mode 100644 examples/cpp/blocksparse.cpp diff --git a/examples/cpp/blocksparse.cpp b/examples/cpp/blocksparse.cpp deleted file mode 100644 index 5a816aff1..000000000 --- a/examples/cpp/blocksparse.cpp +++ /dev/null @@ -1,153 +0,0 @@ -#include -#include -#include "common.hpp" -#include "triton/jit.h" -#include "triton/driver/backend.h" -#include "triton/driver/stream.h" - -const char* src = -R"( -const tunable int32 TM = {16, 32, 64, 128}; -const tunable int32 TN = {8}; -const tunable int32 TK = {8}; - -void blocksparse(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, - int32 M, int32 N, int32 K, int32 bound){ - int32 rxa[TM] = get_global_range[TM](0); - int32 ryb[TN] = get_global_range[TN](1); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 C[TM, TN] = 0; - fp32* pa[TM, TK] = a + rka[newaxis, :]*M + rxa[:, newaxis]; - fp32* pb[TN, TK] = b + rkb[newaxis, :]*K + ryb[:, newaxis]; - fp32 a[TM, TK] = *pa; - fp32 b[TN, TK] = *pb; - for(int32 k = K; k > 0;){ - C = dot(a, trans(b), C); - pa = pa + TK*M; - pb = pb + TK*N; - k = k - TK; - int1 checka[TM, TK] = k > bound; - int1 checkb[TN, TK] = k > bound; - @checka a = *pa; - @checkb b = *pb; - if(k > bound) - continue; - int1 checka0[TM] = rxa < M; - int1 checka1[TK] = rka < k; - int1 checkb0[TN] = ryb < N; - int1 checkb1[TK] = rkb < k; - checka = checka0[:, newaxis] && checka1[newaxis, :]; - checkb = checkb0[:, newaxis] && checkb1[newaxis, :]; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 ryc[TN] = get_global_range[TN](1); - fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis]; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = C; -} -)"; - -std::vector make_deltas(std::vector mask, int K, int N){ - std::vector>> pairs(N); - unsigned int current = 0; - for(int k = 0; k < K; k++) - for(int n = 0; n < N; n++){ - if(mask[k + n*K]) - pairs[n].push_back({current, k}); - } -} - -int main() { - // initialize default compute device - auto context = triton::driver::backend::contexts::get_default(); - triton::jit jit(context); - - - // matrix multiplication parameters - int32_t M = 512, N = 32, K = 2048; - std::vector hc(M*N); - std::vector rc(M*N); - std::vector ha(M*K); - std::vector hb(K*N); - srand(0); - for(size_t i = 0; i < ha.size(); i++) - ha[i] = (float)rand()/RAND_MAX; - for(size_t i = 0; i < hb.size(); i++) - hb[i] = (float)rand()/RAND_MAX; - for(size_t i = 0; i < hc.size(); i++) - hc[i] = 0; - triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); - triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); - triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*4); - triton::driver::stream* stream = triton::driver::stream::create(context); - stream->write(da, true, 0, ha); - stream->write(db, true, 0, hb); - stream->write(dc, true, 0, hc); - stream->synchronize(); - - - // benchmark a given matrix multiplication kernel - auto benchmark = [&](triton::driver::kernel* kernel, - triton::jit::launch_information info) { - // launch info - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}; - // fast bounds-checking - unsigned TK = jit.get_int("TK"); - unsigned lasti = (grid[0]*TM - 1)*TM + TM - 1; - unsigned lastj = (grid[1]*TN - 1)*TN + TN - 1; - unsigned lastk = TK - 1; - bool AT = false; - bool BT = true; - unsigned last_safe_a = (AT==false)?(M*K - 1 - lasti)/M - lastk : M*K - 1 - lasti*K - lastk; - unsigned last_safe_b = (BT==true)?(N*K - 1 - lastj)/N - lastk : N*K - 1 - lastj*K - lastk; - int32_t bound = std::max(1, std::max(K - last_safe_a, K - last_safe_b)); - // set argument - kernel->setArg(0, da); - kernel->setArg(1, db); - kernel->setArg(2, dc); - kernel->setArg(3, M); - kernel->setArg(4, N); - kernel->setArg(5, K); - kernel->setArg(6, bound); - // dry run - stream->enqueue(kernel, grid, {nthreads, 1, 1}); - stream->synchronize(); - // benchmark - double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, - [&](){ stream->synchronize(); }, *context->device()); - ts = ts * 1e-9; - double tflops = 2.*M*N*K / ts * 1e-12; - return tflops; - }; - - - // just-in-time compile source-code - std::vector params = { - 16, 2, 64, - 32, 2, 64, - 16, 8, 2, 2, - 8, 8, - 4 - }; - jit.autotune("matmul",src, benchmark); - jit.add_module("matmul", src, params); - triton::driver::kernel* kernel = jit.get_function("matmul"); - triton::jit::launch_information info = jit.get_launch_info("matmul"); - std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; - stream->read(dc, true, 0, hc); - simple_gemm(rc, ha, hb, M, N, K); - for(size_t i = 0; i < M*N; i++) - if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ - std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; - exit(EXIT_FAILURE); - } - std::cout << "Pass!" << std::endl; -}