#include #include #include #include "triton/driver/backend.h" #include "triton/driver/stream.h" #include "triton/tools/bench.hpp" #include "triton/external/half.hpp" #include "triton/runtime/function.h" #include "cuda.h" template void diff(const std::vector& x, const std::vector& y){ for(size_t i = 0; i < x.size(); i++) if(std::isnan(x[i]) || std::abs(x[i] - y[i])/std::max(x[i], y[i]) > 1e-4){ std::cout << i << " " << x[i] << " " << y[i] << std::endl; exit(EXIT_FAILURE); } std::cout << "Pass!" << std::endl; } template static void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K){ for(size_t m = 0; m < M; m++) for(size_t n = 0; n < N; n++){ float acc = 0; for(size_t k = 0; k < K; k++) acc = acc + (AT ? a[k + m*K] : a[m + k*M]) * (BT ? b[n + k*N] : b[k + n*K]); c[m + n*M] = static_cast(acc); } } template void cpu_ref(bool AT_, bool BT_, size_t M, size_t N, size_t K, std::vector &c, const std::vector &a, const std::vector &b) { if(AT_ && BT_) cpu_ref(c, a, b, M, N, K); else if(AT_ && !BT_) cpu_ref(c, a, b, M, N, K); else if(!AT_ && BT_) cpu_ref(c, a, b, M, N, K); else cpu_ref(c, a, b, M, N, K); } std::string src = R"( #ifdef AT #define USEA ^a #else #define USEA a #endif #ifdef BT #define USEB ^b #else #define USEB b #endif void dot(TYPE * A __noalias __readonly __aligned(16), TYPE * B __noalias __readonly __aligned(16), TYPE * C __noalias __readonly __aligned(16), int M, int N, int K, int lda __multipleof(8), int ldb __multipleof(8), int ldc) { int ridx = get_program_id(0); int ridy = get_program_id(1); int rxa[TM] = ridx * TM + 0 ... TM; int ryb[TN] = ridy * TN + 0 ... TN; int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; float xc[TM, TN] = 0; #ifdef AT TYPE* pa[TK, TM] = A + rka[:, newaxis] + rxa[newaxis, :]*lda; TYPE a[TK, TM] = *pa; #else TYPE* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; TYPE a[TM, TK] = *pa; #endif #ifdef BT TYPE* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; TYPE b[TN, TK] = *pb; #else TYPE* pb[TK, TN] = B + rkb[:, newaxis] + ryb[newaxis, :]*ldb; TYPE b[TK, TN] = *pb; #endif for(int k = K; k > 0; k = k - TK){ xc = USEA @ USEB + xc; #ifdef AT pa = pa + TK; #else pa = pa + TK*lda; #endif #ifdef BT pb = pb + TK*ldb; #else pb = pb + TK; #endif a = *pa; b = *pb; } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); TYPE* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; TYPE c[TM, TN] = xc; bool checkc0[TM] = rxc < M; bool checkc1[TN] = ryc < N; bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; *pc = c; } )"; struct perf_t { double triton; double cublas; }; namespace drv = triton::driver; namespace rt = triton::runtime; perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ typedef half NumericT; std::string ty = "half"; size_t dt_nbytes = sizeof(NumericT); drv::context* context = stream->context(); std::vector hc(M*N); std::vector ha(M*K); std::vector hb(K*N); int32_t lda = AT ? K : M; int32_t ldb = BT ? N : K; int32_t ldc = M; srand(0); for(size_t i = 0; i < ha.size(); i++) ha[i] = static_cast((double)rand()/RAND_MAX); for(size_t i = 0; i < hb.size(); i++) hb[i] = static_cast((double)rand()/RAND_MAX); for(size_t i = 0; i < hc.size(); i++) hc[i] = static_cast((double)0); drv::buffer* dc = drv::buffer::create(context, hc.size()*dt_nbytes); drv::buffer* da = drv::buffer::create(context, ha.size()*dt_nbytes); drv::buffer* db = drv::buffer::create(context, hb.size()*dt_nbytes); stream->write(da, true, 0, ha); stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); // run rt::function::options_space_t opt; opt.defines.push_back({"TYPE", {ty}}); if(AT) opt.defines.push_back({"AT", {""}}); if(BT) opt.defines.push_back({"BT", {""}}); opt.defines.push_back({"TM", {"32"}}); opt.defines.push_back({"TN", {"32"}}); opt.defines.push_back({"TK", {"32"}}); opt.num_warps = {1, 2, 4, 8}; rt::function function(src, opt); auto ceil = [](size_t x, size_t y) { return (x + y - 1) / y; }; auto grid = [&](const rt::function::options_t& x) { return rt::grid_t{ceil(M, x.D("TM")), ceil(N, x.D("TN")), 1}; }; auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; }; perf_t res; res.triton = tflops(triton::tools::bench([&]() { function({da, db, dc, M, N, K, lda, ldb, ldc}, grid, stream);}, stream)); res.cublas = 0; // test stream->synchronize(); stream->read(dc, true, 0, hc); std::vector rc(hc.size()); cpu_ref(AT, BT, M, N, K, rc, ha, hb); for(size_t i = 0; i < M*N; i++) if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; exit(EXIT_FAILURE); } std::cout << hc[0] << " " << std::endl; std::cout << "Pass!" << std::endl; // clean-up delete dc; delete da; delete db; return res; } int main() { struct config_t{ bool AT; bool BT; int32_t M; int32_t N; int32_t K; std::string repr() { std::ostringstream oss; oss << AT << " " << BT << " " << M << " " << N << " " << K; return oss.str(); } perf_t perf(triton::driver::stream *stream){ return do_bench(stream, AT, BT, M, N, K); } }; // shapes to benchmark std::vector configs = { // {false, false, 8192, 512, 512}, {false, true, 128, 128, 128} // {false, true, 128, 128, 128}, // {false, false, 128, 128, 128}, // {true, false, 128, 128, 128}, // {true, true, 128, 128, 128} // {false, true, 32768, 256, 512} // {true, false, 8192, 512, 512}, // {true, true, 8192, 512, 512} }; // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::driver::stream* stream = triton::driver::stream::create(context); // does the work for(config_t c: configs){ perf_t perf = c.perf(stream); std::cout << "// " << c.repr() << ", " << perf.triton << ", " << perf.cublas << std::endl; } }