From f4f216b88a695223b7a651b14374c7f5dd6f2403 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 8 Aug 2020 14:51:29 -0400 Subject: [PATCH] [EXAMPLES] Added C++ example for Conv2d --- tests/bench/conv.cc | 39 ++++++++++++ tests/common/conv.h | 124 +++++++++++++++++++++++++++++++++++++ tests/common/src/conv.h | 132 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 295 insertions(+) create mode 100644 tests/bench/conv.cc create mode 100644 tests/common/conv.h create mode 100644 tests/common/src/conv.h diff --git a/tests/bench/conv.cc b/tests/bench/conv.cc new file mode 100644 index 000000000..93596708d --- /dev/null +++ b/tests/bench/conv.cc @@ -0,0 +1,39 @@ +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "conv.h" + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + triton::driver::stream* stream = triton::driver::stream::create(context); + // shapes to benchmark + typedef std::tuple config_t; + std::vector configs = { +// {1, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1}, +// {1, 56, 56, 128, 128, 3, 3, 1, 1, 1, 1}, +// {1, 56, 56, 256, 256, 3, 3, 1, 1, 1, 1}, +// {1, 56, 56, 384, 384, 3, 3, 1, 1, 1, 1}, +// {1, 56, 56, 512, 512, 3, 3, 1, 1, 1, 1}, +// {1, 56, 56, 768, 768, 3, 3, 1, 1, 1, 1}, +// {1, 56, 56, 1024, 1024, 3, 3, 1, 1, 1, 1}, + +// {1, 8, 8, 256, 256, 3, 3, 1, 1, 1, 1}, +// {1, 16, 16, 256, 256, 3, 3, 1, 1, 1, 1}, +// {1, 32, 32, 256, 256, 3, 3, 1, 1, 1, 1}, +// {1, 64, 64, 256, 256, 3, 3, 1, 1, 1, 1}, + {1, 64, 64, 4096, 4096, 1, 1, 0, 0, 1, 1}, +// {1, 256, 256, 256, 256, 3, 3, 1, 1, 1, 1} + + + + + }; + int Z, H, W, CO, CI, R, S, pad_h, pad_w, stride_h, stride_w; + for(const auto& c: configs){ + std::tie(Z, H, W, CO, CI, R, S, pad_h, pad_w, stride_h, stride_w) = c; + std::cout << "// " << c ; + for(auto perf: bench_conv(stream, HALF, Z, H, W, CO, CI, R, S, pad_h, pad_w, stride_h, stride_w)) + std::cout << ", " << perf << std::flush; + std::cout << std::endl; + } +} diff --git a/tests/common/conv.h b/tests/common/conv.h new file mode 100644 index 000000000..b6166bb94 --- /dev/null +++ b/tests/common/conv.h @@ -0,0 +1,124 @@ +#include +#include +#include +#include +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/tools/bench.hpp" +#include "triton/external/half.hpp" +#include "triton/runtime/function.h" +#include "src/conv.h" +#include "cuda/cublas.h" +#include "util.h" + + + +enum run_mode_t { + BENCH, + TEST +}; + +enum dtype_t { + FLOAT, + HALF, + DOUBLE +}; + +template +struct to_string; + +template<> struct to_string{ + static constexpr const char* value = "half"; +}; + +template<> struct to_string{ + static constexpr const char* value = "float"; +}; + +template<> struct to_string{ + static constexpr const char* value = "double"; +}; + +template +void triton_conv(drv::stream* stream, + int Z, int CI, int H, int W, int CO, int R, int S, + int pad_h, int pad_w, int stride_h, int stride_w, + run_mode_t mode, std::vector& bench, bool &test){ + std::string ty = to_string::value; + size_t dt_nbytes = sizeof(T); + drv::context* context = stream->context(); + + int P = (H + 2*pad_h - R)/stride_h + 1; + int Q = (W + 2*pad_w - S)/stride_w + 1; + + // inputs + auto dc = std::shared_ptr(drv::buffer::create(context, Z*CO*P*Q*dt_nbytes)); + auto da = std::shared_ptr(drv::buffer::create(context, Z*CI*H*W*dt_nbytes)); + auto db = std::shared_ptr(drv::buffer::create(context, CI*R*S*CO*dt_nbytes)); + auto ddelta = std::shared_ptr(drv::buffer::create(context, CI*R*S*4)); + auto dlocks = std::shared_ptr(drv::buffer::create(context, 1024*1024*2*4)); + ((drv::cu_buffer*)dlocks.get())->set_zero(stream, dlocks->size()); + + std::vector hdelta(CI*R*S); + int TK = 16; + for(int i = 0; i < hdelta.size(); i++){ + int s = i % S; + int cr = i / S; + int r = cr % R; + int c = cr / R; + int nexti = i + TK; + int nexts = nexti % S; + int nextcr = nexti / S; + int nextr = nextcr % R; + int nextc = nextcr / R; + hdelta[i] = (nextc - c)*W*H + (nextr - r)*W + (nexts - s); + } + stream->write(&*ddelta, true, 0, hdelta); + + // macros + rt::function::options_space_t opt; + opt.defines.push_back({"TYPE", {ty}}); + opt.defines.push_back({"TM", {"128"}}); + opt.defines.push_back({"TN", {"128"}}); + opt.defines.push_back({"TK", {std::to_string(TK)}}); + opt.defines.push_back({"TZ", {"1"}}); + opt.defines.push_back({"RR", {std::to_string(R)}}); + opt.defines.push_back({"SS", {std::to_string(S)}}); + opt.defines.push_back({"PP", {std::to_string(P)}}); + opt.defines.push_back({"QQ", {std::to_string(Q)}}); + opt.defines.push_back({"HH", {std::to_string(H)}}); + opt.defines.push_back({"WW", {std::to_string(W)}}); + + opt.num_warps = {2, 4}; + + // kernels + rt::function function(src::conv, opt); + std::vector args = {&*da, &*db, &*dc, (float)1, Z*P*Q, CO, CI*R*S, + pad_h, pad_w, stride_h, stride_w, + &*ddelta, + W*H*CI, W*H, W, 1, + CO*S*R , CO*S, CO, 1, + Q*P*CO, Q*P, Q, 1}; + auto grid = [Z,P,Q,CO](const rt::function::options_t& x) { + return rt::grid_t{ceil(Z*P*Q, x.D("TM")), + ceil(CO , x.D("TN")), + (size_t)x.D("TZ")}; + }; + auto tflops = [&](double nanosec) { return 2.*Z*P*Q*CI*CO*R*S / nanosec * 1e-3; }; + double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream); + bench.push_back(tflops(triton_ns)); +} + +std::vector bench_conv(drv::stream* stream, dtype_t dtype, + int32_t Z, int32_t H, int32_t W, int32_t CO, int32_t CI, int32_t R, int32_t S, + int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w) { + std::vector bench; + bool test; + switch(dtype){ + case HALF: triton_conv(stream, Z, CI, H, W, CO, R, S, pad_h, pad_w, stride_h, stride_w, BENCH, bench, test); break; + case FLOAT: triton_conv(stream, Z, CI, H, W, CO, R, S, pad_h, pad_w, stride_h, stride_w, BENCH, bench, test); break; + case DOUBLE: triton_conv(stream, Z, CI, H, W, CO, R, S, pad_h, pad_w, stride_h, stride_w, BENCH, bench, test); break; + default: break; + } + return bench; +} diff --git a/tests/common/src/conv.h b/tests/common/src/conv.h new file mode 100644 index 000000000..c0786cb6f --- /dev/null +++ b/tests/common/src/conv.h @@ -0,0 +1,132 @@ +namespace src { + + const char *conv = +R"( + __global__ void conv(TYPE *A __noalias __readonly __aligned(16), + TYPE *B __noalias __readonly __aligned(16), + TYPE *C __noalias __aligned(16), + float alpha, + // equivalent matmul + int M, int N, int K, + // convolution properties + int pad_h, int pad_w, int stride_h, int stride_w, + // pointer increment + int *ADELTA, + // memory strides + int lda_z __multipleof(8), int lda_ci __multipleof(8), int lda_h __multipleof(8), int lda_w __multipleof(8), + int ldb_ci __multipleof(8), int ldb_r __multipleof(8), int ldb_s __multipleof(8), int ldb_co __multipleof(8), + int ldc_z __multipleof(8), int ldc_co __multipleof(8), int ldc_p __multipleof(8), int ldc_q __multipleof(8)) { + // prologue + int ridx = get_program_id(0); + int ridy = get_program_id(1); + int ridz = get_program_id(2); + int rm[TM] = ridx * TM + 0 ... TM; + int rn[TN] = ridy * TN + 0 ... TN; + // reduction splitting + K = K / TZ; + int rk[TK] = ridz * K + 0 ... TK; + + // unpack aggregate rows + // m = (z, p, q) + int rq[TM] = rm % QQ; + int rzp[TM] = rm / QQ; + int rp[TM] = rzp % PP; + int rz[TM] = rzp / PP; + // unpack aggregate reduction + // k = (ci, r, s) + int rs [TK] = rk % SS; + int rcir[TK] = rk / SS; + int rr [TK] = rcir % RR; + int rci [TK] = rcir / RR; + + // padding / striding + int rh_0[TM] = rp * stride_h - pad_h; + int rw_0[TM] = rq * stride_w - pad_w; + int rh[TM, TK] = rh_0[:, newaxis] + rr[newaxis, :]; + int rw[TM, TK] = rw_0[:, newaxis] + rs[newaxis, :]; + + // pointers to lhs + int offa[TM, TK] = rz [:, newaxis] * lda_z + + rci[newaxis, :] * lda_ci + + rh * lda_h + + rw * 1; + TYPE* pa[TM, TK] = A + offa; + // pointers to rhs + int offb[TK, TN] = rci[:, newaxis] * ldb_ci + + rr [:, newaxis] * ldb_r + + rs [:, newaxis] * ldb_s + + rn [newaxis, :] * 1; + TYPE* pb[TK, TN] = B + offb; + // pointers to delta + int* padelta[TK] = ADELTA + rk; + int adelta[TK] = *padelta; + + // prefetches operands + bool checkam[TM, TK] = rm[:, newaxis] < M; + bool checka[TM, TK] = checkam && rh >= 0 && rh < HH && rw >= 0 && rw < WW; + bool checkbn[TK, TN] = rn[newaxis, :] < N; + bool checkbk[TK, TN] = rk[:, newaxis] < K; + bool checkb[TK, TN] = checkbn && checkbk; + TYPE a[TM, TK] = checka ? *pa : 0; + TYPE b[TK, TN] = checkb ? *pb : 0; + + // reduction loop + float acc[TM, TN] = 0; + for(int k = K; k > 0; k -= TK){ + acc += a @ b; + // increment A + pa += adelta[newaxis, :]; + // bounds-checking A + rk += TK; + rs = rk % SS; + rcir = rk / SS; + rr = rcir % RR; + rh = rh_0[:, newaxis] + rr[newaxis, :]; + rw = rw_0[:, newaxis] + rs[newaxis, :]; + bool checka[TM, TK] = checkam && rh >= 0 && rh < HH && rw >= 0 && rw < WW; + // increment B + pb += TK * ldb_s; + // bounds-checking B + bool checkbk = k > TK; + bool checkb[TK, TN] = checkbn && checkbk; + a = checka ? *pa : 0; + b = *?(checkb)pb; + padelta += TK; + adelta = *padelta; + } + acc = acc * alpha; + TYPE c[TM, TN] = acc; + + // epilogue + rm = ridx * TM + 0 ... TM; + rn = ridy * TN + 0 ... TN; + rq = rm % QQ; + rzp = rm / QQ; + rp = rzp % PP; + rz = rzp / PP; + int offc[TM, TN] = rz [:, newaxis] * ldc_z + + rn [newaxis, :] * ldc_co + + rp [:, newaxis] * ldc_p + + rm [:, newaxis] * 1; + TYPE* pc[TM, TN] = C + offc; + bool checkc[TM, TN] = rm[:, newaxis] < M && rn[newaxis, :] < N; + +#if (TZ==1) + *?(checkc) pc = c; +#else + // accumulate partial result using spin-locks + int *plock = locks + rid; + int *pcount = plock + get_num_programs(0) * get_num_programs(1); + for(int repeat = 1; repeat == 1; repeat = atomic_cas(plock, 0, 1)); + int count = *pcount; + if(count == 0) + *?(checkc) pc = c; + else + *?(checkc) pc = c + *?(checkc)pc; + atomic_xchg(pcount, (count + 1) % TZ); + atomic_xchg(plock, 0); +#endif +} +)"; + +}