diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index 0d37d382d..38b335689 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -130,7 +130,7 @@ public: // create profile triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "fp16", params_.bsize, params_.locks, params_.blocks, OP); // blocksparse matmul - triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::FULL_TUNING); + triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING); triton::driver::buffer* locks_buffer = ((triton::dnn::blocksparse::dot*)op)->get_locks(); Tensor *tmp = nullptr; TensorShape tmp_shapes; diff --git a/include/triton/dnn/heuristics.h b/include/triton/dnn/heuristics.h index ab8af7d32..d9bd01e75 100644 --- a/include/triton/dnn/heuristics.h +++ b/include/triton/dnn/heuristics.h @@ -7,10 +7,12 @@ namespace triton{ namespace dnn{ +/* Dense matrix multiplication */ + typedef std::vector params_t; typedef std::tuple trans_key_t; typedef std::tuple size_key_t; -static const std::map> params = { +static const std::map> dot_params = { /* NN */ {trans_key_t(false, false), std::map{ {size_key_t(16, 16), {2, 8, 16, 4, 16, 2, 2, 1, 1, 16, 32, 8, 4, 1}}, @@ -108,7 +110,7 @@ static const std::map> params = { // small search space for partial auto-tuning inline std::vector dot_search_space(bool AT, bool BT) { std::vector result; - for(auto x: params.at(trans_key_t{AT, BT})) + for(auto x: dot_params.at(trans_key_t{AT, BT})) result.push_back(x.second); return result; } @@ -118,9 +120,41 @@ inline params_t dot_heuristics(bool AT, bool BT, size_t M, size_t N, size_t K) { size_t TM = 128; size_t TN = 128; // return {4, 4, 128, 8, 4, 128, 2, 2, 2, 2, 32, 32, 16, 1}; - return params.at(trans_key_t{AT, BT}).at(size_key_t{TM, TN}); + return dot_params.at(trans_key_t{AT, BT}).at(size_key_t{TM, TN}); } + +/* Block-sparse matrix multiplication */ + +static const std::map, std::map> bsdot_params = { + /* 32x32 */ + {{true, 32}, std::map{ + {32, {2, 2, 32, 32, 2, 2, 4, 8, 32, 32, 8, 4, 16}}, + {64, {2, 2, 64, 32, 2, 1, 16, 4, 4, 32, 16, 2, 4}}, + {128, {2, 2, 128, 32, 4, 1, 32, 4, 4, 32, 8, 4, 16}} + }}, + {{false, 32}, std::map{ + {32, {2, 2, 32, 32, 1, 1, 8, 4, 4, 32, 8, 4, 8}}, + {64, {2, 2, 64, 32, 2, 1, 16, 4, 4, 32, 16, 4, 8}}, + {128, {2, 2, 128, 32, 4, 1, 32, 4, 4, 32, 32, 4, 8}} + }} +}; + +// small search space for partial auto-tuning +inline std::vector bsdot_search_space(bool is_fprop, size_t block_size) { + std::vector result; + for(auto x: bsdot_params.at({is_fprop, block_size})) + result.push_back(x.second); + return result; +} + +// simple parameter heuristics +inline params_t bsdot_heuristics(bool is_fprop, size_t block_size, size_t N, size_t S) { + return bsdot_params.at({is_fprop,block_size}).at(128); +} + + } } + #endif diff --git a/lib/codegen/alignment_info.cpp b/lib/codegen/alignment_info.cpp index 87df925df..ed20e01fc 100644 --- a/lib/codegen/alignment_info.cpp +++ b/lib/codegen/alignment_info.cpp @@ -303,6 +303,7 @@ void alignment_info::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ populate_max_contiguous(i); +// std::cout << i->get_name() << " " << is_constant_.at(i).num_cst << " " << starting_multiple_.at(i) << " " << max_contiguous_.at(i) << std::endl; } } diff --git a/lib/codegen/optimize_dce.cpp b/lib/codegen/optimize_dce.cpp index 9508cfa2e..ec42729ec 100644 --- a/lib/codegen/optimize_dce.cpp +++ b/lib/codegen/optimize_dce.cpp @@ -20,7 +20,8 @@ void optimize_dce::run(ir::module &mod) { for(ir::instruction *i: block->get_inst_list()){ if(dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) - || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) ){ + || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) + || dynamic_cast(i)){ work_list.push_back(i); marked.insert(i); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index ad7e395b1..a57713f38 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -368,6 +368,8 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::functionadd_memfence(module, builder); + tgt_->add_barrier(module, builder); return (Instruction*)res; } if(ir::atomic_add_inst* ii = dynamic_cast(inst)){ diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 1bb2701bc..a28fd827e 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -247,14 +247,14 @@ void tune::run(ir::module &mod) { size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 8)); *params_.at(i).at("nts.d0") = *tmp; } } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 2)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 8)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 8)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index a75334b90..033e2497c 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -61,8 +61,8 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v jit->add_module(name_.c_str(), src.c_str(), best.params); } else { -// params_t params = heuristics(); - params_t params = jit->get_valid(name_.c_str(), src.c_str()); + params_t params = heuristics(); +// params_t params = jit->get_valid(name_.c_str(), src.c_str()); jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp index ff021cca8..f38030366 100644 --- a/lib/dnn/blocksparse/dot.cpp +++ b/lib/dnn/blocksparse/dot.cpp @@ -1,3 +1,4 @@ +#include "triton/dnn/heuristics.h" #include "triton/dnn/blocksparse/dot.h" namespace triton{ @@ -18,11 +19,11 @@ bool dot::operator <(const base& other) const { } std::vector dot::search_space() const { - throw std::runtime_error("not implemented"); + return bsdot_search_space(op_ == FPROP, BS_); } params_t dot::heuristics() const { - throw std::runtime_error("not implemented"); + return bsdot_heuristics(op_ == FPROP, BS_, N_, S_); } base * dot::clone() const { @@ -116,7 +117,8 @@ void dot::triton_c_src(std::ostream &os) const { int32 column = *(header + 2); int32 lockid = *(header + 3); int32 *plut = lut + offset * 2; - for(int32 k = K; k > 0; k = k - 1){ + for(int32 k = K; k > 0; k = k - 1) + { int32 ak = *(plut + 0); int32 bk = *(plut + 1); )" + ab_ty_ + "* pa[" + sizea + R"(] = A + offa + ak * TK * lda; @@ -133,16 +135,19 @@ void dot::triton_c_src(std::ostream &os) const { int1 checkc[TM, TN] = (rxc < N)[:, newaxis]; if(lockid == 0) @checkc *pc = c; - else { + else + { int32 *plock = locks + ridx*nlocks + lockid - 1; int32 *pcount = plock + get_num_program(0)*nlocks; while(__atomic_cas(plock, 0, 1)); int32 count = *pcount; - if(count == 0) + if(count == 0){ @checkc *pc = c; - else + __atomic_exch(pcount, 1); + } + else{ @checkc *pc = c + *pc; - *pcount = 1; + } __atomic_exch(plock, 0); } })";