From 7578c27d3d7e5cf6117ef79ea2ab169ac3a89176 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 7 Aug 2019 21:15:54 -0700 Subject: [PATCH] [general][filesystem] added structure and namespace to code generation files --- examples/cpp/dot.cpp | 4 +- examples/python/tensorflow/blocksparse.cpp | 2 +- .../alignment.h} | 2 + .../shmem/allocation.h} | 20 +++++--- .../{shmem_info.h => analysis/shmem/info.h} | 7 ++- .../shmem/liveness.h} | 13 +++-- include/triton/codegen/{ => analysis}/tune.h | 2 + .../codegen/{ => selection}/selection.h | 26 ++++++---- .../triton/codegen/{ => selection}/target.h | 0 .../{optimize_dce.h => transform/dce.h} | 4 +- .../{optimize_dot.h => transform/dot.h} | 10 ++-- .../codegen/{ => transform}/reassociate.h | 12 +++-- .../shmem/barriers.h} | 19 +++++-- .../{optimize_trans.h => transform/trans.h} | 2 + .../codegen/{ => transform}/vectorize.h | 12 +++-- include/triton/runtime/jit.h | 50 ++++++++++--------- include/triton/tools/bench.hpp | 4 +- .../alignment.cpp} | 4 +- .../shmem/allocation.cpp} | 18 ++++--- .../shmem/info.cpp} | 19 ++++--- .../shmem/liveness.cpp} | 11 ++-- lib/codegen/{ => analysis}/tune.cpp | 9 ++-- lib/codegen/{ => selection}/selection.cpp | 39 ++++++++------- lib/codegen/{ => selection}/target.cpp | 2 +- .../{optimize_dce.cpp => transform/dce.cpp} | 4 +- .../{optimize_dot.cpp => transform/dot.cpp} | 6 ++- lib/codegen/{ => transform}/reassociate.cpp | 10 ++-- .../shmem/barriers.cpp} | 8 +-- .../trans.cpp} | 4 +- lib/codegen/{ => transform}/vectorize.cpp | 6 ++- lib/dnn/base.cpp | 3 +- lib/dnn/blocksparse/dot.cpp | 11 ++-- lib/dnn/dot.cpp | 24 ++++----- lib/driver/device.cpp | 2 +- lib/runtime/jit.cpp | 2 +- 35 files changed, 224 insertions(+), 147 deletions(-) rename include/triton/codegen/{alignment_info.h => analysis/alignment.h} (97%) rename include/triton/codegen/{shmem_allocation.h => analysis/shmem/allocation.h} (76%) rename include/triton/codegen/{shmem_info.h => analysis/shmem/info.h} (92%) rename include/triton/codegen/{shmem_liveness.h => analysis/shmem/liveness.h} (90%) rename include/triton/codegen/{ => analysis}/tune.h (99%) rename include/triton/codegen/{ => selection}/selection.h (92%) rename include/triton/codegen/{ => selection}/target.h (100%) rename include/triton/codegen/{optimize_dce.h => transform/dce.h} (93%) rename include/triton/codegen/{optimize_dot.h => transform/dot.h} (70%) rename include/triton/codegen/{ => transform}/reassociate.h (82%) rename include/triton/codegen/{shmem_barriers.h => transform/shmem/barriers.h} (78%) rename include/triton/codegen/{optimize_trans.h => transform/trans.h} (95%) rename include/triton/codegen/{ => transform}/vectorize.h (62%) rename lib/codegen/{alignment_info.cpp => analysis/alignment.cpp} (99%) rename lib/codegen/{shmem_allocation.cpp => analysis/shmem/allocation.cpp} (93%) rename lib/codegen/{shmem_info.cpp => analysis/shmem/info.cpp} (91%) rename lib/codegen/{shmem_liveness.cpp => analysis/shmem/liveness.cpp} (84%) rename lib/codegen/{ => analysis}/tune.cpp (98%) rename lib/codegen/{ => selection}/selection.cpp (98%) rename lib/codegen/{ => selection}/target.cpp (99%) rename lib/codegen/{optimize_dce.cpp => transform/dce.cpp} (96%) rename lib/codegen/{optimize_dot.cpp => transform/dot.cpp} (97%) rename lib/codegen/{ => transform}/reassociate.cpp (97%) rename lib/codegen/{shmem_barriers.cpp => transform/shmem/barriers.cpp} (96%) rename lib/codegen/{optimize_trans.cpp => transform/trans.cpp} (97%) rename lib/codegen/{ => transform}/vectorize.cpp (91%) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 50e59e0fa..ef73a7581 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -40,7 +40,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int hb[i] = static_cast((double)rand()/RAND_MAX); for(size_t i = 0; i < hc.size(); i++) hc[i] = static_cast((double)0); - triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); + triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*dt_nbytes); triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*dt_nbytes); triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*dt_nbytes); stream->write(da, true, 0, ha); @@ -49,7 +49,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, ty, 8, 8, 8); // benchmark triton - double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::PARTIAL_TUNING);}, stream); + double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::FULL_TUNING);}, stream); // benchmark cublas // NumericT alpha = 1; // NumericT beta = 0; diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index 3a6a2505c..e2a0b5144 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -130,7 +130,7 @@ public: // create profile triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "half", params_.bsize, params_.locks, params_.blocks, OP); // blocksparse matmul - triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING); + triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::PARTIAL_TUNING); triton::driver::buffer* locks_buffer = ((triton::dnn::blocksparse::dot*)op)->get_locks(); Tensor *tmp = nullptr; TensorShape tmp_shapes; diff --git a/include/triton/codegen/alignment_info.h b/include/triton/codegen/analysis/alignment.h similarity index 97% rename from include/triton/codegen/alignment_info.h rename to include/triton/codegen/analysis/alignment.h index 92a15efeb..1d0c4b191 100644 --- a/include/triton/codegen/alignment_info.h +++ b/include/triton/codegen/analysis/alignment.h @@ -12,6 +12,7 @@ namespace ir { } namespace codegen{ +namespace analysis{ class alignment_info { struct cst_info { @@ -41,6 +42,7 @@ private: }; +} } } diff --git a/include/triton/codegen/shmem_allocation.h b/include/triton/codegen/analysis/shmem/allocation.h similarity index 76% rename from include/triton/codegen/shmem_allocation.h rename to include/triton/codegen/analysis/shmem/allocation.h index 0f36ec154..024c3cf68 100644 --- a/include/triton/codegen/shmem_allocation.h +++ b/include/triton/codegen/analysis/shmem/allocation.h @@ -13,16 +13,18 @@ namespace ir{ } namespace codegen{ +namespace analysis{ -class layout; -class target_tuner; -class shmem_liveness; -class shmem_info; class tune; -class shmem_allocation { +namespace shmem{ + +class liveness; +class info; + +class allocation { public: - shmem_allocation(shmem_liveness *live, shmem_info *buffer_info, tune *params) + allocation(liveness *live, info *buffer_info, tune *params) : liveness_(live), buffer_info_(buffer_info), params_(params){ } // utilities @@ -41,11 +43,13 @@ private: std::map num_bytes_; size_t allocated_size_; // dependences - shmem_liveness *liveness_; - shmem_info *buffer_info_; + liveness *liveness_; + info *buffer_info_; tune *params_; }; +} +} } } diff --git a/include/triton/codegen/shmem_info.h b/include/triton/codegen/analysis/shmem/info.h similarity index 92% rename from include/triton/codegen/shmem_info.h rename to include/triton/codegen/analysis/shmem/info.h index f8325d00b..689516cb2 100644 --- a/include/triton/codegen/shmem_info.h +++ b/include/triton/codegen/analysis/shmem/info.h @@ -14,8 +14,10 @@ namespace ir { } namespace codegen{ +namespace analysis{ +namespace shmem{ -class shmem_info { +class info { public: void run(ir::module &mod); // queries @@ -33,7 +35,8 @@ private: std::map refs_; }; - +} +} } } diff --git a/include/triton/codegen/shmem_liveness.h b/include/triton/codegen/analysis/shmem/liveness.h similarity index 90% rename from include/triton/codegen/shmem_liveness.h rename to include/triton/codegen/analysis/shmem/liveness.h index 69210d03f..bec0303c0 100644 --- a/include/triton/codegen/shmem_liveness.h +++ b/include/triton/codegen/analysis/shmem/liveness.h @@ -12,10 +12,12 @@ namespace ir{ } namespace codegen{ +namespace analysis{ +namespace shmem{ typedef unsigned slot_index; -class shmem_info; +class info; struct segment { slot_index start; @@ -30,7 +32,7 @@ struct segment { } }; -class shmem_liveness { +class liveness { private: typedef std::map indices_map_t; typedef std::map intervals_map_t; @@ -43,7 +45,7 @@ public: public: // constructor - shmem_liveness(shmem_info *info): info_(info){ } + liveness(info *info): info_(info){ } // accessors const intervals_map_t& intervals() const { return intervals_; } @@ -53,7 +55,7 @@ public: void run(ir::module &mod); private: - shmem_info *info_; + info *info_; has_storage_map_t has_dedicated_storage_; indices_map_t indices_; intervals_map_t intervals_; @@ -61,5 +63,8 @@ private: } } +} +} + #endif diff --git a/include/triton/codegen/tune.h b/include/triton/codegen/analysis/tune.h similarity index 99% rename from include/triton/codegen/tune.h rename to include/triton/codegen/analysis/tune.h index 7f393a3a0..54dae5524 100644 --- a/include/triton/codegen/tune.h +++ b/include/triton/codegen/analysis/tune.h @@ -16,6 +16,7 @@ namespace ir{ } namespace codegen{ +namespace analysis{ class tune { typedef std::pair node_t; @@ -67,6 +68,7 @@ private: }; +} } } diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection/selection.h similarity index 92% rename from include/triton/codegen/selection.h rename to include/triton/codegen/selection/selection.h index b480da5f0..2bc49f72e 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection/selection.h @@ -7,7 +7,7 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/type.h" -#include "triton/codegen/shmem_info.h" +#include "triton/codegen/analysis/shmem/info.h" namespace llvm{ @@ -21,12 +21,20 @@ namespace llvm{ namespace triton{ namespace codegen{ -class shmem_allocation; +namespace analysis{ + class tune; -class shmem_info; -class target; class alignment_info; +namespace shmem{ + +class allocation; +class info; + +} +} +class target; + typedef std::vector indices_t; struct distributed_axis { @@ -138,7 +146,7 @@ private: void lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); public: - selection(shmem_allocation *alloc, tune *params, shmem_info *buffer_info, alignment_info *ax_info, target *tgt) + selection(analysis::shmem::allocation *alloc, analysis::tune *params, analysis::shmem::info *buffer_info, analysis::alignment_info *ax_info, target *tgt) : alloc_(alloc), params_(params), buffer_info_(buffer_info), axis_info_(ax_info), tgt_(tgt){ } void run(ir::module &src, llvm::Module &dst); @@ -146,11 +154,11 @@ public: private: vmap_t vmap_; tmap_t tmap_; - shmem_allocation *alloc_; - tune *params_; + analysis::shmem::allocation *alloc_; + analysis::tune *params_; target *tgt_; - shmem_info *buffer_info_; - alignment_info *axis_info_; + analysis::shmem::info *buffer_info_; + analysis::alignment_info *axis_info_; std::map axes_; llvm::Value *sh_mem_ptr_; llvm::Value *offset_a_i_, *offset_a_k_; diff --git a/include/triton/codegen/target.h b/include/triton/codegen/selection/target.h similarity index 100% rename from include/triton/codegen/target.h rename to include/triton/codegen/selection/target.h diff --git a/include/triton/codegen/optimize_dce.h b/include/triton/codegen/transform/dce.h similarity index 93% rename from include/triton/codegen/optimize_dce.h rename to include/triton/codegen/transform/dce.h index e40bafef5..169363752 100644 --- a/include/triton/codegen/optimize_dce.h +++ b/include/triton/codegen/transform/dce.h @@ -12,7 +12,7 @@ namespace ir { } namespace codegen{ -class tune; +namespace transform{ class optimize_dce { public: @@ -20,7 +20,7 @@ public: void run(ir::module &mod); }; - +} } } diff --git a/include/triton/codegen/optimize_dot.h b/include/triton/codegen/transform/dot.h similarity index 70% rename from include/triton/codegen/optimize_dot.h rename to include/triton/codegen/transform/dot.h index 76d8368dc..15612e2f0 100644 --- a/include/triton/codegen/optimize_dot.h +++ b/include/triton/codegen/transform/dot.h @@ -13,18 +13,22 @@ namespace ir { namespace codegen{ +namespace analysis{ class tune; +} + +namespace transform{ class optimize_dot { public: - optimize_dot(tune* params): params_(params) {} + optimize_dot(analysis::tune* params): params_(params) {} void run(ir::module &mod); private: - tune* params_; + analysis::tune* params_; }; - +} } } diff --git a/include/triton/codegen/reassociate.h b/include/triton/codegen/transform/reassociate.h similarity index 82% rename from include/triton/codegen/reassociate.h rename to include/triton/codegen/transform/reassociate.h index 3c9cc813b..66d95eb44 100644 --- a/include/triton/codegen/reassociate.h +++ b/include/triton/codegen/transform/reassociate.h @@ -19,8 +19,12 @@ class getelementptr_inst; namespace codegen{ +namespace analysis{ class tune; class alignment_info; +} + +namespace transform{ class reassociate { struct cst_info { @@ -34,16 +38,18 @@ private: ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map &offsets); public: - reassociate(tune *params, alignment_info *align); + reassociate(analysis::tune *params, analysis::alignment_info *align); void run(ir::module& module); private: - tune* params_; - alignment_info* align_; + analysis::tune* params_; + analysis::alignment_info* align_; }; } } +} + #endif diff --git a/include/triton/codegen/shmem_barriers.h b/include/triton/codegen/transform/shmem/barriers.h similarity index 78% rename from include/triton/codegen/shmem_barriers.h rename to include/triton/codegen/transform/shmem/barriers.h index 271b745cc..d03360690 100644 --- a/include/triton/codegen/shmem_barriers.h +++ b/include/triton/codegen/transform/shmem/barriers.h @@ -17,8 +17,16 @@ namespace ir { namespace codegen{ -class shmem_allocation; -class shmem_info; +namespace analysis{ +namespace shmem{ + +class allocation; +class info; + +} +} + +namespace transform{ class shmem_barriers { private: @@ -36,15 +44,16 @@ private: std::pair transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from, std::set &insert_loc); public: - shmem_barriers(shmem_allocation *alloc, shmem_info *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} + shmem_barriers(analysis::shmem::allocation *alloc, analysis::shmem::info *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} void run(ir::module &mod); private: - shmem_allocation *alloc_; - shmem_info *buffer_info_; + analysis::shmem::allocation *alloc_; + analysis::shmem::info *buffer_info_; }; +} } } diff --git a/include/triton/codegen/optimize_trans.h b/include/triton/codegen/transform/trans.h similarity index 95% rename from include/triton/codegen/optimize_trans.h rename to include/triton/codegen/transform/trans.h index 8af45205d..4bdb62157 100644 --- a/include/triton/codegen/optimize_trans.h +++ b/include/triton/codegen/transform/trans.h @@ -17,6 +17,7 @@ namespace ir { } namespace codegen{ +namespace transform{ class optimize_trans { private: @@ -28,6 +29,7 @@ public: }; +} } } diff --git a/include/triton/codegen/vectorize.h b/include/triton/codegen/transform/vectorize.h similarity index 62% rename from include/triton/codegen/vectorize.h rename to include/triton/codegen/transform/vectorize.h index fe6df9dcf..09fb48000 100644 --- a/include/triton/codegen/vectorize.h +++ b/include/triton/codegen/transform/vectorize.h @@ -9,18 +9,22 @@ namespace ir { namespace codegen{ -class tune; +namespace analysis{ + class tune; +} + +namespace transform{ class vectorize { public: - vectorize(tune *params): params_(params){} + vectorize(analysis::tune *params): params_(params){} void run(ir::module &mod); private: - tune *params_; + analysis::tune *params_; }; - +} } } diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 563d5863a..1cc8c929a 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -8,19 +8,19 @@ #include "triton/ir/print.h" #include "triton/driver/module.h" #include "triton/driver/kernel.h" -#include "triton/codegen/selection.h" -#include "triton/codegen/tune.h" -#include "triton/codegen/optimize_dot.h" -#include "triton/codegen/optimize_dce.h" -#include "triton/codegen/optimize_trans.h" -#include "triton/codegen/shmem_allocation.h" -#include "triton/codegen/shmem_liveness.h" -#include "triton/codegen/shmem_info.h" -#include "triton/codegen/shmem_barriers.h" -#include "triton/codegen/alignment_info.h" -#include "triton/codegen/reassociate.h" -#include "triton/codegen/target.h" -#include "triton/codegen/vectorize.h" +#include "triton/codegen/selection/selection.h" +#include "triton/codegen/selection/target.h" +#include "triton/codegen/analysis/tune.h" +#include "triton/codegen/analysis/shmem/allocation.h" +#include "triton/codegen/analysis/shmem/liveness.h" +#include "triton/codegen/analysis/shmem/info.h" +#include "triton/codegen/analysis/alignment.h" +#include "triton/codegen/transform/dot.h" +#include "triton/codegen/transform/dce.h" +#include "triton/codegen/transform/trans.h" +#include "triton/codegen/transform/shmem/barriers.h" +#include "triton/codegen/transform/reassociate.h" +#include "triton/codegen/transform/vectorize.h" #include "triton/runtime/launch_info.h" #include @@ -35,8 +35,10 @@ class translation_unit; } namespace codegen{ +namespace analysis{ class tune; } +} namespace ir { class module; @@ -90,18 +92,18 @@ public: // ir::print(module, std::cout); } - codegen::tune tune; - codegen::shmem_info shmem_info; - codegen::shmem_liveness shmem_liveness; - codegen::shmem_allocation shmem_allocation; - codegen::shmem_barriers shmem_barriers; - codegen::vectorize vectorize; codegen::selection selection; - codegen::optimize_dot optimize_dot; - codegen::optimize_dce optimize_dce; - codegen::optimize_trans optimize_trans; - codegen::alignment_info alignment_info; - codegen::reassociate reassociate; + codegen::analysis::tune tune; + codegen::analysis::shmem::info shmem_info; + codegen::analysis::shmem::liveness shmem_liveness; + codegen::analysis::shmem::allocation shmem_allocation; + codegen::analysis::alignment_info alignment_info; + codegen::transform::shmem_barriers shmem_barriers; + codegen::transform::vectorize vectorize; + codegen::transform::optimize_dot optimize_dot; + codegen::transform::optimize_dce optimize_dce; + codegen::transform::optimize_trans optimize_trans; + codegen::transform::reassociate reassociate; codegen::target* target_; }; diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 0ebf7f360..74053b717 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -38,8 +38,8 @@ inline double bench(std::function const & op, driver::stream * stream) while(total_time*1e-9 < 1e-3){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning - if(auto cu_device = dynamic_cast(device)) - norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); +// if(auto cu_device = dynamic_cast(device)) +// norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); tmr.start(); op(); stream->synchronize(); diff --git a/lib/codegen/alignment_info.cpp b/lib/codegen/analysis/alignment.cpp similarity index 99% rename from lib/codegen/alignment_info.cpp rename to lib/codegen/analysis/alignment.cpp index ed20e01fc..3ed74f7a3 100644 --- a/lib/codegen/alignment_info.cpp +++ b/lib/codegen/analysis/alignment.cpp @@ -1,4 +1,4 @@ -#include "triton/codegen/alignment_info.h" +#include "triton/codegen/analysis/alignment.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -7,6 +7,7 @@ namespace triton { namespace codegen{ +namespace analysis{ inline int gcd(int a, int b) { @@ -310,3 +311,4 @@ void alignment_info::run(ir::module &mod) { } } +} diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/analysis/shmem/allocation.cpp similarity index 93% rename from lib/codegen/shmem_allocation.cpp rename to lib/codegen/analysis/shmem/allocation.cpp index 042f9a19d..1a2d69536 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/analysis/shmem/allocation.cpp @@ -1,7 +1,7 @@ -#include "triton/codegen/shmem_allocation.h" -#include "triton/codegen/shmem_liveness.h" -#include "triton/codegen/shmem_info.h" -#include "triton/codegen/tune.h" +#include "triton/codegen/analysis/shmem/allocation.h" +#include "triton/codegen/analysis/shmem/liveness.h" +#include "triton/codegen/analysis/shmem/info.h" +#include "triton/codegen/analysis/tune.h" #include "triton/ir/basic_block.h" #include "triton/ir/type.h" #include "triton/ir/value.h" @@ -10,8 +10,10 @@ namespace triton{ namespace codegen{ +namespace analysis{ +namespace shmem{ -unsigned shmem_allocation::is_ld_padded(ir::value *x) { +unsigned allocation::is_ld_padded(ir::value *x) { if(auto *trans = dynamic_cast(x)){ if(trans->get_perm()[0]->get_value() != 0) return 4; @@ -43,7 +45,7 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { return 0; } -unsigned shmem_allocation::get_num_bytes(ir::value *x) { +unsigned allocation::get_num_bytes(ir::value *x) { if(auto *red = dynamic_cast(x)){ unsigned num_bytes = x->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; size_t axis = red->get_axis(); @@ -71,7 +73,7 @@ unsigned shmem_allocation::get_num_bytes(ir::value *x) { return num_bytes; } -void shmem_allocation::run(){ +void allocation::run(){ using std::max; using std::min; typedef std::multimap triples_map_type; @@ -174,3 +176,5 @@ void shmem_allocation::run(){ } } +} +} diff --git a/lib/codegen/shmem_info.cpp b/lib/codegen/analysis/shmem/info.cpp similarity index 91% rename from lib/codegen/shmem_info.cpp rename to lib/codegen/analysis/shmem/info.cpp index 659afaa4a..63dbfb93f 100644 --- a/lib/codegen/shmem_info.cpp +++ b/lib/codegen/analysis/shmem/info.cpp @@ -1,4 +1,4 @@ -#include "triton/codegen/shmem_info.h" +#include "triton/codegen/analysis/shmem/info.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -8,10 +8,11 @@ namespace triton { namespace codegen{ - +namespace analysis{ +namespace shmem{ // run pass on module -bool shmem_info::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ +bool info::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ if(phi->get_parent() != terminator->get_parent()) return false; if(auto *br = dynamic_cast(terminator)) @@ -23,7 +24,7 @@ bool shmem_info::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ throw std::runtime_error("unreachable"); } -void shmem_info::replace(ir::value* before, ir::value *after) { +void info::replace(ir::value* before, ir::value *after) { shared_.erase(before); shared_.insert(after); if(refs_.find(before) != refs_.end()){ @@ -70,7 +71,7 @@ void add_copy(ir::value *x, ir::builder &builder) { } } -void shmem_info::run(ir::module &mod) { +void info::run(ir::module &mod) { // Add shared copies for(ir::function *fn: mod.get_function_list()){ ir::builder builder(mod.get_context()); @@ -120,18 +121,20 @@ void shmem_info::run(ir::module &mod) { } // query double-buffered status -bool shmem_info::is_double(ir::value *x) +bool info::is_double(ir::value *x) { return double_.find(x) != double_.end(); } // query shared status -bool shmem_info::is_shared(ir::value *x) +bool info::is_shared(ir::value *x) { return shared_.find(x) != shared_.end(); } // get reference if any -ir::value *shmem_info::get_reference(ir::value *x) +ir::value *info::get_reference(ir::value *x) { return refs_[x]; } +} +} } } diff --git a/lib/codegen/shmem_liveness.cpp b/lib/codegen/analysis/shmem/liveness.cpp similarity index 84% rename from lib/codegen/shmem_liveness.cpp rename to lib/codegen/analysis/shmem/liveness.cpp index 4d8e9c66b..617a764ed 100644 --- a/lib/codegen/shmem_liveness.cpp +++ b/lib/codegen/analysis/shmem/liveness.cpp @@ -1,5 +1,5 @@ -#include "triton/codegen/shmem_liveness.h" -#include "triton/codegen/shmem_info.h" +#include "triton/codegen/analysis/shmem/liveness.h" +#include "triton/codegen/analysis/shmem/info.h" #include "triton/ir/basic_block.h" #include "triton/ir/function.h" #include "triton/ir/module.h" @@ -8,10 +8,11 @@ namespace triton{ namespace codegen{ - +namespace analysis{ +namespace shmem{ // Entry point -void shmem_liveness::run(ir::module &mod) { +void liveness::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()){ // Assigns index to each instruction slot_index index = 0; @@ -39,3 +40,5 @@ void shmem_liveness::run(ir::module &mod) { } } +} +} diff --git a/lib/codegen/tune.cpp b/lib/codegen/analysis/tune.cpp similarity index 98% rename from lib/codegen/tune.cpp rename to lib/codegen/analysis/tune.cpp index 39dea9f12..7fba702cc 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/analysis/tune.cpp @@ -1,4 +1,4 @@ -#include "triton/codegen/tune.h" +#include "triton/codegen/analysis/tune.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" #include "triton/ir/module.h" @@ -12,6 +12,7 @@ namespace triton{ namespace codegen{ +namespace analysis{ tune::tune(): num_global_ranges_(0){ } @@ -257,7 +258,7 @@ void tune::run(ir::module &mod) { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); if(node.second == 2) fpw->set_value(1); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 2, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } @@ -270,7 +271,8 @@ void tune::run(ir::module &mod) { continue; auto shapes = i->get_type()->get_tile_shapes(); - if(auto *x = dynamic_cast(i)){ + if(auto *x = dynamic_cast(i)) + if(fragments_.at({i, 0}) == STRIDED_SCAN){ ir::type *ptr_ty = x->get_pointer_operand()->get_type()->get_scalar_ty(); size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ @@ -452,3 +454,4 @@ unsigned tune::get_num_threads() { } } +} diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection/selection.cpp similarity index 98% rename from lib/codegen/selection.cpp rename to lib/codegen/selection/selection.cpp index 86bee2932..491ff870f 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -1,15 +1,15 @@ -#include "triton/codegen/selection.h" -#include "triton/codegen/tune.h" -#include "triton/codegen/shmem_allocation.h" -#include "triton/codegen/target.h" -#include "triton/codegen/alignment_info.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/IRBuilder.h" +#include "triton/codegen/selection/selection.h" +#include "triton/codegen/analysis/tune.h" +#include "triton/codegen/analysis/shmem/allocation.h" +#include "triton/codegen/selection/target.h" +#include "triton/codegen/analysis/alignment.h" #include "triton/ir/context.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/type.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -485,7 +485,7 @@ inline void to_warps(const std::vector &bs, std::vector &nw, void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); - if(params_->get_fragment(v, 0) == tune::STRIDED_SCAN){ + if(params_->get_fragment(v, 0) == analysis::tune::STRIDED_SCAN){ std::vector contiguous(dim); std::vector block_size(dim); std::vector warp_size(dim); @@ -1022,7 +1022,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); - if(params_->get_fragment(ins, 0) == tune::STRIDED_SCAN) { + if(params_->get_fragment(ins, 0) == analysis::tune::STRIDED_SCAN) { TA->set_vector_size(TC->axis(0).contiguous); TB->set_vector_size(TC->axis(1).contiguous); result->for_each([&](indices_t idx){ @@ -1047,8 +1047,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & a = builder.CreateFPCast(a, c_ty); if(b->getType() != c_ty) b = builder.CreateFPCast(b, c_ty); -// a = ConstantFP::get(builder.getFloatTy(), 1); -// b = ConstantFP::get(builder.getFloatTy(), 1); res = builder.CreateCall(f_mul_add, {a, b, res}); } result->set_value(idx, res); @@ -1060,13 +1058,14 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & TA->set_return_mode(true); TB->set_return_mode(true); - std::map> fcs; + std::map, std::vector> fcs; result->for_each([&](indices_t idx){ - fcs[{builder.getInt32(0)}].push_back(TC->get_value(idx)); + std::vector key(idx.size() - 2); + std::copy(idx.begin() + 2, idx.end(), key.begin()); + fcs[key].push_back(TC->get_value(idx)); }); - Type *fp32_ty = builder.getFloatTy(); Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); @@ -1122,8 +1121,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & std::swap(idx_a[0], idx_a[1]); if(!dot->is_b_trans()) std::swap(idx_b[0], idx_b[1]); -// idx_a.push_back(builder.getInt32(0)); -// idx_b.push_back(builder.getInt32(0)); + idx_a.insert(idx_a.end(), x.first.begin(), x.first.end()); + idx_b.insert(idx_b.end(), x.first.begin(), x.first.end()); Value *ha = TA->get_value(idx_a); Value *hb = TB->get_value(idx_b); for(unsigned ii = 0; ii < pack_size_0_; ii++) @@ -1159,9 +1158,11 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // write back unsigned i = 0; result->for_each([&](indices_t idx){ - if(i >= fcs.at({builder.getInt32(0)}).size()) + std::vector key(idx.size() - 2); + std::copy(idx.begin() + 2, idx.end(), key.begin()); + if(i >= fcs.at(key).size()) i = 0; - result->set_value(idx, fcs.at({builder.getInt32(0)})[i++]); + result->set_value(idx, fcs.at(key)[i++]); }); TA->set_return_mode(false); diff --git a/lib/codegen/target.cpp b/lib/codegen/selection/target.cpp similarity index 99% rename from lib/codegen/target.cpp rename to lib/codegen/selection/target.cpp index 4116bcca7..3a5e35aa1 100644 --- a/lib/codegen/target.cpp +++ b/lib/codegen/selection/target.cpp @@ -1,4 +1,4 @@ -#include "triton/codegen/target.h" +#include "triton/codegen/selection/target.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" diff --git a/lib/codegen/optimize_dce.cpp b/lib/codegen/transform/dce.cpp similarity index 96% rename from lib/codegen/optimize_dce.cpp rename to lib/codegen/transform/dce.cpp index 8caf22a62..d11caf55f 100644 --- a/lib/codegen/optimize_dce.cpp +++ b/lib/codegen/transform/dce.cpp @@ -2,10 +2,11 @@ #include "triton/ir/basic_block.h" #include "triton/ir/module.h" #include "triton/ir/cfg.h" -#include "triton/codegen/optimize_dce.h" +#include "triton/codegen/transform/dce.h" namespace triton { namespace codegen{ +namespace transform{ void optimize_dce::run(ir::module &mod) { @@ -60,3 +61,4 @@ void optimize_dce::run(ir::module &mod) { } } +} diff --git a/lib/codegen/optimize_dot.cpp b/lib/codegen/transform/dot.cpp similarity index 97% rename from lib/codegen/optimize_dot.cpp rename to lib/codegen/transform/dot.cpp index 00893c1a5..fa1a542f0 100644 --- a/lib/codegen/optimize_dot.cpp +++ b/lib/codegen/transform/dot.cpp @@ -1,11 +1,12 @@ #include "triton/ir/function.h" #include "triton/ir/basic_block.h" #include "triton/ir/module.h" -#include "triton/codegen/optimize_dot.h" -#include "triton/codegen/tune.h" +#include "triton/codegen/transform/dot.h" +#include "triton/codegen/analysis/tune.h" namespace triton { namespace codegen{ +namespace transform{ inline bool is_trans(ir::value *v){ auto *x = dynamic_cast(v); @@ -109,3 +110,4 @@ void optimize_dot::run(ir::module &mod) { } } +} diff --git a/lib/codegen/reassociate.cpp b/lib/codegen/transform/reassociate.cpp similarity index 97% rename from lib/codegen/reassociate.cpp rename to lib/codegen/transform/reassociate.cpp index d0a54ec31..a1594fb17 100644 --- a/lib/codegen/reassociate.cpp +++ b/lib/codegen/transform/reassociate.cpp @@ -1,15 +1,16 @@ #include -#include "triton/codegen/reassociate.h" -#include "triton/codegen/alignment_info.h" +#include "triton/codegen/transform/reassociate.h" +#include "triton/codegen/analysis/alignment.h" +#include "triton/codegen/analysis/tune.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" #include "triton/ir/cfg.h" -#include "triton/codegen/tune.h" namespace triton { namespace codegen{ +namespace transform{ //inline Constant *get_gep_cst_offset(GetElementPtrInst *gep){ // std::vector idx_vals; @@ -154,7 +155,7 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, return new_value; } -reassociate::reassociate(tune* params, alignment_info* align) +reassociate::reassociate(analysis::tune* params, analysis::alignment_info* align) : params_(params), align_(align) { } @@ -280,3 +281,4 @@ void reassociate::run(ir::module &mod) { } } +} diff --git a/lib/codegen/shmem_barriers.cpp b/lib/codegen/transform/shmem/barriers.cpp similarity index 96% rename from lib/codegen/shmem_barriers.cpp rename to lib/codegen/transform/shmem/barriers.cpp index 717b927fd..be0875b96 100644 --- a/lib/codegen/shmem_barriers.cpp +++ b/lib/codegen/transform/shmem/barriers.cpp @@ -1,7 +1,7 @@ #include -#include "triton/codegen/shmem_barriers.h" -#include "triton/codegen/shmem_allocation.h" -#include "triton/codegen/shmem_info.h" +#include "triton/codegen/transform/shmem/barriers.h" +#include "triton/codegen/analysis/shmem/allocation.h" +#include "triton/codegen/analysis/shmem/info.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -11,6 +11,7 @@ namespace triton { namespace codegen{ +namespace transform{ bool shmem_barriers::intersect(const interval_vec_t &X, interval_t x) { return std::any_of(X.begin(), X.end(), [&](const interval_t &y){ @@ -137,3 +138,4 @@ void shmem_barriers::run(ir::module &mod) { } } +} diff --git a/lib/codegen/optimize_trans.cpp b/lib/codegen/transform/trans.cpp similarity index 97% rename from lib/codegen/optimize_trans.cpp rename to lib/codegen/transform/trans.cpp index b1e0dc4b9..43cba99b7 100644 --- a/lib/codegen/optimize_trans.cpp +++ b/lib/codegen/transform/trans.cpp @@ -1,9 +1,10 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" -#include "triton/codegen/optimize_trans.h" +#include "triton/codegen/transform/trans.h" namespace triton { namespace codegen{ +namespace transform{ ir::value* optimize_trans::replace_phi(ir::value* value, @@ -73,3 +74,4 @@ void optimize_trans::run(ir::module &mod) { } } +} diff --git a/lib/codegen/vectorize.cpp b/lib/codegen/transform/vectorize.cpp similarity index 91% rename from lib/codegen/vectorize.cpp rename to lib/codegen/transform/vectorize.cpp index 8ec5a99f6..dbf7ee7f1 100644 --- a/lib/codegen/vectorize.cpp +++ b/lib/codegen/transform/vectorize.cpp @@ -1,5 +1,5 @@ -#include "triton/codegen/vectorize.h" -#include "triton/codegen/tune.h" +#include "triton/codegen/transform/vectorize.h" +#include "triton/codegen/analysis/tune.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -8,6 +8,7 @@ namespace triton { namespace codegen{ +namespace transform{ void vectorize::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); @@ -39,3 +40,4 @@ void vectorize::run(ir::module &mod) { } } +} diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 86d031564..dae023eef 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -62,7 +62,8 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v jit->add_module(name_.c_str(), src.c_str(), best.params); } else{ - params_t params = heuristics(); +// params_t params = heuristics(); + params_t params = {4, 2, 16, 4, 4, 16, 2, 2, 1, 1, 1, 8, 64, 8, 8, 1, 4, 2, 1}; jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp index 9c7fd95d9..054904f27 100644 --- a/lib/dnn/blocksparse/dot.cpp +++ b/lib/dnn/blocksparse/dot.cpp @@ -96,7 +96,7 @@ void dot::triton_c_src(std::ostream &os) const { restrict read_only align(16) )" + ab_ty_ + R"( *B, )" + c_ty_ + R"(* C, int lda, int ldc, int N, - int* lut, int* locks, int nlocks){ + int* lut, int* locks, int nlocks) { int ridx = get_range_id(0); int ridy = get_range_id(1); float acc[TM, TN] = 0; @@ -129,10 +129,10 @@ void dot::triton_c_src(std::ostream &os) const { )" + c_ty_ + R"(" c[TM, TN] = acc; )" + c_ty_ + R"(* pc[TM, TN] = C + rxc[:, newaxis] + ryc[newaxis, :]*ldc; bool checkc[TM, TN] = (rxc < N)[:, newaxis]; - if(lockid == 0) + if(lockid == 0) { @checkc *pc = c; - else - { + } + else { int *plock = locks + ridx*nlocks + lockid - 1; int *pcount = plock + get_num_program(0)*nlocks; while(__atomic_cas(plock, 0, 1)); @@ -147,10 +147,11 @@ void dot::triton_c_src(std::ostream &os) const { __atomic_exch(plock, 0); } })"; - os << result; } + + } } } diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 5465b26b9..22198b7af 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -75,14 +75,14 @@ void dot::triton_c_src(std::ostream &os) const { std::string ZS = "1"; std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; - std::string XAS0 = "TM", XAS1 = "TK", XAS2 = ZS; - std::string XBS0 = "TK", XBS1 = ZS, XBS2 = "TN"; + std::string XAS0 = "TM", XAS1 = "TK / " + ZS, XAS2 = ZS; + std::string XBS0 = "TK / " + ZS, XBS1 = ZS, XBS2 = "TN"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT_ ? "trans(xa, 2, 0, 1)" : "xa"; - std::string useb = BT_ ? "trans(xb, 1, 0, 2)" : "trans(xb, 0, 2, 1)"; + std::string usea = AT_ ? "trans(a)" : "a"; + std::string useb = BT_ ? "trans(b)" : "b"; if(AT_){ std::swap(AS0, AS1); std::swap(XAS0, XAS1); @@ -99,15 +99,15 @@ void dot::triton_c_src(std::ostream &os) const { } std::string AS = AS0 + ", " + AS1; std::string BS = BS0 + ", " + BS1; - std::string XAS = XAS0 + ", " + XAS1 + ", " + XAS2; - std::string XBS = XBS0 + ", " + XBS1 + ", " + XBS2; - std::string XCS = "TM, TN, " + ZS; +// std::string XAS = XAS0 + ", " + XAS1 + ", " + XAS2; +// std::string XBS = XBS0 + ", " + XBS1 + ", " + XBS2; + std::string XCS = "TM, TN"; std::string align_lda_str = "multiple_of(" + std::to_string(align_lda_) + ")"; std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = R"( -const tunable int TM = {16, 32, 64, 128}; -const tunable int TN = {16, 32, 64, 128}; +const tunable int TM = {32}; +const tunable int TN = {32}; const tunable int TK = {32}; const tunable int GZ = {1}; @@ -131,8 +131,6 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, bool checkb[)" + BS + R"(] = (rkb < K))" + bcb0 + " && (ryb < N)" + bcb1 + R"(; )" + a_ty_ + R"( a[)" + AS + R"(] = checka ? *pa : 0; )" + b_ty_ + R"( b[)" + BS + R"(] = checkb ? *pb : 0; - )" + a_ty_ + R"( xa[)" + XAS + "] = __reshape(a, " + XAS + R"(); - )" + b_ty_ + R"( xb[)" + XBS + "] = __reshape(b, " + XBS + R"(); for(int k = K; k > 0; k = k - TK){ xc = dot()" + usea + ", " + useb + R"(, xc); pa = pa + TK)" + lda0 + R"(; @@ -141,13 +139,11 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, bool checkb[)" + BS + R"(] = k > TK; a = checka ? *pa : 0; b = checkb ? *pb : 0; - xa = __reshape(a, )" + XAS + R"(); - xb = __reshape(b, )" + XBS + R"(); } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); )" + c_ty_ + R"(* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - )" + c_ty_ + R"( c[TM, TN] = __sum(xc, 2); + )" + c_ty_ + R"( c[TM, TN] = xc; bool checkc0[TM] = rxc < M; bool checkc1[TN] = ryc < N; bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index ae66c50c8..41a9561eb 100755 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -28,7 +28,7 @@ #include "triton/driver/helpers/CL/infos.hpp" #include "triton/driver/device.h" #include "triton/driver/context.h" -#include "triton/codegen/target.h" +#include "triton/codegen/selection/target.h" namespace triton { diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 1f6a60ccd..141bb8054 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -1,6 +1,6 @@ #include #include "triton/lang/lang.h" -#include "triton/codegen/target.h" +#include "triton/codegen/selection/target.h" #include "triton/ir/context.h" #include "triton/ir/context_impl.h" #include "triton/driver/device.h"