[BACKEND] Compiler improvements (#557)
This PR adds several optimization capabilities in the compiler backend: - Now using inline PTX for `tl.store`, making it possible to use things like evict_last - For A100, mma layout can be directly converted to shared memory - For A100, an additional "transpose" argument in `dot` allows tensors to be loaded once and used both row- and col- major. - Fixed liveness analysis; this was broken. - Now can load/store directly mma layout without converting. Useful for when tl.dot accumulator is initialized with DRAM data inside of an inner loop. - `tl.dot` can now take LHS inputs in registers when it comes from a previous `tl.dot` instruction. Useful for e.g. fused attention.
This commit is contained in:
@@ -12,8 +12,8 @@ namespace triton {
|
||||
namespace codegen{
|
||||
namespace transform{
|
||||
|
||||
coalesce::coalesce(analysis::align* align, analysis::layouts *layouts)
|
||||
: align_(align), layout_(layouts) { }
|
||||
coalesce::coalesce(analysis::align* align, analysis::layouts *layouts, bool has_sm80)
|
||||
: align_(align), layout_(layouts), has_sm80_(has_sm80) { }
|
||||
|
||||
|
||||
// simplify layout conversions using the following simple rules:
|
||||
@@ -64,15 +64,18 @@ void coalesce::run(ir::module &mod) {
|
||||
if(op->get_type()->is_block_ty())
|
||||
if(op->get_type()->get_tile_rank() == 2)
|
||||
if(invalidated.find(layout_->get(op)) == invalidated.end())
|
||||
if(layout_->get(op)->to_mma()){
|
||||
if(layout_->get(op)->to_mma())
|
||||
if(dynamic_cast<ir::io_inst*>(i)->get_eviction_policy()==ir::io_inst::NORMAL){
|
||||
ir::instruction* new_op = ir::cvt_layout_inst::create(op);
|
||||
builder.set_insert_point(i);
|
||||
builder.insert(new_op);
|
||||
i->replace_uses_of_with(op, new_op);
|
||||
}
|
||||
// coalesce before copy_to_shared
|
||||
// It's dirty, but the backend is being rewritten from scratch. :)
|
||||
if(dynamic_cast<ir::copy_to_shared_inst*>(i))
|
||||
// only necessary for sm < 80 as Ampere+ can handle reduction
|
||||
// on MMA layout
|
||||
if(!has_sm80_)
|
||||
if(dynamic_cast<ir::copy_to_shared_inst*>(i) || dynamic_cast<ir::reduce_inst*>(i))
|
||||
if(ir::value* op = i->get_operand(0))
|
||||
if(op->get_type()->is_block_ty())
|
||||
if(op->get_type()->get_tile_rank() == 2)
|
||||
@@ -89,7 +92,8 @@ void coalesce::run(ir::module &mod) {
|
||||
if(auto x = dynamic_cast<ir::load_inst*>(i))
|
||||
if(x->get_type()->is_block_ty())
|
||||
if(x->get_type()->get_tile_rank()==2)
|
||||
if(layout_->get(x)->to_mma()){
|
||||
if(layout_->get(x)->to_mma())
|
||||
if(!has_sm80_ || dynamic_cast<ir::io_inst*>(i)->get_eviction_policy()==ir::io_inst::NORMAL){
|
||||
builder.set_insert_point_after(x);
|
||||
ir::instruction* new_x = ir::cvt_layout_inst::create(x);
|
||||
builder.insert(new_x);
|
||||
|
Reference in New Issue
Block a user