triton/lib/codegen/analysis/swizzle.cc

#include "triton/codegen/analysis/swizzle.h"
#include "triton/codegen/analysis/layout.h"
#include "triton/codegen/target.h"
#include "triton/ir/type.h"
#include <iostream>

namespace triton{
namespace codegen{
namespace analysis{


void swizzle::run(ir::module &) {
    per_phase_.clear();
    max_phase_.clear();

    for(auto &x: layouts_->get_all()){
      shared_layout* layout = dynamic_cast<shared_layout*>(x.second);
      if(!layout)
        continue;
      ir::value* mma_dot_a = layout->hmma_dot_a();
      ir::value* mma_dot_b = layout->hmma_dot_b();

      if(!mma_dot_a && !mma_dot_b){
        per_phase_[layout] = 1;
        max_phase_[layout] = 1;
        vec_[layout] = 1;
        continue;
      }
      auto ord = layout->get_order();
      scanline_layout* in_layout = dynamic_cast<scanline_layout*>(layout->get_arg_layout());
      int per_phase = 1;
      int dtsize = layout->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8;
      if(in_layout)
        per_phase = std::max<int>(128 / (in_layout->mts(ord[0])*in_layout->nts(ord[0])*dtsize), 1);
      else
        per_phase = 1;
      if(tgt_->as_nvidia() && tgt_->as_nvidia()->sm() < 80){
        int inner = mma_dot_a ? 0 : 1;
        per_phase_[layout] = per_phase;
        max_phase_[layout] = (ord[inner] == 1 ? 8 : 4) / per_phase_[layout];
        if(mma_dot_a)
          vec_[layout] = 2*layouts_->get(mma_dot_a)->to_mma()->rep(0);
        else
          vec_[layout] = 2*layouts_->get(mma_dot_b)->to_mma()->rep(1);
      }
      else {
        if (!layout->allow_swizzle()) {
          per_phase_[layout] = 1;
          max_phase_[layout] = 1;
          vec_[layout] = 1;
        } else {
          per_phase_[layout] = per_phase;
          max_phase_[layout] = layout->get_mma_strided() / per_phase_[layout];
          vec_[layout]       = layout->get_mma_vec();
        }
      }
    }
}

}
}
}
[GENERAL] Merged v1.0alpha into master. Added features are: - A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill 2021-01-11 19:20:34 -05:00			`#include "triton/codegen/analysis/swizzle.h"`
			`#include "triton/codegen/analysis/layout.h"`
			`#include "triton/codegen/target.h"`
			`#include "triton/ir/type.h"`
			`#include <iostream>`

			`namespace triton{`
			`namespace codegen{`
			`namespace analysis{`


			`void swizzle::run(ir::module &) {`
			`per_phase_.clear();`
			`max_phase_.clear();`

			`for(auto &x: layouts_->get_all()){`
[CODEGEN] Reverted some changes from previous PR; fixed vectorization characteristics of mma layout (#469) 2022-03-04 01:53:31 -08:00			`shared_layout* layout = dynamic_cast<shared_layout*>(x.second);`
			`if(!layout)`
[GENERAL] Merged v1.0alpha into master. Added features are: - A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill 2021-01-11 19:20:34 -05:00			`continue;`
[CODEGEN] Reverted some changes from previous PR; fixed vectorization characteristics of mma layout (#469) 2022-03-04 01:53:31 -08:00			`ir::value* mma_dot_a = layout->hmma_dot_a();`
			`ir::value* mma_dot_b = layout->hmma_dot_b();`
[BACKEND] Add bf16 & tf32 mma supports (on A100) (#426) 2022-01-12 02:20:31 +08:00
[GENERAL] Merged v1.0alpha into master. Added features are: - A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill 2021-01-11 19:20:34 -05:00			`if(!mma_dot_a && !mma_dot_b){`
[CODEGEN] Reverted some changes from previous PR; fixed vectorization characteristics of mma layout (#469) 2022-03-04 01:53:31 -08:00			`per_phase_[layout] = 1;`
			`max_phase_[layout] = 1;`
			`vec_[layout] = 1;`
[GENERAL] Merged v1.0alpha into master. Added features are: - A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill 2021-01-11 19:20:34 -05:00			`continue;`
			`}`
[CODEGEN] Reverted some changes from previous PR; fixed vectorization characteristics of mma layout (#469) 2022-03-04 01:53:31 -08:00			`auto ord = layout->get_order();`
			`scanline_layout* in_layout = dynamic_cast<scanline_layout*>(layout->get_arg_layout());`
[BACKEND] Compiler improvements (#557) This PR adds several optimization capabilities in the compiler backend: - Now using inline PTX for `tl.store`, making it possible to use things like evict_last - For A100, mma layout can be directly converted to shared memory - For A100, an additional "transpose" argument in `dot` allows tensors to be loaded once and used both row- and col- major. - Fixed liveness analysis; this was broken. - Now can load/store directly mma layout without converting. Useful for when tl.dot accumulator is initialized with DRAM data inside of an inner loop. - `tl.dot` can now take LHS inputs in registers when it comes from a previous `tl.dot` instruction. Useful for e.g. fused attention. 2022-06-27 11:49:19 -07:00			`int per_phase = 1;`
[CODEGEN] Reverted some changes from previous PR; fixed vectorization characteristics of mma layout (#469) 2022-03-04 01:53:31 -08:00			`int dtsize = layout->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8;`
[BACKEND] Compiler improvements (#557) This PR adds several optimization capabilities in the compiler backend: - Now using inline PTX for `tl.store`, making it possible to use things like evict_last - For A100, mma layout can be directly converted to shared memory - For A100, an additional "transpose" argument in `dot` allows tensors to be loaded once and used both row- and col- major. - Fixed liveness analysis; this was broken. - Now can load/store directly mma layout without converting. Useful for when tl.dot accumulator is initialized with DRAM data inside of an inner loop. - `tl.dot` can now take LHS inputs in registers when it comes from a previous `tl.dot` instruction. Useful for e.g. fused attention. 2022-06-27 11:49:19 -07:00			`if(in_layout)`
			`per_phase = std::max<int>(128 / (in_layout->mts(ord[0])in_layout->nts(ord[0])dtsize), 1);`
			`else`
			`per_phase = 1;`
[FRONTEND] Removed circular import that broke Python 3.6 support (#272) 2021-09-09 13:46:55 -07:00			`if(tgt_->as_nvidia() && tgt_->as_nvidia()->sm() < 80){`
[GENERAL] Merged v1.0alpha into master. Added features are: - A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill 2021-01-11 19:20:34 -05:00			`int inner = mma_dot_a ? 0 : 1;`
[BACKEND] Compiler improvements (#557) This PR adds several optimization capabilities in the compiler backend: - Now using inline PTX for `tl.store`, making it possible to use things like evict_last - For A100, mma layout can be directly converted to shared memory - For A100, an additional "transpose" argument in `dot` allows tensors to be loaded once and used both row- and col- major. - Fixed liveness analysis; this was broken. - Now can load/store directly mma layout without converting. Useful for when tl.dot accumulator is initialized with DRAM data inside of an inner loop. - `tl.dot` can now take LHS inputs in registers when it comes from a previous `tl.dot` instruction. Useful for e.g. fused attention. 2022-06-27 11:49:19 -07:00			`per_phase_[layout] = per_phase;`
[CODEGEN] Reverted some changes from previous PR; fixed vectorization characteristics of mma layout (#469) 2022-03-04 01:53:31 -08:00			`max_phase_[layout] = (ord[inner] == 1 ? 8 : 4) / per_phase_[layout];`
[GENERAL] Merged v1.0alpha into master. Added features are: - A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill 2021-01-11 19:20:34 -05:00			`if(mma_dot_a)`
[CODEGEN] Reverted some changes from previous PR; fixed vectorization characteristics of mma layout (#469) 2022-03-04 01:53:31 -08:00			`vec_[layout] = 2*layouts_->get(mma_dot_a)->to_mma()->rep(0);`
[GENERAL] Merged v1.0alpha into master. Added features are: - A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill 2021-01-11 19:20:34 -05:00			`else`
[CODEGEN] Reverted some changes from previous PR; fixed vectorization characteristics of mma layout (#469) 2022-03-04 01:53:31 -08:00			`vec_[layout] = 2*layouts_->get(mma_dot_b)->to_mma()->rep(1);`
[GENERAL] Merged v1.0alpha into master. Added features are: - A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill 2021-01-11 19:20:34 -05:00			`}`
[BACKEND] Add bf16 & tf32 mma supports (on A100) (#426) 2022-01-12 02:20:31 +08:00			`else {`
[CODEGEN] Reverted some changes from previous PR; fixed vectorization characteristics of mma layout (#469) 2022-03-04 01:53:31 -08:00			`if (!layout->allow_swizzle()) {`
			`per_phase_[layout] = 1;`
			`max_phase_[layout] = 1;`
			`vec_[layout] = 1;`
[BACKEND] Added Int8 mma (#440) 2022-01-28 01:12:44 +08:00			`} else {`
[BACKEND] Compiler improvements (#557) This PR adds several optimization capabilities in the compiler backend: - Now using inline PTX for `tl.store`, making it possible to use things like evict_last - For A100, mma layout can be directly converted to shared memory - For A100, an additional "transpose" argument in `dot` allows tensors to be loaded once and used both row- and col- major. - Fixed liveness analysis; this was broken. - Now can load/store directly mma layout without converting. Useful for when tl.dot accumulator is initialized with DRAM data inside of an inner loop. - `tl.dot` can now take LHS inputs in registers when it comes from a previous `tl.dot` instruction. Useful for e.g. fused attention. 2022-06-27 11:49:19 -07:00			`per_phase_[layout] = per_phase;`
[CODEGEN] Reverted some changes from previous PR; fixed vectorization characteristics of mma layout (#469) 2022-03-04 01:53:31 -08:00			`max_phase_[layout] = layout->get_mma_strided() / per_phase_[layout];`
			`vec_[layout] = layout->get_mma_vec();`
[BACKEND] Added Int8 mma (#440) 2022-01-28 01:12:44 +08:00			`}`
[GENERAL] Merged v1.0alpha into master. Added features are: - A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill 2021-01-11 19:20:34 -05:00			`}`
			`}`
			`}`

			`}`
			`}`
			`}`