Improved codegen for the Ampere GPUs. * Make the layout pass recognize the multistage pipelined pattern. * Now the pipeline pass can automate the multistage pipelining transformation. * Remove extra barriers (from the prefetch pass & WAR) on Ampere. * Update the code generator (generator.cc) to make Triton generate n-buffered shared memory loads/stores.
31 lines
557 B
C++
31 lines
557 B
C++
#ifndef TRITON_INCLUDE_IR_CODEGEN_PIPELINE_H
|
|
#define TRITON_INCLUDE_IR_CODEGEN_PIPELINE_H
|
|
|
|
// forward declaration
|
|
namespace triton {
|
|
namespace ir {
|
|
class module;
|
|
}
|
|
} // namespace triton
|
|
|
|
namespace triton {
|
|
namespace codegen {
|
|
namespace transform {
|
|
|
|
class pipeline {
|
|
public:
|
|
pipeline(bool has_copy_async, int num_stages)
|
|
: has_copy_async_(has_copy_async), num_stages_(num_stages) {}
|
|
void run(ir::module &module);
|
|
|
|
private:
|
|
bool has_copy_async_;
|
|
int num_stages_;
|
|
};
|
|
|
|
} // namespace transform
|
|
} // namespace codegen
|
|
} // namespace triton
|
|
|
|
#endif
|