Improved codegen for the Ampere GPUs. * Make the layout pass recognize the multistage pipelined pattern. * Now the pipeline pass can automate the multistage pipelining transformation. * Remove extra barriers (from the prefetch pass & WAR) on Ampere. * Update the code generator (generator.cc) to make Triton generate n-buffered shared memory loads/stores.
32 lines
505 B
C++
32 lines
505 B
C++
#ifndef _TRITON_CODEGEN_PASS_H_
|
|
#define _TRITON_CODEGEN_PASS_H_
|
|
|
|
|
|
#include <memory>
|
|
|
|
namespace triton{
|
|
|
|
namespace ir{
|
|
class module;
|
|
}
|
|
namespace driver{
|
|
class device;
|
|
class module;
|
|
class kernel;
|
|
}
|
|
}
|
|
|
|
namespace triton{
|
|
namespace codegen{
|
|
|
|
// TODO:
|
|
// There should be a proper pass manager there!
|
|
void add_passes_to_emit_bin(ir::module &ir, driver::device* dev, int num_warps, int num_stages,
|
|
driver::module*& mod, driver::kernel*& ker, size_t& shared_mem);
|
|
|
|
|
|
}
|
|
}
|
|
|
|
#endif
|