This PR adds several optimization capabilities in the compiler backend: - Now using inline PTX for `tl.store`, making it possible to use things like evict_last - For A100, mma layout can be directly converted to shared memory - For A100, an additional "transpose" argument in `dot` allows tensors to be loaded once and used both row- and col- major. - Fixed liveness analysis; this was broken. - Now can load/store directly mma layout without converting. Useful for when tl.dot accumulator is initialized with DRAM data inside of an inner loop. - `tl.dot` can now take LHS inputs in registers when it comes from a previous `tl.dot` instruction. Useful for e.g. fused attention.
44 lines
782 B
C++
44 lines
782 B
C++
#ifndef TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
|
|
#define TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
|
|
|
|
#include <set>
|
|
#include <map>
|
|
|
|
namespace triton {
|
|
|
|
namespace ir {
|
|
class module;
|
|
class value;
|
|
class phi_node;
|
|
class instruction;
|
|
class builder;
|
|
}
|
|
|
|
namespace codegen{
|
|
|
|
namespace analysis{
|
|
class layouts;
|
|
}
|
|
|
|
namespace transform{
|
|
|
|
class cts {
|
|
private:
|
|
bool is_shmem_op(ir::instruction* i, int op);
|
|
bool is_shmem_res(ir::value* i);
|
|
void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder, bool to_shared, std::map<ir::value*,ir::value*>& copies);
|
|
|
|
public:
|
|
cts(analysis::layouts* layouts, bool has_sm80 = false): layouts_(layouts), has_sm80_(has_sm80) {}
|
|
void run(ir::module &mod);
|
|
|
|
private:
|
|
bool has_sm80_;
|
|
analysis::layouts* layouts_;
|
|
};
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif |