- A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill
37 lines
548 B
C++
37 lines
548 B
C++
#ifndef TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
|
|
#define TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
|
|
|
|
#include <set>
|
|
#include <map>
|
|
|
|
namespace triton {
|
|
|
|
namespace ir {
|
|
class module;
|
|
class value;
|
|
class phi_node;
|
|
class instruction;
|
|
class builder;
|
|
}
|
|
|
|
namespace codegen{
|
|
namespace transform{
|
|
|
|
class cts {
|
|
private:
|
|
void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder, bool to_shared);
|
|
|
|
public:
|
|
cts(bool use_async = false): use_async_(use_async) {}
|
|
void run(ir::module &mod);
|
|
|
|
private:
|
|
bool use_async_;
|
|
};
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif
|