This PR merges the `triton-mlir` branch, in which we have been quietly rewriting the Triton backend from scratch to increase maintainability, stability and ultimately performance. Changes to the runtime are minimal, and this new version aims to remain backward-compatible with the previous commit. The legacy backend is now officially deprecated, but can still be accessed via the `legacy-backend` tag. Co-authored-by: Keren Zhou <kerenzhou@openai.com> Co-authored-by: Yan Chunwei <yanchunwei@outlook.com> Co-authored-by: goostavz <109190422+goostavz@users.noreply.github.com> Co-authored-by: Shintaro Iwasaki <siwasaki@fb.com> Co-authored-by: Yan Da <dyanab@connect.ust.hk> Co-authored-by: Jun Yang <yangjunpro@gmail.com> Co-authored-by: Ian Bearman <ianb@microsoft.com> Co-authored-by: Jason Ansel <jansel@jansel.net> Co-authored-by: Qingyi Liu <qingyil@nvidia.com> Co-authored-by: ben-zhang-609 <110140741+ben-zhang-609@users.noreply.github.com> Co-authored-by: Chenggang Zhao <lyricz@yeah.net> Co-authored-by: ben-zhang-609 <benzh609@gmail.com> Co-authored-by: dongdongl <dongdongl@nvidia.com>
138 lines
4.8 KiB
C++
138 lines
4.8 KiB
C++
#include "triton/Analysis/Membar.h"
|
|
#include "triton/Analysis/Alias.h"
|
|
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
|
|
|
|
#include "mlir/Dialect/GPU/GPUDialect.h"
|
|
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
|
|
|
namespace mlir {
|
|
|
|
void MembarAnalysis::run() {
|
|
auto *operation = allocation->getOperation();
|
|
RegionInfo regionInfo;
|
|
OpBuilder builder(operation);
|
|
dfsOperation(operation, ®ionInfo, &builder);
|
|
}
|
|
|
|
void MembarAnalysis::dfsOperation(Operation *operation,
|
|
RegionInfo *parentRegionInfo,
|
|
OpBuilder *builder) {
|
|
transfer(operation, parentRegionInfo, builder);
|
|
if (operation->getNumRegions()) {
|
|
// If there's any nested regions, we need to visit them.
|
|
// scf.if and scf.else: two regions
|
|
// scf.if only: two regions
|
|
// scf.for: one region
|
|
RegionInfo curRegionInfo;
|
|
auto traverseRegions = [&]() -> auto{
|
|
for (auto ®ion : operation->getRegions()) {
|
|
// Copy the parent info as the current info.
|
|
RegionInfo regionInfo = *parentRegionInfo;
|
|
for (auto &block : region.getBlocks()) {
|
|
assert(region.getBlocks().size() == 1 &&
|
|
"Multiple blocks in a region is not supported");
|
|
for (auto &op : block.getOperations()) {
|
|
// Traverse the nested operation.
|
|
dfsOperation(&op, ®ionInfo, builder);
|
|
}
|
|
}
|
|
curRegionInfo.join(regionInfo);
|
|
}
|
|
// Set the parent region info as the union of the nested region info.
|
|
*parentRegionInfo = curRegionInfo;
|
|
};
|
|
|
|
traverseRegions();
|
|
if (isa<scf::ForOp>(operation)) {
|
|
// scf.for can have two possible inputs: the init value and the
|
|
// previous iteration's result. Although we've applied alias analysis,
|
|
// there could be unsynced memory accesses on reused memories.
|
|
// For example, consider the following code:
|
|
// %1 = convert_layout %0: blocked -> shared
|
|
// ...
|
|
// gpu.barrier
|
|
// ...
|
|
// %5 = convert_layout %4 : shared -> dot
|
|
// %6 = tt.dot %2, %5
|
|
// scf.yield
|
|
//
|
|
// Though %5 could be released before scf.yield, it may shared the same
|
|
// memory with %1. So we actually have to insert a barrier before %1 to
|
|
// make sure the memory is synced.
|
|
traverseRegions();
|
|
}
|
|
}
|
|
}
|
|
|
|
void MembarAnalysis::transfer(Operation *op, RegionInfo *regionInfo,
|
|
OpBuilder *builder) {
|
|
if (isa<scf::ForOp>(op) || isa<scf::IfOp>(op) || isa<scf::YieldOp>(op) ||
|
|
isa<tensor::ExtractSliceOp>(op) || isa<triton::gpu::AllocTensorOp>(op)) {
|
|
// Do not insert barriers before control flow operations and
|
|
// alloc/extract/insert
|
|
// alloc is an allocation op without memory write.
|
|
// FIXME(Keren): extract_slice is always alias for now
|
|
return;
|
|
}
|
|
|
|
if (isa<gpu::BarrierOp>(op)) {
|
|
// If the current op is a barrier, we sync previous reads and writes
|
|
regionInfo->sync();
|
|
return;
|
|
}
|
|
|
|
if (isa<triton::gpu::AsyncWaitOp>(op) &&
|
|
!isa<gpu::BarrierOp>(op->getNextNode())) {
|
|
// If the current op is an async wait and the next op is not a barrier we
|
|
// insert a barrier op and sync
|
|
regionInfo->sync();
|
|
OpBuilder::InsertionGuard g(*builder);
|
|
builder->setInsertionPointAfter(op);
|
|
builder->create<gpu::BarrierOp>(op->getLoc());
|
|
regionInfo->sync();
|
|
return;
|
|
}
|
|
|
|
RegionInfo curRegionInfo;
|
|
for (Value value : op->getOperands()) {
|
|
for (auto bufferId : allocation->getBufferIds(value)) {
|
|
if (bufferId != Allocation::InvalidBufferId) {
|
|
if (isa<triton::gpu::InsertSliceAsyncOp>(op) ||
|
|
isa<tensor::InsertSliceOp>(op)) {
|
|
// FIXME(Keren): insert_slice and insert_slice_async are always alias
|
|
// for now
|
|
curRegionInfo.syncWriteBuffers.insert(bufferId);
|
|
} else {
|
|
// ConvertLayoutOp: shared memory -> registers
|
|
curRegionInfo.syncReadBuffers.insert(bufferId);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for (Value value : op->getResults()) {
|
|
// ConvertLayoutOp: registers -> shared memory
|
|
auto bufferId = allocation->getBufferId(value);
|
|
if (bufferId != Allocation::InvalidBufferId) {
|
|
curRegionInfo.syncWriteBuffers.insert(bufferId);
|
|
}
|
|
}
|
|
// Scratch buffer is considered as both shared memory write & read
|
|
auto bufferId = allocation->getBufferId(op);
|
|
if (bufferId != Allocation::InvalidBufferId) {
|
|
curRegionInfo.syncWriteBuffers.insert(bufferId);
|
|
curRegionInfo.syncReadBuffers.insert(bufferId);
|
|
}
|
|
|
|
if (regionInfo->isIntersected(curRegionInfo, allocation)) {
|
|
OpBuilder::InsertionGuard g(*builder);
|
|
builder->setInsertionPoint(op);
|
|
builder->create<gpu::BarrierOp>(op->getLoc());
|
|
regionInfo->sync();
|
|
}
|
|
// Update the region info, even if barrier is inserted, we have to maintain
|
|
// the current op's read/write buffers.
|
|
regionInfo->join(curRegionInfo);
|
|
}
|
|
|
|
} // namespace mlir
|