[Triton-MLIR][BACKEND] insert_slice_async on GPUs < sm80 (#908)
`insert_slice_async` is decomposed into `load + insert_slice` in the backend. Not sure if V100 perf can match the master branch though in this way. Maybe the performance can be improved if instructions are arranged in the following form: ``` %0 = load %1 = load %2 = load ... insert_slice %0 insert_slice %1 insert_slice %2 ``` Tested on A100 when manually enabling this decomposition. Tests on V100 haven't been integrated yet, we can divide the tests into two phases: 1. Test only load, insert_slice, and insert_slice_async, given TritonGPU IRs in `test_backend.py`. 2. End to end gemm tests on V100.
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
#include "triton/Analysis/Membar.h"
|
||||
#include "triton/Analysis/Alias.h"
|
||||
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
|
||||
|
||||
#include "mlir/Dialect/GPU/GPUDialect.h"
|
||||
@@ -71,11 +72,17 @@ void MembarAnalysis::transfer(Operation *op, RegionInfo *regionInfo,
|
||||
|
||||
RegionInfo curRegionInfo;
|
||||
for (Value value : op->getOperands()) {
|
||||
// ConvertLayoutOp: shared memory -> registers
|
||||
// Need to consider all alias buffers
|
||||
for (auto bufferId : allocation->getBufferIds(value)) {
|
||||
if (bufferId != Allocation::InvalidBufferId) {
|
||||
curRegionInfo.syncReadBuffers.insert(bufferId);
|
||||
if (isa<triton::gpu::InsertSliceAsyncOp>(op) ||
|
||||
isa<tensor::InsertSliceOp>(op)) {
|
||||
// FIXME(Keren): insert_slice and insert_slice_async are always alias
|
||||
// for now
|
||||
curRegionInfo.syncWriteBuffers.insert(bufferId);
|
||||
} else {
|
||||
// ConvertLayoutOp: shared memory -> registers
|
||||
curRegionInfo.syncReadBuffers.insert(bufferId);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user