[Triton-MLIR][BACKEND] insert_slice_async on GPUs < sm80 (#908)

`insert_slice_async` is decomposed into `load + insert_slice` in the
backend.

Not sure if V100 perf can match the master branch though in this way.
Maybe the performance can be improved if instructions are arranged in
the following form:

```
%0 = load
%1 = load 
%2 = load 
...
insert_slice %0
insert_slice %1
insert_slice %2
```

Tested on A100 when manually enabling this decomposition.
Tests on V100 haven't been integrated yet, we can divide the tests into
two phases:
1. Test only load, insert_slice, and insert_slice_async, given TritonGPU
IRs in `test_backend.py`.
2. End to end gemm tests on V100.
This commit is contained in:
Keren Zhou
2022-11-24 14:05:54 -08:00
committed by GitHub
parent f98aed1258
commit 153aecb339
16 changed files with 351 additions and 137 deletions

View File

@@ -26,13 +26,14 @@ ChangeResult SharedMemoryAliasAnalysis::visitOperation(
// These ops may allocate a new shared memory buffer.
auto result = op->getResult(0);
// FIXME(Keren): extract and insert are always alias for now
if (auto extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(op)) {
if (isa<tensor::ExtractSliceOp>(op)) {
// extract_slice %src
aliasInfo = AliasInfo(operands[0]->getValue());
pessimistic = false;
} else if (auto insertSliceOp =
dyn_cast<triton::gpu::InsertSliceAsyncOp>(op)) {
} else if (isa<tensor::InsertSliceOp>(op) ||
isa<triton::gpu::InsertSliceAsyncOp>(op)) {
// insert_slice_async %src, %dst, %index
// insert_slice %src into %dst[%offsets]
aliasInfo = AliasInfo(operands[1]->getValue());
pessimistic = false;
} else if (isSharedEncoding(result)) {

View File

@@ -28,7 +28,7 @@ namespace mlir {
namespace triton {
// Bitwidth of pointers
constexpr int kPtrBitWidth = 64;
constexpr int kPtrBitWidth = 64;
static std::pair<SmallVector<unsigned>, SmallVector<unsigned>>
getCvtOrder(const Attribute &srcLayout, const Attribute &dstLayout) {
@@ -155,8 +155,7 @@ private:
// For example: %a = scf.if -> yield
// %a must be allocated elsewhere by other operations.
// FIXME(Keren): extract and insert are always alias for now
if (!maybeSharedAllocationOp(op) || isa<tensor::ExtractSliceOp>(op) ||
isa<triton::gpu::InsertSliceAsyncOp>(op)) {
if (!maybeSharedAllocationOp(op) || maybeAliasOp(op)) {
return;
}
@@ -210,9 +209,9 @@ private:
auto smemShape = getScratchConfigForCvtLayout(cvtLayout, inVec, outVec);
unsigned elems = std::accumulate(smemShape.begin(), smemShape.end(), 1,
std::multiplies{});
auto bytes = srcTy.getElementType().isa<triton::PointerType>()?
elems * kPtrBitWidth / 8 :
elems * srcTy.getElementTypeBitWidth() / 8;
auto bytes = srcTy.getElementType().isa<triton::PointerType>()
? elems * kPtrBitWidth / 8
: elems * srcTy.getElementTypeBitWidth() / 8;
allocation->addBuffer<BufferT::BufferKind::Scratch>(op, bytes);
} else if (auto atomicRMWOp = dyn_cast<triton::AtomicRMWOp>(op)) {
auto value = op->getOperand(0);

View File

@@ -1,4 +1,5 @@
#include "triton/Analysis/Membar.h"
#include "triton/Analysis/Alias.h"
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
@@ -71,11 +72,17 @@ void MembarAnalysis::transfer(Operation *op, RegionInfo *regionInfo,
RegionInfo curRegionInfo;
for (Value value : op->getOperands()) {
// ConvertLayoutOp: shared memory -> registers
// Need to consider all alias buffers
for (auto bufferId : allocation->getBufferIds(value)) {
if (bufferId != Allocation::InvalidBufferId) {
curRegionInfo.syncReadBuffers.insert(bufferId);
if (isa<triton::gpu::InsertSliceAsyncOp>(op) ||
isa<tensor::InsertSliceOp>(op)) {
// FIXME(Keren): insert_slice and insert_slice_async are always alias
// for now
curRegionInfo.syncWriteBuffers.insert(bufferId);
} else {
// ConvertLayoutOp: shared memory -> registers
curRegionInfo.syncReadBuffers.insert(bufferId);
}
}
}
}

View File

@@ -28,6 +28,12 @@ bool maybeSharedAllocationOp(Operation *op) {
dialect->getTypeID() == mlir::TypeID::get<tensor::TensorDialect>());
}
bool maybeAliasOp(Operation *op) {
return isa<tensor::ExtractSliceOp>(op) ||
isa<triton::gpu::InsertSliceAsyncOp>(op) ||
isa<tensor::InsertSliceOp>(op);
}
std::string getValueOperandName(Value value, AsmState &state) {
std::string opName;
llvm::raw_string_ostream ss(opName);