[Triton-MLIR][BACKEND] insert_slice_async on GPUs < sm80 (#908)

`insert_slice_async` is decomposed into `load + insert_slice` in the backend. Not sure if V100 perf can match the master branch though in this way. Maybe the performance can be improved if instructions are arranged in the following form: ``` %0 = load %1 = load %2 = load ... insert_slice %0 insert_slice %1 insert_slice %2 ``` Tested on A100 when manually enabling this decomposition. Tests on V100 haven't been integrated yet, we can divide the tests into two phases: 1. Test only load, insert_slice, and insert_slice_async, given TritonGPU IRs in `test_backend.py`. 2. End to end gemm tests on V100.
2022-11-24 14:05:54 -08:00
parent f98aed1258
commit 153aecb339
16 changed files with 351 additions and 137 deletions
--- a/lib/Analysis/Alias.cpp
+++ b/lib/Analysis/Alias.cpp
@@ -26,13 +26,14 @@ ChangeResult SharedMemoryAliasAnalysis::visitOperation(
    // These ops may allocate a new shared memory buffer.
    auto result = op->getResult(0);
    // FIXME(Keren): extract and insert are always alias for now
-    if (auto extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(op)) {
+    if (isa<tensor::ExtractSliceOp>(op)) {
      // extract_slice %src
      aliasInfo = AliasInfo(operands[0]->getValue());
      pessimistic = false;
-    } else if (auto insertSliceOp =
-                   dyn_cast<triton::gpu::InsertSliceAsyncOp>(op)) {
+    } else if (isa<tensor::InsertSliceOp>(op) ||
+               isa<triton::gpu::InsertSliceAsyncOp>(op)) {
      // insert_slice_async %src, %dst, %index
+      // insert_slice %src into %dst[%offsets]
      aliasInfo = AliasInfo(operands[1]->getValue());
      pessimistic = false;
    } else if (isSharedEncoding(result)) {
--- a/lib/Analysis/Allocation.cpp
+++ b/lib/Analysis/Allocation.cpp
@@ -28,7 +28,7 @@ namespace mlir {
 namespace triton {

 // Bitwidth of pointers
-constexpr int kPtrBitWidth = 64; 
+constexpr int kPtrBitWidth = 64;

 static std::pair<SmallVector<unsigned>, SmallVector<unsigned>>
 getCvtOrder(const Attribute &srcLayout, const Attribute &dstLayout) {
@@ -155,8 +155,7 @@ private:
    // For example: %a = scf.if -> yield
    // %a must be allocated elsewhere by other operations.
    // FIXME(Keren): extract and insert are always alias for now
-    if (!maybeSharedAllocationOp(op) || isa<tensor::ExtractSliceOp>(op) ||
-        isa<triton::gpu::InsertSliceAsyncOp>(op)) {
+    if (!maybeSharedAllocationOp(op) || maybeAliasOp(op)) {
      return;
    }

@@ -210,9 +209,9 @@ private:
      auto smemShape = getScratchConfigForCvtLayout(cvtLayout, inVec, outVec);
      unsigned elems = std::accumulate(smemShape.begin(), smemShape.end(), 1,
                                       std::multiplies{});
-      auto bytes = srcTy.getElementType().isa<triton::PointerType>()? 
-                   elems * kPtrBitWidth / 8 :
-                   elems * srcTy.getElementTypeBitWidth() / 8;
+      auto bytes = srcTy.getElementType().isa<triton::PointerType>()
+                       ? elems * kPtrBitWidth / 8
+                       : elems * srcTy.getElementTypeBitWidth() / 8;
      allocation->addBuffer<BufferT::BufferKind::Scratch>(op, bytes);
    } else if (auto atomicRMWOp = dyn_cast<triton::AtomicRMWOp>(op)) {
      auto value = op->getOperand(0);
--- a/lib/Analysis/Membar.cpp
+++ b/lib/Analysis/Membar.cpp
@@ -1,4 +1,5 @@
 #include "triton/Analysis/Membar.h"
+#include "triton/Analysis/Alias.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"

 #include "mlir/Dialect/GPU/GPUDialect.h"
@@ -71,11 +72,17 @@ void MembarAnalysis::transfer(Operation *op, RegionInfo *regionInfo,

  RegionInfo curRegionInfo;
  for (Value value : op->getOperands()) {
-    // ConvertLayoutOp: shared memory -> registers
-    // Need to consider all alias buffers
    for (auto bufferId : allocation->getBufferIds(value)) {
      if (bufferId != Allocation::InvalidBufferId) {
-        curRegionInfo.syncReadBuffers.insert(bufferId);
+        if (isa<triton::gpu::InsertSliceAsyncOp>(op) ||
+            isa<tensor::InsertSliceOp>(op)) {
+          // FIXME(Keren): insert_slice and insert_slice_async are always alias
+          // for now
+          curRegionInfo.syncWriteBuffers.insert(bufferId);
+        } else {
+          // ConvertLayoutOp: shared memory -> registers
+          curRegionInfo.syncReadBuffers.insert(bufferId);
+        }
      }
    }
  }
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -28,6 +28,12 @@ bool maybeSharedAllocationOp(Operation *op) {
          dialect->getTypeID() == mlir::TypeID::get<tensor::TensorDialect>());
 }

+bool maybeAliasOp(Operation *op) {
+  return isa<tensor::ExtractSliceOp>(op) ||
+         isa<triton::gpu::InsertSliceAsyncOp>(op) ||
+         isa<tensor::InsertSliceOp>(op);
+}
+
 std::string getValueOperandName(Value value, AsmState &state) {
  std::string opName;
  llvm::raw_string_ostream ss(opName);