[Analysis/Allocation] Allocation passes now assumes that slices always alias (#108)

This code in this branch assumes the `src` operand in `insert_slice_async` always aliases the result, which shouldn't hold for generally cases but is just a workaround to make the pipeline pass work. I'm also working on the complete analysis in another [branch](https://github.com/openai/triton-mlir/tree/keren/analyze-slice).
2022-09-09 12:03:41 -07:00
parent 9bd5a3dcd2
commit 16aed94ff5
14 changed files with 299 additions and 195 deletions
--- a/lib/Analysis/Alias.cpp
+++ b/lib/Analysis/Alias.cpp
@@ -22,25 +22,24 @@ ChangeResult SharedMemoryAliasAnalysis::visitOperation(
  AliasInfo aliasInfo;
  bool pessimistic = true;
  if (maybeSharedAllocationOp(op)) {
-    // These ops will allocate a new shared memory buffer.
+    // These ops may allocate a new shared memory buffer.
    auto result = op->getResult(0);
    if (isSharedEncoding(result)) {
-      aliasInfo.insert(result);
+      // FIXME(Keren): extract and insert are always alias for now
+      if (auto extractSliceOp = dyn_cast<triton::gpu::ExtractSliceOp>(op)) {
+        // extract_slice %src, %index
+        aliasInfo = AliasInfo(operands[0]->getValue());
+      } else if (auto insertSliceOp =
+                     dyn_cast<triton::gpu::InsertSliceAsyncOp>(op)) {
+        // insert_slice_async %src, %dst, %index
+        aliasInfo = AliasInfo(operands[1]->getValue());
+      } else {
+        aliasInfo.insert(result);
+      }
      pessimistic = false;
-    } else {
-      llvm::errs() << "op: " << op->getName() << "\n";
    }
  }
-  // XXX(Keren): triton ops don't support aliasing yet.
-  // else if (auto viewOp = dyn_cast<triton::ViewOp>(op) ||
-  //                         dyn_cast<triton::ExpandDimsOp>(op)) {
-  //  // These ops will reate a new view of the same shared memory buffer.
-  //  auto result = op->getResult(0);
-  //  if (isSharedEncoding(result)) {
-  //    aliasInfo = AliasInfo(operands[0]->getValue());
-  //    pessimistic = false;
-  //  }
-  //}
+
  if (pessimistic) {
    return markAllPessimisticFixpoint(op->getResults());
  }
--- a/lib/Analysis/Allocation.cpp
+++ b/lib/Analysis/Allocation.cpp
@@ -39,11 +39,13 @@ private:

  /// Initializes explicitly defined shared memory values for a given operation.
  void getExplicitValueSize(Operation *op) {
-    /// Values returned from scf.yield will not be allocated even though they
-    /// have the shared encoding.
-    /// For example: %a = scf.if -> yield
-    /// %a must be allocated elsewhere by other operations.
-    if (!maybeSharedAllocationOp(op)) {
+    // Values returned from scf.yield will not be allocated even though they
+    // have the shared encoding.
+    // For example: %a = scf.if -> yield
+    // %a must be allocated elsewhere by other operations.
+    // FIXME(Keren): extract and insert are always alias for now
+    if (!maybeSharedAllocationOp(op) || isa<triton::gpu::ExtractSliceOp>(op) ||
+        isa<triton::gpu::InsertSliceAsyncOp>(op)) {
      return;
    }

--- a/lib/Analysis/Membar.cpp
+++ b/lib/Analysis/Membar.cpp
@@ -45,19 +45,25 @@ void MembarAnalysis::transfer(Operation *op, RegionInfo *regionInfo,
  if (op->getNumResults() < 1)
    return;

-  if (dyn_cast<scf::ForOp>(op) || dyn_cast<scf::IfOp>(op) ||
-      dyn_cast<scf::YieldOp>(op)) {
-    // Do not insert barriers before control flow operations.
+  if (isa<scf::ForOp>(op) || isa<scf::IfOp>(op) || isa<scf::YieldOp>(op) ||
+      isa<triton::gpu::ExtractSliceOp>(op) ||
+      isa<triton::gpu::InsertSliceAsyncOp>(op) ||
+      isa<triton::gpu::AllocTensorOp>(op)) {
+    // Do not insert barriers before control flow operations and
+    // alloc/extract/insert
+    // alloc is an allocation op without memory write.
+    // In contrast, arith.constant is an allocation op with memory write.
+    // FIXME(Keren): extract and insert are always alias for now
    return;
  }

-  if (dyn_cast<gpu::BarrierOp>(op)) {
+  if (isa<gpu::BarrierOp>(op)) {
    // If the current op is a barrier, we sync previous reads and writes
    regionInfo->sync();
    return;
  }

-  if (dyn_cast<triton::gpu::AsyncWaitOp>(op)) {
+  if (isa<triton::gpu::AsyncWaitOp>(op)) {
    // If the current op is an async wait, we insert a barrier op and sync
    // previous reads and writes.
    OpBuilder::InsertionGuard g(*builder);
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -292,44 +292,6 @@ void SharedEncodingAttr::print(AsmPrinter &printer) const {
          << "}>";
 }

-//===----------------------------------------------------------------------===//
-// CopyAsyncOp
-//===----------------------------------------------------------------------===//
-
-ParseResult parseCopyAsyncOp(OpAsmParser &parser, OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 4> allOperands;
-  Type resultTypes[1], ptrType;
-  SMLoc allOperandLoc = parser.getCurrentLocation();
-  if (parser.parseOperandList(allOperands) ||
-      parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() ||
-      parser.parseCustomTypeWithFallback(ptrType) || parser.parseArrow() ||
-      parser.parseCustomTypeWithFallback(resultTypes[0]))
-    return failure();
-  result.addTypes(resultTypes);
-
-  SmallVector<Type> operandTypes;
-  operandTypes.push_back(ptrType); // ptr
-  if (allOperands.size() >= 2)
-    operandTypes.push_back(triton::getI1SameShape(ptrType)); // mask
-  if (allOperands.size() >= 3)
-    operandTypes.push_back(triton::getPointeeType(ptrType)); // other
-
-  if (parser.resolveOperands(allOperands, operandTypes, allOperandLoc,
-                             result.operands))
-    return failure();
-  return success();
-}
-
-void printCopyAsyncOp(OpAsmPrinter &printer, CopyAsyncOp copyAsyncOp) {
-  printer << " ";
-  printer << copyAsyncOp.getOperation()->getOperands();
-  printer.printOptionalAttrDict(copyAsyncOp->getAttrs(), /*elidedAttrs=*/{});
-  printer << " : ";
-  printer.printStrippedAttrOrType(copyAsyncOp.ptr().getType());
-  printer << " -> ";
-  printer.printStrippedAttrOrType(copyAsyncOp.result().getType());
-}
-
 //===----------------------------------------------------------------------===//
 // InsertSliceAsyncOp
 //===----------------------------------------------------------------------===//
@@ -350,7 +312,7 @@ ParseResult parseInsertSliceAsyncOp(OpAsmParser &parser,
  operandTypes.push_back(srcType); // src
  operandTypes.push_back(dstType); // dst
  operandTypes.push_back(
-      IntegerType::get(parser.getBuilder().getContext(), 32)); // offset
+      IntegerType::get(parser.getBuilder().getContext(), 32)); // index
  if (allOperands.size() >= 4)
    operandTypes.push_back(triton::getI1SameShape(srcType)); // mask
  if (allOperands.size() >= 5)
@@ -389,6 +351,8 @@ mlir::LogicalResult ExtractSliceOp::inferReturnTypes(
  auto axis = attributes.get("axis").cast<IntegerAttr>().getInt();
  if (axis < 0 || axis > srcShape.size())
    return failure();
+  // Since we only extract a slice from a certain index on the axis,
+  // the dims before the axis can be dropped.
  auto dstShape = srcShape.drop_front(axis + 1);
  auto returnType =
      RankedTensorType::get(dstShape, srcType.getElementType(), encoding);
@@ -438,16 +402,10 @@ void TritonGPUDialect::initialize() {
 // Verification
 //===----------------------------------------------------------------------===//

-static LogicalResult verify(CopyAsyncOp op) {
-  if (!isSharedEncoding(op.getResult())) {
-    return op.emitOpError("copy_async should return a shared memory tensor");
-  }
-  return success();
-}
-
 static LogicalResult verify(InsertSliceAsyncOp op) {
  if (!isSharedEncoding(op.getResult())) {
-    return op.emitOpError("copy_async should return a shared memory tensor");
+    return op.emitOpError(
+        "insert_slice_async should return a shared memory tensor");
  }
  return success();
 }
--- a/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp
@@ -69,7 +69,7 @@ struct CoalescePass : public TritonGPUCoalesceBase<CoalescePass> {
    // convert output types
    SmallVector<Type, 4> newTypes;
    for (auto t : op->getResultTypes()) {
-      bool is_async = std::is_same<T, triton::gpu::CopyAsyncOp>::value;
+      bool is_async = std::is_same<T, triton::gpu::InsertSliceAsyncOp>::value;
      newTypes.push_back(is_async ? t : convertType(t));
    }
    // construct new op with the new encoding
@@ -106,9 +106,9 @@ struct CoalescePass : public TritonGPUCoalesceBase<CoalescePass> {
      builder.setInsertionPoint(curr);
      if (auto load = dyn_cast<triton::LoadOp>(curr))
        coalesceOp<triton::LoadOp>(axisInfo, curr, load.ptr(), builder);
-      if (auto load = dyn_cast<triton::gpu::CopyAsyncOp>(curr))
-        coalesceOp<triton::gpu::CopyAsyncOp>(axisInfo, curr, load.ptr(),
-                                             builder);
+      if (auto load = dyn_cast<triton::gpu::InsertSliceAsyncOp>(curr))
+        coalesceOp<triton::gpu::InsertSliceAsyncOp>(axisInfo, curr, load.src(),
+                                                    builder);
      if (auto store = dyn_cast<triton::StoreOp>(curr))
        coalesceOp<triton::StoreOp>(axisInfo, curr, store.ptr(), builder);
    });
--- a/lib/Dialect/TritonGPU/Transforms/Combine.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Combine.cpp
@@ -119,7 +119,9 @@ public:
      return mlir::failure();

    auto blacklist = [](Operation *op) {
-      if (isa<triton::gpu::CopyAsyncOp, triton::LoadOp, triton::StoreOp>(op))
+      if (isa<triton::gpu::ExtractSliceOp, triton::gpu::AllocTensorOp,
+              triton::gpu::InsertSliceAsyncOp, triton::LoadOp, triton::StoreOp>(
+              op))
        return true;
      if (isa<scf::YieldOp, scf::ForOp>(op))
        return true;
--- a/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
@@ -275,7 +275,7 @@ void LoopPipeliner::emitPrologue() {
          loadStageBuffer[op->getResult(0)] = {loadsBuffer[op->getResult(0)]};
        }
        // load => copy async
-        // TODO: check if the hardware supports copyasync
+        // TODO: check if the hardware supports async copy
        if (auto loadOp = llvm::dyn_cast<triton::LoadOp>(op)) {
          newOp = builder.create<triton::gpu::InsertSliceAsyncOp>(
              op->getLoc(), loadsBuffer[loadOp].getType(),