[Analysis/Allocation] Allocation passes now assumes that slices always alias (#108)

This code in this branch assumes the `src` operand in `insert_slice_async` always aliases the result, which shouldn't hold for generally cases but is just a workaround to make the pipeline pass work. I'm also working on the complete analysis in another [branch](https://github.com/openai/triton-mlir/tree/keren/analyze-slice).
2022-09-09 12:03:41 -07:00
parent 9bd5a3dcd2
commit 16aed94ff5
14 changed files with 299 additions and 195 deletions
--- a/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp
@@ -69,7 +69,7 @@ struct CoalescePass : public TritonGPUCoalesceBase<CoalescePass> {
    // convert output types
    SmallVector<Type, 4> newTypes;
    for (auto t : op->getResultTypes()) {
-      bool is_async = std::is_same<T, triton::gpu::CopyAsyncOp>::value;
+      bool is_async = std::is_same<T, triton::gpu::InsertSliceAsyncOp>::value;
      newTypes.push_back(is_async ? t : convertType(t));
    }
    // construct new op with the new encoding
@@ -106,9 +106,9 @@ struct CoalescePass : public TritonGPUCoalesceBase<CoalescePass> {
      builder.setInsertionPoint(curr);
      if (auto load = dyn_cast<triton::LoadOp>(curr))
        coalesceOp<triton::LoadOp>(axisInfo, curr, load.ptr(), builder);
-      if (auto load = dyn_cast<triton::gpu::CopyAsyncOp>(curr))
-        coalesceOp<triton::gpu::CopyAsyncOp>(axisInfo, curr, load.ptr(),
-                                             builder);
+      if (auto load = dyn_cast<triton::gpu::InsertSliceAsyncOp>(curr))
+        coalesceOp<triton::gpu::InsertSliceAsyncOp>(axisInfo, curr, load.src(),
+                                                    builder);
      if (auto store = dyn_cast<triton::StoreOp>(curr))
        coalesceOp<triton::StoreOp>(axisInfo, curr, store.ptr(), builder);
    });
--- a/lib/Dialect/TritonGPU/Transforms/Combine.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Combine.cpp
@@ -119,7 +119,9 @@ public:
      return mlir::failure();

    auto blacklist = [](Operation *op) {
-      if (isa<triton::gpu::CopyAsyncOp, triton::LoadOp, triton::StoreOp>(op))
+      if (isa<triton::gpu::ExtractSliceOp, triton::gpu::AllocTensorOp,
+              triton::gpu::InsertSliceAsyncOp, triton::LoadOp, triton::StoreOp>(
+              op))
        return true;
      if (isa<scf::YieldOp, scf::ForOp>(op))
        return true;
--- a/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
@@ -275,7 +275,7 @@ void LoopPipeliner::emitPrologue() {
          loadStageBuffer[op->getResult(0)] = {loadsBuffer[op->getResult(0)]};
        }
        // load => copy async
-        // TODO: check if the hardware supports copyasync
+        // TODO: check if the hardware supports async copy
        if (auto loadOp = llvm::dyn_cast<triton::LoadOp>(op)) {
          newOp = builder.create<triton::gpu::InsertSliceAsyncOp>(
              op->getLoc(), loadsBuffer[loadOp].getType(),