[Triton-MLIR] Replace triton.extract_slice with tensor.extract_slice and support more general tensor slicing (#837)

## Features - Allow taking a block of tensor slice, as long as each dimension is contiguous (unit stride). - Fix some problems in `insert_slice_async`'s semantic. - More general verification for ops that return shared layout encoding. ## Known Limitations - `insert_slice_async` still uses the old semantic. May submit another PR later to support similar semantic like `tensor.extract_slice`. - No encoding verification for `tensor.extract_slice`. - 3d tensor ops are broken. - Strided accesses are not allowed. - May cause a little performance slowdown since we are passing strides as values but not constants (e.g., int). It would be difficult to pass strides as attributes when we have control flows. A block argument is possible to accept tensors with different strides.
2022-11-06 22:59:03 -08:00
parent a4ff0c362c
commit fdd59900f7
26 changed files with 507 additions and 339 deletions
--- a/lib/Analysis/Allocation.cpp
+++ b/lib/Analysis/Allocation.cpp
@@ -1,6 +1,7 @@
 #include "triton/Analysis/Allocation.h"
 #include "mlir/Analysis/Liveness.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "triton/Analysis/Alias.h"
 #include "triton/Analysis/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -76,13 +77,13 @@ SmallVector<unsigned> getScratchConfigForReduce(triton::ReduceOp op) {
  auto srcShape = srcTy.getShape();
  auto axis = op.axis();

-  bool fast_reduce = axis == 1; // FIXME(Qingyi): The fastest-changing dimension
+  bool fastReduce = axis == 1; // FIXME(Qingyi): The fastest-changing dimension

  SmallVector<unsigned> smemShape;
  for (auto d : srcShape)
    smemShape.push_back(d);

-  if (fast_reduce) {
+  if (fastReduce) {
    unsigned sizeInterWarps = srcLayout.getWarpsPerCTA()[axis];
    smemShape[axis] = sizeInterWarps;
  } else {
@@ -123,7 +124,7 @@ private:
    // For example: %a = scf.if -> yield
    // %a must be allocated elsewhere by other operations.
    // FIXME(Keren): extract and insert are always alias for now
-    if (!maybeSharedAllocationOp(op) || isa<triton::gpu::ExtractSliceOp>(op) ||
+    if (!maybeSharedAllocationOp(op) || isa<tensor::ExtractSliceOp>(op) ||
        isa<triton::gpu::InsertSliceAsyncOp>(op)) {
      return;
    }