[Triton-MLIR] Replace triton.extract_slice with tensor.extract_slice and support more general tensor slicing (#837)

## Features - Allow taking a block of tensor slice, as long as each dimension is contiguous (unit stride). - Fix some problems in `insert_slice_async`'s semantic. - More general verification for ops that return shared layout encoding. ## Known Limitations - `insert_slice_async` still uses the old semantic. May submit another PR later to support similar semantic like `tensor.extract_slice`. - No encoding verification for `tensor.extract_slice`. - 3d tensor ops are broken. - Strided accesses are not allowed. - May cause a little performance slowdown since we are passing strides as values but not constants (e.g., int). It would be difficult to pass strides as attributes when we have control flows. A block argument is possible to accept tensors with different strides.
2022-11-06 22:59:03 -08:00
parent a4ff0c362c
commit fdd59900f7
26 changed files with 507 additions and 339 deletions
--- a/include/triton/Conversion/Passes.td
+++ b/include/triton/Conversion/Passes.td
@@ -38,6 +38,7 @@ def ConvertTritonGPUToLLVM : Pass<"convert-triton-gpu-to-llvm", "mlir::ModuleOp"
                             "mlir::gpu::GPUDialect",
                             "mlir::scf::SCFDialect",
                             "mlir::LLVM::LLVMDialect",
+                             "mlir::tensor::TensorDialect",
                             "mlir::triton::TritonDialect",
                             "mlir::triton::gpu::TritonGPUDialect",
                             "mlir::NVVM::NVVMDialect",
--- a/include/triton/Dialect/Triton/IR/Dialect.h
+++ b/include/triton/Dialect/Triton/IR/Dialect.h
@@ -1,7 +1,6 @@
 #ifndef TRITON_DIALECT_TRITON_IR_DIALECT_H_
 #define TRITON_DIALECT_TRITON_IR_DIALECT_H_

-#include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
--- a/include/triton/Dialect/Triton/IR/TritonDialect.td
+++ b/include/triton/Dialect/Triton/IR/TritonDialect.td
@@ -27,7 +27,6 @@ def Triton_Dialect : Dialect {
    "math::MathDialect",
    "StandardOpsDialect",
    "scf::SCFDialect",
-    "gpu::GPUDialect",

    // Since LLVM 15
    // "cf::ControlFlowDialect",
--- a/include/triton/Dialect/TritonGPU/IR/Dialect.h
+++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -2,6 +2,7 @@
 #define TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_

 #include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dialect.h"

@@ -9,6 +10,7 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"

 #include "triton/Dialect/TritonGPU/IR/Dialect.h.inc"
+#include "triton/Dialect/TritonGPU/IR/Traits.h"

 #define GET_ATTRDEF_CLASSES
 #include "triton/Dialect/Triton/IR/AttrInterfaces.h.inc"
--- a/include/triton/Dialect/TritonGPU/IR/Traits.h
+++ b/include/triton/Dialect/TritonGPU/IR/Traits.h
@@ -0,0 +1,31 @@
+#ifndef TRITON_GPU_IR_TRAITS_H_
+#define TRITON_GPU_IR_TRAITS_H_
+
+#include "mlir/IR/OpDefinition.h"
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+namespace OpTrait {
+
+// These functions are out-of-line implementations of the methods in the
+// corresponding trait classes.  This avoids them being template
+// instantiated/duplicated.
+namespace impl {
+LogicalResult verifyResultsAreSharedEncoding(Operation *op);
+} // namespace impl
+
+template <typename ConcreteType>
+class ResultsAreSharedEncoding
+    : public TraitBase<ConcreteType, ResultsAreSharedEncoding> {
+public:
+  static LogicalResult verifyTrait(Operation *op) {
+    return impl::verifyResultsAreSharedEncoding(op);
+  }
+};
+
+} // namespace OpTrait
+} // namespace mlir
+
+#endif
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
@@ -16,7 +16,8 @@ def TritonGPU_Dialect : Dialect {

  let dependentDialects = [
    "triton::TritonDialect",
-    "mlir::gpu::GPUDialect"
+    "mlir::gpu::GPUDialect",
+    "tensor::TensorDialect",
  ];

  let extraClassDeclaration = [{
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -10,6 +10,8 @@ include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td" // NoSideEffect
 include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType

+def ResultsAreSharedEncoding: NativeOpTrait<"ResultsAreSharedEncoding">;
+
 class TTG_Op<string mnemonic, list<Trait> traits = []> :
    Op<TritonGPU_Dialect, mnemonic, traits>;

@@ -75,7 +77,8 @@ def TTG_SelectOp : TTG_Op<"select", [NoSideEffect]> {


 def TTG_InsertSliceAsyncOp : TTG_Op<"insert_slice_async",
-                                    [SameVariadicOperandSize,
+                                    [AttrSizedOperandSegments,
+                                     ResultsAreSharedEncoding,
                                     // MemoryEffects<[MemRead]>, doesn't work with CSE but seems like it should?
                                     NoSideEffect,
                                     TypesMatchWith<"infer mask type from src type",
@@ -93,6 +96,10 @@ def TTG_InsertSliceAsyncOp : TTG_Op<"insert_slice_async",
      It returns a copy of `$dst` with the proper slice updated asynchronously with the value of `$src`.
      This operation is non-blocking, and `$results` will have the updated value after the corresponding async_wait.

+      When converting from `tt.load` to `triton_gpu.insert_slice_async`, the `$evict`, `$cache`, and `$isVolatile` fields
+      might be ignored on certain hardware. For example, on NVIDIA GPUs, the cache policy is determined by the backend,
+      and `$evict` and `$isVolatile` are ignored because they apply to L1 cache only.
+
      The insert_slice_async operation supports the following arguments:

      * src: the tensor that is inserted.
@@ -149,48 +156,9 @@ def TTG_InsertSliceAsyncOp : TTG_Op<"insert_slice_async",
  let parser = [{ return parseInsertSliceAsyncOp(parser, result); }];

  let printer = [{ return printInsertSliceAsyncOp(p, *this); }];
-
-  // result needs to be of shared layout
-  let verifier = [{ return ::verify(*this); }];
 }

-def TTG_ExtractSliceOp : TTG_Op<"extract_slice", [NoSideEffect, InferTypeOpInterface]> {
-  let summary = "extract slice";
-  let description = [{
-    The "extract_slice" operation extracts a `$result` tensor from a `$src` tensor as
-    specified by the operation's `$index` and `$axis` arguments.
-
-    The extract_slice operation supports the following arguments:
-
-    * src: the tensor that is extracted from.
-    * index: the index at the given `$axis` from which the `$src` tensor is extracted
-
-    Example:
-
-    ```
-    // Rank-reducing extract_slice.
-    %1 = tensor.extract_slice %0, %index {axis = 0} : tensor<8x16x4xf32> -> tensor<1x16x4xf32>
-    ```
-  }];
-
-  let arguments = (ins TT_Tensor:$src, I32:$index, I32Attr:$axis);
-
-  let results = (outs TT_Tensor:$result);
-
-  let assemblyFormat = [{$src `,` $index attr-dict `:` type($src) `->` type($result)}];
-
-  let extraClassDeclaration = [{
-    static ::mlir::LogicalResult inferReturnTypes(::mlir::MLIRContext *context,
-          ::llvm::Optional<::mlir::Location> location, ::mlir::ValueRange operands,
-          ::mlir::DictionaryAttr attributes, ::mlir::RegionRange regions,
-          ::llvm::SmallVectorImpl<::mlir::Type> &inferredReturnTypes);
-  }];
-
-  // result needs to be of shared layout
-  let verifier = [{ return ::verify(*this); }];
-}
-
-def TTG_AllocTensorOp : TTG_Op<"alloc_tensor", [NoSideEffect]> {
+def TTG_AllocTensorOp : TTG_Op<"alloc_tensor", [NoSideEffect, ResultsAreSharedEncoding]> {
  let summary = "allocate tensor";

  let description = [{
@@ -203,9 +171,6 @@ def TTG_AllocTensorOp : TTG_Op<"alloc_tensor", [NoSideEffect]> {
  let assemblyFormat = [{attr-dict `:` type($result)}];

  let results = (outs TT_Tensor:$result);
-
-  // result needs to be of shared layout
-  let verifier = [{ return ::verify(*this); }];
 }

 #endif