Keren/tensor slice insert alloc (#94)

This branch defines three new triton_gpu operations to partially solve #87. Below is an overview: ``` %tensor = triton_gpu.alloc_tensor : tensor<2x16x16xf16, #A> %b = triton_gpu.insert_slice_async %a_ptr, %tensor, %offset {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f16>, #AL> -> tensor<2x16x16xf16, #A> %c = triton_gpu.extract_slice %b, %offset {axis = 0 : i32} : tensor<2x16x16xf16, #A> -> tensor<16x16xf16, #A> ``` We plan to fully replace `copy_async` with `insert_slice_async`. **This hasn't been done yet.**
2022-09-01 12:37:17 -07:00
parent d01353de07
commit 328b87aec6
10 changed files with 260 additions and 40 deletions
--- a/include/triton/Analysis/Allocation.h
+++ b/include/triton/Analysis/Allocation.h
@@ -17,24 +17,24 @@ class AllocationAnalysis;
 }

 /// Modified from llvm-15.0: llvm/ADT/AddressRanges.h
-/// A class that represents a range, specified using a start and an end values:
-/// [Start, End).
-template <typename T> class Range {
+/// A class that represents an interval, specified using a start and an end
+/// values: [Start, End).
+template <typename T> class Interval {
 public:
-  Range() {}
-  Range(T S, T E) : Start(S), End(E) { assert(Start <= End); }
+  Interval() {}
+  Interval(T S, T E) : Start(S), End(E) { assert(Start <= End); }
  T start() const { return Start; }
  T end() const { return End; }
  T size() const { return End - Start; }
  bool contains(T Addr) const { return Start <= Addr && Addr < End; }
-  bool intersects(const Range &R) const {
+  bool intersects(const Interval &R) const {
    return Start < R.End && R.Start < End;
  }
-  bool operator==(const Range &R) const {
+  bool operator==(const Interval &R) const {
    return Start == R.Start && End == R.End;
  }
-  bool operator!=(const Range &R) const { return !(*this == R); }
-  bool operator<(const Range &R) const {
+  bool operator!=(const Interval &R) const { return !(*this == R); }
+  bool operator<(const Interval &R) const {
    return std::make_pair(Start, End) < std::make_pair(R.Start, R.End);
  }

@@ -137,8 +137,9 @@ private:
        : kind(kind), size(size), offset(offset), id(nextId++) {}

    bool intersects(const BufferT &other) const {
-      return Range<size_t>(offset, offset + size)
-          .intersects(Range<size_t>(other.offset, other.offset + other.size));
+      return Interval<size_t>(offset, offset + size)
+          .intersects(
+              Interval<size_t>(other.offset, other.offset + other.size));
    }
  };

--- a/include/triton/Dialect/TritonGPU/IR/Dialect.h
+++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -1,6 +1,7 @@
 #ifndef TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_
 #define TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_

+#include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dialect.h"

--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
@@ -15,7 +15,8 @@ def TritonGPU_Dialect : Dialect {
  }];

  let dependentDialects = [
-    "triton::TritonDialect"
+    "triton::TritonDialect",
+    "mlir::gpu::GPUDialect"
  ];
 }

--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -58,7 +58,7 @@ def TTG_CopyAsyncOp : TTG_Op<"copy_async",
                     "triton::EvictionPolicy":$evict, "bool":$isVolatile)>,
  ];

-  let results = (outs TT_Type:$result);
+  let results = (outs TT_Tensor:$result);

  // let assemblyFormat = "operands attr-dict `:` type($ptr) `->` type($result)";
  let parser = [{ return parseCopyAsyncOp(parser, result); }];
@@ -97,4 +97,137 @@ def TTG_CmpFOp : TTG_Op<"cmpf"> {
  let results = (outs TT_BoolLike:$result);
 }

+def TTG_InsertSliceAsyncOp : TTG_Op<"insert_slice_async", 
+                                    [SameVariadicOperandSize,
+                                     MemoryEffects<[MemRead, MemWrite]>,
+                                     TypesMatchWith<"infer mask type from src type",
+                                                    "src", "mask", "getI1SameShape($_self)",
+                                                    "($_op.getOperands().size() <= 3) || std::equal_to<>()">,
+                                     TypesMatchWith<"infer other type from src type",
+                                                    "src", "other", "getPointeeType($_self)",
+                                                    "($_op.getOperands().size() <= 4) || std::equal_to<>()">]> {
+  let summary = "insert slice async";
+
+  let description = [{
+      This operation inserts a tensor `$src` into another tensor `$dst` as specified by the operation’s
+      `$offset` argument and `$axis` attribute.
+
+      It returns a copy of `$dst` with the proper slice updated asynchronously with the value of `$src`.
+      This operation is non-blocking, and `$results` will have the updated value after the corresponding async_wait.
+
+      The insert_slice_async operation supports the following arguments:
+
+      * src: the tensor that is inserted.
+      * dst: the tensor into which the `$src` tensor is inserted.
+      * offset: the offset of the `$src` tensor at the given `$axis` from which the `$dst` tensor is inserted into
+      * mask: optional tensor-rank number of boolean masks which specify which
+              elements of the `$src` tensor are inserted into the `$dst` tensor.
+      * other: optional tensor-rank number of other tensors which specify what
+              values are inserted into the `$dst` tensor if the corresponding
+              element of the `$mask` tensor is false.
+      
+      In the future, we may decompose this operation into a sequence of:
+
+      * `async` operation to specify a sequence of asynchronous operations
+      * `load` operation to load a tensor from global memory
+      * `insert_slice` operations to insert the `$src` tensor into the `$dst` tensor
+
+      Example:
+
+      ```
+      %1 = triton_gpu.alloc_tensor : tensor<2x32xf32>
+      %2 = triton_gpu.insert_slice_async %0, %1, %offset { axis = 0 } : tensor<32x!tt.ptr<f32>, #AL> -> tensor<2x32xf32, #A>
+      triiton_gpu.async_wait { num = 0 : i32 }
+      ```
+  }];
+
+  let arguments = (ins TT_PtrTensor:$src, TT_Tensor:$dst, I32:$offset,
+                       Optional<I1Tensor>:$mask, Optional<TT_Type>:$other,
+                       TT_CacheModifierAttr:$cache, TT_EvictionPolicyAttr:$evict,
+                       BoolAttr:$isVolatile, I32Attr:$axis);
+
+  let builders = [
+      OpBuilder<(ins "Value":$src, "Value":$dst, "Value":$offset,
+                     "triton::CacheModifier":$cache,
+                     "triton::EvictionPolicy":$evict, "bool":$isVolatile, "int":$axis)>,
+      OpBuilder<(ins "Value":$src, "Value":$dst, "Value":$offset, "Value":$mask,
+                     "triton::CacheModifier":$cache,
+                     "triton::EvictionPolicy":$evict, "bool":$isVolatile, "int":$axis)>,
+      OpBuilder<(ins "Value":$src, "Value":$dst, "Value":$offset,
+                     "Value":$mask, "Value":$other,
+                     "triton::CacheModifier":$cache,
+                     "triton::EvictionPolicy":$evict, "bool":$isVolatile, "int":$axis)>,
+  ];
+
+  let results = (outs TT_Tensor:$result);
+
+  //let assemblyFormat = [{
+  //  $src `,` $dst ``
+  //  $offset, $mask, $other
+  //  attr-dict `:` type($src) `->` type($dst)
+  //}];
+
+  // The custom parser could be replaced with oilist in LLVM-16
+  let parser = [{ return parseInsertSliceAsyncOp(parser, result); }];
+
+  let printer = [{ return printInsertSliceAsyncOp(p, *this); }];
+
+  // result needs to be of shared layout
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def TTG_ExtractSliceOp : TTG_Op<"extract_slice", [NoSideEffect, InferTypeOpInterface]> {
+  let summary = "extract slice";
+  let description = [{
+    The "extract_slice" operation extracts a `$result` tensor from a `$src` tensor as
+    specified by the operation's `$offset` and `$axis` arguments.
+
+    The extract_slice operation supports the following arguments:
+
+    * src: the tensor that is extracted from.
+    * offset: the offset at the given `$axis` from which the `$src` tensor is extracted
+
+    Example:
+
+    ```
+    // Rank-reducing extract_slice.
+    %1 = tensor.extract_slice %0, %offset {axis = 0} : tensor<8x16x4xf32> -> tensor<1x16x4xf32>
+    ```
+  }];
+
+  let arguments = (ins TT_Tensor:$src, I32:$offset, I32Attr:$axis);
+
+  let results = (outs TT_Tensor:$result);
+
+  let assemblyFormat = [{$src `,` $offset attr-dict `:` type($src) `->` type($result)}];
+
+  let extraClassDeclaration = [{
+    static ::mlir::LogicalResult inferReturnTypes(::mlir::MLIRContext *context,
+          ::llvm::Optional<::mlir::Location> location, ::mlir::ValueRange operands,
+          ::mlir::DictionaryAttr attributes, ::mlir::RegionRange regions,
+          ::llvm::SmallVectorImpl<::mlir::Type> &inferredReturnTypes);
+  }];
+
+  // result needs to be of shared layout
+  let verifier = [{ return ::verify(*this); }];
+}
+
+def TTG_AllocTensorOp : TTG_Op<"alloc_tensor", [NoSideEffect]> {
+  let summary = "allocate tensor";
+
+  let description = [{
+    This operation defines a tensor of a particular shape.
+    The contents of the tensor are supposed to be in shared memory.
+
+    Note: This op can be repalced to a `bufferization.alloc_tensor` in LLVM 16.
+  }];
+
+  let assemblyFormat = [{attr-dict `:` type($result)}]; 
+
+  let results = (outs TT_Tensor:$result);
+
+  // result needs to be of shared layout
+  let verifier = [{ return ::verify(*this); }];
+}
+
 #endif
--- a/include/triton/Dialect/TritonGPU/Transforms/Passes.td
+++ b/include/triton/Dialect/TritonGPU/Transforms/Passes.td
@@ -80,7 +80,8 @@ def TritonGPUVerifier : Pass<"tritongpu-verifier", "mlir::ModuleOp"> {

  let constructor = "mlir::createTritonGPUVerifier()";

-  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
+                           "mlir::gpu::GPUDialect"];
 }

 #endif