[OPTIMIZER] Rewrite patterns for layout conversions (#64)

2022-08-18 12:49:37 -07:00
parent e0bedeb44c
commit 192be76b3c
19 changed files with 851 additions and 127 deletions
--- a/include/triton/Dialect/Triton/IR/TritonOps.td
+++ b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -225,7 +225,7 @@ def TT_DotOp : TT_Op<"dot", [NoSideEffect,
    // let hasCanonicalizer = 1;
 }

-def TT_ReduceOp : TT_Op<"reduce"> {
+def TT_ReduceOp : TT_Op<"reduce", [NoSideEffect]> {
    let summary = "reduce";

    let arguments = (ins TT_RedOpAttr:$redOp, TT_Tensor:$operand, I32Attr:$axis);
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -37,7 +37,7 @@ Right now, Triton implements two classes of layouts: shared, and distributed.
 // Shared Layout Encoding
 //===----------------------------------------------------------------------===//

-def TritonGPUSharedEncodingAttr : TritonGPU_Attr<"TritonGPUSharedEncoding"> {
+def SharedEncodingAttr : TritonGPU_Attr<"SharedEncoding"> {
  let mnemonic = "shared";

  let description = [{
@@ -70,9 +70,7 @@ A_{3, 2}  A_{3, 3}  A_{3, 0}  A_{3, 1} ...   [phase 1] /
 // Distributed Layout Encoding
 //===----------------------------------------------------------------------===//

-class TritonGPUDistributedEncodingAttr : TritonGPU_Attr<"TritonGPUDistributedEncoding"> {
-  let mnemonic = "distributed";
-
+class DistributedEncoding<string name> : TritonGPU_Attr<name> {
  let description = [{
 Distributed encodings have a layout function that is entirely characterized
 by a d-dimensional tensor L. Note that L doesn't need to have the same shape
@@ -97,12 +95,11 @@ L(A) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
  }];
 }

-
 //===----------------------------------------------------------------------===//
 // Blocked Layout Encoding
 //===----------------------------------------------------------------------===//

-def TritonGPUBlockedEncodingAttr : TritonGPU_Attr<"TritonGPUBlockedEncoding"> {
+def BlockedEncodingAttr : DistributedEncoding<"BlockedEncoding"> {
  let mnemonic = "blocked";

  let description = [{
@@ -174,6 +171,10 @@ for
    }]>
  ];

+  let extraClassDeclaration = [{
+    SliceEncodingAttr squeeze(int axis);
+  }];
+

  let parameters = (
    ins
@@ -197,7 +198,7 @@ for
 //===----------------------------------------------------------------------===//
 // TODO: MMAv1 and MMAv2 should be two instances of the same class

-def TritonGPUMmaEncodingAttr : TritonGPU_Attr<"TritonGPUMmaEncoding"> {
+def MmaEncodingAttr : DistributedEncoding<"MmaEncoding"> {
  let mnemonic = "mma";

  let description = [{
@@ -283,5 +284,34 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
  );
 }

+def SliceEncodingAttr : DistributedEncoding<"SliceEncoding"> {
+  let mnemonic = "slice";
+
+  let description = [{
+    TODO: improve docs
+
+    A = [x  x  x  x  x  x  x  x]
+        [x  x  x  x  x  x  x  x]
+    L_parent = [0  1  2  3 ]
+               [4  5  6  7 ]
+               [8  9  10 11]
+               [12 13 14 15]
+    dim = 0
+
+    Then the data of A would be distributed as follow between the 16 CUDA threads:
+    L(A) = [ {0,4,8,12} , {1,5,9,13} , ... {3,7,11,15} ]
+
+    This is useful for constructing the inverse layout of an expand_dims operation during some optimization passes.
+
+  }];
+  
+  let parameters = (
+    ins
+    "unsigned":$dim,
+    // TODO: constraint here to only take distributed encodings
+    "Attribute":$parent
+  );
+}
+

 #endif
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -20,7 +20,7 @@ class TTG_Op<string mnemonic, list<Trait> traits = []> :
    Op<TritonGPU_Dialect, mnemonic, traits>;

 def TTG_ConvertLayoutOp : TTG_Op<"convert_layout",
-                                 [NoSideEffect, SameOperandsAndResultType]> {
+                                 [NoSideEffect]> {
  let summary = "convert layout";

  let arguments = (ins TT_Tensor:$src);
@@ -65,7 +65,7 @@ def TTG_CopyAsyncOp : TTG_Op<"copy_async",
 // This is needed because Arith's Cmp ops don't
 // handle encodings
 // https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Dialect/Arithmetic/IR/ArithmeticOps.td#L111
-def TTG_CmpIOp : TTG_Op<"cmpi"> {
+def TTG_CmpIOp : TTG_Op<"cmpi", [NoSideEffect]> {
  let summary = "integer comparison operation";

  let description = [{}];
--- a/include/triton/Dialect/TritonGPU/Transforms/Passes.h
+++ b/include/triton/Dialect/TritonGPU/Transforms/Passes.h
@@ -6,6 +6,8 @@
 namespace mlir {
 std::unique_ptr<Pass> createTritonGPUPipelinePass(int numStages = 2);

+std::unique_ptr<Pass> createTritonGPUCanonicalizeLoopsPass();
+
 std::unique_ptr<Pass> createTritonGPUCoalescePass();

 std::unique_ptr<Pass> createTritonGPUCombineOpsPass();
--- a/include/triton/Dialect/TritonGPU/Transforms/Passes.td
+++ b/include/triton/Dialect/TritonGPU/Transforms/Passes.td
@@ -60,6 +60,19 @@ def TritonGPUCombineOps : Pass<"tritongpu-combine", "mlir::ModuleOp"> {
                           "mlir::triton::TritonDialect"];
 }

+def TritonGPUCanonicalizeLoops: Pass<"tritongpu-canonicalize-loops", "mlir::ModuleOp"> {
+  let summary = "canonicalize scf.ForOp ops";
+
+  let description = [{
+    This implements some optimizations that are missing in the standard scf.ForOp
+    canonicalizer.
+  }];
+
+  let constructor = "mlir::createTritonGPUCanonicalizeLoopsPass()";
+
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
+}
+
 def TritonGPUVerifier : Pass<"tritongpu-verifier", "mlir::ModuleOp"> {
  let summary = "verify TritonGPU IR";