[Triton-MLIR][OPTIMIZER] Cleaned up swizzling (#869)

Swizzling is no longer implemented as a separate pass. It is instead done in a specialized constructor of SharedEncodingAttr, and tested via google tests instead of triton-opt + filecheck. In the future we may want to implement it as a pass again once we have an additional dialect between TritonGPU and LLVM.
2022-11-10 12:05:46 -08:00
parent 2aa538ec2e
commit f40c63fb03
14 changed files with 170 additions and 318 deletions
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -71,6 +71,70 @@ A_{3, 2}  A_{3, 3}  A_{3, 0}  A_{3, 1} ...   [phase 1] /
    ArrayRefParameter<"unsigned", "order of axes by the rate of changing">:$order
  );

+  let builders = [
+    AttrBuilder<(ins "DotOperandEncodingAttr":$dotOpEnc,
+                     "ArrayRef<int64_t>":$shape,
+                     "Type":$eltTy), [{
+        auto mmaEnc = dotOpEnc.getParent().dyn_cast<MmaEncodingAttr>();
+        // Only support row major for now
+        // TODO(Keren): check why column major code crashes
+        SmallVector<unsigned> order = {1, 0};
+
+        if(!mmaEnc)
+          return $_get(context, 1, 1, 1, order);
+
+        int version = mmaEnc.getVersion();
+        int opIdx = dotOpEnc.getOpIdx();
+
+        // number of rows per phase
+        int perPhase = 128 / (shape[order[0]] * (eltTy.getIntOrFloatBitWidth() / 8));
+        perPhase = std::max<int>(perPhase, 1);
+        
+        // index of the inner dimension in `order`
+        unsigned inner = (opIdx == 0) ? 0 : 1;
+
+        // ---- begin version 1 ----
+        // TODO: handle rep (see
+        // https://github.com/openai/triton/blob/master/lib/codegen/analysis/layout.cc#L209)
+        if (version == 1) {
+          int maxPhase = (order[inner] == 1 ? 8 : 4) / perPhase;
+          return $_get(context, 1, perPhase, maxPhase, order);
+        } 
+
+        // ---- begin version 2 ----
+        if (version == 2) {
+          std::vector<size_t> matShape = {8, 8,
+                                          2 * 64 / eltTy.getIntOrFloatBitWidth()};
+          // for now, disable swizzle when using transposed int8 tensor cores
+          if (eltTy.isInteger(8) && order[0] == inner)
+            return $_get(context, 1, 1, 1, order);
+            
+          // --- handle A operand ---
+          if (opIdx == 0) { // compute swizzling for A operand
+              int vec = (order[0] == 1) ? matShape[2] : matShape[0]; // k : m
+              int mmaStride = (order[0] == 1) ? matShape[0] : matShape[2];
+              int maxPhase = mmaStride / perPhase;
+              return $_get(context, vec, perPhase, maxPhase, order);
+          } 
+
+          // --- handle B operand ---
+          if (opIdx == 1) {
+              int vec = (order[0] == 1) ? matShape[1] : matShape[2]; // n : k
+              int mmaStride = (order[0] == 1) ? matShape[2] : matShape[1];
+              int maxPhase = mmaStride / perPhase;
+              return $_get(context, vec, perPhase, maxPhase, order);
+          } 
+            
+          llvm_unreachable("invalid operand index");
+        }
+
+        // ---- not implemented ----
+        llvm_unreachable("unsupported swizzling for provided MMA version");
+
+
+    }]>
+  ];
+
  let extraClassDeclaration = extraBaseClassDeclaration;
 }

--- a/include/triton/Dialect/TritonGPU/Transforms/Passes.h
+++ b/include/triton/Dialect/TritonGPU/Transforms/Passes.h
@@ -11,8 +11,6 @@ std::unique_ptr<Pass> createTritonGPUPrefetchPass();

 std::unique_ptr<Pass> createTritonGPUCanonicalizeLoopsPass();

-std::unique_ptr<Pass> createTritonGPUSwizzlePass();
-
 std::unique_ptr<Pass> createTritonGPUCoalescePass();

 std::unique_ptr<Pass> createTritonGPUCombineOpsPass();
--- a/include/triton/Dialect/TritonGPU/Transforms/Passes.td
+++ b/include/triton/Dialect/TritonGPU/Transforms/Passes.td
@@ -65,18 +65,6 @@ def TritonGPUCombineOps : Pass<"tritongpu-combine", "mlir::ModuleOp"> {
                           "mlir::triton::TritonDialect"];
 }

-def TritonGPUSwizzle : Pass<"tritongpu-swizzle", "mlir::ModuleOp"> {
-  let summary = "swizzle";
-
-  let description = [{
-    Inserts conversions to swizzled layout so as to avoid shared memory bank conflicts.
-  }];
-
-  let constructor = "mlir::createTritonGPUSwizzlePass()";
-
-  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
-}
-
 def TritonGPUCanonicalizeLoops: Pass<"tritongpu-canonicalize-loops", "mlir::ModuleOp"> {
  let summary = "canonicalize scf.ForOp ops";