[OPTIMIZER] Minor bugfixes that affected matmul codegen performance (#834)

2022-11-02 22:58:09 -07:00
parent 847a318a03
commit 91a9773b38
4 changed files with 59 additions and 29 deletions
--- a/lib/Dialect/TritonGPU/Transforms/Combine.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Combine.cpp
@@ -533,6 +533,35 @@ public:
  BlockedToMMA(mlir::MLIRContext *context)
      : mlir::RewritePattern(triton::DotOp::getOperationName(), 2, context) {}

+  static SmallVector<unsigned, 2>
+  getWarpsPerTile(const ArrayRef<int64_t> &shape, int version, int numWarps) {
+    assert(version == 2);
+    // TODO: Handle one warp per row for fused matmuls
+    // TODO: unsigned -> int64_t to keep things uniform
+    SmallVector<unsigned, 2> ret = {1, 1};
+    SmallVector<int64_t, 2> shapePerWarp = {16, 8};
+    bool changed = false;
+    // TODO (@daadaada): double-check.
+    // original logic in
+    // https://github.com/openai/triton/blob/master/lib/codegen/analysis/layout.cc#L252
+    // seems buggy for shape = [32, 16] ?
+    do {
+      changed = false;
+      if (ret[0] * ret[1] >= numWarps)
+        break;
+      if (shape[0] / shapePerWarp[0] / ret[0] >=
+          shape[1] / (shapePerWarp[1] * 2) / ret[1]) {
+        if (ret[0] < shape[0] / shapePerWarp[0]) {
+          ret[0] *= 2;
+        } else
+          ret[1] *= 2;
+      } else {
+        ret[1] *= 2;
+      }
+    } while (true);
+    return ret;
+  }
+
  mlir::LogicalResult
  matchAndRewrite(mlir::Operation *op,
                  mlir::PatternRewriter &rewriter) const override {
@@ -541,13 +570,20 @@ public:
    auto oldRetType = dotOp.getResult().getType().cast<RankedTensorType>();
    if (oldRetType.getEncoding().isa<triton::gpu::MmaEncodingAttr>())
      return failure();
-    // TODO: compute warpsPerCTA
-    auto newRetType = RankedTensorType::get(
-        oldRetType.getShape(), oldRetType.getElementType(),
-        triton::gpu::MmaEncodingAttr::get(oldRetType.getContext(), 2, {2, 2}));
+    // get MMA encoding for the given number of warps
+    auto retShape = oldRetType.getShape();
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    int numWarps = triton::gpu::TritonGPUDialect::getNumWarps(mod);
+    auto newRetType =
+        RankedTensorType::get(retShape, oldRetType.getElementType(),
+                              triton::gpu::MmaEncodingAttr::get(
+                                  oldRetType.getContext(), 2,
+                                  getWarpsPerTile(retShape, 2, numWarps)));
+    // convert accumulator
    auto oldAcc = dotOp.getOperand(2);
    auto newAcc = rewriter.create<triton::gpu::ConvertLayoutOp>(
        oldAcc.getLoc(), newRetType, oldAcc);
+    // convert output
    auto newDot = rewriter.create<triton::DotOp>(
        dotOp.getLoc(), newRetType, dotOp.getOperand(0), dotOp.getOperand(1),
        newAcc, dotOp.allowTF32(), dotOp.transA(), dotOp.transB());