[TritonGPU] Improved documentation and semantics of layout encodings (#30)

2022-07-31 13:59:44 -07:00
parent e02c82c765
commit d1593e6ca8
17 changed files with 399 additions and 566 deletions
--- a/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp
@@ -3,6 +3,7 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include <algorithm>
+#include <numeric>

 using namespace mlir;
 using namespace mlir::triton::gpu;
@@ -11,54 +12,26 @@ using namespace mlir::triton::gpu;
 // TypeConverter
 //
 TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context,
-                                               int numThreads)
-    : context(context), numThreads(numThreads) {
+                                               int numWarps)
+    : context(context), numWarps(numWarps) {
  // TODO: how does MLIR pick the right conversion?
  addConversion([](Type type) { return type; });
  addConversion([this](RankedTensorType tensorType) -> RankedTensorType {
-    MLIRContext *context = this->context;
-    int numThreads = this->numThreads;
-
-    llvm::ArrayRef<int64_t> shape = tensorType.getShape();
-    Type elementType = tensorType.getElementType();
-    int64_t rank = tensorType.getRank();
-    int64_t numElements = tensorType.getNumElements();
-
-    // TODO: are there any better ways to raise this error?
-    if (!(numElements >= numThreads)) {
-      SmallVector<char> buffer;
-      llvm::raw_svector_ostream os(buffer);
-      os << tensorType << " has " << numElements << " numElements "
-         << " smaller than numThreads (" << numThreads << ")\n"
-         << "consider using smaller num-warps\n";
-      llvm::report_fatal_error(os.str());
-    }
-    assert(numElements % numThreads == 0);
-
-    // or assert no encoding?
-
-    // Now we assume:
-    //   contiguous = 1, order = 0, 1, 2, ...,
-    llvm::SmallVector<unsigned> threadTileSize(rank, 1); // naive layout
-    llvm::SmallVector<unsigned> warpTileSize(rank, 1);
-    llvm::SmallVector<unsigned> blockTileSize(rank);
+    // types with encoding are already in the right format
+    // TODO: check for layout encodings specifically
+    if (tensorType.getEncoding())
+      return tensorType;
+    // pessimistic values for attributes:
+    //   - 1 element per thread
+    //   - order = arange(rank)
+    ArrayRef<int64_t> shape = tensorType.getShape();
+    int rank = shape.size();
    llvm::SmallVector<unsigned> order(rank);
-    llvm::SmallVector<unsigned> broadcastAxis;
-    int remainingThreads = numThreads;
-    int remainingLanes = /*warp size*/ 32;
-    for (int64_t dim = 0; dim < rank; ++dim) {
-      blockTileSize[dim] = std::clamp(remainingThreads, 1, int(shape[dim]));
-      warpTileSize[dim] = std::clamp(remainingLanes, 1, int(shape[dim]));
-      order[dim] = dim;
-
-      remainingThreads /= blockTileSize[dim];
-      remainingLanes /= warpTileSize[dim];
-      // TODO: will we need repetition?
-    }
+    std::iota(order.begin(), order.end(), 0);
+    llvm::SmallVector<unsigned> sizePerThread(rank, 1);
    Attribute encoding = triton::gpu::TritonGPUBlockedEncodingAttr::get(
-        context, threadTileSize, warpTileSize, blockTileSize, order,
-        broadcastAxis);
-    return RankedTensorType::get(shape, elementType, encoding);
+        this->context, shape, sizePerThread, order, this->numWarps);
+    return RankedTensorType::get(shape, tensorType.getElementType(), encoding);
  });

  //
@@ -86,8 +59,12 @@ TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context,
  // NOTE: only for remapped values.
  addTargetMaterialization([&](OpBuilder &builder, RankedTensorType tensorType,
                               ValueRange inputs, Location loc) {
-    llvm_unreachable("Not implemented");
-    return llvm::None;
+    auto cast =
+        builder.create<triton::gpu::ConvertLayoutOp>(loc, tensorType, inputs);
+    return Optional<Value>(cast.getResult());
+    // return Optional<Value>(cast.getResult(0));
+    // llvm_unreachable("Not implemented");
+    // return llvm::None;
  });
 }

@@ -122,87 +99,6 @@ TritonGPUConversionTarget::TritonGPUConversionTarget(
        aEncoding.isa<triton::gpu::TritonGPUSharedEncodingAttr>() &&
        bEncoding && bEncoding.isa<triton::gpu::TritonGPUSharedEncodingAttr>())
      return true;
-    // // TODO: we should delete this
-    // if (this->typeConverter.isLegal(dotOp))
-    //   return true;
    return false;
  });
-}
-
-// %dst = tt.broadcast %src
-//   =>
-// %newSrc = convert_layout %src
-// %bcst = tt.broadcast %newSrc
-// %dst = convert_layout %bcst
-LogicalResult TritonGPUConversionTarget::refineLayouts(ModuleOp mod,
-                                                       int numThreads) {
-  // collect broadcasts
-  SmallVector<triton::BroadcastOp> broadcasts;
-  mod.walk([&](triton::BroadcastOp op) { broadcasts.push_back(op); });
-
-  BlockAndValueMapping mapping;
-  for (auto broadcast : broadcasts) {
-    OpBuilder builder(broadcast);
-    Value src = mapping.lookupOrDefault(broadcast.src());
-    Type originSrcType = src.getType();
-    Type originDstType = broadcast.getType();
-    auto originDstTensorType = originDstType.dyn_cast<RankedTensorType>();
-    unsigned dstRank = originDstTensorType.getRank();
-
-    // compute newSrcType & broadcastAxis
-    Type newSrcType;
-    SmallVector<unsigned> broadcastAxis;
-    bool isSrcScalar = false;
-    if (auto tensorType = originSrcType.dyn_cast<RankedTensorType>()) {
-      assert(tensorType.getRank() == dstRank &&
-             "src & dst should have same rank (verifier should catch this)");
-      for (unsigned ax = 0; ax < dstRank; ++ax)
-        if (tensorType.getShape()[ax] < originDstTensorType.getShape()[ax])
-          broadcastAxis.push_back(ax);
-
-      Attribute originSrcEnc = tensorType.getEncoding();
-      if (auto blockedEnc =
-              originSrcEnc.dyn_cast<TritonGPUBlockedEncodingAttr>()) {
-        auto newSrcEnc = TritonGPUBlockedMulticastEncodingAttr::get(
-            blockedEnc.getContext(), blockedEnc.getThreadTileSize(),
-            blockedEnc.getWarpTileSize(), blockedEnc.getBlockTileSize(),
-            blockedEnc.getOrder(), broadcastAxis);
-        newSrcType = RankedTensorType::get(
-            tensorType.getShape(), tensorType.getElementType(), newSrcEnc);
-      } else
-        llvm_unreachable("src of broadcast should have blocked encoding");
-    } else {
-      for (unsigned ax = 0; ax < dstRank; ++ax)
-        broadcastAxis.push_back(ax);
-      newSrcType = originSrcType;
-      isSrcScalar = true;
-    }
-
-    // create new src
-    if (!isSrcScalar) // we don't need to convert layout for scalar values
-      src = builder.create<triton::gpu::ConvertLayoutOp>(src.getLoc(),
-                                                         newSrcType, src);
-
-    // create new broadcast
-    // compute new type (encoding)
-    auto originDstEnc = originDstTensorType.getEncoding()
-                            .dyn_cast<TritonGPUBlockedEncodingAttr>();
-    auto newEnc = TritonGPUBlockedEncodingAttr::get(
-        originDstEnc.getContext(), originDstEnc.getThreadTileSize(),
-        originDstEnc.getWarpTileSize(), originDstEnc.getBlockTileSize(),
-        originDstEnc.getOrder(), broadcastAxis);
-    auto newType =
-        RankedTensorType::get(originDstTensorType.getShape(),
-                              originDstTensorType.getElementType(), newEnc);
-    Value newBroadcast =
-        builder.create<triton::BroadcastOp>(broadcast.getLoc(), newType, src);
-    // we don't want to change the encoding of the result
-    Value newDst = builder.create<triton::gpu::ConvertLayoutOp>(
-        broadcast.getLoc(), originDstType, newBroadcast);
-
-    broadcast.replaceAllUsesWith(newDst);
-    mapping.map(broadcast, newDst);
-  }
-
-  return success();
-}
+}