[Triton-MLIR][Backend] Fix number of warps and threads per warp when matrices are small (#917)

2022-11-26 12:30:38 -08:00
parent f63be0e9b5
commit 35c9ec1103
7 changed files with 116 additions and 29 deletions
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -5,6 +5,38 @@

 namespace mlir {

+bool ReduceOpHelper::isFastReduction() {
+  auto srcLayout = srcTy.getEncoding();
+  auto axis = op.axis();
+  return axis == triton::gpu::getOrder(srcLayout)[0];
+}
+
+unsigned ReduceOpHelper::getInterWarpSize() {
+  auto srcLayout = srcTy.getEncoding();
+  auto srcShape = srcTy.getShape();
+  auto axis = op.axis();
+  auto srcReduceDimSize = static_cast<unsigned>(srcShape[axis]);
+  unsigned sizeIntraWarps = getIntraWarpSize();
+  return std::min(srcReduceDimSize / sizeIntraWarps,
+                  triton::gpu::getWarpsPerCTA(srcLayout)[axis]);
+}
+
+unsigned ReduceOpHelper::getIntraWarpSize() {
+  auto srcLayout = srcTy.getEncoding();
+  auto srcShape = srcTy.getShape();
+  auto axis = op.axis();
+  auto srcReduceDimSize = static_cast<unsigned>(srcShape[axis]);
+  return std::min(srcReduceDimSize,
+                  triton::gpu::getThreadsPerWarp(srcLayout)[axis]);
+}
+
+unsigned ReduceOpHelper::getThreadsReductionAxis() {
+  auto srcLayout = srcTy.getEncoding();
+  auto axis = op.axis();
+  return triton::gpu::getThreadsPerWarp(srcLayout)[axis] *
+         triton::gpu::getWarpsPerCTA(srcLayout)[axis];
+}
+
 bool isSharedEncoding(Value value) {
  auto type = value.getType();
  if (auto tensorType = type.dyn_cast<RankedTensorType>()) {