[Triton-MLIR][Backend] Support ConvertLayout blocked->shared and a few fixes related with mma(#716)

2022-10-03 19:33:25 +08:00
parent baba98ad69
commit f9d7f2f126
5 changed files with 458 additions and 227 deletions
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -87,7 +87,6 @@ SmallVector<unsigned> getShapePerCTA(const Attribute &layout) {
  } else {
    assert(0 && "Unimplemented usage of getShapePerCTA");
  }
-
  return shape;
 }

@@ -104,7 +103,7 @@ SmallVector<unsigned> getOrder(const Attribute &layout) {
    assert(0 && "Unimplemented usage of getOrder");
    return {};
  }
-}
+};

 } // namespace gpu
 } // namespace triton
@@ -215,9 +214,12 @@ unsigned SliceEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape) const {
 }

 unsigned MmaEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape) const {
-  int threads = product(getWarpsPerCTA());
-  int numElem = product(shape);
-  return numElem / threads;
+  size_t rank = shape.size();
+  assert(rank == 2 && "Unexpected rank of mma layout");
+  assert(getVersion() == 2 && "mmaLayout version = 1 is not implemented yet");
+  unsigned elemsCol = ceil<unsigned>(shape[0], 16 * getWarpsPerCTA()[0]) * 2;
+  unsigned elemsRow = ceil<unsigned>(shape[1], 8 * getWarpsPerCTA()[1]) * 2;
+  return elemsCol * elemsRow;
 }

 unsigned SharedEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape) const {