[Triton-MLIR] Support FP8 (#864)

Co-authored-by: Superjomn <yanchunwei@outlook.com>
2022-11-10 15:53:06 +08:00
parent 4946167241
commit 57fd1864a7
18 changed files with 571 additions and 160 deletions
--- a/lib/Dialect/Triton/IR/Ops.cpp
+++ b/lib/Dialect/Triton/IR/Ops.cpp
@@ -124,6 +124,29 @@ void printStoreOp(OpAsmPrinter &printer, StoreOp storeOp) {
 namespace mlir {
 namespace triton {

+//-- FpToFpOp --
+bool FpToFpOp::areCastCompatible(::mlir::TypeRange inputs,
+                               ::mlir::TypeRange outputs) {
+  if (inputs.size() != 1 || outputs.size() != 1)
+    return false;
+  auto srcEltType = inputs.front();
+  auto dstEltType = outputs.front();
+  auto srcTensorType = srcEltType.dyn_cast<mlir::RankedTensorType>();
+  auto dstTensorType = dstEltType.dyn_cast<mlir::RankedTensorType>();
+  if (srcTensorType && dstTensorType) {
+    srcEltType = srcTensorType.getElementType();
+    dstEltType = dstTensorType.getElementType();
+  }
+  // Check whether fp8 <=> fp16, bf16, f32, f64
+  // Make `srcEltType` always the fp8 side
+  if (dstEltType.dyn_cast<mlir::triton::Float8Type>())
+    std::swap(srcEltType, dstEltType);
+  if (!srcEltType.dyn_cast<mlir::triton::Float8Type>())
+    return false;
+  return dstEltType.isF16() || dstEltType.isBF16() ||
+         dstEltType.isF32() || dstEltType.isF64();
+}
+
 //-- StoreOp --
 void StoreOp::build(::mlir::OpBuilder &builder, ::mlir::OperationState &state,
                    ::mlir::Value ptr, ::mlir::Value value) {
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -44,7 +44,9 @@ namespace gpu {

 // TODO: Inheritation of layout attributes
 unsigned getElemsPerThread(Type type) {
-  if (type.isIntOrIndexOrFloat() || type.isa<triton::PointerType>())
+  if (type.isIntOrIndexOrFloat() ||
+      type.isa<triton::Float8Type>() ||
+      type.isa<triton::PointerType>())
    return 1;
  auto tensorType = type.cast<RankedTensorType>();
  auto layout = tensorType.getEncoding();
--- a/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Coalesce.cpp
@@ -32,7 +32,10 @@ struct CoalescePass : public TritonGPUCoalesceBase<CoalescePass> {
    // Thread tile size depends on memory alignment
    SmallVector<unsigned, 4> sizePerThread(rank, 1);
    PointerType ptrType = origType.getElementType().cast<PointerType>();
-    unsigned numBits = ptrType.getPointeeType().getIntOrFloatBitWidth();
+    auto pointeeType = ptrType.getPointeeType();
+    unsigned numBits =
+        pointeeType.isa<triton::Float8Type>() ?
+        8 : pointeeType.getIntOrFloatBitWidth();
    unsigned maxMultiple = info.getDivisibility(order[0]);
    unsigned maxContig = info.getContiguity(order[0]);
    unsigned alignment = std::min(maxMultiple, maxContig);