[BACKEND] Support of ConvertLayoutOp from blocked to blocked and SliceLayout with blocked parent (#658)

2022-09-18 05:58:42 +08:00
parent 13669b46a6
commit 15bfd0cb79
17 changed files with 1025 additions and 191 deletions
--- a/include/triton/Analysis/Allocation.h
+++ b/include/triton/Analysis/Allocation.h
@@ -14,7 +14,12 @@ namespace mlir {

 namespace triton {
 class AllocationAnalysis;
-}
+
+SmallVector<unsigned>
+getScratchConfigForCvtLayout(triton::gpu::ConvertLayoutOp op, unsigned &inVec,
+                             unsigned &outVec);
+
+} // namespace triton

 /// Modified from llvm-15.0: llvm/ADT/AddressRanges.h
 /// A class that represents an interval, specified using a start and an end
--- a/include/triton/Analysis/Utility.h
+++ b/include/triton/Analysis/Utility.h
@@ -2,7 +2,10 @@
 #define TRITON_ANALYSIS_UTILITY_H

 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include <algorithm>
+#include <numeric>
 #include <string>
+
 namespace mlir {

 bool isSharedEncoding(Value value);
@@ -11,6 +14,12 @@ bool maybeSharedAllocationOp(Operation *op);

 std::string getValueOperandName(Value value, AsmState &state);

+template <typename Int> Int product(llvm::ArrayRef<Int> arr) {
+  return std::accumulate(arr.begin(), arr.end(), 1, std::multiplies{});
+}
+
+template <typename Int> Int ceil(Int m, Int n) { return (m + n - 1) / n; }
+
 } // namespace mlir

 #endif // TRITON_ANALYSIS_UTILITY_H
--- a/include/triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.h
+++ b/include/triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.h
@@ -18,6 +18,14 @@ public:
                                      mlir::LLVMTypeConverter &typeConverter);
 };

+class TritonLLVMFunctionConversionTarget : public ConversionTarget {
+  mlir::LLVMTypeConverter &typeConverter;
+
+public:
+  explicit TritonLLVMFunctionConversionTarget(
+      MLIRContext &ctx, mlir::LLVMTypeConverter &typeConverter);
+};
+
 namespace triton {

 // Names for identifying different NVVM annotations. It is used as attribute
--- a/include/triton/Dialect/TritonGPU/IR/Dialect.h
+++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -16,4 +16,16 @@
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonGPU/IR/Ops.h.inc"

+namespace mlir {
+namespace triton {
+namespace gpu {
+
+unsigned getElemsPerThread(Attribute layout, ArrayRef<int64_t> shape);
+
+unsigned getShapePerCTA(const Attribute &layout, unsigned d);
+
+} // namespace gpu
+} // namespace triton
+} // namespace mlir
+
 #endif // TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -31,6 +31,10 @@ Then, attaching $\mathcal{L} to a tensor $T$ would mean that:

 Right now, Triton implements two classes of layouts: shared, and distributed.
  }];
+
+  code extraBaseClassDeclaration = [{
+    unsigned getElemsPerThread(ArrayRef<int64_t> shape) const;
+  }];
 }

 //===----------------------------------------------------------------------===//
@@ -64,6 +68,8 @@ A_{3, 2}  A_{3, 3}  A_{3, 0}  A_{3, 1} ...   [phase 1] /
    "unsigned":$vec, "unsigned":$perPhase, "unsigned":$maxPhase,
    ArrayRefParameter<"unsigned", "order of axes by the rate of changing">:$order
  );
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
 }

 //===----------------------------------------------------------------------===//
@@ -93,6 +99,8 @@ Then the data of A would be distributed as follow between the 16 CUDA threads:
 L(A) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
         {4,12}, {5,13}, {6,14}, {7,15}, {4,12}, {5, 13}, {6, 14}, {7, 15} ]
  }];
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
 }

 //===----------------------------------------------------------------------===//
@@ -171,11 +179,10 @@ for
    }]>
  ];

-  let extraClassDeclaration = [{
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
    SliceEncodingAttr squeeze(int axis);
  }];

-
  let parameters = (
    ins
    ArrayRefParameter<"unsigned">:$sizePerThread,
@@ -282,6 +289,8 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
    "unsigned":$version,
    ArrayRefParameter<"unsigned">:$warpsPerCTA
  );
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
 }

 def SliceEncodingAttr : DistributedEncoding<"SliceEncoding"> {
@@ -311,6 +320,8 @@ def SliceEncodingAttr : DistributedEncoding<"SliceEncoding"> {
    // TODO: constraint here to only take distributed encodings
    "Attribute":$parent
  );
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
 }


--- a/include/triton/tools/sys/getenv.hpp
+++ b/include/triton/tools/sys/getenv.hpp
@@ -22,6 +22,7 @@
 #ifndef TDL_TOOLS_SYS_GETENV_HPP
 #define TDL_TOOLS_SYS_GETENV_HPP

+#include <algorithm>
 #include <cstdlib>
 #include <string>

@@ -37,6 +38,14 @@ inline std::string getenv(const char *name) {
  return result;
 }

+inline bool getBoolEnv(const std::string &env) {
+  const char *s = std::getenv(env.c_str());
+  std::string str(s ? s : "");
+  std::transform(str.begin(), str.end(), str.begin(),
+                 [](unsigned char c) { return std::tolower(c); });
+  return (str == "on" || str == "true" || str == "1");
+}
+
 } // namespace tools

 } // namespace triton