[OPTIMIZER] Rewrite patterns for layout conversions (#64)
This commit is contained in:
@@ -225,7 +225,7 @@ def TT_DotOp : TT_Op<"dot", [NoSideEffect,
|
||||
// let hasCanonicalizer = 1;
|
||||
}
|
||||
|
||||
def TT_ReduceOp : TT_Op<"reduce"> {
|
||||
def TT_ReduceOp : TT_Op<"reduce", [NoSideEffect]> {
|
||||
let summary = "reduce";
|
||||
|
||||
let arguments = (ins TT_RedOpAttr:$redOp, TT_Tensor:$operand, I32Attr:$axis);
|
||||
|
@@ -37,7 +37,7 @@ Right now, Triton implements two classes of layouts: shared, and distributed.
|
||||
// Shared Layout Encoding
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def TritonGPUSharedEncodingAttr : TritonGPU_Attr<"TritonGPUSharedEncoding"> {
|
||||
def SharedEncodingAttr : TritonGPU_Attr<"SharedEncoding"> {
|
||||
let mnemonic = "shared";
|
||||
|
||||
let description = [{
|
||||
@@ -70,9 +70,7 @@ A_{3, 2} A_{3, 3} A_{3, 0} A_{3, 1} ... [phase 1] /
|
||||
// Distributed Layout Encoding
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
class TritonGPUDistributedEncodingAttr : TritonGPU_Attr<"TritonGPUDistributedEncoding"> {
|
||||
let mnemonic = "distributed";
|
||||
|
||||
class DistributedEncoding<string name> : TritonGPU_Attr<name> {
|
||||
let description = [{
|
||||
Distributed encodings have a layout function that is entirely characterized
|
||||
by a d-dimensional tensor L. Note that L doesn't need to have the same shape
|
||||
@@ -97,12 +95,11 @@ L(A) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
|
||||
}];
|
||||
}
|
||||
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Blocked Layout Encoding
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def TritonGPUBlockedEncodingAttr : TritonGPU_Attr<"TritonGPUBlockedEncoding"> {
|
||||
def BlockedEncodingAttr : DistributedEncoding<"BlockedEncoding"> {
|
||||
let mnemonic = "blocked";
|
||||
|
||||
let description = [{
|
||||
@@ -174,6 +171,10 @@ for
|
||||
}]>
|
||||
];
|
||||
|
||||
let extraClassDeclaration = [{
|
||||
SliceEncodingAttr squeeze(int axis);
|
||||
}];
|
||||
|
||||
|
||||
let parameters = (
|
||||
ins
|
||||
@@ -197,7 +198,7 @@ for
|
||||
//===----------------------------------------------------------------------===//
|
||||
// TODO: MMAv1 and MMAv2 should be two instances of the same class
|
||||
|
||||
def TritonGPUMmaEncodingAttr : TritonGPU_Attr<"TritonGPUMmaEncoding"> {
|
||||
def MmaEncodingAttr : DistributedEncoding<"MmaEncoding"> {
|
||||
let mnemonic = "mma";
|
||||
|
||||
let description = [{
|
||||
@@ -283,5 +284,34 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
|
||||
);
|
||||
}
|
||||
|
||||
def SliceEncodingAttr : DistributedEncoding<"SliceEncoding"> {
|
||||
let mnemonic = "slice";
|
||||
|
||||
let description = [{
|
||||
TODO: improve docs
|
||||
|
||||
A = [x x x x x x x x]
|
||||
[x x x x x x x x]
|
||||
L_parent = [0 1 2 3 ]
|
||||
[4 5 6 7 ]
|
||||
[8 9 10 11]
|
||||
[12 13 14 15]
|
||||
dim = 0
|
||||
|
||||
Then the data of A would be distributed as follow between the 16 CUDA threads:
|
||||
L(A) = [ {0,4,8,12} , {1,5,9,13} , ... {3,7,11,15} ]
|
||||
|
||||
This is useful for constructing the inverse layout of an expand_dims operation during some optimization passes.
|
||||
|
||||
}];
|
||||
|
||||
let parameters = (
|
||||
ins
|
||||
"unsigned":$dim,
|
||||
// TODO: constraint here to only take distributed encodings
|
||||
"Attribute":$parent
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
@@ -20,7 +20,7 @@ class TTG_Op<string mnemonic, list<Trait> traits = []> :
|
||||
Op<TritonGPU_Dialect, mnemonic, traits>;
|
||||
|
||||
def TTG_ConvertLayoutOp : TTG_Op<"convert_layout",
|
||||
[NoSideEffect, SameOperandsAndResultType]> {
|
||||
[NoSideEffect]> {
|
||||
let summary = "convert layout";
|
||||
|
||||
let arguments = (ins TT_Tensor:$src);
|
||||
@@ -65,7 +65,7 @@ def TTG_CopyAsyncOp : TTG_Op<"copy_async",
|
||||
// This is needed because Arith's Cmp ops don't
|
||||
// handle encodings
|
||||
// https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Dialect/Arithmetic/IR/ArithmeticOps.td#L111
|
||||
def TTG_CmpIOp : TTG_Op<"cmpi"> {
|
||||
def TTG_CmpIOp : TTG_Op<"cmpi", [NoSideEffect]> {
|
||||
let summary = "integer comparison operation";
|
||||
|
||||
let description = [{}];
|
||||
|
@@ -6,6 +6,8 @@
|
||||
namespace mlir {
|
||||
std::unique_ptr<Pass> createTritonGPUPipelinePass(int numStages = 2);
|
||||
|
||||
std::unique_ptr<Pass> createTritonGPUCanonicalizeLoopsPass();
|
||||
|
||||
std::unique_ptr<Pass> createTritonGPUCoalescePass();
|
||||
|
||||
std::unique_ptr<Pass> createTritonGPUCombineOpsPass();
|
||||
|
@@ -60,6 +60,19 @@ def TritonGPUCombineOps : Pass<"tritongpu-combine", "mlir::ModuleOp"> {
|
||||
"mlir::triton::TritonDialect"];
|
||||
}
|
||||
|
||||
def TritonGPUCanonicalizeLoops: Pass<"tritongpu-canonicalize-loops", "mlir::ModuleOp"> {
|
||||
let summary = "canonicalize scf.ForOp ops";
|
||||
|
||||
let description = [{
|
||||
This implements some optimizations that are missing in the standard scf.ForOp
|
||||
canonicalizer.
|
||||
}];
|
||||
|
||||
let constructor = "mlir::createTritonGPUCanonicalizeLoopsPass()";
|
||||
|
||||
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
|
||||
}
|
||||
|
||||
def TritonGPUVerifier : Pass<"tritongpu-verifier", "mlir::ModuleOp"> {
|
||||
let summary = "verify TritonGPU IR";
|
||||
|
||||
|
Reference in New Issue
Block a user