[OPTIMIZER] Rewrite patterns for layout conversions (#64)

This commit is contained in:
Philippe Tillet
2022-08-18 12:49:37 -07:00
committed by GitHub
parent e0bedeb44c
commit 192be76b3c
19 changed files with 851 additions and 127 deletions

View File

@@ -225,7 +225,7 @@ def TT_DotOp : TT_Op<"dot", [NoSideEffect,
// let hasCanonicalizer = 1;
}
def TT_ReduceOp : TT_Op<"reduce"> {
def TT_ReduceOp : TT_Op<"reduce", [NoSideEffect]> {
let summary = "reduce";
let arguments = (ins TT_RedOpAttr:$redOp, TT_Tensor:$operand, I32Attr:$axis);

View File

@@ -37,7 +37,7 @@ Right now, Triton implements two classes of layouts: shared, and distributed.
// Shared Layout Encoding
//===----------------------------------------------------------------------===//
def TritonGPUSharedEncodingAttr : TritonGPU_Attr<"TritonGPUSharedEncoding"> {
def SharedEncodingAttr : TritonGPU_Attr<"SharedEncoding"> {
let mnemonic = "shared";
let description = [{
@@ -70,9 +70,7 @@ A_{3, 2} A_{3, 3} A_{3, 0} A_{3, 1} ... [phase 1] /
// Distributed Layout Encoding
//===----------------------------------------------------------------------===//
class TritonGPUDistributedEncodingAttr : TritonGPU_Attr<"TritonGPUDistributedEncoding"> {
let mnemonic = "distributed";
class DistributedEncoding<string name> : TritonGPU_Attr<name> {
let description = [{
Distributed encodings have a layout function that is entirely characterized
by a d-dimensional tensor L. Note that L doesn't need to have the same shape
@@ -97,12 +95,11 @@ L(A) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
}];
}
//===----------------------------------------------------------------------===//
// Blocked Layout Encoding
//===----------------------------------------------------------------------===//
def TritonGPUBlockedEncodingAttr : TritonGPU_Attr<"TritonGPUBlockedEncoding"> {
def BlockedEncodingAttr : DistributedEncoding<"BlockedEncoding"> {
let mnemonic = "blocked";
let description = [{
@@ -174,6 +171,10 @@ for
}]>
];
let extraClassDeclaration = [{
SliceEncodingAttr squeeze(int axis);
}];
let parameters = (
ins
@@ -197,7 +198,7 @@ for
//===----------------------------------------------------------------------===//
// TODO: MMAv1 and MMAv2 should be two instances of the same class
def TritonGPUMmaEncodingAttr : TritonGPU_Attr<"TritonGPUMmaEncoding"> {
def MmaEncodingAttr : DistributedEncoding<"MmaEncoding"> {
let mnemonic = "mma";
let description = [{
@@ -283,5 +284,34 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
);
}
def SliceEncodingAttr : DistributedEncoding<"SliceEncoding"> {
let mnemonic = "slice";
let description = [{
TODO: improve docs
A = [x x x x x x x x]
[x x x x x x x x]
L_parent = [0 1 2 3 ]
[4 5 6 7 ]
[8 9 10 11]
[12 13 14 15]
dim = 0
Then the data of A would be distributed as follow between the 16 CUDA threads:
L(A) = [ {0,4,8,12} , {1,5,9,13} , ... {3,7,11,15} ]
This is useful for constructing the inverse layout of an expand_dims operation during some optimization passes.
}];
let parameters = (
ins
"unsigned":$dim,
// TODO: constraint here to only take distributed encodings
"Attribute":$parent
);
}
#endif

View File

@@ -20,7 +20,7 @@ class TTG_Op<string mnemonic, list<Trait> traits = []> :
Op<TritonGPU_Dialect, mnemonic, traits>;
def TTG_ConvertLayoutOp : TTG_Op<"convert_layout",
[NoSideEffect, SameOperandsAndResultType]> {
[NoSideEffect]> {
let summary = "convert layout";
let arguments = (ins TT_Tensor:$src);
@@ -65,7 +65,7 @@ def TTG_CopyAsyncOp : TTG_Op<"copy_async",
// This is needed because Arith's Cmp ops don't
// handle encodings
// https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Dialect/Arithmetic/IR/ArithmeticOps.td#L111
def TTG_CmpIOp : TTG_Op<"cmpi"> {
def TTG_CmpIOp : TTG_Op<"cmpi", [NoSideEffect]> {
let summary = "integer comparison operation";
let description = [{}];

View File

@@ -6,6 +6,8 @@
namespace mlir {
std::unique_ptr<Pass> createTritonGPUPipelinePass(int numStages = 2);
std::unique_ptr<Pass> createTritonGPUCanonicalizeLoopsPass();
std::unique_ptr<Pass> createTritonGPUCoalescePass();
std::unique_ptr<Pass> createTritonGPUCombineOpsPass();

View File

@@ -60,6 +60,19 @@ def TritonGPUCombineOps : Pass<"tritongpu-combine", "mlir::ModuleOp"> {
"mlir::triton::TritonDialect"];
}
def TritonGPUCanonicalizeLoops: Pass<"tritongpu-canonicalize-loops", "mlir::ModuleOp"> {
let summary = "canonicalize scf.ForOp ops";
let description = [{
This implements some optimizations that are missing in the standard scf.ForOp
canonicalizer.
}];
let constructor = "mlir::createTritonGPUCanonicalizeLoopsPass()";
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
}
def TritonGPUVerifier : Pass<"tritongpu-verifier", "mlir::ModuleOp"> {
let summary = "verify TritonGPU IR";