[Triton-MLIR][OPTIMIZER] Cleaned up swizzling (#869)

Swizzling is no longer implemented as a separate pass. It is instead
done in a specialized constructor of SharedEncodingAttr, and tested via
google tests instead of triton-opt + filecheck.

In the future we may want to implement it as a pass again once we have
an additional dialect between TritonGPU and LLVM.
This commit is contained in:
Philippe Tillet
2022-11-10 12:05:46 -08:00
committed by GitHub
parent 2aa538ec2e
commit f40c63fb03
14 changed files with 170 additions and 318 deletions

View File

@@ -71,6 +71,70 @@ A_{3, 2} A_{3, 3} A_{3, 0} A_{3, 1} ... [phase 1] /
ArrayRefParameter<"unsigned", "order of axes by the rate of changing">:$order
);
let builders = [
AttrBuilder<(ins "DotOperandEncodingAttr":$dotOpEnc,
"ArrayRef<int64_t>":$shape,
"Type":$eltTy), [{
auto mmaEnc = dotOpEnc.getParent().dyn_cast<MmaEncodingAttr>();
// Only support row major for now
// TODO(Keren): check why column major code crashes
SmallVector<unsigned> order = {1, 0};
if(!mmaEnc)
return $_get(context, 1, 1, 1, order);
int version = mmaEnc.getVersion();
int opIdx = dotOpEnc.getOpIdx();
// number of rows per phase
int perPhase = 128 / (shape[order[0]] * (eltTy.getIntOrFloatBitWidth() / 8));
perPhase = std::max<int>(perPhase, 1);
// index of the inner dimension in `order`
unsigned inner = (opIdx == 0) ? 0 : 1;
// ---- begin version 1 ----
// TODO: handle rep (see
// https://github.com/openai/triton/blob/master/lib/codegen/analysis/layout.cc#L209)
if (version == 1) {
int maxPhase = (order[inner] == 1 ? 8 : 4) / perPhase;
return $_get(context, 1, perPhase, maxPhase, order);
}
// ---- begin version 2 ----
if (version == 2) {
std::vector<size_t> matShape = {8, 8,
2 * 64 / eltTy.getIntOrFloatBitWidth()};
// for now, disable swizzle when using transposed int8 tensor cores
if (eltTy.isInteger(8) && order[0] == inner)
return $_get(context, 1, 1, 1, order);
// --- handle A operand ---
if (opIdx == 0) { // compute swizzling for A operand
int vec = (order[0] == 1) ? matShape[2] : matShape[0]; // k : m
int mmaStride = (order[0] == 1) ? matShape[0] : matShape[2];
int maxPhase = mmaStride / perPhase;
return $_get(context, vec, perPhase, maxPhase, order);
}
// --- handle B operand ---
if (opIdx == 1) {
int vec = (order[0] == 1) ? matShape[1] : matShape[2]; // n : k
int mmaStride = (order[0] == 1) ? matShape[2] : matShape[1];
int maxPhase = mmaStride / perPhase;
return $_get(context, vec, perPhase, maxPhase, order);
}
llvm_unreachable("invalid operand index");
}
// ---- not implemented ----
llvm_unreachable("unsupported swizzling for provided MMA version");
}]>
];
let extraClassDeclaration = extraBaseClassDeclaration;
}

View File

@@ -11,8 +11,6 @@ std::unique_ptr<Pass> createTritonGPUPrefetchPass();
std::unique_ptr<Pass> createTritonGPUCanonicalizeLoopsPass();
std::unique_ptr<Pass> createTritonGPUSwizzlePass();
std::unique_ptr<Pass> createTritonGPUCoalescePass();
std::unique_ptr<Pass> createTritonGPUCombineOpsPass();

View File

@@ -65,18 +65,6 @@ def TritonGPUCombineOps : Pass<"tritongpu-combine", "mlir::ModuleOp"> {
"mlir::triton::TritonDialect"];
}
def TritonGPUSwizzle : Pass<"tritongpu-swizzle", "mlir::ModuleOp"> {
let summary = "swizzle";
let description = [{
Inserts conversions to swizzled layout so as to avoid shared memory bank conflicts.
}];
let constructor = "mlir::createTritonGPUSwizzlePass()";
let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
}
def TritonGPUCanonicalizeLoops: Pass<"tritongpu-canonicalize-loops", "mlir::ModuleOp"> {
let summary = "canonicalize scf.ForOp ops";