Files
triton/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp

347 lines
13 KiB
C++
Raw Normal View History

2022-05-11 16:13:53 +08:00
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
2022-05-14 22:04:36 +08:00
#include "mlir/IR/BlockAndValueMapping.h"
2022-05-13 21:32:35 +08:00
//===----------------------------------------------------------------------===//
//
// This file implements loop software pipelining
2022-05-14 22:04:36 +08:00
// The implementation here is inspired by the pipeline pass in Triton (-v2.0)
// and SCF's LoopPipelining.
2022-05-13 21:32:35 +08:00
//
//===----------------------------------------------------------------------===//
2022-05-11 16:13:53 +08:00
using namespace mlir;
#define GEN_PASS_CLASSES
#include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
namespace {
2022-05-11 20:31:08 +08:00
class LoopPipeliner {
struct PipelineInfo {
triton::DotOp dotOp;
triton::LoadOp aLoadOp;
triton::LoadOp bLoadOp;
};
2022-05-13 21:32:35 +08:00
/// comments on numStages:
/// [0, numStages-1) are in the prologue
/// numStages-1 is appended after the loop body
2022-05-11 20:31:08 +08:00
int numStages;
/// cache forOp we are working on
scf::ForOp forOp;
/// dot & loads
PipelineInfo info;
/// value (in loop) => value at stage N
DenseMap<Value, SmallVector<Value>> valueMapping;
2022-05-13 21:32:35 +08:00
DenseSet<BlockArgument> depArgs;
DenseSet<Operation*> depOps;
void setValueMapping(Value origin, Value newValue, int stage);
2022-05-11 20:31:08 +08:00
2022-05-13 21:32:35 +08:00
/// collect values that v depends on and are defined inside the loop
void collectDeps(Value v);
2022-05-11 20:31:08 +08:00
public:
LoopPipeliner(scf::ForOp forOp, int numStages)
: forOp(forOp), numStages(numStages) {}
/// Collect loop info. Return success if we can pipeline this loop
LogicalResult initialize();
2022-05-13 21:32:35 +08:00
///
2022-05-11 20:31:08 +08:00
void emitPrologue();
2022-05-14 22:04:36 +08:00
/// create the new ForOp (add new args & insert prefetched ops)
scf::ForOp createNewForOp();
2022-05-11 20:31:08 +08:00
friend class PipelinePass;
};
2022-05-13 21:32:35 +08:00
// helpers
void LoopPipeliner::setValueMapping(Value origin, Value newValue, int stage) {
if (valueMapping.find(origin) == valueMapping.end())
valueMapping[origin] = SmallVector<Value>(numStages);
valueMapping[origin][stage] = newValue;
}
void LoopPipeliner::collectDeps(Value v) {
if (v.getParentRegion() != &forOp.getLoopBody())
return;
2022-05-14 22:04:36 +08:00
if (auto arg = v.dyn_cast<BlockArgument>()) {
if (depArgs.contains(arg))
return;
2022-05-13 21:32:35 +08:00
depArgs.insert(arg);
2022-05-14 22:04:36 +08:00
// we also need to rematerialize this arg
auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
// Note: we have iv as the first arg, so the op idx is arg.getArgNumber()-1
collectDeps(yield->getOperand(arg.getArgNumber() - 1));
} else { // value
2022-05-13 21:32:35 +08:00
Operation *defOp = v.getDefiningOp();
2022-05-14 22:04:36 +08:00
if (depOps.contains(defOp))
return;
2022-05-13 21:32:35 +08:00
depOps.insert(defOp);
for (Value op : defOp->getOperands())
collectDeps(op);
}
}
2022-05-11 20:31:08 +08:00
/// A load instruction can be pipelined if:
/// - the pointer is a block argument (redefined inside the loop)
/// - the load has only a single use in a dot instruction
LogicalResult LoopPipeliner::initialize() {
2022-05-14 22:04:36 +08:00
Block *loop = forOp.getBody();
2022-05-11 20:31:08 +08:00
// TODO: can we use forOp.walk(...) here?
SmallVector<triton::DotOp, 2> dots;
2022-05-14 22:04:36 +08:00
for (Operation &op : *loop) {
2022-05-11 20:31:08 +08:00
if (auto dotOp = dyn_cast<triton::DotOp>(&op)) {
dots.push_back(dotOp);
}
}
// Don't know what to do if we have more than 1 dots inside the loop
if (dots.size() != 1)
return failure();
triton::DotOp dotOp = dots[0];
// dot (cvt (load %ptr0)), (cvt (load %ptr1))
auto getDefinintLoad = [&](Value v) -> triton::LoadOp {
auto cvt = v.getDefiningOp<triton::gpu::ConvertLayoutOp>();
if (cvt) {
return cvt.src().getDefiningOp<triton::LoadOp>();
}
return nullptr;
};
auto aLoad = getDefinintLoad(dotOp.a());
auto bLoad = getDefinintLoad(dotOp.b());
// ptrs must be block args (phi nodes)
if (aLoad && bLoad) {
if (aLoad.ptr().isa<BlockArgument>() && bLoad.ptr().isa<BlockArgument>()) {
info.dotOp = dotOp; info.aLoadOp = aLoad; info.bLoadOp = bLoad;
2022-05-13 21:32:35 +08:00
collectDeps(dotOp.a());
collectDeps(dotOp.b());
2022-05-11 20:31:08 +08:00
return success();
}
}
return failure();
}
void LoopPipeliner::emitPrologue() {
2022-05-13 21:32:35 +08:00
// TODO: should we use rewriter here?
2022-05-11 20:31:08 +08:00
OpBuilder builder(forOp);
2022-05-13 21:32:35 +08:00
for (BlockArgument &arg : forOp.getRegionIterArgs()) {
OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg);
setValueMapping(arg, operand.get(), 0);
}
2022-05-14 22:04:36 +08:00
// prologue from [0, numStage-1)
auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
2022-05-15 22:29:27 +08:00
Value iv = forOp.getLowerBound();
2022-05-13 21:32:35 +08:00
for (int stage = 0; stage < numStages - 1; ++stage) {
// special handling for induction variable as the increment is implicit
if (stage != 0)
iv = builder.create<arith::AddIOp>(iv.getLoc(), iv, forOp.getStep());
setValueMapping(forOp.getInductionVar(), iv, stage);
// special handling for loop condition as there is no condition in ForOp
Value loopCond = builder.create<arith::CmpIOp>(
iv.getLoc(), arith::CmpIPredicate::slt, iv, forOp.getUpperBound());
// rematerialize peeled values
SmallVector<Operation*> orderedDeps;
for (Operation &op : forOp.getLoopBody().front())
if (depOps.contains(&op))
orderedDeps.push_back(&op);
assert(depOps.size() == orderedDeps.size() && "depOps contains invalid values");
for (Operation *op : orderedDeps) {
Operation *newOp = builder.clone(*op);
for (unsigned opIdx = 0; opIdx < op->getNumOperands(); ++opIdx) {
auto it = valueMapping.find(op->getOperand(opIdx));
if (it != valueMapping.end()) {
Value v = it->second[stage];
assert(v);
newOp->setOperand(opIdx, v);
} // else, op at opIdx is a loop-invariant value
}
// update mapping of results
for (unsigned dstIdx : llvm::seq(unsigned(0), op->getNumResults())) {
setValueMapping(op->getResult(dstIdx), newOp->getResult(dstIdx), stage);
2022-05-14 22:04:36 +08:00
// update mapping for loop-carried values (args)
for (OpOperand &operand : yield->getOpOperands()) {
if (operand.get() == op->getResult(dstIdx))
setValueMapping(forOp.getRegionIterArgs()[operand.getOperandNumber()],
newOp->getResult(dstIdx), stage + 1);
}
2022-05-13 21:32:35 +08:00
}
}
}
2022-05-11 20:31:08 +08:00
}
2022-05-14 22:04:36 +08:00
scf::ForOp LoopPipeliner::createNewForOp() {
OpBuilder builder(forOp);
// order of new args:
// (original args),
// (a at stage[0, numStages-1)), (b at stage[0, numStages-1))
// (depArgs at stage numStages-1)
2022-05-15 22:29:27 +08:00
// (iv at stage numStages-1)
2022-05-14 22:04:36 +08:00
SmallVector<Value> newLoopArgs;
2022-05-15 22:29:27 +08:00
// We need this to update operands for yield
// original block arg => new arg's idx
DenseMap<BlockArgument, size_t> depArgsIdx;
2022-05-14 22:04:36 +08:00
for (auto v : forOp.getIterOperands())
newLoopArgs.push_back(v);
size_t aArgIdx = newLoopArgs.size();
for (int i = 0; i < numStages - 1; ++i)
newLoopArgs.push_back(valueMapping[info.dotOp.a()][i]);
size_t bArgIdx = newLoopArgs.size();
for (int i = 0; i < numStages - 1; ++i)
newLoopArgs.push_back(valueMapping[info.dotOp.b()][i]);
size_t depArgsBeginIdx = newLoopArgs.size();
2022-05-15 22:29:27 +08:00
for (BlockArgument depArg : depArgs) {
depArgsIdx[depArg] = newLoopArgs.size();
2022-05-14 22:04:36 +08:00
newLoopArgs.push_back(valueMapping[depArg][numStages-1]);
2022-05-15 22:29:27 +08:00
}
2022-05-14 22:04:36 +08:00
size_t nextIVIdx = newLoopArgs.size();
2022-05-15 22:29:27 +08:00
newLoopArgs.push_back(valueMapping[forOp.getInductionVar()][numStages-2]);
for (size_t i = 0; i < newLoopArgs.size(); ++i)
assert(newLoopArgs[i]);
2022-05-14 22:04:36 +08:00
// signature of the new ForOp
auto newForOp = builder.create<scf::ForOp>(forOp.getLoc(),
forOp.getLowerBound(),
forOp.getUpperBound(),
forOp.getStep(),
newLoopArgs);
// body of the new ForOp
builder.setInsertionPointToStart(newForOp.getBody());
BlockAndValueMapping mapping;
for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs()))
mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]);
2022-05-15 22:29:27 +08:00
// mapping.map(info.dotOp.a(), newForOp.getRegionIterArgs()[aArgIdx]);
// mapping.map(info.dotOp.b(), newForOp.getRegionIterArgs()[bArgIdx]);
2022-05-14 22:04:36 +08:00
for (Operation &op : forOp.getBody()->without_terminator()) {
Operation *newOp = builder.clone(op, mapping);
// update mapping of results
for (unsigned dstIdx : llvm::seq(unsigned(0), op.getNumResults()))
mapping.map(op.getResult(dstIdx), newOp->getResult(dstIdx));
2022-05-15 22:29:27 +08:00
// TODO: why doesn't mapping work?
if (&op == info.dotOp.getOperation()) {
newOp->setOperand(0, newForOp.getRegionIterArgs()[aArgIdx]);
newOp->setOperand(1, newForOp.getRegionIterArgs()[bArgIdx]);
}
2022-05-14 22:04:36 +08:00
}
// prefetch next iteration
SmallVector<Operation*> orderedDeps;
for (Operation &op : forOp.getLoopBody().front())
if (depOps.contains(&op))
orderedDeps.push_back(&op);
assert(depOps.size() == orderedDeps.size() && "depOps contains invalid values");
BlockAndValueMapping nextMapping;
2022-05-15 22:29:27 +08:00
DenseMap<BlockArgument, Value> depArgsMapping;
2022-05-14 22:04:36 +08:00
size_t argIdx = 0;
for (BlockArgument arg : depArgs) {
nextMapping.map(arg, newForOp.getRegionIterArgs()[argIdx + depArgsBeginIdx]);
++argIdx;
}
// special handling for iv & loop condition
Value nextIV = builder.create<arith::AddIOp>(newForOp.getInductionVar().getLoc(),
newForOp.getRegionIterArgs()[nextIVIdx],
newForOp.getStep());
Value nextLoopCond = builder.create<arith::CmpIOp>(
nextIV.getLoc(), arith::CmpIPredicate::slt,
nextIV, newForOp.getUpperBound());
for (Operation *op : orderedDeps) {
// update loading mask
if (op == info.aLoadOp.getOperation() || op == info.bLoadOp.getOperation()) {
auto loadOp = llvm::cast<triton::LoadOp>(op);
Value mask = loadOp.mask();
Value splatCond = builder.create<triton::BroadcastOp>(mask.getLoc(),
mask.getType(),
nextLoopCond);
Value newMask = builder.create<arith::AndIOp>(mask.getLoc(),
splatCond,
nextMapping.lookupOrDefault(mask));
2022-05-16 19:38:40 +08:00
// if mask is defined outside the loop, don't update the map more than once
if (!(forOp.isDefinedOutsideOfLoop(mask) && nextMapping.contains(mask)))
nextMapping.map(mask, newMask);
2022-05-14 22:04:36 +08:00
}
Operation *nextOp = builder.clone(*op, nextMapping);
// update mapping of results
2022-05-15 22:29:27 +08:00
for (unsigned dstIdx : llvm::seq(unsigned(0), op->getNumResults())) {
2022-05-14 22:04:36 +08:00
nextMapping.map(op->getResult(dstIdx), nextOp->getResult(dstIdx));
2022-05-15 22:29:27 +08:00
// if this is a loop-carried value, update the mapping for yield
auto originYield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
for (OpOperand &operand : originYield->getOpOperands()) {
if (operand.get() == op->getResult(dstIdx)) {
size_t originIdx = operand.getOperandNumber();
size_t newArgIdx = depArgsIdx[forOp.getRegionIterArgs()[originIdx]];
BlockArgument newArg = newForOp.getRegionIterArgs()[newArgIdx];
depArgsMapping[newArg] = nextOp->getResult(dstIdx);
}
}
}
2022-05-14 22:04:36 +08:00
}
// Finally, the YieldOp, need to sync with the order of newLoopArgs
SmallVector<Value> yieldValues;
for (Value v : forOp.getBody()->getTerminator()->getOperands())
yieldValues.push_back(mapping.lookup(v));
for (int i = 1; i < numStages - 1; ++i)
yieldValues.push_back(newForOp.getRegionIterArgs()[aArgIdx + i]);
2022-05-15 22:29:27 +08:00
yieldValues.push_back(nextMapping.lookup(info.dotOp.a()));
2022-05-14 22:04:36 +08:00
for (int i = 1; i < numStages - 1; ++i)
yieldValues.push_back(newForOp.getRegionIterArgs()[bArgIdx + i]);
2022-05-15 22:29:27 +08:00
yieldValues.push_back(nextMapping.lookup(info.dotOp.b()));
for (size_t i = depArgsBeginIdx; i < nextIVIdx; ++i)
yieldValues.push_back(depArgsMapping.lookup(newForOp.getRegionIterArgs()[i]));
2022-05-14 22:04:36 +08:00
yieldValues.push_back(nextIV);
2022-05-15 22:29:27 +08:00
builder.setInsertionPointToEnd(newForOp.getBody());
builder.create<scf::YieldOp>(forOp.getBody()->getTerminator()->getLoc(),
yieldValues);
2022-05-14 22:04:36 +08:00
return newForOp;
}
2022-05-11 20:31:08 +08:00
// ref: mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
2022-05-11 16:13:53 +08:00
struct PipelinePass : public TritonGPUPipelineBase<PipelinePass> {
PipelinePass() = default;
PipelinePass(int numStages) {
this->numStages = numStages;
}
2022-05-11 16:13:53 +08:00
void runOnOperation() override {
int numStages = this->numStages;
2022-05-11 20:31:08 +08:00
if (numStages <= 1)
return;
getOperation()->walk([&](scf::ForOp forOp) -> void {
LoopPipeliner pipeliner(forOp, numStages);
if (pipeliner.initialize().failed())
return;
2022-05-11 16:13:53 +08:00
2022-05-14 22:04:36 +08:00
pipeliner.emitPrologue();
2022-05-13 21:32:35 +08:00
2022-05-14 22:04:36 +08:00
scf::ForOp newForOp = pipeliner.createNewForOp();
2022-05-13 21:32:35 +08:00
2022-05-15 22:29:27 +08:00
// replace the original loop
for (unsigned i = 0; i < forOp->getNumResults(); ++i)
forOp->getResult(i).replaceAllUsesWith(newForOp->getResult(i));
forOp->erase();
2022-05-11 16:13:53 +08:00
});
}
};
} // anonymous namespace
std::unique_ptr<Pass> mlir::createTritonGPUPipelinePass(int numStages) {
return std::make_unique<PipelinePass>(numStages);
2022-05-11 16:13:53 +08:00
}