@@ -24,7 +24,7 @@ struct CanonicalizePass
|
||||
// The following piece of code is a workaround to
|
||||
// very crudely remove dead code, by making an iteration
|
||||
// argument yield itself if it is not used to create
|
||||
// side-effects anywhere.
|
||||
// side effects anywhere.
|
||||
getOperation()->walk([&](scf::ForOp forOp) -> void {
|
||||
for (size_t i = 0; i < forOp.getNumResults(); ++i) {
|
||||
// condition 1: no other iter arguments depend on it
|
||||
|
@@ -29,7 +29,7 @@ namespace {
|
||||
// convert(blocked, dot_operand) ->
|
||||
// convert(blocked, mma) + convert(mma, dot_operand)
|
||||
// if this value is itself the result of a dot operation
|
||||
// this is a heuristic to accomodate some pattern seen in fused attention
|
||||
// this is a heuristic to accommodate some pattern seen in fused attention
|
||||
// kernels.
|
||||
// TODO: replace this by something more generic, i.e. layout-aware CSE
|
||||
class DecomposeDotOperand : public mlir::RewritePattern {
|
||||
@@ -81,7 +81,7 @@ public:
|
||||
auto srcType = convert.getOperand().getType().cast<RankedTensorType>();
|
||||
auto dstType = convert.getType().cast<RankedTensorType>();
|
||||
// we don't handle conversions to DotOperandEncodingAttr
|
||||
// this is a heuristics to accomodate fused attention
|
||||
// this is a heuristics to accommodate fused attention
|
||||
// if (dstType.getEncoding().isa<triton::gpu::DotOperandEncodingAttr>())
|
||||
// return mlir::failure();
|
||||
// convert to the same layout -- we can delete
|
||||
@@ -265,7 +265,7 @@ public:
|
||||
isSharedEncoding(cvt->getOperand(0)))
|
||||
return mlir::failure();
|
||||
// we don't handle conversions to DotOperandEncodingAttr
|
||||
// this is a heuristics to accomodate fused attention
|
||||
// this is a heuristics to accommodate fused attention
|
||||
auto targetType = cvt->getResultTypes()[0].cast<RankedTensorType>();
|
||||
if (targetType.getEncoding().isa<triton::gpu::DotOperandEncodingAttr>())
|
||||
return mlir::failure();
|
||||
@@ -285,7 +285,7 @@ public:
|
||||
// we stop everything
|
||||
if (expensive_to_remat(currOp))
|
||||
break;
|
||||
// a conversion will be removed here (i.e. transfered to operands)
|
||||
// a conversion will be removed here (i.e. transferred to operands)
|
||||
numCvts -= 1;
|
||||
// done processing
|
||||
processed.insert(currOp);
|
||||
|
@@ -110,7 +110,7 @@ Value LoopPipeliner::lookupOrDefault(Value origin, int stage) {
|
||||
}
|
||||
|
||||
void LoopPipeliner::collectDeps(Value v, int stages, DenseSet<Value> &deps) {
|
||||
// Loop-invarant value. skip
|
||||
// Loop-invariant value, skip
|
||||
if (v.getParentRegion() != &forOp.getLoopBody())
|
||||
return;
|
||||
|
||||
@@ -125,7 +125,7 @@ void LoopPipeliner::collectDeps(Value v, int stages, DenseSet<Value> &deps) {
|
||||
collectDeps(yieldOp->getOperand(arg.getArgNumber() - 1), stages - 1, deps);
|
||||
} else { // value
|
||||
// v might be in deps, but we still need to visit v.
|
||||
// This is because v might depends on value in previous iterations
|
||||
// This is because v might depend on value in previous iterations
|
||||
deps.insert(v);
|
||||
for (Value op : v.getDefiningOp()->getOperands())
|
||||
collectDeps(op, stages, deps);
|
||||
@@ -175,18 +175,18 @@ LogicalResult LoopPipeliner::initialize() {
|
||||
// other load in the prologue, which is against the point of the pipeline
|
||||
// pass)
|
||||
for (triton::LoadOp loadOp : allLoads) {
|
||||
bool isCandiate = true;
|
||||
bool isCandidate = true;
|
||||
for (triton::LoadOp other : allLoads) {
|
||||
if (loadDeps[loadOp].contains(other)) {
|
||||
isCandiate = false;
|
||||
isCandidate = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// We only pipeline loads that have one covert_layout (to dot_op) use
|
||||
// TODO: lift this constraint in the future
|
||||
if (isCandiate && loadOp.getResult().hasOneUse()) {
|
||||
isCandiate = false;
|
||||
if (isCandidate && loadOp.getResult().hasOneUse()) {
|
||||
isCandidate = false;
|
||||
Operation *use = *loadOp.getResult().getUsers().begin();
|
||||
if (auto convertLayout = llvm::dyn_cast<ttg::ConvertLayoutOp>(use)) {
|
||||
if (auto tensorType = convertLayout.getResult()
|
||||
@@ -194,7 +194,7 @@ LogicalResult LoopPipeliner::initialize() {
|
||||
.dyn_cast<RankedTensorType>()) {
|
||||
if (auto dotOpEnc = tensorType.getEncoding()
|
||||
.dyn_cast<ttg::DotOperandEncodingAttr>()) {
|
||||
isCandiate = true;
|
||||
isCandidate = true;
|
||||
loadsMapping[loadOp] = convertLayout;
|
||||
auto ty = loadOp.getType().cast<RankedTensorType>();
|
||||
SmallVector<int64_t> bufferShape(ty.getShape().begin(),
|
||||
@@ -208,9 +208,9 @@ LogicalResult LoopPipeliner::initialize() {
|
||||
}
|
||||
}
|
||||
} else
|
||||
isCandiate = false;
|
||||
isCandidate = false;
|
||||
|
||||
if (isCandiate)
|
||||
if (isCandidate)
|
||||
loads.insert(loadOp);
|
||||
}
|
||||
|
||||
@@ -317,7 +317,7 @@ void LoopPipeliner::emitPrologue() {
|
||||
for (unsigned dstIdx : llvm::seq(unsigned(0), op->getNumResults())) {
|
||||
Value originalResult = op->getResult(dstIdx);
|
||||
// copy_async will update the value of its only use
|
||||
// TODO: load should no be used in the preheader?
|
||||
// TODO: load should not be used in the preheader?
|
||||
if (loads.contains(originalResult)) {
|
||||
break;
|
||||
// originalResult = loadsMapping[originalResult];
|
||||
|
@@ -35,7 +35,7 @@ TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context,
|
||||
});
|
||||
|
||||
//
|
||||
// materailizations
|
||||
// Materializations
|
||||
//
|
||||
// This will be called when (newArgType != origArgType)
|
||||
// This will create newArg, and map(origArg, newArg)
|
||||
|
Reference in New Issue
Block a user