[BUILD] Fix Warnings and Enable Warnings as Errors (#794)
This commit is contained in:
@@ -49,7 +49,6 @@ unsigned getElemsPerThread(Type type) {
|
||||
auto tensorType = type.cast<RankedTensorType>();
|
||||
auto layout = tensorType.getEncoding();
|
||||
auto shape = tensorType.getShape();
|
||||
size_t rank = shape.size();
|
||||
if (auto blockedLayout = layout.dyn_cast<BlockedEncodingAttr>()) {
|
||||
return blockedLayout.getElemsPerThread(shape);
|
||||
} else if (auto sliceLayout = layout.dyn_cast<SliceEncodingAttr>()) {
|
||||
@@ -109,7 +108,7 @@ SmallVector<unsigned> getThreadsPerCTA(const Attribute &layout) {
|
||||
SmallVector<unsigned> getShapePerCTA(const Attribute &layout) {
|
||||
SmallVector<unsigned> shape;
|
||||
if (auto blockedLayout = layout.dyn_cast<BlockedEncodingAttr>()) {
|
||||
for (int d = 0, n = blockedLayout.getOrder().size(); d < n; ++d)
|
||||
for (unsigned d = 0, n = blockedLayout.getOrder().size(); d < n; ++d)
|
||||
shape.push_back(blockedLayout.getSizePerThread()[d] *
|
||||
blockedLayout.getThreadsPerWarp()[d] *
|
||||
blockedLayout.getWarpsPerCTA()[d]);
|
||||
@@ -117,7 +116,7 @@ SmallVector<unsigned> getShapePerCTA(const Attribute &layout) {
|
||||
unsigned dim = sliceLayout.getDim();
|
||||
auto parent = sliceLayout.getParent();
|
||||
if (auto blockedParent = parent.dyn_cast<BlockedEncodingAttr>()) {
|
||||
for (int d = 0, n = blockedParent.getOrder().size(); d < n; ++d) {
|
||||
for (unsigned d = 0, n = blockedParent.getOrder().size(); d < n; ++d) {
|
||||
if (d == dim)
|
||||
continue;
|
||||
shape.push_back(blockedParent.getSizePerThread()[d] *
|
||||
@@ -258,7 +257,6 @@ SliceEncodingAttr::paddedShape(ArrayRef<int64_t> shape) const {
|
||||
unsigned SliceEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape) const {
|
||||
size_t rank = shape.size();
|
||||
auto parent = getParent();
|
||||
unsigned dim = getDim();
|
||||
if (auto blockedParent = parent.dyn_cast<BlockedEncodingAttr>()) {
|
||||
assert(rank == blockedParent.getSizePerThread().size() - 1 &&
|
||||
"unexpected rank in SliceEncodingAttr::getElemsPerThread");
|
||||
@@ -512,11 +510,11 @@ mlir::LogicalResult ExtractSliceOp::inferReturnTypes(
|
||||
auto encoding = srcType.getEncoding();
|
||||
auto srcShape = srcType.getShape();
|
||||
auto axis = attributes.get("axis").cast<IntegerAttr>().getInt();
|
||||
if (axis < 0 || axis > srcShape.size())
|
||||
if (axis < 0 || (size_t)axis > srcShape.size())
|
||||
return failure();
|
||||
SmallVector<int64_t, 4> dstShape;
|
||||
for (int i = 0; i < srcShape.size(); i++)
|
||||
if (i != axis)
|
||||
for (size_t i = 0; i < srcShape.size(); i++)
|
||||
if (i != (size_t)axis)
|
||||
dstShape.push_back(srcShape[i]);
|
||||
auto returnType =
|
||||
RankedTensorType::get(dstShape, srcType.getElementType(), encoding);
|
||||
@@ -578,15 +576,17 @@ struct TritonGPUInferLayoutInterface
|
||||
: public triton::DialectInferLayoutInterface {
|
||||
using DialectInferLayoutInterface::DialectInferLayoutInterface;
|
||||
|
||||
LogicalResult inferReduceOpEncoding(Attribute operandEncoding, int axis,
|
||||
Attribute &resultEncoding) const {
|
||||
LogicalResult
|
||||
inferReduceOpEncoding(Attribute operandEncoding, unsigned axis,
|
||||
Attribute &resultEncoding) const override {
|
||||
resultEncoding = SliceEncodingAttr::get(getDialect()->getContext(), axis,
|
||||
operandEncoding);
|
||||
return success();
|
||||
}
|
||||
|
||||
LogicalResult inferExpandDimsOpEncoding(Attribute operandEncoding, int axis,
|
||||
Attribute &resultEncoding) const {
|
||||
LogicalResult
|
||||
inferExpandDimsOpEncoding(Attribute operandEncoding, unsigned axis,
|
||||
Attribute &resultEncoding) const override {
|
||||
auto sliceEncoding = operandEncoding.dyn_cast<SliceEncodingAttr>();
|
||||
if (!sliceEncoding) {
|
||||
llvm::report_fatal_error(
|
||||
|
@@ -87,7 +87,6 @@ public:
|
||||
if (!llvm::isa<triton::gpu::ConvertLayoutOp>(op))
|
||||
return mlir::failure();
|
||||
auto convert = llvm::cast<triton::gpu::ConvertLayoutOp>(op);
|
||||
auto srcType = convert.getOperand().getType().cast<RankedTensorType>();
|
||||
auto dstType = convert.getType().cast<RankedTensorType>();
|
||||
// we don't handle conversions to DotOperandEncodingAttr
|
||||
// this is a heuristics to accomodate fused attention
|
||||
@@ -219,10 +218,10 @@ Operation *cloneWithInferType(mlir::PatternRewriter &rewriter, Operation *op,
|
||||
auto typeInfer = dyn_cast<InferTypeOpInterface>(newOp);
|
||||
if (typeInfer) {
|
||||
SmallVector<Type, 1> newType;
|
||||
auto sucess = typeInfer.inferReturnTypes(
|
||||
auto success = typeInfer.inferReturnTypes(
|
||||
newOp->getContext(), newOp->getLoc(), newOp->getOperands(),
|
||||
newOp->getAttrDictionary(), newOp->getRegions(), newType);
|
||||
if (success)
|
||||
if (succeeded(success))
|
||||
newOp->getResult(0).setType(newType.front());
|
||||
}
|
||||
return newOp;
|
||||
@@ -364,10 +363,6 @@ public:
|
||||
rematerializeForLoop(mlir::PatternRewriter &rewriter, scf::ForOp &forOp,
|
||||
size_t i, RankedTensorType newType,
|
||||
triton::gpu::ConvertLayoutOp origConversion) const {
|
||||
|
||||
auto newEncoding = newType.cast<RankedTensorType>().getEncoding();
|
||||
auto ctx = forOp.getContext();
|
||||
auto isInLoop = [&](Operation *op) { return op->getParentOp() == forOp; };
|
||||
// Rewrite init argument
|
||||
Type origType = forOp.getInitArgs()[i].getType();
|
||||
SmallVector<Value, 4> newInitArgs = forOp.getInitArgs();
|
||||
@@ -418,11 +413,10 @@ public:
|
||||
return newResults;
|
||||
}
|
||||
|
||||
mlir::LogicalResult matchAndRewrite(mlir::Operation *op,
|
||||
mlir::PatternRewriter &rewriter) const {
|
||||
|
||||
mlir::LogicalResult
|
||||
matchAndRewrite(mlir::Operation *op,
|
||||
mlir::PatternRewriter &rewriter) const override {
|
||||
auto forOp = cast<scf::ForOp>(op);
|
||||
auto isInLoop = [&](Operation *op) { return op->getParentOp() == forOp; };
|
||||
auto iterArgs = forOp.getRegionIterArgs();
|
||||
for (auto iterArg : llvm::enumerate(iterArgs)) {
|
||||
// if (iterArg.index() != 1)
|
||||
@@ -480,7 +474,6 @@ public:
|
||||
auto forOp = dyn_cast<scf::ForOp>(cvt->getParentOp());
|
||||
if (!forOp)
|
||||
return mlir::failure();
|
||||
auto yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
|
||||
auto isInLoop = [&](Operation *op) { return op->getParentOp() == forOp; };
|
||||
|
||||
SetVector<Operation *> cvtSlices;
|
||||
|
@@ -17,11 +17,6 @@ using namespace mlir;
|
||||
|
||||
namespace {
|
||||
class LoopPipeliner {
|
||||
/// comments on numStages:
|
||||
/// [0, numStages-1) are in the prologue
|
||||
/// numStages-1 is appended after the loop body
|
||||
int numStages;
|
||||
|
||||
/// cache forOp we are working on
|
||||
scf::ForOp forOp;
|
||||
|
||||
@@ -43,6 +38,11 @@ class LoopPipeliner {
|
||||
///
|
||||
Value loopIterIdx;
|
||||
|
||||
/// comments on numStages:
|
||||
/// [0, numStages-1) are in the prologue
|
||||
/// numStages-1 is appended after the loop body
|
||||
int numStages;
|
||||
|
||||
/// value (in loop) => value at stage N
|
||||
DenseMap<Value, SmallVector<Value>> valueMapping;
|
||||
|
||||
@@ -58,9 +58,6 @@ class LoopPipeliner {
|
||||
|
||||
Value lookupOrDefault(Value origin, int stage);
|
||||
|
||||
/// return true if this op uses any of `loads`
|
||||
bool isDirectUserOfAsyncLoad(Operation &op);
|
||||
|
||||
/// returns a empty buffer of size <numStages, ...>
|
||||
triton::gpu::AllocTensorOp allocateEmptyBuffer(Operation *op,
|
||||
OpBuilder &builder);
|
||||
@@ -84,7 +81,7 @@ public:
|
||||
/// create the new ForOp (add new args & insert prefetched ops)
|
||||
scf::ForOp createNewForOp();
|
||||
|
||||
friend class PipelinePass;
|
||||
friend struct PipelinePass;
|
||||
};
|
||||
|
||||
// helpers
|
||||
@@ -123,19 +120,6 @@ void LoopPipeliner::collectDeps(Value v, int stages, DenseSet<Value> &deps) {
|
||||
}
|
||||
}
|
||||
|
||||
bool LoopPipeliner::isDirectUserOfAsyncLoad(Operation &op) {
|
||||
for (Value loadOp : loads) {
|
||||
assert(loadOp.hasOneUse() &&
|
||||
"load should only have one use (ConvertLayout)");
|
||||
Value loadUseResult = loadOp.getUsers().begin()->getResult(0);
|
||||
for (Value opOperand : op.getOperands()) {
|
||||
if (opOperand == loadUseResult)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
triton::gpu::AllocTensorOp
|
||||
LoopPipeliner::allocateEmptyBuffer(Operation *op, OpBuilder &builder) {
|
||||
// allocate a buffer for each pipelined tensor
|
||||
@@ -356,8 +340,8 @@ void LoopPipeliner::emitPrologue() {
|
||||
} // for (int stage = 0; stage < numStages - 1; ++stage)
|
||||
|
||||
// async.wait & extract_slice
|
||||
Operation *asyncWait = builder.create<triton::gpu::AsyncWaitOp>(
|
||||
loads[0].getLoc(), loads.size() * (numStages - 2));
|
||||
builder.create<triton::gpu::AsyncWaitOp>(loads[0].getLoc(),
|
||||
loads.size() * (numStages - 2));
|
||||
loopIterIdx = builder.create<arith::ConstantIntOp>(iv.getLoc(), 0, 32);
|
||||
for (Value loadOp : loads) {
|
||||
Value extractSlice = builder.create<triton::gpu::ExtractSliceOp>(
|
||||
@@ -380,8 +364,7 @@ void LoopPipeliner::emitEpilogue() {
|
||||
OpBuilder builder(forOp);
|
||||
OpBuilder::InsertionGuard g(builder);
|
||||
builder.setInsertionPointAfter(forOp);
|
||||
Operation *asyncWait =
|
||||
builder.create<triton::gpu::AsyncWaitOp>(forOp.getLoc(), 0);
|
||||
builder.create<triton::gpu::AsyncWaitOp>(forOp.getLoc(), 0);
|
||||
}
|
||||
|
||||
scf::ForOp LoopPipeliner::createNewForOp() {
|
||||
@@ -575,8 +558,8 @@ scf::ForOp LoopPipeliner::createNewForOp() {
|
||||
yieldValues.push_back(loopIterIdx);
|
||||
|
||||
builder.setInsertionPointToEnd(newForOp.getBody());
|
||||
auto test = builder.create<scf::YieldOp>(
|
||||
forOp.getBody()->getTerminator()->getLoc(), yieldValues);
|
||||
builder.create<scf::YieldOp>(forOp.getBody()->getTerminator()->getLoc(),
|
||||
yieldValues);
|
||||
return newForOp;
|
||||
}
|
||||
|
||||
|
@@ -30,7 +30,7 @@ struct SwizzlePass : public TritonGPUSwizzleBase<SwizzlePass> {
|
||||
(ty.getElementType().getIntOrFloatBitWidth() / 8));
|
||||
perPhase = std::max<int>(perPhase, 1);
|
||||
// index of the inner dimension in `order`
|
||||
int inner = (opIdx == 0) ? 0 : 1;
|
||||
size_t inner = (opIdx == 0) ? 0 : 1;
|
||||
if (version == 1) {
|
||||
int maxPhase = (order[inner] == 1 ? 8 : 4) / perPhase;
|
||||
// TODO: handle rep (see
|
||||
@@ -67,7 +67,6 @@ struct SwizzlePass : public TritonGPUSwizzleBase<SwizzlePass> {
|
||||
|
||||
void runOnOperation() override {
|
||||
Operation *op = getOperation();
|
||||
MLIRContext *context = &getContext();
|
||||
op->walk([&](triton::DotOp dotOp) -> void {
|
||||
OpBuilder builder(dotOp);
|
||||
auto _retEncoding =
|
||||
|
@@ -73,7 +73,7 @@ TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context,
|
||||
//
|
||||
TritonGPUConversionTarget::TritonGPUConversionTarget(
|
||||
MLIRContext &context, TritonGPUTypeConverter &typeConverter)
|
||||
: ConversionTarget(context), typeConverter(typeConverter) {
|
||||
: ConversionTarget(context) {
|
||||
// TODO: we should also verify ops of TritonGPUDialect
|
||||
addLegalDialect<triton::gpu::TritonGPUDialect>();
|
||||
|
||||
@@ -90,7 +90,7 @@ TritonGPUConversionTarget::TritonGPUConversionTarget(
|
||||
});
|
||||
|
||||
// We have requirements for the data layouts
|
||||
addDynamicallyLegalOp<triton::DotOp>([this](triton::DotOp dotOp) -> bool {
|
||||
addDynamicallyLegalOp<triton::DotOp>([](triton::DotOp dotOp) -> bool {
|
||||
Attribute aEncoding =
|
||||
dotOp.a().getType().cast<RankedTensorType>().getEncoding();
|
||||
Attribute bEncoding =
|
||||
|
Reference in New Issue
Block a user