[BUILD] Fix Warnings and Enable Warnings as Errors (#794)

2022-10-28 12:36:09 -07:00
parent ac0f6793cc
commit f2106d0aa2
20 changed files with 205 additions and 213 deletions
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -49,7 +49,6 @@ unsigned getElemsPerThread(Type type) {
  auto tensorType = type.cast<RankedTensorType>();
  auto layout = tensorType.getEncoding();
  auto shape = tensorType.getShape();
-  size_t rank = shape.size();
  if (auto blockedLayout = layout.dyn_cast<BlockedEncodingAttr>()) {
    return blockedLayout.getElemsPerThread(shape);
  } else if (auto sliceLayout = layout.dyn_cast<SliceEncodingAttr>()) {
@@ -109,7 +108,7 @@ SmallVector<unsigned> getThreadsPerCTA(const Attribute &layout) {
 SmallVector<unsigned> getShapePerCTA(const Attribute &layout) {
  SmallVector<unsigned> shape;
  if (auto blockedLayout = layout.dyn_cast<BlockedEncodingAttr>()) {
-    for (int d = 0, n = blockedLayout.getOrder().size(); d < n; ++d)
+    for (unsigned d = 0, n = blockedLayout.getOrder().size(); d < n; ++d)
      shape.push_back(blockedLayout.getSizePerThread()[d] *
                      blockedLayout.getThreadsPerWarp()[d] *
                      blockedLayout.getWarpsPerCTA()[d]);
@@ -117,7 +116,7 @@ SmallVector<unsigned> getShapePerCTA(const Attribute &layout) {
    unsigned dim = sliceLayout.getDim();
    auto parent = sliceLayout.getParent();
    if (auto blockedParent = parent.dyn_cast<BlockedEncodingAttr>()) {
-      for (int d = 0, n = blockedParent.getOrder().size(); d < n; ++d) {
+      for (unsigned d = 0, n = blockedParent.getOrder().size(); d < n; ++d) {
        if (d == dim)
          continue;
        shape.push_back(blockedParent.getSizePerThread()[d] *
@@ -258,7 +257,6 @@ SliceEncodingAttr::paddedShape(ArrayRef<int64_t> shape) const {
 unsigned SliceEncodingAttr::getElemsPerThread(ArrayRef<int64_t> shape) const {
  size_t rank = shape.size();
  auto parent = getParent();
-  unsigned dim = getDim();
  if (auto blockedParent = parent.dyn_cast<BlockedEncodingAttr>()) {
    assert(rank == blockedParent.getSizePerThread().size() - 1 &&
           "unexpected rank in SliceEncodingAttr::getElemsPerThread");
@@ -512,11 +510,11 @@ mlir::LogicalResult ExtractSliceOp::inferReturnTypes(
  auto encoding = srcType.getEncoding();
  auto srcShape = srcType.getShape();
  auto axis = attributes.get("axis").cast<IntegerAttr>().getInt();
-  if (axis < 0 || axis > srcShape.size())
+  if (axis < 0 || (size_t)axis > srcShape.size())
    return failure();
  SmallVector<int64_t, 4> dstShape;
-  for (int i = 0; i < srcShape.size(); i++)
-    if (i != axis)
+  for (size_t i = 0; i < srcShape.size(); i++)
+    if (i != (size_t)axis)
      dstShape.push_back(srcShape[i]);
  auto returnType =
      RankedTensorType::get(dstShape, srcType.getElementType(), encoding);
@@ -578,15 +576,17 @@ struct TritonGPUInferLayoutInterface
    : public triton::DialectInferLayoutInterface {
  using DialectInferLayoutInterface::DialectInferLayoutInterface;

-  LogicalResult inferReduceOpEncoding(Attribute operandEncoding, int axis,
-                                      Attribute &resultEncoding) const {
+  LogicalResult
+  inferReduceOpEncoding(Attribute operandEncoding, unsigned axis,
+                        Attribute &resultEncoding) const override {
    resultEncoding = SliceEncodingAttr::get(getDialect()->getContext(), axis,
                                            operandEncoding);
    return success();
  }

-  LogicalResult inferExpandDimsOpEncoding(Attribute operandEncoding, int axis,
-                                          Attribute &resultEncoding) const {
+  LogicalResult
+  inferExpandDimsOpEncoding(Attribute operandEncoding, unsigned axis,
+                            Attribute &resultEncoding) const override {
    auto sliceEncoding = operandEncoding.dyn_cast<SliceEncodingAttr>();
    if (!sliceEncoding) {
      llvm::report_fatal_error(
--- a/lib/Dialect/TritonGPU/Transforms/Combine.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Combine.cpp
@@ -87,7 +87,6 @@ public:
    if (!llvm::isa<triton::gpu::ConvertLayoutOp>(op))
      return mlir::failure();
    auto convert = llvm::cast<triton::gpu::ConvertLayoutOp>(op);
-    auto srcType = convert.getOperand().getType().cast<RankedTensorType>();
    auto dstType = convert.getType().cast<RankedTensorType>();
    // we don't handle conversions to DotOperandEncodingAttr
    // this is a heuristics to accomodate fused attention
@@ -219,10 +218,10 @@ Operation *cloneWithInferType(mlir::PatternRewriter &rewriter, Operation *op,
  auto typeInfer = dyn_cast<InferTypeOpInterface>(newOp);
  if (typeInfer) {
    SmallVector<Type, 1> newType;
-    auto sucess = typeInfer.inferReturnTypes(
+    auto success = typeInfer.inferReturnTypes(
        newOp->getContext(), newOp->getLoc(), newOp->getOperands(),
        newOp->getAttrDictionary(), newOp->getRegions(), newType);
-    if (success)
+    if (succeeded(success))
      newOp->getResult(0).setType(newType.front());
  }
  return newOp;
@@ -364,10 +363,6 @@ public:
  rematerializeForLoop(mlir::PatternRewriter &rewriter, scf::ForOp &forOp,
                       size_t i, RankedTensorType newType,
                       triton::gpu::ConvertLayoutOp origConversion) const {
-
-    auto newEncoding = newType.cast<RankedTensorType>().getEncoding();
-    auto ctx = forOp.getContext();
-    auto isInLoop = [&](Operation *op) { return op->getParentOp() == forOp; };
    // Rewrite init argument
    Type origType = forOp.getInitArgs()[i].getType();
    SmallVector<Value, 4> newInitArgs = forOp.getInitArgs();
@@ -418,11 +413,10 @@ public:
    return newResults;
  }

-  mlir::LogicalResult matchAndRewrite(mlir::Operation *op,
-                                      mlir::PatternRewriter &rewriter) const {
-
+  mlir::LogicalResult
+  matchAndRewrite(mlir::Operation *op,
+                  mlir::PatternRewriter &rewriter) const override {
    auto forOp = cast<scf::ForOp>(op);
-    auto isInLoop = [&](Operation *op) { return op->getParentOp() == forOp; };
    auto iterArgs = forOp.getRegionIterArgs();
    for (auto iterArg : llvm::enumerate(iterArgs)) {
      // if (iterArg.index() != 1)
@@ -480,7 +474,6 @@ public:
    auto forOp = dyn_cast<scf::ForOp>(cvt->getParentOp());
    if (!forOp)
      return mlir::failure();
-    auto yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
    auto isInLoop = [&](Operation *op) { return op->getParentOp() == forOp; };

    SetVector<Operation *> cvtSlices;
--- a/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
@@ -17,11 +17,6 @@ using namespace mlir;

 namespace {
 class LoopPipeliner {
-  /// comments on numStages:
-  ///   [0, numStages-1) are in the prologue
-  ///   numStages-1 is appended after the loop body
-  int numStages;
-
  /// cache forOp we are working on
  scf::ForOp forOp;

@@ -43,6 +38,11 @@ class LoopPipeliner {
  ///
  Value loopIterIdx;

+  /// comments on numStages:
+  ///   [0, numStages-1) are in the prologue
+  ///   numStages-1 is appended after the loop body
+  int numStages;
+
  /// value (in loop) => value at stage N
  DenseMap<Value, SmallVector<Value>> valueMapping;

@@ -58,9 +58,6 @@ class LoopPipeliner {

  Value lookupOrDefault(Value origin, int stage);

-  /// return true if this op uses any of `loads`
-  bool isDirectUserOfAsyncLoad(Operation &op);
-
  /// returns a empty buffer of size <numStages, ...>
  triton::gpu::AllocTensorOp allocateEmptyBuffer(Operation *op,
                                                 OpBuilder &builder);
@@ -84,7 +81,7 @@ public:
  /// create the new ForOp (add new args & insert prefetched ops)
  scf::ForOp createNewForOp();

-  friend class PipelinePass;
+  friend struct PipelinePass;
 };

 // helpers
@@ -123,19 +120,6 @@ void LoopPipeliner::collectDeps(Value v, int stages, DenseSet<Value> &deps) {
  }
 }

-bool LoopPipeliner::isDirectUserOfAsyncLoad(Operation &op) {
-  for (Value loadOp : loads) {
-    assert(loadOp.hasOneUse() &&
-           "load should only have one use (ConvertLayout)");
-    Value loadUseResult = loadOp.getUsers().begin()->getResult(0);
-    for (Value opOperand : op.getOperands()) {
-      if (opOperand == loadUseResult)
-        return true;
-    }
-  }
-  return false;
-}
-
 triton::gpu::AllocTensorOp
 LoopPipeliner::allocateEmptyBuffer(Operation *op, OpBuilder &builder) {
  // allocate a buffer for each pipelined tensor
@@ -356,8 +340,8 @@ void LoopPipeliner::emitPrologue() {
  } // for (int stage = 0; stage < numStages - 1; ++stage)

  // async.wait & extract_slice
-  Operation *asyncWait = builder.create<triton::gpu::AsyncWaitOp>(
-      loads[0].getLoc(), loads.size() * (numStages - 2));
+  builder.create<triton::gpu::AsyncWaitOp>(loads[0].getLoc(),
+                                           loads.size() * (numStages - 2));
  loopIterIdx = builder.create<arith::ConstantIntOp>(iv.getLoc(), 0, 32);
  for (Value loadOp : loads) {
    Value extractSlice = builder.create<triton::gpu::ExtractSliceOp>(
@@ -380,8 +364,7 @@ void LoopPipeliner::emitEpilogue() {
  OpBuilder builder(forOp);
  OpBuilder::InsertionGuard g(builder);
  builder.setInsertionPointAfter(forOp);
-  Operation *asyncWait =
-      builder.create<triton::gpu::AsyncWaitOp>(forOp.getLoc(), 0);
+  builder.create<triton::gpu::AsyncWaitOp>(forOp.getLoc(), 0);
 }

 scf::ForOp LoopPipeliner::createNewForOp() {
@@ -575,8 +558,8 @@ scf::ForOp LoopPipeliner::createNewForOp() {
  yieldValues.push_back(loopIterIdx);

  builder.setInsertionPointToEnd(newForOp.getBody());
-  auto test = builder.create<scf::YieldOp>(
-      forOp.getBody()->getTerminator()->getLoc(), yieldValues);
+  builder.create<scf::YieldOp>(forOp.getBody()->getTerminator()->getLoc(),
+                               yieldValues);
  return newForOp;
 }

--- a/lib/Dialect/TritonGPU/Transforms/Swizzle.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Swizzle.cpp
@@ -30,7 +30,7 @@ struct SwizzlePass : public TritonGPUSwizzleBase<SwizzlePass> {
                          (ty.getElementType().getIntOrFloatBitWidth() / 8));
    perPhase = std::max<int>(perPhase, 1);
    // index of the inner dimension in `order`
-    int inner = (opIdx == 0) ? 0 : 1;
+    size_t inner = (opIdx == 0) ? 0 : 1;
    if (version == 1) {
      int maxPhase = (order[inner] == 1 ? 8 : 4) / perPhase;
      // TODO: handle rep (see
@@ -67,7 +67,6 @@ struct SwizzlePass : public TritonGPUSwizzleBase<SwizzlePass> {

  void runOnOperation() override {
    Operation *op = getOperation();
-    MLIRContext *context = &getContext();
    op->walk([&](triton::DotOp dotOp) -> void {
      OpBuilder builder(dotOp);
      auto _retEncoding =
--- a/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp
@@ -73,7 +73,7 @@ TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context,
 //
 TritonGPUConversionTarget::TritonGPUConversionTarget(
    MLIRContext &context, TritonGPUTypeConverter &typeConverter)
-    : ConversionTarget(context), typeConverter(typeConverter) {
+    : ConversionTarget(context) {
  // TODO: we should also verify ops of TritonGPUDialect
  addLegalDialect<triton::gpu::TritonGPUDialect>();

@@ -90,7 +90,7 @@ TritonGPUConversionTarget::TritonGPUConversionTarget(
  });

  // We have requirements for the data layouts
-  addDynamicallyLegalOp<triton::DotOp>([this](triton::DotOp dotOp) -> bool {
+  addDynamicallyLegalOp<triton::DotOp>([](triton::DotOp dotOp) -> bool {
    Attribute aEncoding =
        dotOp.a().getType().cast<RankedTensorType>().getEncoding();
    Attribute bEncoding =