[Triton-MLIR] Replace triton.extract_slice with tensor.extract_slice and support more general tensor slicing (#837)

## Features - Allow taking a block of tensor slice, as long as each dimension is contiguous (unit stride). - Fix some problems in `insert_slice_async`'s semantic. - More general verification for ops that return shared layout encoding. ## Known Limitations - `insert_slice_async` still uses the old semantic. May submit another PR later to support similar semantic like `tensor.extract_slice`. - No encoding verification for `tensor.extract_slice`. - 3d tensor ops are broken. - Strided accesses are not allowed. - May cause a little performance slowdown since we are passing strides as values but not constants (e.g., int). It would be difficult to pass strides as attributes when we have control flows. A block argument is possible to accept tensors with different strides.
2022-11-06 22:59:03 -08:00
parent a4ff0c362c
commit fdd59900f7
26 changed files with 507 additions and 339 deletions
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -474,7 +474,7 @@ void SharedEncodingAttr::print(AsmPrinter &printer) const {

 ParseResult parseInsertSliceAsyncOp(OpAsmParser &parser,
                                    OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 4> allOperands;
+  SmallVector<OpAsmParser::OperandType, 8> allOperands;
  Type srcType, dstType;
  SMLoc allOperandLoc = parser.getCurrentLocation();
  if (parser.parseOperandList(allOperands) ||
@@ -489,14 +489,27 @@ ParseResult parseInsertSliceAsyncOp(OpAsmParser &parser,
  operandTypes.push_back(dstType); // dst
  operandTypes.push_back(
      IntegerType::get(parser.getBuilder().getContext(), 32)); // index
-  if (allOperands.size() >= 4)
+
+  int hasMask = 0, hasOther = 0;
+  if (allOperands.size() >= 4) {
    operandTypes.push_back(triton::getI1SameShape(srcType)); // mask
-  if (allOperands.size() >= 5)
+    hasMask = 1;
+  }
+  if (allOperands.size() >= 5) {
    operandTypes.push_back(triton::getPointeeType(srcType)); // other
+    hasOther = 1;
+  }

  if (parser.resolveOperands(allOperands, operandTypes, allOperandLoc,
                             result.operands))
    return failure();
+
+  // Deduce operand_segment_sizes from the number of the operands.
+  auto operand_segment_sizesAttrName =
+      InsertSliceAsyncOp::operand_segment_sizesAttrName(result.name);
+  result.addAttribute(
+      operand_segment_sizesAttrName,
+      parser.getBuilder().getI32VectorAttr({1, 1, 1, hasMask, hasOther}));
  return success();
 }

@@ -504,39 +517,16 @@ void printInsertSliceAsyncOp(OpAsmPrinter &printer,
                             InsertSliceAsyncOp insertSliceAsyncOp) {
  printer << " ";
  printer << insertSliceAsyncOp.getOperation()->getOperands();
-  printer.printOptionalAttrDict(insertSliceAsyncOp->getAttrs(),
-                                /*elidedAttrs=*/{});
+  // "operand_segment_sizes" can be deduced, so we don't print it.
+  printer.printOptionalAttrDict(
+      insertSliceAsyncOp->getAttrs(),
+      {insertSliceAsyncOp.operand_segment_sizesAttrName()});
  printer << " : ";
  printer.printStrippedAttrOrType(insertSliceAsyncOp.src().getType());
  printer << " -> ";
  printer.printStrippedAttrOrType(insertSliceAsyncOp.result().getType());
 }

-//===----------------------------------------------------------------------===//
-// ExtractSliceOp
-//===----------------------------------------------------------------------===//
-
-mlir::LogicalResult ExtractSliceOp::inferReturnTypes(
-    ::mlir::MLIRContext *context, llvm::Optional<::mlir::Location> location,
-    ::mlir::ValueRange operands, mlir::DictionaryAttr attributes,
-    ::mlir::RegionRange regions,
-    llvm::SmallVectorImpl<::mlir::Type> &inferredReturnTypes) {
-  auto srcType = operands[0].getType().cast<RankedTensorType>();
-  auto encoding = srcType.getEncoding();
-  auto srcShape = srcType.getShape();
-  auto axis = attributes.get("axis").cast<IntegerAttr>().getInt();
-  if (axis < 0 || (size_t)axis > srcShape.size())
-    return failure();
-  SmallVector<int64_t, 4> dstShape;
-  for (size_t i = 0; i < srcShape.size(); i++)
-    if (i != (size_t)axis)
-      dstShape.push_back(srcShape[i]);
-  auto returnType =
-      RankedTensorType::get(dstShape, srcType.getElementType(), encoding);
-  inferredReturnTypes.assign({returnType});
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // DotOperand Encoding
 //===----------------------------------------------------------------------===//
@@ -631,32 +621,6 @@ void TritonGPUDialect::initialize() {
  addInterfaces<TritonGPUInferLayoutInterface>();
 }

-//===----------------------------------------------------------------------===//
-// Verification
-//===----------------------------------------------------------------------===//
-
-static LogicalResult verify(InsertSliceAsyncOp op) {
-  if (!isSharedEncoding(op.getResult())) {
-    return op.emitOpError(
-        "insert_slice_async should return a shared memory tensor");
-  }
-  return success();
-}
-
-static LogicalResult verify(ExtractSliceOp op) {
-  if (!isSharedEncoding(op.getResult())) {
-    return op.emitOpError("extract_slice should return a shared memory tensor");
-  }
-  return success();
-}
-
-static LogicalResult verify(AllocTensorOp op) {
-  if (!isSharedEncoding(op.getResult())) {
-    return op.emitOpError("alloc_tensor should return a shared memory tensor");
-  }
-  return success();
-}
-
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc"