[Triton-MLIR] Replace triton.extract_slice with tensor.extract_slice and support more general tensor slicing (#837)

## Features

- Allow taking a block of tensor slice, as long as each dimension is
contiguous (unit stride).
- Fix some problems in `insert_slice_async`'s semantic.
- More general verification for ops that return shared layout encoding.

## Known Limitations

- `insert_slice_async` still uses the old semantic. May submit another
PR later to support similar semantic like `tensor.extract_slice`.
- No encoding verification for `tensor.extract_slice`.
- 3d tensor ops are broken.
- Strided accesses are not allowed.
- May cause a little performance slowdown since we are passing strides
as values but not constants (e.g., int).
It would be difficult to pass strides as attributes when we have control
flows. A block argument is possible to accept tensors with different
strides.
This commit is contained in:
Keren Zhou
2022-11-06 22:59:03 -08:00
committed by GitHub
parent a4ff0c362c
commit fdd59900f7
26 changed files with 507 additions and 339 deletions

View File

@@ -474,7 +474,7 @@ void SharedEncodingAttr::print(AsmPrinter &printer) const {
ParseResult parseInsertSliceAsyncOp(OpAsmParser &parser,
OperationState &result) {
SmallVector<OpAsmParser::OperandType, 4> allOperands;
SmallVector<OpAsmParser::OperandType, 8> allOperands;
Type srcType, dstType;
SMLoc allOperandLoc = parser.getCurrentLocation();
if (parser.parseOperandList(allOperands) ||
@@ -489,14 +489,27 @@ ParseResult parseInsertSliceAsyncOp(OpAsmParser &parser,
operandTypes.push_back(dstType); // dst
operandTypes.push_back(
IntegerType::get(parser.getBuilder().getContext(), 32)); // index
if (allOperands.size() >= 4)
int hasMask = 0, hasOther = 0;
if (allOperands.size() >= 4) {
operandTypes.push_back(triton::getI1SameShape(srcType)); // mask
if (allOperands.size() >= 5)
hasMask = 1;
}
if (allOperands.size() >= 5) {
operandTypes.push_back(triton::getPointeeType(srcType)); // other
hasOther = 1;
}
if (parser.resolveOperands(allOperands, operandTypes, allOperandLoc,
result.operands))
return failure();
// Deduce operand_segment_sizes from the number of the operands.
auto operand_segment_sizesAttrName =
InsertSliceAsyncOp::operand_segment_sizesAttrName(result.name);
result.addAttribute(
operand_segment_sizesAttrName,
parser.getBuilder().getI32VectorAttr({1, 1, 1, hasMask, hasOther}));
return success();
}
@@ -504,39 +517,16 @@ void printInsertSliceAsyncOp(OpAsmPrinter &printer,
InsertSliceAsyncOp insertSliceAsyncOp) {
printer << " ";
printer << insertSliceAsyncOp.getOperation()->getOperands();
printer.printOptionalAttrDict(insertSliceAsyncOp->getAttrs(),
/*elidedAttrs=*/{});
// "operand_segment_sizes" can be deduced, so we don't print it.
printer.printOptionalAttrDict(
insertSliceAsyncOp->getAttrs(),
{insertSliceAsyncOp.operand_segment_sizesAttrName()});
printer << " : ";
printer.printStrippedAttrOrType(insertSliceAsyncOp.src().getType());
printer << " -> ";
printer.printStrippedAttrOrType(insertSliceAsyncOp.result().getType());
}
//===----------------------------------------------------------------------===//
// ExtractSliceOp
//===----------------------------------------------------------------------===//
mlir::LogicalResult ExtractSliceOp::inferReturnTypes(
::mlir::MLIRContext *context, llvm::Optional<::mlir::Location> location,
::mlir::ValueRange operands, mlir::DictionaryAttr attributes,
::mlir::RegionRange regions,
llvm::SmallVectorImpl<::mlir::Type> &inferredReturnTypes) {
auto srcType = operands[0].getType().cast<RankedTensorType>();
auto encoding = srcType.getEncoding();
auto srcShape = srcType.getShape();
auto axis = attributes.get("axis").cast<IntegerAttr>().getInt();
if (axis < 0 || (size_t)axis > srcShape.size())
return failure();
SmallVector<int64_t, 4> dstShape;
for (size_t i = 0; i < srcShape.size(); i++)
if (i != (size_t)axis)
dstShape.push_back(srcShape[i]);
auto returnType =
RankedTensorType::get(dstShape, srcType.getElementType(), encoding);
inferredReturnTypes.assign({returnType});
return success();
}
//===----------------------------------------------------------------------===//
// DotOperand Encoding
//===----------------------------------------------------------------------===//
@@ -631,32 +621,6 @@ void TritonGPUDialect::initialize() {
addInterfaces<TritonGPUInferLayoutInterface>();
}
//===----------------------------------------------------------------------===//
// Verification
//===----------------------------------------------------------------------===//
static LogicalResult verify(InsertSliceAsyncOp op) {
if (!isSharedEncoding(op.getResult())) {
return op.emitOpError(
"insert_slice_async should return a shared memory tensor");
}
return success();
}
static LogicalResult verify(ExtractSliceOp op) {
if (!isSharedEncoding(op.getResult())) {
return op.emitOpError("extract_slice should return a shared memory tensor");
}
return success();
}
static LogicalResult verify(AllocTensorOp op) {
if (!isSharedEncoding(op.getResult())) {
return op.emitOpError("alloc_tensor should return a shared memory tensor");
}
return success();
}
#define GET_OP_CLASSES
#include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc"