[Backend] Use post-order traversal for liveness numbering (#1027)
Also add tests for `tt.trans`.
This commit is contained in:
@@ -289,7 +289,7 @@ def TT_CatOp : TT_Op<"cat", [NoSideEffect,
|
|||||||
}
|
}
|
||||||
|
|
||||||
def TT_TransOp : TT_Op<"trans", [NoSideEffect,
|
def TT_TransOp : TT_Op<"trans", [NoSideEffect,
|
||||||
SameOperandsAndResultElementType]> {
|
SameOperandsAndResultElementType]> {
|
||||||
|
|
||||||
let summary = "transpose a tensor";
|
let summary = "transpose a tensor";
|
||||||
|
|
||||||
|
@@ -25,13 +25,14 @@ ChangeResult SharedMemoryAliasAnalysis::visitOperation(
|
|||||||
if (maybeSharedAllocationOp(op)) {
|
if (maybeSharedAllocationOp(op)) {
|
||||||
// These ops may allocate a new shared memory buffer.
|
// These ops may allocate a new shared memory buffer.
|
||||||
auto result = op->getResult(0);
|
auto result = op->getResult(0);
|
||||||
// FIXME(Keren): extract and insert are always alias for now
|
// XXX(Keren): the following ops are always aliasing for now
|
||||||
if (isa<tensor::ExtractSliceOp, triton::TransOp>(op)) {
|
if (isa<tensor::ExtractSliceOp, triton::TransOp>(op)) {
|
||||||
// extract_slice %src
|
// extract_slice %src
|
||||||
|
// trans %src
|
||||||
aliasInfo = AliasInfo(operands[0]->getValue());
|
aliasInfo = AliasInfo(operands[0]->getValue());
|
||||||
pessimistic = false;
|
pessimistic = false;
|
||||||
} else if (isa<tensor::InsertSliceOp>(op) ||
|
} else if (isa<tensor::InsertSliceOp, triton::gpu::InsertSliceAsyncOp>(
|
||||||
isa<triton::gpu::InsertSliceAsyncOp>(op)) {
|
op)) {
|
||||||
// insert_slice_async %src, %dst, %index
|
// insert_slice_async %src, %dst, %index
|
||||||
// insert_slice %src into %dst[%offsets]
|
// insert_slice %src into %dst[%offsets]
|
||||||
aliasInfo = AliasInfo(operands[1]->getValue());
|
aliasInfo = AliasInfo(operands[1]->getValue());
|
||||||
|
@@ -298,10 +298,24 @@ private:
|
|||||||
|
|
||||||
/// Resolves liveness of all values involved under the root operation.
|
/// Resolves liveness of all values involved under the root operation.
|
||||||
void resolveLiveness() {
|
void resolveLiveness() {
|
||||||
// In the SCF dialect, we always have a sequentially nested structure of
|
// Assign an ID to each operation using post-order traversal.
|
||||||
// blocks
|
// To achieve the correct liveness range, the parent operation's ID
|
||||||
|
// should be greater than each of its child operation's ID .
|
||||||
|
// Example:
|
||||||
|
// ...
|
||||||
|
// %5 = triton.convert_layout %4
|
||||||
|
// %6 = scf.for ... iter_args(%arg0 = %0) -> (i32) {
|
||||||
|
// %2 = triton.convert_layout %5
|
||||||
|
// ...
|
||||||
|
// scf.yield %arg0
|
||||||
|
// }
|
||||||
|
// For example, %5 is defined in the parent region and used in
|
||||||
|
// the child region, and is not passed as a block argument.
|
||||||
|
// %6 should should have an ID greater than its child operations,
|
||||||
|
// otherwise %5 liveness range ends before the child operation's liveness
|
||||||
|
// range ends.
|
||||||
DenseMap<Operation *, size_t> operationId;
|
DenseMap<Operation *, size_t> operationId;
|
||||||
operation->walk<WalkOrder::PreOrder>(
|
operation->walk<WalkOrder::PostOrder>(
|
||||||
[&](Operation *op) { operationId[op] = operationId.size(); });
|
[&](Operation *op) { operationId[op] = operationId.size(); });
|
||||||
|
|
||||||
// Analyze liveness of explicit buffers
|
// Analyze liveness of explicit buffers
|
||||||
|
@@ -52,6 +52,15 @@ func @convert(%A : !tt.ptr<f16>) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CHECK-LABEL: trans
|
||||||
|
func @trans(%A : !tt.ptr<f16>) {
|
||||||
|
// CHECK: %cst -> %cst
|
||||||
|
%tensor = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED>
|
||||||
|
// CHECK: %0 -> %cst
|
||||||
|
%b = tt.trans %tensor : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED>
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// CHECK-LABEL: insert_slice_async
|
// CHECK-LABEL: insert_slice_async
|
||||||
func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
|
func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
|
||||||
%a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
|
%a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
|
||||||
|
@@ -174,6 +174,14 @@ func @scratch() {
|
|||||||
// CHECK-NEXT: size = 512
|
// CHECK-NEXT: size = 512
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CHECK-LABEL: trans
|
||||||
|
func @trans(%A : !tt.ptr<f16>) {
|
||||||
|
// CHECK: offset = 0, size = 1024
|
||||||
|
%tensor = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED>
|
||||||
|
%b = tt.trans %tensor : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED>
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// CHECK-LABEL: insert_slice_async
|
// CHECK-LABEL: insert_slice_async
|
||||||
func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
|
func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
|
||||||
%a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
|
%a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
|
||||||
@@ -285,6 +293,25 @@ func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %
|
|||||||
// CHECK-NEXT: size = 24576
|
// CHECK-NEXT: size = 24576
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// c0 cannot be released in the loop
|
||||||
|
// CHECK-LABEL: for_use_ancestor
|
||||||
|
func @for_use_ancestor(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
|
||||||
|
// CHECK: offset = 0, size = 8192
|
||||||
|
%a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
|
||||||
|
// CHECK-NEXT: offset = 8192, size = 8192
|
||||||
|
%b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
|
||||||
|
// CHECK-NEXT: offset = 16384, size = 8192
|
||||||
|
%c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
|
||||||
|
%a_shared, %b_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) {
|
||||||
|
%c0 = tt.trans %c_shared_init : (tensor<128x32xf16, #A_SHARED>) -> tensor<32x128xf16, #A_SHARED>
|
||||||
|
// CHECK-NEXT: offset = 24576, size = 8192
|
||||||
|
%c1 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
|
||||||
|
scf.yield %b_shared, %a_shared: tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>
|
||||||
|
}
|
||||||
|
return
|
||||||
|
// CHECK-NEXT: size = 32768
|
||||||
|
}
|
||||||
|
|
||||||
// a_shared_init, b_shared_init, and c_shared_init's liveness ranges are span over the entire function before cst2.
|
// a_shared_init, b_shared_init, and c_shared_init's liveness ranges are span over the entire function before cst2.
|
||||||
// So they cannot be reused by cst0 and cst1, but can be reused by cst2.
|
// So they cannot be reused by cst0 and cst1, but can be reused by cst2.
|
||||||
// CHECK-LABEL: for_if_for
|
// CHECK-LABEL: for_if_for
|
||||||
|
@@ -111,6 +111,13 @@ func @extract_slice() {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CHECK-LABEL: trans
|
||||||
|
func @trans() {
|
||||||
|
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED>
|
||||||
|
%b = tt.trans %cst0 : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED>
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// CHECK-LABEL: insert_slice_async
|
// CHECK-LABEL: insert_slice_async
|
||||||
func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
|
func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
|
||||||
%a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
|
%a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
|
||||||
|
Reference in New Issue
Block a user