From 678b9f53a2c973ba645e5219c653075b3ab13aef Mon Sep 17 00:00:00 2001 From: Keren Zhou Date: Tue, 3 Jan 2023 18:11:54 -0500 Subject: [PATCH 1/2] [Backend] Use post-order traversal for liveness numbering (#1027) Also add tests for `tt.trans`. --- include/triton/Dialect/Triton/IR/TritonOps.td | 2 +- lib/Analysis/Alias.cpp | 7 ++--- lib/Analysis/Allocation.cpp | 20 +++++++++++--- test/Analysis/test-alias.mlir | 9 +++++++ test/Analysis/test-allocation.mlir | 27 +++++++++++++++++++ test/Analysis/test-membar.mlir | 7 +++++ 6 files changed, 65 insertions(+), 7 deletions(-) diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td index 1a7a982ac..ef9597318 100644 --- a/include/triton/Dialect/Triton/IR/TritonOps.td +++ b/include/triton/Dialect/Triton/IR/TritonOps.td @@ -289,7 +289,7 @@ def TT_CatOp : TT_Op<"cat", [NoSideEffect, } def TT_TransOp : TT_Op<"trans", [NoSideEffect, - SameOperandsAndResultElementType]> { + SameOperandsAndResultElementType]> { let summary = "transpose a tensor"; diff --git a/lib/Analysis/Alias.cpp b/lib/Analysis/Alias.cpp index db01e6fc3..a39e4de9a 100644 --- a/lib/Analysis/Alias.cpp +++ b/lib/Analysis/Alias.cpp @@ -25,13 +25,14 @@ ChangeResult SharedMemoryAliasAnalysis::visitOperation( if (maybeSharedAllocationOp(op)) { // These ops may allocate a new shared memory buffer. auto result = op->getResult(0); - // FIXME(Keren): extract and insert are always alias for now + // XXX(Keren): the following ops are always aliasing for now if (isa(op)) { // extract_slice %src + // trans %src aliasInfo = AliasInfo(operands[0]->getValue()); pessimistic = false; - } else if (isa(op) || - isa(op)) { + } else if (isa( + op)) { // insert_slice_async %src, %dst, %index // insert_slice %src into %dst[%offsets] aliasInfo = AliasInfo(operands[1]->getValue()); diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp index aad43241c..515b76411 100644 --- a/lib/Analysis/Allocation.cpp +++ b/lib/Analysis/Allocation.cpp @@ -298,10 +298,24 @@ private: /// Resolves liveness of all values involved under the root operation. void resolveLiveness() { - // In the SCF dialect, we always have a sequentially nested structure of - // blocks + // Assign an ID to each operation using post-order traversal. + // To achieve the correct liveness range, the parent operation's ID + // should be greater than each of its child operation's ID . + // Example: + // ... + // %5 = triton.convert_layout %4 + // %6 = scf.for ... iter_args(%arg0 = %0) -> (i32) { + // %2 = triton.convert_layout %5 + // ... + // scf.yield %arg0 + // } + // For example, %5 is defined in the parent region and used in + // the child region, and is not passed as a block argument. + // %6 should should have an ID greater than its child operations, + // otherwise %5 liveness range ends before the child operation's liveness + // range ends. DenseMap operationId; - operation->walk( + operation->walk( [&](Operation *op) { operationId[op] = operationId.size(); }); // Analyze liveness of explicit buffers diff --git a/test/Analysis/test-alias.mlir b/test/Analysis/test-alias.mlir index 6a4407a31..c72f97ce5 100644 --- a/test/Analysis/test-alias.mlir +++ b/test/Analysis/test-alias.mlir @@ -52,6 +52,15 @@ func @convert(%A : !tt.ptr) { return } +// CHECK-LABEL: trans +func @trans(%A : !tt.ptr) { + // CHECK: %cst -> %cst + %tensor = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED> + // CHECK: %0 -> %cst + %b = tt.trans %tensor : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> + return +} + // CHECK-LABEL: insert_slice_async func @insert_slice_async(%A : !tt.ptr, %i1 : i1) { %a_ptr = tt.broadcast %A : (!tt.ptr) -> tensor<16x16x!tt.ptr, #AL> diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir index 888a30dad..cce9bdd4e 100644 --- a/test/Analysis/test-allocation.mlir +++ b/test/Analysis/test-allocation.mlir @@ -174,6 +174,14 @@ func @scratch() { // CHECK-NEXT: size = 512 } +// CHECK-LABEL: trans +func @trans(%A : !tt.ptr) { + // CHECK: offset = 0, size = 1024 + %tensor = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED> + %b = tt.trans %tensor : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> + return +} + // CHECK-LABEL: insert_slice_async func @insert_slice_async(%A : !tt.ptr, %i1 : i1) { %a_ptr = tt.broadcast %A : (!tt.ptr) -> tensor<16x16x!tt.ptr, #AL> @@ -285,6 +293,25 @@ func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr, % // CHECK-NEXT: size = 24576 } +// c0 cannot be released in the loop +// CHECK-LABEL: for_use_ancestor +func @for_use_ancestor(%lb : index, %ub : index, %step : index, %A : !tt.ptr, %B : !tt.ptr, %i1 : i1) { + // CHECK: offset = 0, size = 8192 + %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> + // CHECK-NEXT: offset = 8192, size = 8192 + %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> + // CHECK-NEXT: offset = 16384, size = 8192 + %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> + %a_shared, %b_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) { + %c0 = tt.trans %c_shared_init : (tensor<128x32xf16, #A_SHARED>) -> tensor<32x128xf16, #A_SHARED> + // CHECK-NEXT: offset = 24576, size = 8192 + %c1 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED> + scf.yield %b_shared, %a_shared: tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED> + } + return + // CHECK-NEXT: size = 32768 +} + // a_shared_init, b_shared_init, and c_shared_init's liveness ranges are span over the entire function before cst2. // So they cannot be reused by cst0 and cst1, but can be reused by cst2. // CHECK-LABEL: for_if_for diff --git a/test/Analysis/test-membar.mlir b/test/Analysis/test-membar.mlir index 130445448..6187126fb 100644 --- a/test/Analysis/test-membar.mlir +++ b/test/Analysis/test-membar.mlir @@ -111,6 +111,13 @@ func @extract_slice() { return } +// CHECK-LABEL: trans +func @trans() { + %cst0 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED> + %b = tt.trans %cst0 : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED> + return +} + // CHECK-LABEL: insert_slice_async func @insert_slice_async(%A : !tt.ptr, %i1 : i1) { %a_ptr = tt.broadcast %A : (!tt.ptr) -> tensor<16x16x!tt.ptr, #AL> From 8460ea3df1a2aac65e0e717793bbbfb11d227663 Mon Sep 17 00:00:00 2001 From: Keren Zhou Date: Tue, 3 Jan 2023 18:48:05 -0500 Subject: [PATCH 2/2] [Frontend] Fix import for libdevice (#1028) This is a hotfix for issue 1 in https://github.com/openai/triton/issues/1017 --- python/triton/language/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/triton/language/__init__.py b/python/triton/language/__init__.py index 0e7875e42..038e26bbe 100644 --- a/python/triton/language/__init__.py +++ b/python/triton/language/__init__.py @@ -5,6 +5,7 @@ from ..impl import ( ir, builtin, ) +from . import libdevice from .core import ( abs, arange, @@ -130,6 +131,7 @@ __all__ = [ "int64", "int8", "ir", + "libdevice", "load", "log", "max",