From 678b9f53a2c973ba645e5219c653075b3ab13aef Mon Sep 17 00:00:00 2001
From: Keren Zhou <kerenzhou@openai.com>
Date: Tue, 3 Jan 2023 18:11:54 -0500
Subject: [PATCH 1/2] [Backend] Use post-order traversal for liveness numbering
 (#1027)

Also add tests for `tt.trans`.
---
 include/triton/Dialect/Triton/IR/TritonOps.td |  2 +-
 lib/Analysis/Alias.cpp                        |  7 ++---
 lib/Analysis/Allocation.cpp                   | 20 +++++++++++---
 test/Analysis/test-alias.mlir                 |  9 +++++++
 test/Analysis/test-allocation.mlir            | 27 +++++++++++++++++++
 test/Analysis/test-membar.mlir                |  7 +++++
 6 files changed, 65 insertions(+), 7 deletions(-)
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
index 1a7a982ac..ef9597318 100644
--- a/include/triton/Dialect/Triton/IR/TritonOps.td
+++ b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -289,7 +289,7 @@ def TT_CatOp : TT_Op<"cat", [NoSideEffect,
 }
 
 def TT_TransOp : TT_Op<"trans", [NoSideEffect,
-                               SameOperandsAndResultElementType]> {
+                                 SameOperandsAndResultElementType]> {
 
     let summary = "transpose a tensor";
 
diff --git a/lib/Analysis/Alias.cpp b/lib/Analysis/Alias.cpp
index db01e6fc3..a39e4de9a 100644
--- a/lib/Analysis/Alias.cpp
+++ b/lib/Analysis/Alias.cpp
@@ -25,13 +25,14 @@ ChangeResult SharedMemoryAliasAnalysis::visitOperation(
   if (maybeSharedAllocationOp(op)) {
     // These ops may allocate a new shared memory buffer.
     auto result = op->getResult(0);
-    // FIXME(Keren): extract and insert are always alias for now
+    // XXX(Keren): the following ops are always aliasing for now
     if (isa<tensor::ExtractSliceOp, triton::TransOp>(op)) {
       // extract_slice %src
+      // trans %src
       aliasInfo = AliasInfo(operands[0]->getValue());
       pessimistic = false;
-    } else if (isa<tensor::InsertSliceOp>(op) ||
-               isa<triton::gpu::InsertSliceAsyncOp>(op)) {
+    } else if (isa<tensor::InsertSliceOp, triton::gpu::InsertSliceAsyncOp>(
+                   op)) {
       // insert_slice_async %src, %dst, %index
       // insert_slice %src into %dst[%offsets]
       aliasInfo = AliasInfo(operands[1]->getValue());
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
index aad43241c..515b76411 100644
--- a/lib/Analysis/Allocation.cpp
+++ b/lib/Analysis/Allocation.cpp
@@ -298,10 +298,24 @@ private:
 
   /// Resolves liveness of all values involved under the root operation.
   void resolveLiveness() {
-    // In the SCF dialect, we always have a sequentially nested structure of
-    // blocks
+    // Assign an ID to each operation using post-order traversal.
+    // To achieve the correct liveness range, the parent operation's ID
+    // should be greater than each of its child operation's ID .
+    // Example:
+    //     ...
+    //     %5 = triton.convert_layout %4
+    //     %6 = scf.for ... iter_args(%arg0 = %0) -> (i32) {
+    //       %2 = triton.convert_layout %5
+    //       ...
+    //       scf.yield %arg0
+    //     }
+    // For example, %5 is defined in the parent region and used in
+    // the child region, and is not passed as a block argument.
+    // %6 should should have an ID greater than its child operations,
+    // otherwise %5 liveness range ends before the child operation's liveness
+    // range ends.
     DenseMap<Operation *, size_t> operationId;
-    operation->walk<WalkOrder::PreOrder>(
+    operation->walk<WalkOrder::PostOrder>(
         [&](Operation *op) { operationId[op] = operationId.size(); });
 
     // Analyze liveness of explicit buffers
diff --git a/test/Analysis/test-alias.mlir b/test/Analysis/test-alias.mlir
index 6a4407a31..c72f97ce5 100644
--- a/test/Analysis/test-alias.mlir
+++ b/test/Analysis/test-alias.mlir
@@ -52,6 +52,15 @@ func @convert(%A : !tt.ptr<f16>) {
   return
 }
 
+// CHECK-LABEL: trans
+func @trans(%A : !tt.ptr<f16>) {
+  // CHECK: %cst -> %cst
+  %tensor = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED>
+  // CHECK: %0 -> %cst
+  %b = tt.trans %tensor : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED>
+  return
+}
+
 // CHECK-LABEL: insert_slice_async
 func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
   %a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir
index 888a30dad..cce9bdd4e 100644
--- a/test/Analysis/test-allocation.mlir
+++ b/test/Analysis/test-allocation.mlir
@@ -174,6 +174,14 @@ func @scratch() {
   // CHECK-NEXT: size = 512
 }
 
+// CHECK-LABEL: trans
+func @trans(%A : !tt.ptr<f16>) {
+  // CHECK: offset = 0, size = 1024
+  %tensor = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED>
+  %b = tt.trans %tensor : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED>
+  return
+}
+
 // CHECK-LABEL: insert_slice_async
 func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
   %a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
@@ -285,6 +293,25 @@ func @for_if_slice(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %
   // CHECK-NEXT: size = 24576
 }
 
+// c0 cannot be released in the loop
+// CHECK-LABEL: for_use_ancestor
+func @for_use_ancestor(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
+  // CHECK: offset = 0, size = 8192
+  %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
+  // CHECK-NEXT: offset = 8192, size = 8192
+  %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
+  // CHECK-NEXT: offset = 16384, size = 8192
+  %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
+  %a_shared, %b_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init) -> (tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>) {
+    %c0 = tt.trans %c_shared_init : (tensor<128x32xf16, #A_SHARED>) -> tensor<32x128xf16, #A_SHARED>
+    // CHECK-NEXT: offset = 24576, size = 8192
+    %c1 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A_SHARED>
+    scf.yield %b_shared, %a_shared: tensor<128x32xf16, #A_SHARED>, tensor<128x32xf16, #A_SHARED>
+  }
+  return
+  // CHECK-NEXT: size = 32768
+}
+
 // a_shared_init, b_shared_init, and c_shared_init's liveness ranges are span over the entire function before cst2.
 // So they cannot be reused by cst0 and cst1, but can be reused by cst2.
 // CHECK-LABEL: for_if_for
diff --git a/test/Analysis/test-membar.mlir b/test/Analysis/test-membar.mlir
index 130445448..6187126fb 100644
--- a/test/Analysis/test-membar.mlir
+++ b/test/Analysis/test-membar.mlir
@@ -111,6 +111,13 @@ func @extract_slice() {
   return
 }
 
+// CHECK-LABEL: trans
+func @trans() {
+  %cst0 = arith.constant dense<0.000000e+00> : tensor<16x32xf16, #A_SHARED>
+  %b = tt.trans %cst0 : (tensor<16x32xf16, #A_SHARED>) -> tensor<32x16xf16, #A_SHARED>
+  return
+}
+
 // CHECK-LABEL: insert_slice_async
 func @insert_slice_async(%A : !tt.ptr<f16>, %i1 : i1) {
   %a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>

From 8460ea3df1a2aac65e0e717793bbbfb11d227663 Mon Sep 17 00:00:00 2001
From: Keren Zhou <kerenzhou@openai.com>
Date: Tue, 3 Jan 2023 18:48:05 -0500
Subject: [PATCH 2/2] [Frontend] Fix import for libdevice (#1028)

This is a hotfix for issue 1 in
https://github.com/openai/triton/issues/1017
---
 python/triton/language/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/triton/language/__init__.py b/python/triton/language/__init__.py
index 0e7875e42..038e26bbe 100644
--- a/python/triton/language/__init__.py
+++ b/python/triton/language/__init__.py
@@ -5,6 +5,7 @@ from ..impl import (
     ir,
     builtin,
 )
+from . import libdevice
 from .core import (
     abs,
     arange,
@@ -130,6 +131,7 @@ __all__ = [
     "int64",
     "int8",
     "ir",
+    "libdevice",
     "load",
     "log",
     "max",