Analyze shared memory alias (#81)

The purpose of this PR is analyzing shared memory aliases so that we can
fix memory allocation bugs and save memory allocations in triton code
involving complex control flows.

Changes to memory bar and allocation are on the way.

Co-authored-by: Philippe Tillet <phil@openai.com>
This commit is contained in:
Keren Zhou
2022-08-29 10:43:20 -07:00
committed by GitHub
parent 83287d7193
commit 02ebf24d35
15 changed files with 761 additions and 61 deletions

View File

@@ -0,0 +1,216 @@
// RUN: triton-opt %s --mlir-disable-threading -test-print-alias -split-input-file 2>&1 | FileCheck %s
#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
#A = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
#B = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
#C = #triton_gpu.mma<{version = 2, warpsPerCTA = [4, 1]}>
// CHECK-LABEL: matmul_loop
func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
%a_ptr_init = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>
%b_ptr_init = tt.broadcast %B : (!tt.ptr<f16>) -> tensor<32x128x!tt.ptr<f16>, #BL>
%a_mask = arith.constant dense<true> : tensor<128x32xi1, #AL>
%a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL>
%b_mask = arith.constant dense<true> : tensor<32x128xi1, #BL>
%b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL>
%c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C>
%a_off = arith.constant dense<4> : tensor<128x32xi32, #AL>
%b_off = arith.constant dense<4> : tensor<32x128xi32, #BL>
scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr<f16>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, tensor<128x128xf32, #C>) {
%a_ = tt.load %a_ptr, %a_mask, %a_other {cache = 1 : i32, evict = 1 : i32, isOtherUnspecified = false, isVolatile = false} : tensor<128x32xf16, #AL>
// CHECK: %4 -> %4
%a = triton_gpu.convert_layout %a_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A>
%b_ = tt.load %b_ptr, %b_mask, %b_other {cache = 1 : i32, evict = 1 : i32, isOtherUnspecified = false, isVolatile = false} : tensor<32x128xf16, #BL>
// CHECK-NEXT: %6 -> %6
%b = triton_gpu.convert_layout %b_ : (tensor<32x128xf16, #BL>) -> tensor<32x128xf16, #B>
%c = tt.dot %a, %b, %prev_c {allowTF32 = true} : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C>
%next_a_ptr = tt.getelementptr %a_ptr, %a_off : tensor<128x32x!tt.ptr<f16>, #AL>
%next_b_ptr = tt.getelementptr %b_ptr, %b_off : tensor<32x128x!tt.ptr<f16>, #BL>
scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr<f16>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, tensor<128x128xf32, #C>
}
return
}
// CHECK-LABEL: alloc
func @alloc(%A : !tt.ptr<f16>) {
// CHECK: %cst -> %cst
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
%cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
return
}
// CHECK-LABEL: convert
func @convert(%A : !tt.ptr<f16>) {
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
// CHECK: %0 -> %0
%cst1 = triton_gpu.convert_layout %cst0 : (tensor<16x16xf16, #AL>) -> tensor<16x16xf16, #A>
return
}
// CHECK-LABEL: copy_async
func @copy_async(%A : !tt.ptr<f16>, %i1 : i1) {
%a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<16x16x!tt.ptr<f16>, #AL>
%mask = tt.splat %i1 : (i1) -> tensor<16x16xi1, #AL>
%other = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
// CHECK: %2 -> %2
%a = triton_gpu.copy_async %a_ptr, %mask, %other {cache = 1 : i32, evict = 1 : i32, isOtherUnspecified = false, isVolatile = false} : tensor<16x16x!tt.ptr<f16>, #AL> -> tensor<16x16xf16, #A>
return
}
// COM: Enable the following test once we support view on shared memory tensors
// COM: // CHECK-LABEL: view
// COM: func @view(%A : !tt.ptr<f16>) {
// COM: // CHECK: res0:0 -> 0
// COM: %cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// COM: // CHECK-NEXT: res1:0 -> 0
// COM: %cst1 = tt.view %cst0 : (tensor<16x16xf16, #A>) -> tensor<32x8xf16, #A>
// COM: return
// COM: }
// CHECK-LABEL: if_cat
func @if_cat(%i1 : i1) {
// CHECK: %cst -> %cst
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK: %cst_0 -> %cst_0
%cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK: %0 -> %1,%1
%cst2 = scf.if %i1 -> tensor<32x16xf16, #A> {
// CHECK: %1 -> %1
%a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
scf.yield %a : tensor<32x16xf16, #A>
} else {
// CHECK: %1 -> %1
%b = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
scf.yield %b : tensor<32x16xf16, #A>
}
return
}
// CHECK-LABEL: if_alias
func @if_alias(%i1 : i1) {
// CHECK: %cst -> %cst
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: %cst_0 -> %cst_0
%cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: %0 -> %cst,%cst_0
%cst2 = scf.if %i1 -> tensor<16x16xf16, #A> {
scf.yield %cst0 : tensor<16x16xf16, #A>
} else {
scf.yield %cst1 : tensor<16x16xf16, #A>
}
return
}
// CHECK-LABEL: for
func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
// CHECK: %cst -> %cst
%a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: %cst_0 -> %cst_0
%b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: %cst_1 -> %cst_1
%c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: %arg6 -> %cst
// CHECK-NEXT: %arg7 -> %cst_0
// CHECK-NEXT: %arg8 -> %cst_1
// CHECK-NEXT: %0#0 -> %cst,%cst_0
// CHECK-NEXT: %0#1 -> %cst,%cst_0
// CHECK-NEXT: %0#2 -> %cst,%cst_0
%a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) {
scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>
}
return
}
// COM: // Enable the following test once we support view on shared memory tensors
// COM: // CHECK-LABEL: for_if
// COM: func @for_if(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
// COM: // CHECK: res0:0 -> 0
// COM: %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// COM: // CHECK-NEXT: res1:0 -> 1
// COM: %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// COM: // CHECK-NEXT: res2:0 -> 2
// COM: %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// COM: // CHECK-NEXT: arg3:0 -> 0
// COM: // CHECK-NEXT: arg3:1 -> 1
// COM: // CHECK-NEXT: arg3:2 -> 2
// COM: // CHECK-NEXT: res3:0 -> 0,1
// COM: // CHECK-NEXT: res3:1 -> 0,1
// COM: // CHECK-NEXT: res3:2 -> 0,1
// COM: %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) {
// COM: scf.if %i1 {
// COM: // CHECK-NEXT: res5:0 -> 0,1
// COM: %cst0 = tt.view %a_shared : (tensor<128x32xf16, #A>) -> tensor<32x128xf16, #A>
// COM: scf.yield
// COM: }
// COM: scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>
// COM: }
// COM: return
// COM: }
// COM: // Enable the following test once we support view on shared memory tensors
// COM: // CHECK-LABEL: for_if_else
// COM: func @for_if_else(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
// COM: // CHECK: res0:0 -> 0
// COM: %a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// COM: // CHECK-NEXT: res1:0 -> 1
// COM: %b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// COM: // CHECK-NEXT: res2:0 -> 2
// COM: %c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// COM: // CHECK-NEXT: arg3:0 -> 0
// COM: // CHECK-NEXT: arg3:1 -> 1
// COM: // CHECK-NEXT: arg3:2 -> 2
// COM: // CHECK-NEXT: res3:0 -> 0
// COM: // CHECK-NEXT: res3:1 -> 1
// COM: // CHECK-NEXT: res3:2 -> 0,7
// COM: %a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) {
// COM: // CHECK-NEXT: res4:0 -> 0,7
// COM: %c_shared_next = scf.if %i1 -> tensor<128x32xf16, #A> {
// COM: // CHECK-NEXT: res5:0 -> 0
// COM: %cst0 = tt.view %a_shared : (tensor<128x32xf16, #A>) -> tensor<128x32xf16, #A>
// COM: scf.yield %cst0 : tensor<128x32xf16, #A>
// COM: } else {
// COM: // CHECK-NEXT: res7:0 -> 7
// COM: %cst0 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// COM: scf.yield %cst0 : tensor<128x32xf16, #A>
// COM: }
// COM: scf.yield %a_shared, %b_shared, %c_shared_next : tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>
// COM: }
// COM: return
// COM: }
// CHECK-LABEL: for_if_for
func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
// CHECK: %cst -> %cst
%a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: %cst_0 -> %cst_0
%b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: %cst_1 -> %cst_1
%c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: %arg7 -> %cst
// CHECK-NEXT: %arg8 -> %cst_0
// CHECK-NEXT: %arg9 -> %cst_1
// CHECK-NEXT: %0#0 -> %cst
// CHECK-NEXT: %0#1 -> %cst_0
// CHECK-NEXT: %0#2 -> %cst_2,%cst_2
%a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) {
// CHECK-NEXT: %arg11 -> %cst_1,%cst_2,%cst_2
// CHECK-NEXT: %1 -> %cst_2,%cst_2
%c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (tensor<128x32xf16, #A>) {
// CHECK-NEXT: %2 -> %cst_2,%cst_2
%c_shared_next_next = scf.if %i1 -> tensor<128x32xf16, #A> {
// CHECK-NEXT: %cst_2 -> %cst_2
%cst0 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
scf.yield %cst0 : tensor<128x32xf16, #A>
} else {
// CHECK-NEXT: %cst_2 -> %cst_2
%cst0 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
scf.yield %cst0 : tensor<128x32xf16, #A>
}
scf.yield %c_shared_next_next : tensor<128x32xf16, #A>
}
scf.yield %a_shared, %b_shared, %c_shared_next : tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>
}
return
}

View File

@@ -151,21 +151,17 @@ func @longlive(%A : !tt.ptr<f16>) {
// CHECK-LABEL: scratch
func @scratch() {
// CHECK: offset = 0, size = 512
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 1056, size = 1024
%a = tt.cat %cst0, %cst0 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
// CHECK-NEXT: scratch offset = 32, size = 1024
// CHECK-NEXT: offset = 0, size = 32
%b = tt.reduce %a {redOp = 1 : i32, axis = 0 : i32} : tensor<32x16xf16, #A> -> tensor<16xf16, #A>
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
// CHECK: scratch offset = 0, size = 512
%b = tt.reduce %cst0 {redOp = 1 : i32, axis = 0 : i32} : tensor<16x16xf16, #AL> -> tensor<16xf16, #AL>
return
// CHECK-NEXT: size = 2080
// CHECK-NEXT: size = 512
}
// B0 -> (B1) -> B0
// Memory used by B1 can be reused by B0.
// CHECK-LABEL: multi_blocks_reuse
func @multi_blocks_reuse(%i1 : i1) {
// CHECK-LABEL: if
func @if(%i1 : i1) {
// CHECK: offset = 0, size = 512
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 512, size = 512
@@ -188,8 +184,8 @@ func @multi_blocks_reuse(%i1 : i1) {
// B0 -> (B1) -> (B2) -> B0
// Memory used by B0 cannot be reused by B1 or B2.
// CHECK-LABEL: multi_blocks_noreuse
func @multi_blocks_noreuse(%i1 : i1) {
// CHECK-LABEL: if_else
func @if_else(%i1 : i1) {
// CHECK: offset = 0, size = 512
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 512, size = 512
@@ -212,3 +208,51 @@ func @multi_blocks_noreuse(%i1 : i1) {
return
// CHECK-NEXT: size = 3072
}
// Block arguments and yields are memory aliases that do not trigger a new
// allocation.
// CHECK-LABEL: for
func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
// CHECK: offset = 0, size = 8192
%a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: offset = 8192, size = 8192
%b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: offset = 16384, size = 8192
%c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
%a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) {
scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>
}
return
// CHECK-NEXT: size = 24576
}
// a_shared_init, b_shared_init, and c_shared_init's liveness ranges are span over the entire function before cst2.
// So they cannot be reused by cst0 and cst1, but can be reused by cst2.
// CHECK-LABEL: for_if_for
func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
// CHECK: offset = 0, size = 8192
%a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: offset = 8192, size = 8192
%b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: offset = 16384, size = 8192
%c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
%a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) {
%c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (tensor<128x32xf16, #A>) {
%c_shared_next_next = scf.if %i1 -> tensor<128x32xf16, #A> {
// CHECK-NEXT: offset = 24576, size = 8192
%cst0 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
scf.yield %cst0 : tensor<128x32xf16, #A>
} else {
// CHECK-NEXT: offset = 32768, size = 8192
%cst1 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
scf.yield %cst1 : tensor<128x32xf16, #A>
}
scf.yield %c_shared_next_next : tensor<128x32xf16, #A>
}
scf.yield %a_shared, %b_shared, %c_shared_next : tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>
}
// CHECK-NEXT: offset = 0, size = 8192
%cst2 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
return
// CHECK-NEXT: size = 40960
}

View File

@@ -176,3 +176,36 @@ func @multi_blocks_nested_scf(%i1 : i1, %i2 : i1) {
%a_ = triton_gpu.convert_layout %cst0 : (tensor<16x16xf16, #A>) -> tensor<16x16xf16, #AL>
return
}
// CHECK-LABEL: for
func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
%a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
%b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
%c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
%a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) {
// CHECK-NEXT: Membar 3
%cst0 = tt.cat %a_shared, %b_shared {axis = 0} : (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) -> tensor<256x32xf16, #A>
scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>
}
return
}
// Although a_shared and b_shared are synced before entering the loop,
// they are reassociated with aliases (c_shared) and thus require a barrier.
// CHECK-LABEL: for_alias
func @for_alias(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
%a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
%b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: Membar 2
%cst0 = tt.cat %a_shared_init, %b_shared_init {axis = 0} : (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) -> tensor<256x32xf16, #A>
%c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
%a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) {
%cst1 = tt.cat %a_shared_init, %b_shared_init {axis = 0} : (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) -> tensor<256x32xf16, #A>
// CHECK-NEXT: Membar 6
%cst2 = tt.cat %a_shared, %b_shared {axis = 0} : (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) -> tensor<256x32xf16, #A>
scf.yield %c_shared, %a_shared, %b_shared : tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>
}
// CHECK-NEXT: Membar 9
%cst3 = tt.cat %cst0, %cst0 {axis = 0} : (tensor<256x32xf16, #A>, tensor<256x32xf16, #A>) -> tensor<512x32xf16, #A>
return
}

View File

@@ -1,4 +1,5 @@
add_mlir_library(TritonTestAnalysis
TestAlias.cpp
TestAxisInfo.cpp
TestAllocation.cpp
TestMembar.cpp

View File

@@ -0,0 +1,92 @@
#include "mlir/IR/AsmState.h"
#include "mlir/Pass/Pass.h"
#include "triton/Analysis/Alias.h"
#include "triton/Analysis/Utility.h"
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
using namespace mlir;
namespace {
struct TestAliasPass
: public PassWrapper<TestAliasPass, OperationPass<FuncOp>> {
// LLVM15+
// MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestAliasPass);
static void print(StringRef name, SmallVector<std::string, 4> &vals,
raw_ostream &os) {
if (vals.empty())
return;
os << name << " -> ";
size_t i = 0;
for (auto val : vals) {
if (i != 0)
os << ",";
os << val;
++i;
}
os << "\n";
}
StringRef getArgument() const final { return "test-print-alias"; }
StringRef getDescription() const final {
return "print the result of the alias analysis pass";
}
void runOnOperation() override {
Operation *operation = getOperation();
auto &os = llvm::errs();
auto op_name = SymbolTable::getSymbolName(operation).getValue().str();
os << op_name << "\n";
SharedMemoryAliasAnalysis analysis(&getContext());
analysis.run(operation);
AsmState state(operation->getParentOfType<ModuleOp>());
// Get operation ids of value's aliases
auto getAllocOpNames = [&](Value value) {
LatticeElement<AliasInfo> *latticeElement =
analysis.lookupLatticeElement(value);
SmallVector<std::string, 4> opNames;
if (latticeElement) {
auto &info = latticeElement->getValue();
if (!info.getAllocs().empty()) {
for (auto &alias : info.getAllocs()) {
auto opName =
getValueOperandName(alias.getDefiningOp()->getResult(0), state);
opNames.push_back(std::move(opName));
}
}
}
// Ensure deterministic output
std::sort(opNames.begin(), opNames.end());
return opNames;
};
operation->walk<WalkOrder::PreOrder>([&](Operation *op) {
if (op->getNumResults() < 1)
return;
if (auto forOp = dyn_cast<scf::ForOp>(op)) {
for (auto arg : llvm::enumerate(forOp.getRegionIterArgs())) {
auto operand = forOp.getOpOperandForRegionIterArg(arg.value()).get();
auto opNames = getAllocOpNames(operand);
auto argName = getValueOperandName(arg.value(), state);
print(argName, opNames, os);
}
}
for (auto result : llvm::enumerate(op->getResults())) {
auto opNames = getAllocOpNames(result.value());
auto resultName = getValueOperandName(result.value(), state);
print(resultName, opNames, os);
}
});
}
};
} // namespace
namespace mlir {
namespace test {
void registerTestAliasPass() { PassRegistration<TestAliasPass>(); }
} // namespace test
} // namespace mlir