Files
triton/test/Analysis/test-allocation.mlir
Keren Zhou 02ebf24d35 Analyze shared memory alias (#81)
The purpose of this PR is analyzing shared memory aliases so that we can
fix memory allocation bugs and save memory allocations in triton code
involving complex control flows.

Changes to memory bar and allocation are on the way.

Co-authored-by: Philippe Tillet <phil@openai.com>
2022-08-29 10:43:20 -07:00

259 lines
14 KiB
MLIR

// RUN: triton-opt %s -split-input-file --mlir-disable-threading -test-print-allocation 2>&1 | FileCheck %s
#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
#A = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
#B = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
#C = #triton_gpu.mma<{version = 2, warpsPerCTA = [4, 1]}>
// CHECK-LABEL: matmul_loop
func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
%a_ptr_init = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>
%b_ptr_init = tt.broadcast %B : (!tt.ptr<f16>) -> tensor<32x128x!tt.ptr<f16>, #BL>
%a_mask = arith.constant dense<true> : tensor<128x32xi1, #AL>
%a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL>
%b_mask = arith.constant dense<true> : tensor<32x128xi1, #BL>
%b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL>
%c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C>
%a_off = arith.constant dense<4> : tensor<128x32xi32, #AL>
%b_off = arith.constant dense<4> : tensor<32x128xi32, #BL>
scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr<f16>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, tensor<128x128xf32, #C>) {
%a_ = tt.load %a_ptr, %a_mask, %a_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL>
// CHECK: offset = 0, size = 8192
%a = triton_gpu.convert_layout %a_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A>
%b_ = tt.load %b_ptr, %b_mask, %b_other {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #BL>
// CHECK-NEXT: offset = 8192, size = 8192
%b = triton_gpu.convert_layout %b_ : (tensor<32x128xf16, #BL>) -> tensor<32x128xf16, #B>
%c = tt.dot %a, %b, %prev_c {allowTF32 = true} : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C>
%next_a_ptr = tt.getelementptr %a_ptr, %a_off : tensor<128x32x!tt.ptr<f16>, #AL>
%next_b_ptr = tt.getelementptr %b_ptr, %b_off : tensor<32x128x!tt.ptr<f16>, #BL>
scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr<f16>, #AL>, tensor<32x128x!tt.ptr<f16>, #BL>, tensor<128x128xf32, #C>
}
return
// CHECK-NEXT: size = 16384
}
// Shared memory is available after a tensor's liveness range ends
// CHECK-LABEL: reusable
func @reusable(%A : !tt.ptr<f16>) {
%cst1 = arith.constant dense<true> : tensor<128x32xi1, #AL>
%cst2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #AL>
%cst3 = arith.constant dense<true> : tensor<32x128xi1, #AL>
%cst4 = arith.constant dense<0.000000e+00> : tensor<32x128xf16, #AL>
%c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C>
%a_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>
%b_ptr = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<32x128x!tt.ptr<f16>, #AL>
%a1_ = tt.load %a_ptr, %cst1, %cst2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL>
// CHECK: offset = 0, size = 8192
%a1 = triton_gpu.convert_layout %a1_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A>
%a2_ = tt.load %b_ptr, %cst3, %cst4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #AL>
// CHECK-NEXT: offset = 8192, size = 8192
%a2 = triton_gpu.convert_layout %a2_ : (tensor<32x128xf16, #AL>) -> tensor<32x128xf16, #A>
%a3_ = tt.load %a_ptr, %cst1, %cst2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x32xf16, #AL>
// CHECK-NEXT: offset = 16384, size = 8192
%a3 = triton_gpu.convert_layout %a3_ : (tensor<128x32xf16, #AL>) -> tensor<128x32xf16, #A>
%c = tt.dot %a1, %a2, %c_init {allowTF32 = true} : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C>
%a4_ = tt.load %b_ptr, %cst3, %cst4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x128xf16, #AL>
// CHECK-NEXT: offset = 0, size = 8192
%a4 = triton_gpu.convert_layout %a4_ : (tensor<32x128xf16, #AL>) -> tensor<32x128xf16, #A>
%c1 = tt.dot %a3, %a4, %c {allowTF32 = true} : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C>
return
// CHECK-NEXT: size = 24576
}
// A tensor's shared memory offset is larger than it needs to accommodate further tensors
// %cst0->%c
// %cst1->%cst4
// %cst3->%g->%h->%i
// CHECK-LABEL: preallocate
func @preallocate(%A : !tt.ptr<f16>) {
// CHECK: offset = 0, size = 512
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 1024, size = 512
%cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 1536, size = 512
%cst2 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 2048, size = 1024
%a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
// CHECK-NEXT: offset = 3072, size = 1024
%b = tt.cat %cst0, %cst2 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
// CHECK-NEXT: offset = 0, size = 1024
%c = tt.cat %cst1, %cst2 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
// CHECK-NEXT: offset = 1024, size = 1024
%cst4 = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #A>
// CHECK-NEXT: offset = 6144, size = 2048
%e = tt.cat %a, %cst4 {axis = 0} : (tensor<32x16xf16, #A>, tensor<32x16xf16, #A>) -> tensor<64x16xf16, #A>
// CHECK-NEXT: offset = 8192, size = 2048
%d = tt.cat %b, %cst4 {axis = 0} : (tensor<32x16xf16, #A>, tensor<32x16xf16, #A>) -> tensor<64x16xf16, #A>
// CHECK-NEXT: offset = 10240, size = 2048
%f = tt.cat %c, %cst4 {axis = 0} : (tensor<32x16xf16, #A>, tensor<32x16xf16, #A>) -> tensor<64x16xf16, #A>
// CHECK-NEXT: offset = 0, size = 2048
%cst5 = arith.constant dense<0.000000e+00> : tensor<64x16xf16, #A>
// CHECK-NEXT: offset = 2048, size = 4096
%g = tt.cat %e, %cst5 {axis = 0} : (tensor<64x16xf16, #A>, tensor<64x16xf16, #A>) -> tensor<128x16xf16, #A>
// CHECK-NEXT: offset = 2048, size = 4096
%h = tt.cat %d, %cst5 {axis = 0} : (tensor<64x16xf16, #A>, tensor<64x16xf16, #A>) -> tensor<128x16xf16, #A>
// CHECK-NEXT: offset = 2048, size = 4096
%i = tt.cat %f, %cst5 {axis = 0} : (tensor<64x16xf16, #A>, tensor<64x16xf16, #A>) -> tensor<128x16xf16, #A>
return
// CHECK-NEXT: size = 12288
}
// Unused tensors are immediately released
// CHECK-LABEL: unused
func @unused(%A : !tt.ptr<f16>) {
// CHECK: offset = 0, size = 1024
%cst0 = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #A>
// CHECK-NEXT: offset = 0, size = 512
%cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 512, size = 512
%cst2 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 1024, size = 1024
%a = tt.cat %cst1, %cst2 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
return
// CHECK: size = 2048
}
// cst0 is alive through the entire function, it cannot be released before the end of the function
// CHECK-LABEL: longlive
func @longlive(%A : !tt.ptr<f16>) {
// CHECK: offset = 0, size = 512
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 512, size = 512
%cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 1024, size = 512
%cst2 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 1536, size = 1024
%a = tt.cat %cst1, %cst2 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
// CHECK-NEXT: offset = 512, size = 512
%cst3 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 1024, size = 512
%cst4 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 1536, size = 1024
%b = tt.cat %cst3, %cst4 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
// CHECK-NEXT: offset = 1536, size = 512
%cst5 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 1536, size = 512
%cst6 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 1536, size = 1024
%c = tt.cat %cst3, %cst4 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
// CHECK-NEXT: offset = 512, size = 1024
%d = tt.cat %cst0, %cst0 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
return
// CHECK-NEXT: size = 2560
}
// CHECK-LABEL: scratch
func @scratch() {
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #AL>
// CHECK: scratch offset = 0, size = 512
%b = tt.reduce %cst0 {redOp = 1 : i32, axis = 0 : i32} : tensor<16x16xf16, #AL> -> tensor<16xf16, #AL>
return
// CHECK-NEXT: size = 512
}
// B0 -> (B1) -> B0
// Memory used by B1 can be reused by B0.
// CHECK-LABEL: if
func @if(%i1 : i1) {
// CHECK: offset = 0, size = 512
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 512, size = 512
%cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
scf.if %i1 {
// CHECK-NEXT: offset = 1024, size = 1024
%a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
// CHECK-NEXT: offset = 1024, size = 1024
%b = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
}
// CHECK-NEXT: offset = 0, size = 512
%cst2 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 512, size = 512
%cst3 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 1024, size = 1024
%a = tt.cat %cst2, %cst3 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
return
// CHECK-NEXT: size = 2048
}
// B0 -> (B1) -> (B2) -> B0
// Memory used by B0 cannot be reused by B1 or B2.
// CHECK-LABEL: if_else
func @if_else(%i1 : i1) {
// CHECK: offset = 0, size = 512
%cst0 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 512, size = 512
%cst1 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
scf.if %i1 {
// CHECK-NEXT: offset = 1024, size = 1024
%a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
// CHECK-NEXT: offset = 1024, size = 1024
%b = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
} else {
// CHECK-NEXT: offset = 1024, size = 512
%cst2 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 1536, size = 512
%cst3 = arith.constant dense<0.000000e+00> : tensor<16x16xf16, #A>
// CHECK-NEXT: offset = 2048, size = 1024
%a = tt.cat %cst2, %cst3 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
}
// CHECK-NEXT: offset = 1024, size = 1024
%a = tt.cat %cst0, %cst1 {axis = 0} : (tensor<16x16xf16, #A>, tensor<16x16xf16, #A>) -> tensor<32x16xf16, #A>
return
// CHECK-NEXT: size = 3072
}
// Block arguments and yields are memory aliases that do not trigger a new
// allocation.
// CHECK-LABEL: for
func @for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
// CHECK: offset = 0, size = 8192
%a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: offset = 8192, size = 8192
%b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: offset = 16384, size = 8192
%c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
%a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) {
scf.yield %b_shared, %a_shared, %a_shared : tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>
}
return
// CHECK-NEXT: size = 24576
}
// a_shared_init, b_shared_init, and c_shared_init's liveness ranges are span over the entire function before cst2.
// So they cannot be reused by cst0 and cst1, but can be reused by cst2.
// CHECK-LABEL: for_if_for
func @for_if_for(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>, %i1 : i1) {
// CHECK: offset = 0, size = 8192
%a_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: offset = 8192, size = 8192
%b_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
// CHECK-NEXT: offset = 16384, size = 8192
%c_shared_init = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
%a_shared, %b_shared, %c_shared = scf.for %iv = %lb to %ub step %step iter_args(%a_shared = %a_shared_init, %b_shared = %b_shared_init, %c_shared = %c_shared_init) -> (tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>) {
%c_shared_next = scf.for %jv = %lb to %ub step %step iter_args(%c_shared_next = %c_shared) -> (tensor<128x32xf16, #A>) {
%c_shared_next_next = scf.if %i1 -> tensor<128x32xf16, #A> {
// CHECK-NEXT: offset = 24576, size = 8192
%cst0 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
scf.yield %cst0 : tensor<128x32xf16, #A>
} else {
// CHECK-NEXT: offset = 32768, size = 8192
%cst1 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
scf.yield %cst1 : tensor<128x32xf16, #A>
}
scf.yield %c_shared_next_next : tensor<128x32xf16, #A>
}
scf.yield %a_shared, %b_shared, %c_shared_next : tensor<128x32xf16, #A>, tensor<128x32xf16, #A>, tensor<128x32xf16, #A>
}
// CHECK-NEXT: offset = 0, size = 8192
%cst2 = arith.constant dense<0.00e+00> : tensor<128x32xf16, #A>
return
// CHECK-NEXT: size = 40960
}