[OPTIMIZER] Better pipeline pass (#100)

* Use `insert_slice_async` instead of `CopyAsync` * Move async.wait to loop header Co-authored-by: Jokeren <kerenzhou@openai.com>
2022-09-06 23:31:13 +08:00
parent a0bab9748e
commit 35e346bcff
2 changed files with 141 additions and 54 deletions
--- a/test/TritonGPU/loop-pipeline.mlir
+++ b/test/TritonGPU/loop-pipeline.mlir
@@ -9,15 +9,24 @@
 #C = #triton_gpu.mma<{version = 2, warpsPerCTA = [4, 1]}>

 // CHECK: func @matmul_loop
-// CHECK: %[[A0:.*]] = triton_gpu.copy_async
-// CHECK: %[[B0:.*]] = triton_gpu.copy_async
-// CHECK: %[[A1:.*]] = triton_gpu.copy_async
-// CHECK: %[[B1:.*]] = triton_gpu.copy_async
-// CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, {{.*}}, %[[arg_a0:.*]] = %[[A0]], %[[arg_a1:.*]] = %[[A1]], %[[arg_b0:.*]] = %[[B0]], %[[arg_b1:.*]] = %[[B1]], {{.*}})
+// CHECK: %[[ABUFFER:.*]] = triton_gpu.alloc_tensor
+// CHECK: %[[A0BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK: %[[BBUFFER:.*]] = triton_gpu.alloc_tensor
+// CHECK: %[[B0BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK: %[[A1BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK: %[[B1BUFFER:.*]] = triton_gpu.insert_slice_async
 // CHECK:   triton_gpu.async_wait {num = 2 : i32}
+// CHECK: %[[A0:.*]] = triton_gpu.extract_slice %[[A0BUFFER]]
+// CHECK: %[[B0:.*]] = triton_gpu.extract_slice %[[B0BUFFER]]
+// CHECK: %[[A1:.*]] = triton_gpu.extract_slice %[[A1BUFFER]]
+// CHECK: %[[B1:.*]] = triton_gpu.extract_slice %[[B1BUFFER]]
+// CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, {{.*}}, %[[arg_a0:.*]] = %[[A0]], %[[arg_a1:.*]] = %[[A1]], %[[arg_b0:.*]] = %[[B0]], %[[arg_b1:.*]] = %[[B1]], {{.*}})
 // CHECK:   tt.dot %[[arg_a0]], %[[arg_b0]], {{.*}}
-// CHECK:   %[[NEXT_A:.*]] = triton_gpu.copy_async
-// CHECK:   %[[NEXT_B:.*]] = triton_gpu.copy_async
+// CHECK:   %[[NEXT_A_BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK:   %[[NEXT_B_BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK:   triton_gpu.async_wait {num = 2 : i32}
+// CHECK:   %[[NEXT_A:.*]] = triton_gpu.extract_slice %[[NEXT_A_BUFFER]]
+// CHECK:   %[[NEXT_B:.*]] = triton_gpu.extract_slice %[[NEXT_B_BUFFER]]
 // CHECK:   scf.yield {{.*}}, {{.*}}, {{.*}}, %[[arg_a1]], %[[NEXT_A]], %[[arg_b1]], %[[NEXT_B]]
 func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
  %a_ptr_init = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>
@@ -50,15 +59,24 @@ func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B

 // CHECK: func @matmul_loop_nested
 // CHECK: scf.for
-// CHECK:   %[[A0:.*]] = triton_gpu.copy_async
-// CHECK:   %[[B0:.*]] = triton_gpu.copy_async
-// CHECK:   %[[A1:.*]] = triton_gpu.copy_async
-// CHECK:   %[[B1:.*]] = triton_gpu.copy_async
-// CHECK:   scf.for {{.*}} iter_args({{.*}}, {{.*}}, {{.*}}, %[[arg_a0:.*]] = %[[A0]], %[[arg_a1:.*]] = %[[A1]], %[[arg_b0:.*]] = %[[B0]], %[[arg_b1:.*]] = %[[B1]], {{.*}})
+// CHECK:   %[[ABUFFER:.*]] = triton_gpu.alloc_tensor
+// CHECK:   %[[A0BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK:   %[[BBUFFER:.*]] = triton_gpu.alloc_tensor
+// CHECK:   %[[B0BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK:   %[[A1BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK:   %[[B1BUFFER:.*]] = triton_gpu.insert_slice_async
 // CHECK:   triton_gpu.async_wait {num = 2 : i32}
+// CHECK:   %[[A0:.*]] = triton_gpu.extract_slice %[[A0BUFFER]]
+// CHECK:   %[[B0:.*]] = triton_gpu.extract_slice %[[B0BUFFER]]
+// CHECK:   %[[A1:.*]] = triton_gpu.extract_slice %[[A1BUFFER]]
+// CHECK:   %[[B1:.*]] = triton_gpu.extract_slice %[[B1BUFFER]]
+// CHECK:   scf.for {{.*}} iter_args({{.*}}, {{.*}}, {{.*}}, %[[arg_a0:.*]] = %[[A0]], %[[arg_a1:.*]] = %[[A1]], %[[arg_b0:.*]] = %[[B0]], %[[arg_b1:.*]] = %[[B1]], {{.*}})
 // CHECK:     tt.dot %[[arg_a0]], %[[arg_b0]], {{.*}}
-// CHECK:     %[[NEXT_A:.*]] = triton_gpu.copy_async
-// CHECK:     %[[NEXT_B:.*]] = triton_gpu.copy_async
+// CHECK:     %[[NEXT_A_BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK:     %[[NEXT_B_BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK:     triton_gpu.async_wait {num = 2 : i32}
+// CHECK:     %[[NEXT_A:.*]] = triton_gpu.extract_slice %[[NEXT_A_BUFFER]]
+// CHECK:     %[[NEXT_B:.*]] = triton_gpu.extract_slice %[[NEXT_B_BUFFER]]
 // CHECK:     scf.yield {{.*}}, {{.*}}, {{.*}}, %[[arg_a1]], %[[NEXT_A]], %[[arg_b1]], %[[NEXT_B]]
 func @matmul_loop_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
  scf.for %iv0 = %lb to %ub step %step {
@@ -92,12 +110,17 @@ func @matmul_loop_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f


 // CHECK: func @matmul_loop_single_pipeline
-// CHECK: %[[B0:.*]] = triton_gpu.copy_async
-// CHECK: %[[B1:.*]] = triton_gpu.copy_async
+// CHECK: %[[BBUFFER:.*]] = triton_gpu.alloc_tensor
+// CHECK: %[[B0BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK: %[[B1BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK: triton_gpu.async_wait {num = 1 : i32}
+// CHECK: %[[B0:.*]] = triton_gpu.extract_slice %[[B0BUFFER]]
+// CHECK: %[[B1:.*]] = triton_gpu.extract_slice %[[B1BUFFER]]
 // CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, %[[arg_b0:.*]] = %[[B0]], %[[arg_b1:.*]] = %[[B1]], {{.*}})
-// CHECK:   triton_gpu.async_wait {num = 1 : i32}
 // CHECK:   tt.dot {{.*}}, %[[arg_b0]], {{.*}}
-// CHECK:   %[[NEXT_B:.*]] = triton_gpu.copy_async
+// CHECK:   %[[NEXT_B_BUFFER:.*]] = triton_gpu.insert_slice_async
+// CHECK:   triton_gpu.async_wait {num = 1 : i32}
+// CHECK:   %[[NEXT_B:.*]] = triton_gpu.extract_slice %[[NEXT_B_BUFFER]]
 // CHECK:   scf.yield {{.*}}, {{.*}}, %[[arg_b1]], %[[NEXT_B]]
 func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B : !tt.ptr<f16>) {
  %a_ptr_init = tt.broadcast %A : (!tt.ptr<f16>) -> tensor<128x32x!tt.ptr<f16>, #AL>