[OPTIMIZER] Fix Num in AsyncWaitOp generated by the pipeline pass (#72)
This commit is contained in:
@@ -337,8 +337,9 @@ scf::ForOp LoopPipeliner::createNewForOp() {
|
|||||||
for (Operation &op : forOp.getBody()->without_terminator()) {
|
for (Operation &op : forOp.getBody()->without_terminator()) {
|
||||||
if (!asyncWaitInserted && isDirectUserOfAsyncLoad(op)) {
|
if (!asyncWaitInserted && isDirectUserOfAsyncLoad(op)) {
|
||||||
asyncWaitInserted = true;
|
asyncWaitInserted = true;
|
||||||
|
assert(numStages >= 2);
|
||||||
builder.create<triton::gpu::AsyncWaitOp>(op.getLoc(),
|
builder.create<triton::gpu::AsyncWaitOp>(op.getLoc(),
|
||||||
loads.size() * (numStages - 1));
|
loads.size() * (numStages - 2));
|
||||||
}
|
}
|
||||||
Operation *newOp = builder.clone(op, mapping);
|
Operation *newOp = builder.clone(op, mapping);
|
||||||
// update mapping of results
|
// update mapping of results
|
||||||
|
@@ -14,7 +14,7 @@
|
|||||||
// CHECK: %[[A1:.*]] = triton_gpu.copy_async
|
// CHECK: %[[A1:.*]] = triton_gpu.copy_async
|
||||||
// CHECK: %[[B1:.*]] = triton_gpu.copy_async
|
// CHECK: %[[B1:.*]] = triton_gpu.copy_async
|
||||||
// CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, {{.*}}, %[[arg_a0:.*]] = %[[A0]], %[[arg_a1:.*]] = %[[A1]], %[[arg_b0:.*]] = %[[B0]], %[[arg_b1:.*]] = %[[B1]], {{.*}})
|
// CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, {{.*}}, %[[arg_a0:.*]] = %[[A0]], %[[arg_a1:.*]] = %[[A1]], %[[arg_b0:.*]] = %[[B0]], %[[arg_b1:.*]] = %[[B1]], {{.*}})
|
||||||
// CHECK: triton_gpu.async_wait {num = 4 : i32}
|
// CHECK: triton_gpu.async_wait {num = 2 : i32}
|
||||||
// CHECK: tt.dot %[[arg_a0]], %[[arg_b0]], {{.*}}
|
// CHECK: tt.dot %[[arg_a0]], %[[arg_b0]], {{.*}}
|
||||||
// CHECK: %[[NEXT_A:.*]] = triton_gpu.copy_async
|
// CHECK: %[[NEXT_A:.*]] = triton_gpu.copy_async
|
||||||
// CHECK: %[[NEXT_B:.*]] = triton_gpu.copy_async
|
// CHECK: %[[NEXT_B:.*]] = triton_gpu.copy_async
|
||||||
@@ -55,7 +55,7 @@ func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f16>, %B
|
|||||||
// CHECK: %[[A1:.*]] = triton_gpu.copy_async
|
// CHECK: %[[A1:.*]] = triton_gpu.copy_async
|
||||||
// CHECK: %[[B1:.*]] = triton_gpu.copy_async
|
// CHECK: %[[B1:.*]] = triton_gpu.copy_async
|
||||||
// CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, {{.*}}, %[[arg_a0:.*]] = %[[A0]], %[[arg_a1:.*]] = %[[A1]], %[[arg_b0:.*]] = %[[B0]], %[[arg_b1:.*]] = %[[B1]], {{.*}})
|
// CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, {{.*}}, %[[arg_a0:.*]] = %[[A0]], %[[arg_a1:.*]] = %[[A1]], %[[arg_b0:.*]] = %[[B0]], %[[arg_b1:.*]] = %[[B1]], {{.*}})
|
||||||
// CHECK: triton_gpu.async_wait {num = 4 : i32}
|
// CHECK: triton_gpu.async_wait {num = 2 : i32}
|
||||||
// CHECK: tt.dot %[[arg_a0]], %[[arg_b0]], {{.*}}
|
// CHECK: tt.dot %[[arg_a0]], %[[arg_b0]], {{.*}}
|
||||||
// CHECK: %[[NEXT_A:.*]] = triton_gpu.copy_async
|
// CHECK: %[[NEXT_A:.*]] = triton_gpu.copy_async
|
||||||
// CHECK: %[[NEXT_B:.*]] = triton_gpu.copy_async
|
// CHECK: %[[NEXT_B:.*]] = triton_gpu.copy_async
|
||||||
@@ -95,7 +95,7 @@ func @matmul_loop_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f
|
|||||||
// CHECK: %[[B0:.*]] = triton_gpu.copy_async
|
// CHECK: %[[B0:.*]] = triton_gpu.copy_async
|
||||||
// CHECK: %[[B1:.*]] = triton_gpu.copy_async
|
// CHECK: %[[B1:.*]] = triton_gpu.copy_async
|
||||||
// CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, %[[arg_b0:.*]] = %[[B0]], %[[arg_b1:.*]] = %[[B1]], {{.*}})
|
// CHECK: scf.for {{.*}} iter_args({{.*}}, {{.*}}, %[[arg_b0:.*]] = %[[B0]], %[[arg_b1:.*]] = %[[B1]], {{.*}})
|
||||||
// CHECK: triton_gpu.async_wait {num = 2 : i32}
|
// CHECK: triton_gpu.async_wait {num = 1 : i32}
|
||||||
// CHECK: tt.dot {{.*}}, %[[arg_b0]], {{.*}}
|
// CHECK: tt.dot {{.*}}, %[[arg_b0]], {{.*}}
|
||||||
// CHECK: %[[NEXT_B:.*]] = triton_gpu.copy_async
|
// CHECK: %[[NEXT_B:.*]] = triton_gpu.copy_async
|
||||||
// CHECK: scf.yield {{.*}}, {{.*}}, %[[arg_b1]], %[[NEXT_B]]
|
// CHECK: scf.yield {{.*}}, {{.*}}, %[[arg_b1]], %[[NEXT_B]]
|
||||||
|
Reference in New Issue
Block a user