[Triton-MLIR][Backend] Remove unnecessary barriers (#901)
Cross operation barriers are taken care of by the Membar pass. Explicit barriers are only required if there's any synchronization necessary within each operation.
This commit is contained in:
@@ -86,9 +86,10 @@ void MembarAnalysis::transfer(Operation *op, RegionInfo *regionInfo,
|
||||
curRegionInfo.syncWriteBuffers.insert(bufferId);
|
||||
}
|
||||
}
|
||||
// Scratch buffer is considered as a shared memory read
|
||||
// Scratch buffer is considered as both shared memory write & read
|
||||
auto bufferId = allocation->getBufferId(op);
|
||||
if (bufferId != Allocation::InvalidBufferId) {
|
||||
curRegionInfo.syncWriteBuffers.insert(bufferId);
|
||||
curRegionInfo.syncReadBuffers.insert(bufferId);
|
||||
}
|
||||
|
||||
|
@@ -3077,7 +3077,8 @@ LogicalResult ConvertLayoutOpConversion::lowerDistributedToDistributed(
|
||||
for (unsigned repId = 0; repId < accumNumReplicates; ++repId) {
|
||||
auto multiDimRepId =
|
||||
getMultiDimIndex<unsigned>(repId, numReplicates, outOrd);
|
||||
barrier();
|
||||
if (repId != 0)
|
||||
barrier();
|
||||
if (srcLayout.isa<BlockedEncodingAttr>() ||
|
||||
srcLayout.isa<SliceEncodingAttr>() ||
|
||||
srcLayout.isa<MmaEncodingAttr>()) {
|
||||
@@ -3169,11 +3170,6 @@ LogicalResult ConvertLayoutOpConversion::lowerBlockedToShared(
|
||||
auto retVal = getStructFromSharedMemoryObject(loc, smemObj, rewriter);
|
||||
auto numWordsEachRep = product<unsigned>(wordsInEachRep);
|
||||
SmallVector<Value> wordVecs(numWordsEachRep);
|
||||
// TODO: We should get less barriers if it is handled by membar pass
|
||||
// instead of the backend, since the later can only handle it in
|
||||
// the most conservative way. However just keep for now and revisit
|
||||
// in the future in case necessary.
|
||||
barrier();
|
||||
for (unsigned i = 0; i < numElems; ++i) {
|
||||
if (i % srcAccumSizeInThreads == 0) {
|
||||
// start of a replication
|
||||
|
@@ -577,7 +577,6 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
|
||||
// CHECK-LABEL: convert_layout_blocked_blocked
|
||||
func @convert_layout_blocked_blocked(%arg0: tensor<16x16xf32, #blocked0>) {
|
||||
// CHECK: llvm.mlir.addressof @global_smem
|
||||
// CHECK: nvvm.barrier0
|
||||
// CHECK: llvm.store
|
||||
// CHECK-SAME: !llvm.ptr<vector<1xf32>, 3>
|
||||
// CHECK: llvm.store
|
||||
@@ -625,7 +624,6 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
|
||||
// CHECK-LABEL: convert_layout_blocked_blocked_vec
|
||||
func @convert_layout_blocked_blocked_vec(%arg0: tensor<16x16xf32, #blocked0>) {
|
||||
// CHECK: llvm.mlir.addressof @global_smem
|
||||
// CHECK: nvvm.barrier0
|
||||
// CHECK: llvm.store
|
||||
// CHECK-SAME: !llvm.ptr<vector<4xf32>, 3>
|
||||
// CHECK: llvm.store
|
||||
@@ -649,7 +647,6 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
|
||||
// CHECK-LABEL: convert_layout_blocked_blocked_multi_rep
|
||||
func @convert_layout_blocked_blocked_multi_rep(%arg0: tensor<16x16xf32, #blocked0>) {
|
||||
// CHECK: llvm.mlir.addressof @global_smem
|
||||
// CHECK: nvvm.barrier0
|
||||
// CHECK: llvm.store
|
||||
// CHECK-SAME: !llvm.ptr<vector<4xf32>, 3>
|
||||
// CHECK: nvvm.barrier0
|
||||
@@ -717,7 +714,6 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
|
||||
// CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8>
|
||||
// CHECK-LABEL: convert_layout_mma_block
|
||||
func @convert_layout_mma_blocked(%arg0: tensor<32x16xf32, #mma>) {
|
||||
// CHECK: nvvm.barrier0
|
||||
// CHECK: llvm.store
|
||||
// CHECK-SAME: !llvm.ptr<vector<2xf32>, 3>
|
||||
// CHECK: llvm.store
|
||||
|
Reference in New Issue
Block a user