From 2afebcd79ba8423a5385f811e9d77a1ac772a3d8 Mon Sep 17 00:00:00 2001 From: Keren Zhou Date: Tue, 22 Nov 2022 10:03:29 -0800 Subject: [PATCH] [Triton-MLIR][Backend] Remove unnecessary barriers (#901) Cross operation barriers are taken care of by the Membar pass. Explicit barriers are only required if there's any synchronization necessary within each operation. --- lib/Analysis/Membar.cpp | 3 ++- lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp | 8 ++------ test/Conversion/tritongpu_to_llvm.mlir | 4 ---- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/lib/Analysis/Membar.cpp b/lib/Analysis/Membar.cpp index eab1636e5..2822b7ace 100644 --- a/lib/Analysis/Membar.cpp +++ b/lib/Analysis/Membar.cpp @@ -86,9 +86,10 @@ void MembarAnalysis::transfer(Operation *op, RegionInfo *regionInfo, curRegionInfo.syncWriteBuffers.insert(bufferId); } } - // Scratch buffer is considered as a shared memory read + // Scratch buffer is considered as both shared memory write & read auto bufferId = allocation->getBufferId(op); if (bufferId != Allocation::InvalidBufferId) { + curRegionInfo.syncWriteBuffers.insert(bufferId); curRegionInfo.syncReadBuffers.insert(bufferId); } diff --git a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp index dbc057c85..a14691c6c 100644 --- a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp @@ -3077,7 +3077,8 @@ LogicalResult ConvertLayoutOpConversion::lowerDistributedToDistributed( for (unsigned repId = 0; repId < accumNumReplicates; ++repId) { auto multiDimRepId = getMultiDimIndex(repId, numReplicates, outOrd); - barrier(); + if (repId != 0) + barrier(); if (srcLayout.isa() || srcLayout.isa() || srcLayout.isa()) { @@ -3169,11 +3170,6 @@ LogicalResult ConvertLayoutOpConversion::lowerBlockedToShared( auto retVal = getStructFromSharedMemoryObject(loc, smemObj, rewriter); auto numWordsEachRep = product(wordsInEachRep); SmallVector wordVecs(numWordsEachRep); - // TODO: We should get less barriers if it is handled by membar pass - // instead of the backend, since the later can only handle it in - // the most conservative way. However just keep for now and revisit - // in the future in case necessary. - barrier(); for (unsigned i = 0; i < numElems; ++i) { if (i % srcAccumSizeInThreads == 0) { // start of a replication diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir index da5b908cc..a7fb4551a 100644 --- a/test/Conversion/tritongpu_to_llvm.mlir +++ b/test/Conversion/tritongpu_to_llvm.mlir @@ -577,7 +577,6 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: convert_layout_blocked_blocked func @convert_layout_blocked_blocked(%arg0: tensor<16x16xf32, #blocked0>) { // CHECK: llvm.mlir.addressof @global_smem - // CHECK: nvvm.barrier0 // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store @@ -625,7 +624,6 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: convert_layout_blocked_blocked_vec func @convert_layout_blocked_blocked_vec(%arg0: tensor<16x16xf32, #blocked0>) { // CHECK: llvm.mlir.addressof @global_smem - // CHECK: nvvm.barrier0 // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store @@ -649,7 +647,6 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK-LABEL: convert_layout_blocked_blocked_multi_rep func @convert_layout_blocked_blocked_multi_rep(%arg0: tensor<16x16xf32, #blocked0>) { // CHECK: llvm.mlir.addressof @global_smem - // CHECK: nvvm.barrier0 // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: nvvm.barrier0 @@ -717,7 +714,6 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} { // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8> // CHECK-LABEL: convert_layout_mma_block func @convert_layout_mma_blocked(%arg0: tensor<32x16xf32, #mma>) { - // CHECK: nvvm.barrier0 // CHECK: llvm.store // CHECK-SAME: !llvm.ptr, 3> // CHECK: llvm.store