From 2afebcd79ba8423a5385f811e9d77a1ac772a3d8 Mon Sep 17 00:00:00 2001
From: Keren Zhou <kerenzhou@openai.com>
Date: Tue, 22 Nov 2022 10:03:29 -0800
Subject: [PATCH] [Triton-MLIR][Backend] Remove unnecessary barriers (#901)

Cross operation barriers are taken care of by the Membar pass.

Explicit barriers are only required if there's any synchronization
necessary within each operation.
---
 lib/Analysis/Membar.cpp                            | 3 ++-
 lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp | 8 ++------
 test/Conversion/tritongpu_to_llvm.mlir             | 4 ----
 3 files changed, 4 insertions(+), 11 deletions(-)
diff --git a/lib/Analysis/Membar.cpp b/lib/Analysis/Membar.cpp
index eab1636e5..2822b7ace 100644
--- a/lib/Analysis/Membar.cpp
+++ b/lib/Analysis/Membar.cpp
@@ -86,9 +86,10 @@ void MembarAnalysis::transfer(Operation *op, RegionInfo *regionInfo,
       curRegionInfo.syncWriteBuffers.insert(bufferId);
     }
   }
-  // Scratch buffer is considered as a shared memory read
+  // Scratch buffer is considered as both shared memory write & read
   auto bufferId = allocation->getBufferId(op);
   if (bufferId != Allocation::InvalidBufferId) {
+    curRegionInfo.syncWriteBuffers.insert(bufferId);
     curRegionInfo.syncReadBuffers.insert(bufferId);
   }
 
diff --git a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp
index dbc057c85..a14691c6c 100644
--- a/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.cpp
@@ -3077,7 +3077,8 @@ LogicalResult ConvertLayoutOpConversion::lowerDistributedToDistributed(
   for (unsigned repId = 0; repId < accumNumReplicates; ++repId) {
     auto multiDimRepId =
         getMultiDimIndex<unsigned>(repId, numReplicates, outOrd);
-    barrier();
+    if (repId != 0)
+      barrier();
     if (srcLayout.isa<BlockedEncodingAttr>() ||
         srcLayout.isa<SliceEncodingAttr>() ||
         srcLayout.isa<MmaEncodingAttr>()) {
@@ -3169,11 +3170,6 @@ LogicalResult ConvertLayoutOpConversion::lowerBlockedToShared(
   auto retVal = getStructFromSharedMemoryObject(loc, smemObj, rewriter);
   auto numWordsEachRep = product<unsigned>(wordsInEachRep);
   SmallVector<Value> wordVecs(numWordsEachRep);
-  // TODO: We should get less barriers if it is handled by membar pass
-  //       instead of the backend, since the later can only handle it in
-  //       the most conservative way. However just keep for now and revisit
-  //       in the future in case necessary.
-  barrier();
   for (unsigned i = 0; i < numElems; ++i) {
     if (i % srcAccumSizeInThreads == 0) {
       // start of a replication
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
index da5b908cc..a7fb4551a 100644
--- a/test/Conversion/tritongpu_to_llvm.mlir
+++ b/test/Conversion/tritongpu_to_llvm.mlir
@@ -577,7 +577,6 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
   // CHECK-LABEL: convert_layout_blocked_blocked
   func @convert_layout_blocked_blocked(%arg0: tensor<16x16xf32, #blocked0>) {
     // CHECK: llvm.mlir.addressof @global_smem
-    // CHECK: nvvm.barrier0
     // CHECK: llvm.store
     // CHECK-SAME: !llvm.ptr<vector<1xf32>, 3>
     // CHECK: llvm.store
@@ -625,7 +624,6 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
   // CHECK-LABEL: convert_layout_blocked_blocked_vec
   func @convert_layout_blocked_blocked_vec(%arg0: tensor<16x16xf32, #blocked0>) {
     // CHECK: llvm.mlir.addressof @global_smem
-    // CHECK: nvvm.barrier0
     // CHECK: llvm.store
     // CHECK-SAME: !llvm.ptr<vector<4xf32>, 3>
     // CHECK: llvm.store
@@ -649,7 +647,6 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
   // CHECK-LABEL: convert_layout_blocked_blocked_multi_rep
   func @convert_layout_blocked_blocked_multi_rep(%arg0: tensor<16x16xf32, #blocked0>) {
     // CHECK: llvm.mlir.addressof @global_smem
-    // CHECK: nvvm.barrier0
     // CHECK: llvm.store
     // CHECK-SAME: !llvm.ptr<vector<4xf32>, 3>
     // CHECK: nvvm.barrier0
@@ -717,7 +714,6 @@ module attributes {"triton_gpu.num-warps" = 1 : i32} {
   // CHECK: llvm.mlir.global external @global_smem() {addr_space = 3 : i32} : !llvm.array<0 x i8>
   // CHECK-LABEL: convert_layout_mma_block
   func @convert_layout_mma_blocked(%arg0: tensor<32x16xf32, #mma>) {
-    // CHECK: nvvm.barrier0
     // CHECK: llvm.store
     // CHECK-SAME: !llvm.ptr<vector<2xf32>, 3>
     // CHECK: llvm.store