[Triton-MLIR][BACKEND] Fix the membar pass to add missing barriers caused by scf.for (#933)
1. Add missing barriers and revert the previous temporary solution 2. Extract the `run` method from membar analysis because the membar analysis should have two phases, including construction, which doesn't modify any IR, and modification, which adds barrier IRs. Hope this could make the use of membar clear.
This commit is contained in:
@@ -24,21 +24,43 @@ void MembarAnalysis::dfsOperation(Operation *operation,
|
||||
// scf.if only: two regions
|
||||
// scf.for: one region
|
||||
RegionInfo curRegionInfo;
|
||||
for (auto ®ion : operation->getRegions()) {
|
||||
// Copy the parent info as the current info.
|
||||
RegionInfo regionInfo = *parentRegionInfo;
|
||||
for (auto &block : region.getBlocks()) {
|
||||
assert(region.getBlocks().size() == 1 &&
|
||||
"Multiple blocks in a region is not supported");
|
||||
for (auto &op : block.getOperations()) {
|
||||
// Traverse the nested operation.
|
||||
dfsOperation(&op, ®ionInfo, builder);
|
||||
auto traverseRegions = [&]() -> auto{
|
||||
for (auto ®ion : operation->getRegions()) {
|
||||
// Copy the parent info as the current info.
|
||||
RegionInfo regionInfo = *parentRegionInfo;
|
||||
for (auto &block : region.getBlocks()) {
|
||||
assert(region.getBlocks().size() == 1 &&
|
||||
"Multiple blocks in a region is not supported");
|
||||
for (auto &op : block.getOperations()) {
|
||||
// Traverse the nested operation.
|
||||
dfsOperation(&op, ®ionInfo, builder);
|
||||
}
|
||||
}
|
||||
curRegionInfo.join(regionInfo);
|
||||
}
|
||||
curRegionInfo.join(regionInfo);
|
||||
// Set the parent region info as the union of the nested region info.
|
||||
*parentRegionInfo = curRegionInfo;
|
||||
};
|
||||
|
||||
traverseRegions();
|
||||
if (isa<scf::ForOp>(operation)) {
|
||||
// scf.for can have two possible inputs: the init value and the
|
||||
// previous iteration's result. Although we've applied alias analysis,
|
||||
// there could be unsynced memory accesses on reused memories.
|
||||
// For example, consider the following code:
|
||||
// %1 = convert_layout %0: blocked -> shared
|
||||
// ...
|
||||
// gpu.barrier
|
||||
// ...
|
||||
// %5 = convert_layout %4 : shared -> dot
|
||||
// %6 = tt.dot %2, %5
|
||||
// scf.yield
|
||||
//
|
||||
// Though %5 could be released before scf.yield, it may shared the same
|
||||
// memory with %1. So we actually have to insert a barrier before %1 to
|
||||
// make sure the memory is synced.
|
||||
traverseRegions();
|
||||
}
|
||||
// Set the parent region info as the union of the nested region info.
|
||||
*parentRegionInfo = curRegionInfo;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,8 +71,7 @@ void MembarAnalysis::transfer(Operation *op, RegionInfo *regionInfo,
|
||||
// Do not insert barriers before control flow operations and
|
||||
// alloc/extract/insert
|
||||
// alloc is an allocation op without memory write.
|
||||
// In contrast, arith.constant is an allocation op with memory write.
|
||||
// FIXME(Keren): extract is always alias for now
|
||||
// FIXME(Keren): extract_slice is always alias for now
|
||||
return;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user