#include "triton/Analysis/Membar.h" #include "triton/Analysis/Alias.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" namespace mlir { void MembarAnalysis::run() { auto *operation = allocation->getOperation(); RegionInfo regionInfo; OpBuilder builder(operation); dfsOperation(operation, ®ionInfo, &builder); } void MembarAnalysis::dfsOperation(Operation *operation, RegionInfo *parentRegionInfo, OpBuilder *builder) { transfer(operation, parentRegionInfo, builder); if (operation->getNumRegions()) { // If there's any nested regions, we need to visit them. // scf.if and scf.else: two regions // scf.if only: two regions // scf.for: one region RegionInfo curRegionInfo; auto traverseRegions = [&]() -> auto{ for (auto ®ion : operation->getRegions()) { // Copy the parent info as the current info. RegionInfo regionInfo = *parentRegionInfo; for (auto &block : region.getBlocks()) { assert(region.getBlocks().size() == 1 && "Multiple blocks in a region is not supported"); for (auto &op : block.getOperations()) { // Traverse the nested operation. dfsOperation(&op, ®ionInfo, builder); } } curRegionInfo.join(regionInfo); } // Set the parent region info as the union of the nested region info. *parentRegionInfo = curRegionInfo; }; traverseRegions(); if (isa(operation)) { // scf.for can have two possible inputs: the init value and the // previous iteration's result. Although we've applied alias analysis, // there could be unsynced memory accesses on reused memories. // For example, consider the following code: // %1 = convert_layout %0: blocked -> shared // ... // gpu.barrier // ... // %5 = convert_layout %4 : shared -> dot // %6 = tt.dot %2, %5 // scf.yield // // Though %5 could be released before scf.yield, it may shared the same // memory with %1. So we actually have to insert a barrier before %1 to // make sure the memory is synced. traverseRegions(); } } } void MembarAnalysis::transfer(Operation *op, RegionInfo *regionInfo, OpBuilder *builder) { if (isa(op) || isa(op) || isa(op) || isa(op) || isa(op)) { // Do not insert barriers before control flow operations and // alloc/extract/insert // alloc is an allocation op without memory write. // FIXME(Keren): extract_slice is always alias for now return; } if (isa(op)) { // If the current op is a barrier, we sync previous reads and writes regionInfo->sync(); return; } if (isa(op) && !isa(op->getNextNode())) { // If the current op is an async wait and the next op is not a barrier we // insert a barrier op and sync regionInfo->sync(); OpBuilder::InsertionGuard g(*builder); builder->setInsertionPointAfter(op); builder->create(op->getLoc()); regionInfo->sync(); return; } RegionInfo curRegionInfo; for (Value value : op->getOperands()) { for (auto bufferId : allocation->getBufferIds(value)) { if (bufferId != Allocation::InvalidBufferId) { if (isa(op) || isa(op)) { // FIXME(Keren): insert_slice and insert_slice_async are always alias // for now curRegionInfo.syncWriteBuffers.insert(bufferId); } else { // ConvertLayoutOp: shared memory -> registers curRegionInfo.syncReadBuffers.insert(bufferId); } } } } for (Value value : op->getResults()) { // ConvertLayoutOp: registers -> shared memory auto bufferId = allocation->getBufferId(value); if (bufferId != Allocation::InvalidBufferId) { curRegionInfo.syncWriteBuffers.insert(bufferId); } } // Scratch buffer is considered as both shared memory write & read auto bufferId = allocation->getBufferId(op); if (bufferId != Allocation::InvalidBufferId) { curRegionInfo.syncWriteBuffers.insert(bufferId); curRegionInfo.syncReadBuffers.insert(bufferId); } if (regionInfo->isIntersected(curRegionInfo, allocation)) { OpBuilder::InsertionGuard g(*builder); builder->setInsertionPoint(op); builder->create(op->getLoc()); regionInfo->sync(); } // Update the region info, even if barrier is inserted, we have to maintain // the current op's read/write buffers. regionInfo->join(curRegionInfo); } } // namespace mlir