Files
triton/lib/Analysis/Membar.cpp
Keren Zhou 328b87aec6 Keren/tensor slice insert alloc (#94)
This branch defines three new triton_gpu operations to partially solve #87. Below is an overview:

```
%tensor = triton_gpu.alloc_tensor : tensor<2x16x16xf16, #A>
%b = triton_gpu.insert_slice_async %a_ptr, %tensor, %offset {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<16x16x!tt.ptr<f16>, #AL> -> tensor<2x16x16xf16, #A>
%c = triton_gpu.extract_slice %b, %offset {axis = 0 : i32} : tensor<2x16x16xf16, #A> -> tensor<16x16xf16, #A>
```

We plan to fully replace `copy_async` with `insert_slice_async`. **This hasn't been done yet.**
2022-09-01 12:37:17 -07:00

105 lines
3.4 KiB
C++

#include "triton/Analysis/Membar.h"
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
namespace mlir {
void MembarAnalysis::run() {
auto *operation = allocation->getOperation();
RegionInfo regionInfo;
OpBuilder builder(operation);
dfsOperation(operation, &regionInfo, &builder);
}
void MembarAnalysis::dfsOperation(Operation *operation,
RegionInfo *parentRegionInfo,
OpBuilder *builder) {
transfer(operation, parentRegionInfo, builder);
if (operation->getNumRegions()) {
// If there's any nested regions, we need to visit them.
// scf.if and scf.else: two regions
// scf.if only: two regions
// scf.for: one region
RegionInfo curRegionInfo;
for (auto &region : operation->getRegions()) {
// Copy the parent info as the current info.
RegionInfo regionInfo = *parentRegionInfo;
for (auto &block : region.getBlocks()) {
assert(region.getBlocks().size() == 1 &&
"Multiple blocks in a region is not supported");
for (auto &op : block.getOperations()) {
// Traverse the nested operation.
dfsOperation(&op, &regionInfo, builder);
}
}
curRegionInfo.join(regionInfo);
}
// Set the parent region info as the union of the nested region info.
*parentRegionInfo = curRegionInfo;
}
}
void MembarAnalysis::transfer(Operation *op, RegionInfo *regionInfo,
OpBuilder *builder) {
if (op->getNumResults() < 1)
return;
if (dyn_cast<scf::ForOp>(op) || dyn_cast<scf::IfOp>(op) ||
dyn_cast<scf::YieldOp>(op)) {
// Do not insert barriers before control flow operations.
return;
}
if (dyn_cast<gpu::BarrierOp>(op)) {
// If the current op is a barrier, we sync previous reads and writes
regionInfo->sync();
return;
}
if (dyn_cast<triton::gpu::AsyncWaitOp>(op)) {
// If the current op is an async wait, we insert a barrier op and sync
// previous reads and writes.
OpBuilder::InsertionGuard g(*builder);
builder->setInsertionPointAfter(op);
builder->create<gpu::BarrierOp>(op->getLoc());
regionInfo->sync();
return;
}
RegionInfo curRegionInfo;
for (Value value : op->getOperands()) {
// ConvertLayoutOp: shared memory -> registers
// Need to consider all alias buffers
for (auto bufferId : allocation->getBufferIds(value)) {
if (bufferId != Allocation::InvalidBufferId) {
curRegionInfo.syncReadBuffers.insert(bufferId);
}
}
}
for (Value value : op->getResults()) {
// ConvertLayoutOp: registers -> shared memory
auto bufferId = allocation->getBufferId(value);
if (bufferId != Allocation::InvalidBufferId) {
curRegionInfo.syncWriteBuffers.insert(bufferId);
}
}
// Scratch buffer is considered as a shared memory read
auto bufferId = allocation->getBufferId(op);
if (bufferId != Allocation::InvalidBufferId) {
curRegionInfo.syncReadBuffers.insert(bufferId);
}
if (regionInfo->isIntersected(curRegionInfo, allocation)) {
OpBuilder::InsertionGuard g(*builder);
builder->setInsertionPoint(op);
builder->create<gpu::BarrierOp>(op->getLoc());
regionInfo->sync();
}
// Update the region info, even if barrier is inserted, we have to maintain
// the current op's read/write buffers.
regionInfo->join(curRegionInfo);
}
} // namespace mlir