[OPTIMIZER] Let the pipeline pass insert async wait. (#63)

2022-08-19 01:31:57 +08:00
parent d69ce77b19
commit 8776ad1a0e
3 changed files with 28 additions and 0 deletions
--- a/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
+++ b/lib/Dialect/TritonGPU/Transforms/Pipeline.cpp
@@ -47,6 +47,9 @@ class LoopPipeliner {

  void setValueMapping(Value origin, Value newValue, int stage);

+  /// return true if this op uses any of `loads`
+  bool isDirectUserOfAsyncLoad(Operation &op);
+
 public:
  LoopPipeliner(scf::ForOp forOp, int numStages)
      : forOp(forOp), numStages(numStages) {
@@ -96,6 +99,19 @@ void LoopPipeliner::collectDeps(Value v, int stages, DenseSet<Value> &deps) {
  }
 }

+bool LoopPipeliner::isDirectUserOfAsyncLoad(Operation &op) {
+  for (Value loadOp : loads) {
+    assert(loadOp.hasOneUse() &&
+           "load should only have one use (ConvertLayout)");
+    Value loadUseResult = loadOp.getUsers().begin()->getResult(0);
+    for (Value opOperand : op.getOperands()) {
+      if (opOperand == loadUseResult)
+        return true;
+    }
+  }
+  return false;
+}
+
 /// A load instruction can be pipelined if:
 ///   - the load doesn't depend on any other loads (after loop peeling)
 ///   - (?) this load is not a loop-invariant value (we should run LICM before
@@ -318,7 +334,14 @@ scf::ForOp LoopPipeliner::createNewForOp() {
    mapping.map(arg.value(), newForOp.getRegionIterArgs()[arg.index()]);

  // 2.1 clone the loop body, replace original args with args of the new ForOp
+  // Insert async wait if necessary.
+  bool asyncWaitInserted = false;
  for (Operation &op : forOp.getBody()->without_terminator()) {
+    if (!asyncWaitInserted && isDirectUserOfAsyncLoad(op)) {
+      asyncWaitInserted = true;
+      builder.create<triton::gpu::AsyncWaitOp>(op.getLoc(),
+                                               loads.size() * (numStages - 1));
+    }
    Operation *newOp = builder.clone(op, mapping);
    // update mapping of results
    for (unsigned dstIdx : llvm::seq(unsigned(0), op.getNumResults()))