[AMD] Combine redundant AsyncWaits in StreamPipeliner (#6435)

AlexAUT · web-flow · commit 3728fdfb875c · 2025-04-09T08:13:25.000-07:00
Moved `combineRedundantWaitOps` from `WGMMAPipeline` to
`PipeliningUtility` to reuse it in the Streampipeliner.
diff --git a/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h b/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
@@ -73,6 +73,10 @@ Value createAlloc(scf::ForOp forOp, RankedTensorType ty, Location loc,
 // Determine if the operation is a TMA load.
 bool isTMALoad(Operation *op);
 
+// Look for consecutive wait ops and combine them into a single wait op.
+void combineRedundantWaitOps(
+    llvm::SmallSetVector<gpu::AsyncWaitOp, 8> &waitOps);
+
 // Get the type of the view of a multi-buffered tensor value.
 gpu::MemDescType getBufferViewType(gpu::MemDescType allocTy);
 // Get a generic shared encoding for a tensor.
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -327,6 +327,40 @@ bool mlir::triton::isTMALoad(Operation *op) {
   return isa<tt::DescriptorLoadOp, tt::DescriptorGatherOp>(op);
 }
 
+void mlir::triton::combineRedundantWaitOps(
+    llvm::SmallSetVector<ttg::AsyncWaitOp, 8> &waitOps) {
+  llvm::MapVector<ttg::AsyncWaitOp, ttg::AsyncWaitOp> toDelete;
+  for (auto waitOp : waitOps) {
+    if (toDelete.count(waitOp))
+      continue;
+    SmallVector<ttg::AsyncWaitOp> waitGroup = {waitOp};
+    SmallVector<Value> depTokens = waitOp.getOperands();
+    unsigned minWaitNumber = waitOp.getNum();
+    Operation *next = waitOp->getNextNode();
+    while (next && !isa<ttg::AsyncCommitGroupOp>(next)) {
+      if (auto nextWait = dyn_cast<ttg::AsyncWaitOp>(next)) {
+        waitGroup.push_back(nextWait);
+        minWaitNumber = std::min(minWaitNumber, nextWait.getNum());
+        depTokens.append(nextWait.getOperands().begin(),
+                         nextWait.getOperands().end());
+      }
+      next = next->getNextNode();
+    }
+    if (waitGroup.size() == 1)
+      continue;
+    OpBuilder builder(waitGroup.front());
+    auto newWaitOp = builder.create<ttg::AsyncWaitOp>(waitOp.getLoc(),
+                                                      depTokens, minWaitNumber);
+    for (auto waitOp : waitGroup) {
+      toDelete[waitOp] = newWaitOp;
+    }
+  }
+  for (auto waitOp : toDelete) {
+    waitOp.first->replaceAllUsesWith(waitOp.second);
+    waitOp.first->erase();
+  }
+}
+
 ttg::MemDescType mlir::triton::getBufferViewType(ttg::MemDescType allocTy) {
   Attribute sharedMemorySpace =
       ttg::SharedMemorySpaceAttr::get(allocTy.getContext());
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp
@@ -100,41 +100,6 @@ static int minNumInterleavedCommitOps(Operation *waitOp) {
   return minCommits;
 }
 
-// Look for consecutive wait ops and combine them into a single wait op.
-static void
-combineRedundantWaitOps(llvm::SmallSetVector<ttg::AsyncWaitOp, 8> &waitOps) {
-  llvm::MapVector<ttg::AsyncWaitOp, ttg::AsyncWaitOp> toDelete;
-  for (auto waitOp : waitOps) {
-    if (toDelete.count(waitOp))
-      continue;
-    SmallVector<ttg::AsyncWaitOp> waitGroup = {waitOp};
-    SmallVector<Value> depTokens = waitOp.getOperands();
-    unsigned minWaitNumber = waitOp.getNum();
-    Operation *next = waitOp->getNextNode();
-    while (next && !isa<ttg::AsyncCommitGroupOp>(next)) {
-      if (auto nextWait = dyn_cast<ttg::AsyncWaitOp>(next)) {
-        waitGroup.push_back(nextWait);
-        minWaitNumber = std::min(minWaitNumber, nextWait.getNum());
-        depTokens.append(nextWait.getOperands().begin(),
-                         nextWait.getOperands().end());
-      }
-      next = next->getNextNode();
-    }
-    if (waitGroup.size() == 1)
-      continue;
-    OpBuilder builder(waitGroup.front());
-    auto newWaitOp = builder.create<ttg::AsyncWaitOp>(waitOp.getLoc(),
-                                                      depTokens, minWaitNumber);
-    for (auto waitOp : waitGroup) {
-      toDelete[waitOp] = newWaitOp;
-    }
-  }
-  for (auto waitOp : toDelete) {
-    waitOp.first->replaceAllUsesWith(waitOp.second);
-    waitOp.first->erase();
-  }
-}
-
 /// Update wait op number by analyzing the number of async_commit_group ops
 /// along all paths.
 void mlir::triton::updateWaits(ModuleOp module) {
@@ -144,7 +109,7 @@ void mlir::triton::updateWaits(ModuleOp module) {
     waitOp.setNum(minNumCommits);
     waitOps.insert(waitOp);
   });
-  combineRedundantWaitOps(waitOps);
+  tt::combineRedundantWaitOps(waitOps);
 }
 
 // Add the given values as operands of the given wait, and replace all uses of
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -1050,6 +1050,12 @@ struct PipelinePass : public TritonAMDGPUStreamPipelineBase<PipelinePass> {
                          globalPrefetch, localPrefetch, useAsyncCopy);
       (void)sp.pipelineLoop();
     }
+
+    if (useAsyncCopy) {
+      llvm::SmallSetVector<ttg::AsyncWaitOp, 8> waitOps;
+      moduleOp.walk([&](ttg::AsyncWaitOp waitOp) { waitOps.insert(waitOp); });
+      tt::combineRedundantWaitOps(waitOps);
+    }
   }
 };
 } // namespace

Original file line number	Diff line number	Diff line change
`@@ -1050,6 +1050,12 @@ struct PipelinePass : public TritonAMDGPUStreamPipelineBase<PipelinePass> {`
`1050`	`1050`	`globalPrefetch, localPrefetch, useAsyncCopy);`
`1051`	`1051`	`(void)sp.pipelineLoop();`
`1052`	`1052`	`}`
	`1053`	`+`
	`1054`	`+ if (useAsyncCopy) {`
	`1055`	`+ llvm::SmallSetVector<ttg::AsyncWaitOp, 8> waitOps;`
	`1056`	`+ moduleOp.walk([&](ttg::AsyncWaitOp waitOp) { waitOps.insert(waitOp); });`
	`1057`	`+ tt::combineRedundantWaitOps(waitOps);`
	`1058`	`+ }`
`1053`	`1059`	`}`
`1054`	`1060`	`};`
`1055`	`1061`	`} // namespace`