[AMD] Restrict merging async_wait in StreamPipeliner (#7577)

jungpark-mlir · web-flow · commit 1e0a37113914 · 2025-07-20T08:27:55.000-07:00
Disable merging async_wait when pipelining with num_stages=3.
This is to avoid incorrect operation order by combineRedundantWaitOps.
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -765,14 +765,15 @@ struct PipelinePass : impl::TritonAMDGPUStreamPipelineBase<PipelinePass> {
                          useAsyncCopy, waitAtTail);
     }
 
-    if (useAsyncCopy) {
+    if (useAsyncCopy && numStages != 3) {
       llvm::SmallSetVector<ttg::AsyncWaitOp, 8> waitOps;
       moduleOp.walk([&](ttg::AsyncWaitOp waitOp) {
         if (auto maybeForOp = dyn_cast<scf::ForOp>(waitOp->getParentOp()))
           // FIXME: There's potential bug in combinRedundantWaitOps(), it
           // generate incorrect IR order when numStages==3.
           if (tt::getNumStagesOrDefault(maybeForOp, numStages) == 3)
-            waitOps.insert(waitOp);
+            return;
+        waitOps.insert(waitOp);
       });
       tt::combineRedundantWaitOps(waitOps);
     }