[BACKEND] combineRedundantWaitOps should not combine across loops/branches (#7593)

AlexAUT · web-flow · commit 16b25e1620e8 · 2025-07-22T11:56:01.000-07:00
`combineRedundantWaitOps` did skip over branches/loops, so if we end up with something like: ```mlir ttg.async_wait scf.for .... scf.yield ttg.async_wait ``` we merge the async_waits in the prologue and epilogue because we do not find a `ttg.commit_group` in between. This PR stops the forward search if we encounter a branch/loop. I can also walk through all successor blocks if we think this is worth the effort. This problem was not triggered before because the `ttg.async_wait` was scheduled in the same stage as its user(s) so we ended up with no `ttg.async_wait` in the prologue or there was another prefetch after it in the prologue. Since triton-lang/triton#7458 we might place the `ttg.async_wait` in the previous stage compared to its user(s) so we might end up with the problematic IR.
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp
@@ -469,7 +469,10 @@ void mlir::triton::combineRedundantWaitOps(
     SmallVector<Value> depTokens = waitOp.getOperands();
     unsigned minWaitNumber = waitOp.getNum();
     Operation *next = waitOp->getNextNode();
-    while (next && !isa<ttg::AsyncCommitGroupOp>(next)) {
+    // Stop if we reach the end of the block or if there is another commit group
+    // or a branching op (forOp, ifOp, whileOp) in between the waits
+    while (next &&
+           !isa<ttg::AsyncCommitGroupOp, RegionBranchOpInterface>(next)) {
       if (auto nextWait = dyn_cast<ttg::AsyncWaitOp>(next)) {
         waitGroup.push_back(nextWait);
         minWaitNumber = std::min(minWaitNumber, nextWait.getNum());
diff --git a/test/TritonGPU/loop-pipeline-combine-waits.mlir b/test/TritonGPU/loop-pipeline-combine-waits.mlir
@@ -0,0 +1,33 @@
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=3 use_async_copy=1 use_pingpong=1" | FileCheck %s
+
+#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
+#mma = #ttg.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: tt.func @simple_pipelined_load
+  // We expect one ttg.async_wait in the epilogue, one in the loop and one in the prologue
+  // CHECK: ttg.async_wait
+  // CHECK-NOT: ttg.async_wait
+  // CHECK: scf.for
+  // CHECK: ttg.async_wait
+  // CHECK-NOT: ttg.async_wait
+  // CHECK: scf.yield
+  // CHECK: ttg.async_wait
+  // CHECK-NOT: ttg.async_wait
+  tt.func @simple_pipelined_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, %arg3: i32, %arg4: i32) -> tensor<128x16xf32, #mma> {
+    %c0_i32 = arith.constant 0 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma>
+    %0 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<1x16x!tt.ptr<f16>, #blocked>
+    %1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked>
+    %3 = tt.broadcast %0 : tensor<1x16x!tt.ptr<f16>, #blocked> -> tensor<64x16x!tt.ptr<f16>, #blocked>
+    %4 = tt.broadcast %2 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked>
+    %5 = tt.addptr %3, %4 : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
+    %6 = scf.for %arg6 = %c0_i32 to %arg3 step %arg4 iter_args(%arg5 = %cst) -> (tensor<128x16xf32, #mma>)  : i32 {
+      %7 = tt.load %5 : tensor<64x16x!tt.ptr<f16>, #blocked>
+      %8 = ttg.convert_layout %7 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+      %9 = tt.dot %arg2, %8, %cst : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma>
+      scf.yield %9 : tensor<128x16xf32, #mma>
+    }
+    tt.return %6 : tensor<128x16xf32, #mma>
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -765,16 +765,9 @@ struct PipelinePass : impl::TritonAMDGPUStreamPipelineBase<PipelinePass> {
                          useAsyncCopy, waitAtTail);
     }
 
-    if (useAsyncCopy && numStages != 3) {
+    if (useAsyncCopy) {
       llvm::SmallSetVector<ttg::AsyncWaitOp, 8> waitOps;
-      moduleOp.walk([&](ttg::AsyncWaitOp waitOp) {
-        if (auto maybeForOp = dyn_cast<scf::ForOp>(waitOp->getParentOp()))
-          // FIXME: There's potential bug in combinRedundantWaitOps(), it
-          // generate incorrect IR order when numStages==3.
-          if (tt::getNumStagesOrDefault(maybeForOp, numStages) == 3)
-            return;
-        waitOps.insert(waitOp);
-      });
+      moduleOp.walk([&](ttg::AsyncWaitOp waitOp) { waitOps.insert(waitOp); });
       tt::combineRedundantWaitOps(waitOps);
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -765,16 +765,9 @@ struct PipelinePass : impl::TritonAMDGPUStreamPipelineBase<PipelinePass> {`
`765`	`765`	`useAsyncCopy, waitAtTail);`
`766`	`766`	`}`
`767`	`767`
`768`		`- if (useAsyncCopy && numStages != 3) {`
	`768`	`+ if (useAsyncCopy) {`
`769`	`769`	`llvm::SmallSetVector<ttg::AsyncWaitOp, 8> waitOps;`
`770`		`- moduleOp.walk([&](ttg::AsyncWaitOp waitOp) {`
`771`		`- if (auto maybeForOp = dyn_cast<scf::ForOp>(waitOp->getParentOp()))`
`772`		`- // FIXME: There's potential bug in combinRedundantWaitOps(), it`
`773`		`- // generate incorrect IR order when numStages==3.`
`774`		`- if (tt::getNumStagesOrDefault(maybeForOp, numStages) == 3)`
`775`		`- return;`
`776`		`- waitOps.insert(waitOp);`
`777`		`- });`
	`770`	`+ moduleOp.walk([&](ttg::AsyncWaitOp waitOp) { waitOps.insert(waitOp); });`
`778`	`771`	`tt::combineRedundantWaitOps(waitOps);`
`779`	`772`	`}`
`780`	`773`	`}`