[BACKEND] Add missing waits in WGMMA rhs in register pipelining (#8964)

peterbell10 · web-flow · commit 6f4f943b057e · 2025-12-11T16:36:35.000Z
We have a special case logic for rsDotNeedsWait which skipped the normal
code path to add wait before accessing the accumulator. This is
necessary because we split one `warp_group_dot` into many that act on
the same accumulator, and so we don't want to add wait 0 between the
wgmma ops. However, it breaks cases where we genuinely need to access
the accumulator e.g. in the epilogue of a persistent matmul.

Instead, this PR makes the rsDotNeedsWait logic completely separate and
adds a condition to the generic code path to not emit waits for wgmmas
that access the accumulator of another wgmma, since these are pipelined
in hardware.
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp
@@ -348,7 +348,7 @@ std::vector<ttng::WarpGroupDotOp> splitRSDot(ttng::WarpGroupDotOp dotOp) {
         dotOp.getInputPrecision(), numImpreciseAcc, dotOp.getIsAsync());
     dots.push_back(dot);
     C = dot.getResult();
-    useC = mlir::arith::ConstantIntOp::create(builder, loc, 1, 1);
+    useC = {};
   }
   dotOp.replaceAllUsesWith(dots.back().getResult());
   dotOp.erase();
@@ -588,44 +588,64 @@ static void insertAsyncWarpGroupDotWaitInLoop(
   // Insert waits before the users of the properly async dots other than loop
   // yield.
   for (auto asyncDot : llvm::make_first_range(properlyAsyncDots)) {
-    // If the dot takes the LHS on registers i, we add a wait for the number
-    // of properly async dots in the loop minus one.
-    // This makes sure that the dot will wait until itself from the previous
-    // iteration has completed, as to avoid rewriting the registers.
-    if (rsDotNeedsWait(asyncDot, forOp)) {
-      OpBuilder builder(asyncDot);
-      builder.setInsertionPointAfter(asyncDot);
-      auto newWait = ttng::WarpGroupDotWaitOp::create(
-          builder, asyncDot->getLoc(), ArrayRef<Value>{},
-          properlyAsyncDots.size() - 1);
-      SmallVector<Value> waitOperands = {asyncDot->getResult(0)};
-      threadValuesThroughWait(newWait, waitOperands);
-      continue;
-    }
-
-    SmallVector<OpOperand *> uses;
+    DenseMap<Block *, SmallVector<OpOperand *>> blockToUses;
     for (auto &use : asyncDot->getUses()) {
       if (auto yieldOp = dyn_cast<scf::YieldOp>(use.getOwner())) {
         continue;
       }
-      uses.push_back(&use);
-    }
 
-    DenseMap<Block *, SmallVector<Value>> blockToUsers;
-    for (auto use : uses) {
-      auto block = use->getOwner()->getBlock();
-      blockToUsers[block].push_back(use->get());
+      auto block = use.getOwner()->getBlock();
+      blockToUses[block].push_back(&use);
     }
 
-    for (auto [block, users] : blockToUsers) {
-      OpBuilder builder(block, block->begin());
+    for (auto [block, uses] : blockToUses) {
+      // Insert a wait before the first use in the block
+      std::sort(uses.begin(), uses.end(), [](OpOperand *lhs, OpOperand *rhs) {
+        Operation *lhsOp = lhs->getOwner();
+        Operation *rhsOp = rhs->getOwner();
+        return lhsOp->isBeforeInBlock(rhsOp);
+      });
+
+      // If a wgmma uses the same accumulator registers, it will be implicitly
+      // pipelined by the hardware and doesn't need a wait.
+      auto firstUse =
+          std::find_if_not(uses.begin(), uses.end(), [](OpOperand *operand) {
+            return (isa<ttng::WarpGroupDotOp>(operand->getOwner()) &&
+                    operand->getOperandNumber() == 2);
+          });
+      if (firstUse == uses.end()) {
+        continue;
+      }
+
+      OpBuilder builder((*firstUse)->getOwner());
       auto newWait = ttng::WarpGroupDotWaitOp::create(
           builder, asyncDot->getLoc(), ArrayRef<Value>{}, 0);
 
+      SmallVector<Value> users;
+      for (; firstUse != uses.end(); ++firstUse) {
+        users.push_back((*firstUse)->get());
+      }
       threadValuesThroughWait(newWait, users);
     }
   }
 
+  for (auto asyncDot : llvm::make_first_range(properlyAsyncDots)) {
+    // If the dot takes the LHS on registers i, we add a wait for the number
+    // of properly async dots in the loop minus one.
+    // This makes sure that the dot will wait until itself from the previous
+    // iteration has completed, as to avoid rewriting the registers.
+    if (!rsDotNeedsWait(asyncDot, forOp))
+      continue;
+
+    OpBuilder builder(asyncDot);
+    builder.setInsertionPointAfter(asyncDot);
+    auto newWait = ttng::WarpGroupDotWaitOp::create(
+        builder, asyncDot->getLoc(), ArrayRef<Value>{},
+        properlyAsyncDots.size() - 1);
+    SmallVector<Value> waitOperands = {asyncDot->getResult(0)};
+    threadValuesThroughWait(newWait, waitOperands);
+  }
+
   // Add the wait right after the last properly-async dot.  This only needs to
   // wait for all properly-async dots from the i-1'th iteration to complete, IOW
   // we wait until there are most `asyncDots.size()` dots in flight.
diff --git a/test/TritonGPU/loop-pipeline-hopper.mlir b/test/TritonGPU/loop-pipeline-hopper.mlir
@@ -816,6 +816,83 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
 
 // -----
 
+#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
+// CHECK-LABEL: dot_lhs_in_reg_with_epilogue
+  tt.func @dot_lhs_in_reg_with_epilogue(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: i1) -> tensor<128x16xf32, #mma> {
+    %cst = arith.constant dense<0> : tensor<128x64xi32, #blocked1>
+    %cst1 = arith.constant dense<0> : tensor<64x16xi32, #blocked>
+    %c0_i32 = arith.constant 0 : i32
+    %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked>
+    %cst_1 = arith.constant dense<0> : tensor<128x1xi32, #blocked1>
+    %c0_i64 = arith.constant 0 : i64
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma>
+    %cst_3 = arith.constant dense<0> : tensor<128x64xi32, #blocked1>
+    %cst_4 = arith.constant dense<2.0> : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+    %c1_i32 = arith.constant 1 : i32
+    %c8_i32 = arith.constant 8 : i32
+    %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr<f16>, i64
+    %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr<f16>, i64
+    %2 = tt.splat %1 : !tt.ptr<f16> -> tensor<128x1x!tt.ptr<f16>, #blocked1>
+    %3 = tt.addptr %2, %cst_1 : tensor<128x1x!tt.ptr<f16>, #blocked1>, tensor<128x1xi32, #blocked1>
+    %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
+    %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1>
+    %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr<f16>, #blocked1> -> tensor<128x64x!tt.ptr<f16>, #blocked1>
+    %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1>
+    %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<128x64xi32, #blocked1>
+    %10 = tt.splat %0 : !tt.ptr<f16> -> tensor<1x16x!tt.ptr<f16>, #blocked>
+    %11 = tt.addptr %10, %cst_0 : tensor<1x16x!tt.ptr<f16>, #blocked>, tensor<1x16xi32, #blocked>
+    %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked>
+    %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr<f16>, #blocked> -> tensor<64x16x!tt.ptr<f16>, #blocked>
+    %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked>
+    %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
+    // CHECK: scf.for
+    // CHECK:   ttg.async_wait {{.*}} {num = 2 : i32}
+    // CHECK:   ttng.warp_group_dot
+    // CHECK-NEXT: ttng.warp_group_dot_wait {{.*}} {pendings = 3 : i32}
+    // CHECK:   ttng.warp_group_dot
+    // CHECK-NEXT: ttng.warp_group_dot_wait {{.*}} {pendings = 3 : i32}
+    // CHECK:   ttng.warp_group_dot
+    // CHECK-NEXT: ttng.warp_group_dot_wait {{.*}} {pendings = 3 : i32}
+    // CHECK:   ttng.warp_group_dot
+    // CHECK-NEXT: ttng.warp_group_dot_wait {{.*}} {pendings = 3 : i32}
+    // CHECK:   ttg.async_copy_global_to_local
+    // CHECK:   ttg.async_copy_global_to_local
+    // CHECK:   ttg.async_commit_group
+    // CHECK:   scf.if
+    // CHECK-NEXT: ttng.warp_group_dot_wait {{.*}} {pendings = 0 : i32}
+    // CHECK:   } else {
+    // CHECK-NOT: ttng.warp_group_dot_wait
+    // CHECK:   scf.yield
+    %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %8, %arg6 = %16) -> (tensor<128x16xf32, #mma>, tensor<128x64x!tt.ptr<f16>, #blocked1>,
+        tensor<64x16x!tt.ptr<f16>, #blocked>)  : i32 {
+      %a_block = tt.load %arg5 : tensor<128x64x!tt.ptr<f16>, #blocked1>
+      %b_block = tt.load %arg6 : tensor<64x16x!tt.ptr<f16>, #blocked>
+      %a_dotop = ttg.convert_layout %a_block : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+      %a_dotop_mul = arith.mulf %a_dotop, %cst_4 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+      %b_smem = ttg.local_alloc %b_block : (tensor<64x16xf16, #blocked>) -> !ttg.memdesc<64x16xf16, #shared, #smem>
+      %25 = ttng.warp_group_dot %a_dotop_mul, %b_smem, %arg4 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !ttg.memdesc<64x16xf16, #shared, #smem> -> tensor<128x16xf32, #mma>
+      %26 = tt.addptr %arg5, %cst : tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<128x64xi32, #blocked1>
+      %27 = tt.addptr %arg6, %cst1 : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
+      %28 = scf.if %arg2 -> tensor<128x16xf32, #mma> {
+        %29 = arith.addf %25, %25 : tensor<128x16xf32, #mma>
+        scf.yield %29: tensor<128x16xf32, #mma>
+      } else {
+        scf.yield %25: tensor<128x16xf32, #mma>
+      }
+      scf.yield %28, %26, %27 : tensor<128x16xf32, #mma>, tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<64x16x!tt.ptr<f16>, #blocked>
+    }
+    tt.return %17#0 : tensor<128x16xf32, #mma>
+  }
+}
+
+// -----
+
 #blocked = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked2 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>