Revert "[BACKEND] Add missing waits in WGMMA rhs in register pipelining" (#8970)

ThomasRaoux · web-flow · commit ba13f846c2ae · 2025-12-11T12:56:58.000-08:00
Temporarily revert due to performance regressions #8964
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp
@@ -348,7 +348,7 @@ std::vector<ttng::WarpGroupDotOp> splitRSDot(ttng::WarpGroupDotOp dotOp) {
         dotOp.getInputPrecision(), numImpreciseAcc, dotOp.getIsAsync());
     dots.push_back(dot);
     C = dot.getResult();
-    useC = {};
+    useC = mlir::arith::ConstantIntOp::create(builder, loc, 1, 1);
   }
   dotOp.replaceAllUsesWith(dots.back().getResult());
   dotOp.erase();
@@ -588,64 +588,44 @@ static void insertAsyncWarpGroupDotWaitInLoop(
   // Insert waits before the users of the properly async dots other than loop
   // yield.
   for (auto asyncDot : llvm::make_first_range(properlyAsyncDots)) {
-    DenseMap<Block *, SmallVector<OpOperand *>> blockToUses;
+    // If the dot takes the LHS on registers i, we add a wait for the number
+    // of properly async dots in the loop minus one.
+    // This makes sure that the dot will wait until itself from the previous
+    // iteration has completed, as to avoid rewriting the registers.
+    if (rsDotNeedsWait(asyncDot, forOp)) {
+      OpBuilder builder(asyncDot);
+      builder.setInsertionPointAfter(asyncDot);
+      auto newWait = ttng::WarpGroupDotWaitOp::create(
+          builder, asyncDot->getLoc(), ArrayRef<Value>{},
+          properlyAsyncDots.size() - 1);
+      SmallVector<Value> waitOperands = {asyncDot->getResult(0)};
+      threadValuesThroughWait(newWait, waitOperands);
+      continue;
+    }
+
+    SmallVector<OpOperand *> uses;
     for (auto &use : asyncDot->getUses()) {
       if (auto yieldOp = dyn_cast<scf::YieldOp>(use.getOwner())) {
         continue;
       }
-
-      auto block = use.getOwner()->getBlock();
-      blockToUses[block].push_back(&use);
+      uses.push_back(&use);
     }
 
-    for (auto [block, uses] : blockToUses) {
-      // Insert a wait before the first use in the block
-      std::sort(uses.begin(), uses.end(), [](OpOperand *lhs, OpOperand *rhs) {
-        Operation *lhsOp = lhs->getOwner();
-        Operation *rhsOp = rhs->getOwner();
-        return lhsOp->isBeforeInBlock(rhsOp);
-      });
-
-      // If a wgmma uses the same accumulator registers, it will be implicitly
-      // pipelined by the hardware and doesn't need a wait.
-      auto firstUse =
-          std::find_if_not(uses.begin(), uses.end(), [](OpOperand *operand) {
-            return (isa<ttng::WarpGroupDotOp>(operand->getOwner()) &&
-                    operand->getOperandNumber() == 2);
-          });
-      if (firstUse == uses.end()) {
-        continue;
-      }
+    DenseMap<Block *, SmallVector<Value>> blockToUsers;
+    for (auto use : uses) {
+      auto block = use->getOwner()->getBlock();
+      blockToUsers[block].push_back(use->get());
+    }
 
-      OpBuilder builder((*firstUse)->getOwner());
+    for (auto [block, users] : blockToUsers) {
+      OpBuilder builder(block, block->begin());
       auto newWait = ttng::WarpGroupDotWaitOp::create(
           builder, asyncDot->getLoc(), ArrayRef<Value>{}, 0);
 
-      SmallVector<Value> users;
-      for (; firstUse != uses.end(); ++firstUse) {
-        users.push_back((*firstUse)->get());
-      }
       threadValuesThroughWait(newWait, users);
     }
   }
 
-  for (auto asyncDot : llvm::make_first_range(properlyAsyncDots)) {
-    // If the dot takes the LHS on registers i, we add a wait for the number
-    // of properly async dots in the loop minus one.
-    // This makes sure that the dot will wait until itself from the previous
-    // iteration has completed, as to avoid rewriting the registers.
-    if (!rsDotNeedsWait(asyncDot, forOp))
-      continue;
-
-    OpBuilder builder(asyncDot);
-    builder.setInsertionPointAfter(asyncDot);
-    auto newWait = ttng::WarpGroupDotWaitOp::create(
-        builder, asyncDot->getLoc(), ArrayRef<Value>{},
-        properlyAsyncDots.size() - 1);
-    SmallVector<Value> waitOperands = {asyncDot->getResult(0)};
-    threadValuesThroughWait(newWait, waitOperands);
-  }
-
   // Add the wait right after the last properly-async dot.  This only needs to
   // wait for all properly-async dots from the i-1'th iteration to complete, IOW
   // we wait until there are most `asyncDots.size()` dots in flight.
diff --git a/test/TritonGPU/loop-pipeline-hopper.mlir b/test/TritonGPU/loop-pipeline-hopper.mlir
@@ -816,83 +816,6 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
 
 // -----
 
-#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
-#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
-#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}>
-#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
-#smem = #ttg.shared_memory
-module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
-// CHECK-LABEL: dot_lhs_in_reg_with_epilogue
-  tt.func @dot_lhs_in_reg_with_epilogue(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: i1) -> tensor<128x16xf32, #mma> {
-    %cst = arith.constant dense<0> : tensor<128x64xi32, #blocked1>
-    %cst1 = arith.constant dense<0> : tensor<64x16xi32, #blocked>
-    %c0_i32 = arith.constant 0 : i32
-    %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked>
-    %cst_1 = arith.constant dense<0> : tensor<128x1xi32, #blocked1>
-    %c0_i64 = arith.constant 0 : i64
-    %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma>
-    %cst_3 = arith.constant dense<0> : tensor<128x64xi32, #blocked1>
-    %cst_4 = arith.constant dense<2.0> : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-    %c1_i32 = arith.constant 1 : i32
-    %c8_i32 = arith.constant 8 : i32
-    %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr<f16>, i64
-    %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr<f16>, i64
-    %2 = tt.splat %1 : !tt.ptr<f16> -> tensor<128x1x!tt.ptr<f16>, #blocked1>
-    %3 = tt.addptr %2, %cst_1 : tensor<128x1x!tt.ptr<f16>, #blocked1>, tensor<128x1xi32, #blocked1>
-    %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
-    %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1>
-    %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr<f16>, #blocked1> -> tensor<128x64x!tt.ptr<f16>, #blocked1>
-    %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1>
-    %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<128x64xi32, #blocked1>
-    %10 = tt.splat %0 : !tt.ptr<f16> -> tensor<1x16x!tt.ptr<f16>, #blocked>
-    %11 = tt.addptr %10, %cst_0 : tensor<1x16x!tt.ptr<f16>, #blocked>, tensor<1x16xi32, #blocked>
-    %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
-    %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked>
-    %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr<f16>, #blocked> -> tensor<64x16x!tt.ptr<f16>, #blocked>
-    %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked>
-    %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
-    // CHECK: scf.for
-    // CHECK:   ttg.async_wait {{.*}} {num = 2 : i32}
-    // CHECK:   ttng.warp_group_dot
-    // CHECK-NEXT: ttng.warp_group_dot_wait {{.*}} {pendings = 3 : i32}
-    // CHECK:   ttng.warp_group_dot
-    // CHECK-NEXT: ttng.warp_group_dot_wait {{.*}} {pendings = 3 : i32}
-    // CHECK:   ttng.warp_group_dot
-    // CHECK-NEXT: ttng.warp_group_dot_wait {{.*}} {pendings = 3 : i32}
-    // CHECK:   ttng.warp_group_dot
-    // CHECK-NEXT: ttng.warp_group_dot_wait {{.*}} {pendings = 3 : i32}
-    // CHECK:   ttg.async_copy_global_to_local
-    // CHECK:   ttg.async_copy_global_to_local
-    // CHECK:   ttg.async_commit_group
-    // CHECK:   scf.if
-    // CHECK-NEXT: ttng.warp_group_dot_wait {{.*}} {pendings = 0 : i32}
-    // CHECK:   } else {
-    // CHECK-NOT: ttng.warp_group_dot_wait
-    // CHECK:   scf.yield
-    %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %8, %arg6 = %16) -> (tensor<128x16xf32, #mma>, tensor<128x64x!tt.ptr<f16>, #blocked1>,
-        tensor<64x16x!tt.ptr<f16>, #blocked>)  : i32 {
-      %a_block = tt.load %arg5 : tensor<128x64x!tt.ptr<f16>, #blocked1>
-      %b_block = tt.load %arg6 : tensor<64x16x!tt.ptr<f16>, #blocked>
-      %a_dotop = ttg.convert_layout %a_block : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-      %a_dotop_mul = arith.mulf %a_dotop, %cst_4 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
-      %b_smem = ttg.local_alloc %b_block : (tensor<64x16xf16, #blocked>) -> !ttg.memdesc<64x16xf16, #shared, #smem>
-      %25 = ttng.warp_group_dot %a_dotop_mul, %b_smem, %arg4 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * !ttg.memdesc<64x16xf16, #shared, #smem> -> tensor<128x16xf32, #mma>
-      %26 = tt.addptr %arg5, %cst : tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<128x64xi32, #blocked1>
-      %27 = tt.addptr %arg6, %cst1 : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
-      %28 = scf.if %arg2 -> tensor<128x16xf32, #mma> {
-        %29 = arith.addf %25, %25 : tensor<128x16xf32, #mma>
-        scf.yield %29: tensor<128x16xf32, #mma>
-      } else {
-        scf.yield %25: tensor<128x16xf32, #mma>
-      }
-      scf.yield %28, %26, %27 : tensor<128x16xf32, #mma>, tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<64x16x!tt.ptr<f16>, #blocked>
-    }
-    tt.return %17#0 : tensor<128x16xf32, #mma>
-  }
-}
-
-// -----
-
 #blocked = #ttg.blocked<{sizePerThread = [16, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked2 = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>