[TritonGPU] Tweaks to warp specialization to reduce register pressure (#6403)

Mogball · web-flow · commit 366de71490f9 · 2025-04-04T20:16:59.000-07:00
* Place TMEM accumulator acquire over the entire epilogue to improve
instruction scheduling
* Give the load partition 2 warps

This marginally improves the performance of the tutorial matmul (~2.5%)
but is important for causes where spilling may occur
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp
@@ -381,17 +381,29 @@ LogicalResult triton::gpu::specializeLoadMMADependencies(scf::ForOp &loop,
            donePt.getPoint()->isBeforeInBlock(&*b.getInsertionPoint()));
     donePt = b.saveInsertionPoint();
 
-    // Acquire and get the accumulator result.
-    b.setInsertionPoint(domOp);
     Partition *userPartition = schedule.addPartition(numStages + numMmaStages);
+    // Acquire and get the accumulator result. Normally, we want to acquire the
+    // accumulator for as small of a critical section as possible to unblock
+    // dependents, but if the most dominating user is inside a conditional,
+    // acquire the accumulator for the whole branch. This will improve
+    // instruction scheduling and interleaving of the TMEM load.
+    bool userInConditional = isa<scf::IfOp>(domOp->getParentOp());
+    b.setInsertionPoint(domOp);
+    if (userInConditional)
+      b.setInsertionPointToStart(domOp->getBlock());
     createInPartition<ttng::WaitBarrierOp>(b, *userPartition, curAccReadyBar,
                                            accPhase);
+
+    b.setInsertionPoint(domOp);
     Value acc = createInPartition<ttng::TMEMLoadOp>(
         b, *userPartition, info.accLoad.getType(), curAccBuf);
     for (Operation *user : accUses)
       user->replaceUsesOfWith(info.accLoad, acc);
+
     // Signal the accumulator buffer is ready for the next iteration. Because
     // the mbarriers got shifted over by 1, we have to signal the next mbarrier.
+    if (userInConditional)
+      b.setInsertionPoint(domOp->getBlock()->getTerminator());
     Value nextIndex =
         b.create<arith::AddIOp>(accIndex, intCst(numMmaStages - 1));
     nextIndex = b.create<arith::RemUIOp>(nextIndex, intCst(numMmaStages));
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/OptimizePartitionWarps.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/OptimizePartitionWarps.cpp
@@ -5,11 +5,13 @@
 #include "triton/Analysis/AxisInfo.h"
 #include "triton/Conversion/TritonToTritonGPU/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 #include "llvm/ADT/ScopeExit.h"
 
 using namespace mlir;
 using namespace triton;
 using namespace triton::gpu;
+namespace ttng = triton::nvidia_gpu;
 
 //===----------------------------------------------------------------------===//
 // relayoutWarps
@@ -182,14 +184,28 @@ static LogicalResult optimizePartitionNumWarps(ModuleAxisInfoAnalysis &axisInfo,
   // If the compiler could control that, then we could allow non-uniform
   // register distributions, mostly beneficial for single-warp warpgroups that
   // just do some artihmetic.
-  constexpr unsigned nTotalRegs = 65536; // for Blackwell SMs
+  constexpr unsigned nTotalRegs = 1 << 16; // for Blackwell SMs
   const unsigned threadsPerWarp =
       TritonGPUDialect::getThreadsPerWarp(axisInfo.getModuleOp());
   const unsigned defaultNumWarps = lookupNumWarps(wsOp);
 
   SmallVector<int32_t> partitionNumWarps =
       llvm::to_vector(wsOp.getPartitionNumWarps());
 
+  // Some instructions have critical throughput if have low register usage. Make
+  // sure there are enough warps for these ops to execute quickly.
+  SmallVector<int32_t> minWarpsForPartition(partitionNumWarps.size(), 1);
+  for (auto [minWarps, region] :
+       llvm::zip(minWarpsForPartition, wsOp.getPartitionRegions())) {
+    region->walk([minWarps = &minWarps](Operation *op) {
+      if (!isa<scf::ForOp>(op->getParentOp()))
+        return;
+      if (isa<ttng::AsyncTMAGatherOp, ttng::AsyncTMAScatterOp,
+              ttng::AsyncTMACopyGlobalToLocalOp>(op))
+        *minWarps = 2;
+    });
+  }
+
   bool changed;
   do {
     changed = false;
@@ -215,9 +231,9 @@ static LogicalResult optimizePartitionNumWarps(ModuleAxisInfoAnalysis &axisInfo,
     int32_t curTotalNumWarps = std::accumulate(
         partitionNumWarps.begin(), partitionNumWarps.end(), defaultNumWarps);
 
-    for (auto [numWarps, tensorRegs] :
-         llvm::zip(partitionNumWarps, maxTensorRegs)) {
-      if (numWarps == 1)
+    for (auto [minWarps, numWarps, tensorRegs] :
+         llvm::zip(minWarpsForPartition, partitionNumWarps, maxTensorRegs)) {
+      if (numWarps <= minWarps)
         continue;
       // Check if reducing the number of warps will still fit the tensor. If it
       // didn't fit to begin with, it won't fit after shrinking.
diff --git a/test/TritonGPU/automatic-warp-specialization.mlir b/test/TritonGPU/automatic-warp-specialization.mlir
@@ -32,7 +32,7 @@ tt.func @matmul_change_desc_in_prologue(
   // BASE-NOT: tt.make_tensor_descriptor
   // PIPELINE-NOT: tt.experimental_tensormap_create
   // CHECK-LABEL: partition1
-  // CHECK-SAME: num_warps(1)
+  // CHECK-SAME: num_warps(2)
   // BASE-COUNT-2: tt.make_tensor_descriptor
   // PIPELINE-COUNT-2: ttg.global_scratch_alloc {alignment = 128 : i32, nbytes = 512 : i32}
   // PIPELINE-COUNT-2: tt.experimental_tensormap_create
@@ -87,7 +87,7 @@ tt.func @matmul_tma_acc_with_conditional_def_and_use(
   // CHECK-LABEL: partition0
   // CHECK-SAME: num_warps(1)
   // CHECK-LABEL: partition1
-  // CHECK-SAME: num_warps(1)
+  // CHECK-SAME: num_warps(2)
   // CHECK: [[INDICES:%.*]] = tt.splat %{{.*}} : i32 -> tensor<128xi32,
   // CHECK: ttng.async_tma_gather %{{.*}}[[[INDICES]],
   // CHECK-LABEL: partition2
diff --git a/test/TritonGPU/load-mma-specialization.mlir b/test/TritonGPU/load-mma-specialization.mlir
@@ -484,10 +484,10 @@ tt.func @matmul_tma_acc_with_conditional_user(
     scf.if %do_epilogue {
       // CHECK-NEXT: ttng.wait_barrier [[CUR_ACC_READY_BAR]], [[ACC_PHASE]] {ttg.partition = 0 : i32}
       // CHECK-NEXT: [[C:%.*]] = ttng.tmem_load [[ACC_BUF]] {ttg.partition = 0 : i32}
-      // CHECK-NEXT: [[NEXT_ACC_EMPTY_BAR:%.*]] = ttg.memdesc_subview [[ACC_EMPTY_BUFS]][[[NEXT_ACC_INDEX]]]
-      // CHECK-NEXT: ttng.arrive_barrier [[NEXT_ACC_EMPTY_BAR]], 1 {ttg.partition = 0 : i32}
       // CHECK-NEXT: "acc_user"([[C]])
       "acc_user"(%c) : (tensor<128x128xf32, #acc_layout>) -> ()
+      // CHECK-NEXT: [[NEXT_ACC_EMPTY_BAR:%.*]] = ttg.memdesc_subview [[ACC_EMPTY_BUFS]][[[NEXT_ACC_INDEX]]]
+      // CHECK-NEXT: ttng.arrive_barrier [[NEXT_ACC_EMPTY_BAR]], 1 {ttg.partition = 0 : i32}
     // CHECK-NEXT: } {ttg.partition = 0 : i32}
     }
 
@@ -513,7 +513,7 @@ tt.func @matmul_tma_acc_with_conditional_user(
 
 // AWS: ttg.warp_specialize
 // AWS: num_warps(4)
-// AWS: num_warps(1)
+// AWS: num_warps(2)
 // AWS: num_warps(1)
 
 // CHECK: @matmul_tma_acc_with_conditional_def
@@ -612,7 +612,7 @@ tt.func @matmul_tma_acc_with_conditional_def(
 
 // AWS: ttg.warp_specialize
 // AWS: num_warps(4)
-// AWS: num_warps(1)
+// AWS: num_warps(2)
 // AWS: num_warps(1)
 
 // CHECK: @matmul_tma_acc_with_conditional_def_and_use
@@ -682,10 +682,10 @@ tt.func @matmul_tma_acc_with_conditional_def_and_use(
     scf.if %do_epilogue {
       // CHECK-NEXT: ttng.wait_barrier [[CUR_ACC_READY_BAR]], [[ACC_PHASE]] {ttg.partition = 0 : i32}
       // CHECK-NEXT: [[C:%.*]] = ttng.tmem_load [[ACC_BUF]] {ttg.partition = 0 : i32}
-      // CHECK-NEXT: [[NEXT_ACC_EMPTY_BAR:%.*]] = ttg.memdesc_subview [[ACC_EMPTY_BUFS]][[[NEXT_ACC_INDEX]]]
-      // CHECK-NEXT: ttng.arrive_barrier [[NEXT_ACC_EMPTY_BAR]], 1 {ttg.partition = 0 : i32}
       // CHECK-NEXT: "acc_user"([[C]])
       "acc_user"(%c) : (tensor<128x128xf32, #acc_layout>) -> ()
+      // CHECK-NEXT: [[NEXT_ACC_EMPTY_BAR:%.*]] = ttg.memdesc_subview [[ACC_EMPTY_BUFS]][[[NEXT_ACC_INDEX]]]
+      // CHECK-NEXT: ttng.arrive_barrier [[NEXT_ACC_EMPTY_BAR]], 1 {ttg.partition = 0 : i32}
     // CHECK-NEXT: } {ttg.partition = 0 : i32}
     }
 
@@ -714,7 +714,7 @@ tt.func @matmul_tma_acc_with_conditional_def_and_use(
 
 // AWS: ttg.warp_specialize
 // AWS: num_warps(1)
-// AWS: num_warps(1)
+// AWS: num_warps(2)
 // AWS: num_warps(1)
 
 // CHECK: @matmul_tma_acc_with_conditional_def_and_use_no_multibuf
@@ -791,10 +791,12 @@ tt.func @matmul_tma_acc_with_conditional_def_and_use_no_multibuf_flag(
     // CHECK-NEXT: scf.if [[DO_EPILOGUE]]
     scf.if %do_epilogue {
       // CHECK-NEXT: ttng.wait_barrier [[ACC_READY_BUF0]], [[ACC_PHASE]] {ttg.partition = 0 : i32}
+      // CHECK-NEXT: "some_op"()
+      "some_op"() : () -> ()
       // CHECK-NEXT: [[C:%.*]] = ttng.tmem_load [[ACC_BUF]] {ttg.partition = 0 : i32}
-      // CHECK-NEXT: ttng.arrive_barrier [[ACC_EMPTY_BUF0]], 1 {ttg.partition = 0 : i32}
       // CHECK-NEXT: "acc_user"([[C]])
       "acc_user"(%c) : (tensor<128x128xf32, #acc_layout>) -> ()
+      // CHECK-NEXT: ttng.arrive_barrier [[ACC_EMPTY_BUF0]], 1 {ttg.partition = 0 : i32}
     // CHECK-NEXT: } {ttg.partition = 0 : i32}
     }