[Blackwell] Fix the TMEM message heuristic (#6692)

Mogball · web-flow · commit 12419f6ec009 · 2025-05-02T16:44:20.000-07:00
Based on feedback from @csullivan The heuristic is also supposed to avoid using two `.x128` messages when the total workload size is 256 (elements per thread). Account for that and the reg size of each individual message.
diff --git a/test/Conversion/tritongpu_to_llvm_blackwell.mlir b/test/Conversion/tritongpu_to_llvm_blackwell.mlir
@@ -389,10 +389,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @tensor_memory_ld_128x256
-  // CHECK-COUNT-2: tcgen05.st.sync.aligned.32x32b.x128.b32
+  // CHECK-COUNT-4: tcgen05.st.sync.aligned.32x32b.x64.b32
   // CHECK-NOT: tcgen05.st
   // CHECK: tcgen05.wait::st.sync.aligned
-  // CHECK-COUNT-2: tcgen05.ld.sync.aligned.32x32b.x128.b32
+  // CHECK-COUNT-4: tcgen05.ld.sync.aligned.32x32b.x64.b32
   // CHECK-NOT: tcgen05.ld
   // CHECK: tcgen05.wait::ld.sync.aligned
   tt.func public @tensor_memory_ld_128x256(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>) {
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp
@@ -134,14 +134,19 @@ TMemMessageTraits getTMemMessageFromAtom(const TMemAccessAtom &atom,
   return m;
 }
 
-// Only allows half of the thread registers to be used for tensor memory access
-// to avoid register pressure. This ensures the largest tmem message width is
-// used for the workload without inducing spills.
-int getTMemMessageNarrowingFactor(const TMemAccessAtom &atom, int maxnreg) {
+// Narrow the TMEM message by reducing the number of registers per TMEM
+// instruction such that:
+// - No instruction uses more than half the available registers at a time.
+// - If the total number of registers required by the workload is more than half
+//   of the available registers, don't use the largest TMEM message.
+int getTMemMessageNarrowingFactor(const TMemAccessAtom &atom,
+                                  int workloadThreadRegs, int maxnreg) {
   const int allowedRegUsage = maxnreg / 2;
   int narrowingFactor = 1;
   while (getTMemMessageFromAtom(atom, narrowingFactor).numRegs >
-         allowedRegUsage) {
+             allowedRegUsage ||
+         workloadThreadRegs > allowedRegUsage) {
+    workloadThreadRegs /= 2;
     narrowingFactor *= 2;
   }
   return narrowingFactor;
@@ -381,7 +386,11 @@ void createWaitOpSt(Location loc, ConversionPatternRewriter &rewriter) {
 TMemMessageTraits selectTMemMessage(const TMemRuntimeInfo &info, int maxnreg) {
   auto atom = info.useStridedMessage ? TMemAccess16x32bx2 : TMemAccess32x32b;
 
-  int narrowingFactor = getTMemMessageNarrowingFactor(atom, maxnreg);
+  int totalRegsNeeded =
+      getEffectiveRegs(info.unpackedb16, info.useStridedMessage,
+                       info.numCols / info.numWarpGroups);
+  int narrowingFactor =
+      getTMemMessageNarrowingFactor(atom, totalRegsNeeded, maxnreg);
   auto narrowedMessage = getTMemMessageFromAtom(atom, narrowingFactor);
   narrowedMessage = constrainMessageFromWorkload(narrowedMessage, info,
                                                  narrowedMessage.numRegs);