[Blackwell] Fix perf regression (#7643)

Mogball · web-flow · commit a2d179de2e7b · 2025-07-24T12:01:24.000-07:00
Somehow optimizing this code when we know the number of warp groups is 1
results in major performance regressions...
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp
@@ -308,14 +308,10 @@ void calculateAddressAndEmitTmemMessage(
 
   TritonLLVMOpBuilder b(loc, rewriter);
   Value warpId = rewriter.create<nvgpu::WarpIdOp>(loc);
-  Value warpIdInGroup, warpGroupId;
-  if (info.numWarpGroups == 1) {
-    warpIdInGroup = warpId;
-    warpGroupId = b.i32_val(0);
-  } else {
-    warpIdInGroup = b.urem(warpId, b.i32_val(4));
-    warpGroupId = b.udiv(warpId, b.i32_val(4));
-  }
+  // Note: optimizing this when we know `info.numWarpGroups` is 1 can result in
+  // performance regressions.
+  Value warpIdInGroup = b.urem(warpId, b.i32_val(4));
+  Value warpGroupId = b.udiv(warpId, b.i32_val(4));
 
   // When split along M, blockM=128 and num_warps=8, and a strided message is
   // selected such that all 8 warps read a 16 rows of a block at a time.