[Backend] Disable LLVM LICM on warp specialize switch loop (#6870)

Mogball · web-flow · commit cd57ce910122 · 2025-05-19T13:56:01.000-07:00
This prevents LLVM fromt hoisting arbitrary number of values out of the
switch loop, which become live across all partition regions. This can
induce tons of spilling in warp specialized kernels.
diff --git a/test/Conversion/warp_specialize_to_llvm.mlir b/test/Conversion/warp_specialize_to_llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file -allow-unregistered-dialect -convert-warp-specialize-to-llvm | FileCheck %s
+// RUN: triton-opt %s -split-input-file -mlir-print-local-scope -allow-unregistered-dialect -convert-warp-specialize-to-llvm | FileCheck %s
 
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.total-num-warps" = 11 : i32} {
 
@@ -76,7 +76,7 @@ llvm.func @generate_switch_loop() attributes {allocation.offset = 32 : i32} {
   // CHECK: [[DEFAULT]]:
   // CHECK-NEXT: barrier.sync 1 ;
   // CHECK-NEXT: barrier.sync 1 ;
-  // CHECK-NEXT: llvm.br [[SWITCH_LOOP]]
+  // CHECK-NEXT: llvm.br [[SWITCH_LOOP]] {loop_annotation = #llvm.loop_annotation<licm = <disable = true>>}
 
   // CHECK: [[EXIT]]:
   // CHECK-NEXT: llvm.return
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp
@@ -294,6 +294,23 @@ static void rewritePartitionRegions(WarpSpecializeOp ws, Block *switchLoop,
   }
 }
 
+// LLVM's LICM will be tempted to hoist code out of the switch loop generated by
+// the `ttg.warp_specialize` lowering. However, neither NVPTX or `ptxas` will
+// rematerialize this code back in to the partition regions, resulting in long
+// liveranges for an arbitrary number of registers.
+//
+// Due to reduced warp group registers, these live values can induce spilling
+// in the partition regions. Prevent this by disabling LICM on the switch loop.
+static void disableLICM(LLVM::BrOp latchBr) {
+  Builder b(latchBr.getContext());
+  MLIRContext *ctx = b.getContext();
+  auto licmMD = LLVM::LoopLICMAttr::get(ctx, b.getBoolAttr(true), {});
+  auto loopMD =
+      LLVM::LoopAnnotationAttr::get(b.getContext(), {}, {}, {}, {}, {}, licmMD,
+                                    {}, {}, {}, {}, {}, {}, {}, {}, {});
+  latchBr.setLoopAnnotationAttr(loopMD);
+}
+
 static LogicalResult lowerWarpSpecialize(LLVM::LLVMFuncOp func,
                                          const NVIDIA::TargetInfo &targetInfo) {
   SmallVector<WarpSpecializeOp> wsOps;
@@ -415,7 +432,8 @@ static LogicalResult lowerWarpSpecialize(LLVM::LLVMFuncOp func,
                 /*aligned=*/false);
   createBarrier(b, kSwitchLoopBarrierIdx, /*numThreads=*/std::nullopt,
                 /*aligned=*/false);
-  b.create<LLVM::BrOp>(switchLoop);
+  auto latchBr = b.create<LLVM::BrOp>(switchLoop);
+  disableLICM(latchBr);
 
   // Exit state.
   Block *switchExit = new Block;