[Backend] Codegen warpId to 0 when there 1 contextual warp (#6823)

Mogball · web-flow · commit 6377474b48d7 · 2025-05-14T23:43:47.000-07:00
This allows a bunch of code to fold away trivially, especially in the
MMA and load partitions of warp specialized kernels.
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -207,9 +207,16 @@ std::pair<Value, Value> getLaneAndWarpId(OpBuilder &rewriter, Location loc) {
   Value tid = getThreadId(rewriter, loc);
   int threadsPerWarp = triton::gpu::lookupThreadsPerWarp(rewriter);
   Value warpSizeVal = b.i32_val(threadsPerWarp);
-
   Value laneId = b.urem(tid, warpSizeVal);
-  Value warpId = b.udiv(tid, warpSizeVal);
+
+  // If there is only one warp, the warp ID is always 0.
+  Operation *lookupPt = &rewriter.getInsertionBlock()->front();
+  Value warpId;
+  if (triton::gpu::lookupNumWarps(lookupPt) == 1)
+    warpId = b.i32_val(0);
+  else
+    warpId = b.udiv(tid, warpSizeVal);
+
   return {laneId, warpId};
 }
 
diff --git a/test/Conversion/nvgpu_to_llvm.mlir b/test/Conversion/nvgpu_to_llvm.mlir
@@ -211,3 +211,40 @@ llvm.func @warpid_warp_specialize() {
 }
 
 }
+
+// -----
+
+module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} {
+
+// CHECK-LABEL: @one_warp
+tt.func @one_warp() -> i32 {
+  // CHECK-NEXT: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
+  %0 = nvgpu.warp_id
+  // CHECK-NEXT: return [[C0]]
+  tt.return %0 : i32
+}
+
+}
+
+// -----
+
+module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
+
+// CHECK-LABEL: @one_contextual_warp
+tt.func @one_contextual_warp() {
+  ttg.warp_specialize()
+  default {
+    ttg.warp_yield
+  }
+  // CHECK: partition0
+  partition0() num_warps(1) {
+    // CHECK-NEXT: [[C0:%.*]] = llvm.mlir.constant(0 : i32)
+    %0 = nvgpu.warp_id
+    // CHECK-NEXT: "use"([[C0]])
+    "use"(%0) : (i32) -> ()
+    ttg.warp_return
+  } : () -> ()
+  tt.return
+}
+
+}
diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
@@ -232,6 +232,12 @@ class WarpIdOpPattern : public OpRewritePattern<ttn::WarpIdOp> {
     auto loc = op.getLoc();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
 
+    if (triton::gpu::lookupNumWarps(op) == 1) {
+      // If there is only one warp, the warp ID is always 0.
+      rewriter.replaceOp(op, b.i32_val(0));
+      return success();
+    }
+
     // If this is inside a warp specialize op, compute the relative thread ID
     // within the warp group.
     Value tid = rewriter.create<NVVM::ThreadIdXOp>(loc, i32_ty);