rmsnorm bwd: make blocking math more robust

v0i0 · v0i0 · commit 0714ac4fb098 · 2025-10-08T10:27:11.000-07:00
ghstack-source-id: 1b9366a Pull-Request: #532
diff --git a/tritonbench/operators/rms_norm/fused_triton.py b/tritonbench/operators/rms_norm/fused_triton.py
@@ -106,8 +106,8 @@ def backward(ctx, dy):
 
         M, N = x_arg.shape
         NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
-        BLOCK_SIZE_M = min(2048, triton.next_power_of_2(M // (8 * NUM_SMS)))
-        PARTIAL_SIZE = math.ceil(M / BLOCK_SIZE_M)
+        BLOCK_SIZE_M = min(2048, triton.next_power_of_2(triton.cdiv(M, (8 * NUM_SMS))))
+        PARTIAL_SIZE = triton.cdiv(M, BLOCK_SIZE_M)
 
         # Columnwise stride for reducing partial sums at end, contiguous loads
         _dw = torch.empty((PARTIAL_SIZE, N), dtype=w.dtype, device=w.device)