Add less warps config to inner reductions (pytorch#162447)

PaulZhang12 · pytorchmergebot · commit 768361e67f0e · 2025-09-24T19:09:02.000Z
Add less warps to ensure proper vectorization + memory coalescing for inner reductions, prefer more work per thread <img width="1717" height="731" alt="Screenshot 2025-09-17 at 10 03 25 AM" src="https://github.com/user-attachments/assets/7b1f4a30-62f2-4bee-bb9c-122501bde63e" /> Pull Request resolved: pytorch#162447 Approved by: https://github.com/v0i0, https://github.com/eellison, https://github.com/shunting314
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -2333,6 +2333,7 @@ def triton_config_reduction(
     num_warps=None,
     register_intensive=False,
     dynamic_scale_rblock=True,
+    reduction_hint=None,
 ) -> Config:
     """
     Construct a reduction triton config with some adjustment heuristics
@@ -2360,7 +2361,12 @@ def total_numel() -> int:
             rnumels[prefix] *= 2
 
     if num_warps is None:
-        num_warps = total_numel() // 128
+        if reduction_hint == ReductionHint.INNER:
+            # r is contiguous, so ensure that each thread has 8 elements for
+            # vectorized loads, assuming bf16/fp16
+            num_warps = r // (32 * 8)
+        else:
+            num_warps = total_numel() // 128
 
     max_num_warps = 16 if r <= 8192 else 32
     num_warps = _num_warps(
@@ -2630,6 +2636,7 @@ def make_config(
                 num_stages=num_stages,
                 register_intensive=register_intensive,
                 dynamic_scale_rblock=dynamic_scale_rblock,
+                reduction_hint=reduction_hint,
             )
 
     def outer_config_opt():
@@ -2681,7 +2688,7 @@ def outer_config_opt():
         )
 
     contiguous_config = make_config(
-        1,
+        1 if rnumel > 2048 else 2,  # 1024 or less is persistent
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
@@ -2911,7 +2918,13 @@ def _persistent_reduction_configs(
 
     if "y" not in size_hints:
         configs = [
-            triton_config_reduction(size_hints, xblock, rnumel, register_intensive=True)
+            triton_config_reduction(
+                size_hints,
+                xblock,
+                rnumel,
+                register_intensive=True,
+                reduction_hint=reduction_hint,
+            )
             for xblock in (1, 8, 32, 128)
             if xblock == 1
             or (rnumel * xblock <= MAX_PERSISTENT_BLOCK_NUMEL and xblock <= xnumel)
@@ -2954,6 +2967,7 @@ def _persistent_reduction_configs(
                     x_block,
                     rnumel,
                     register_intensive=True,
+                    reduction_hint=reduction_hint,
                 )
             ]
 
@@ -2965,6 +2979,7 @@ def _persistent_reduction_configs(
                 size_hints,
                 2 * (256 // rnumel) if rnumel <= 256 else 1,
                 rnumel,
+                reduction_hint=reduction_hint,
             )
         ]
     for c in configs: