Revert "[Inductor] Update Outer Reduction Heuristic (pytorch#159093)"

pytorchmergebot · pytorchmergebot · commit 4e630f0629d1 · 2025-08-26T22:37:56.000Z
This reverts commit ca9fe01. Reverted pytorch#159093 on behalf of https://github.com/PaulZhang12 due to Addressing internal implications then relanding ([comment](pytorch#159093 (comment)))
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -2478,7 +2478,7 @@ def pointwise(
 
 
 def _reduction_configs(
-    *, size_hints: dict[str, int], inductor_meta: dict[str, Any], num_dynamic=0
+    *, size_hints: dict[str, int], inductor_meta: dict[str, Any]
 ) -> list[Config]:
     reduction_hint = inductor_meta.get("reduction_hint", None)
 
@@ -2531,68 +2531,17 @@ def make_config(x, r, num_warps=None, num_stages=1, register_intensive=False):
                 register_intensive=register_intensive,
             )
 
-    def outer_config_opt():
-        # Default to 64 for vectorized loads
-        max_x_block, x_block = 256, 64
-        load_factor = inductor_meta.get("num_load", 0)
-        x = size_hints["x"]
-        num_warps = None
-
-        # Try to use all SMs with small x
-        if x <= 1024:
-            x_block = max(min(x // 128, 8), 2)
-            outer_r_block = min(rnumel, 64)
-        # Lower bound x = 1024, 1024 // 16 = 128 around # of SMs
-        elif x // 4096 <= 8:
-            x_block = 16
-            outer_r_block = 512 // x_block
-        elif num_dynamic > 1:
-            # Lots of compute with multiple dynamic shape per loop iteration
-            # Larger RBLOCK minimizes loop iteration
-            outer_r_block = max(min((rnumel // 64), 64), 8)
-        elif num_dynamic == 1:
-            # Dynamic shapes introduce a lot register pressure for indexing
-            outer_r_block = (
-                1
-                if load_factor >= 3
-                else min(next_power_of_2(max(rnumel, 128) // 128), 8)
-            )
-        else:
-            x_block = max(min(max_x_block, next_power_of_2(x // 4096)), x_block)
-            if load_factor < 4 or rnumel <= 128:
-                outer_r_block = 512 // x_block
-            else:
-                # Heavier reductions contain a lot more overhead per loop iteration
-                # We minimize the overhead by enlarging r block
-                if rnumel >= 2048:
-                    outer_r_block = 64
-                else:
-                    outer_r_block = 32
-                x_block = min(x_block, 32)
-                num_warps = 4
-
-        # Set register intensive to true by default as we try to maximize tiles with heuristic
-        return make_config(
-            x_block,
-            outer_r_block,
-            num_warps=num_warps,
-            register_intensive=register_intensive,
-        )
-
     contiguous_config = make_config(
         1,
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
+    outer_config = make_config(64, 8, register_intensive=register_intensive)
     tiny_config = make_config(
         2 * (256 // rnumel) if rnumel <= 256 else 1,
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
-
-    outer_config = make_config(64, 8, register_intensive=register_intensive)
-    if not torch.version.hip:
-        outer_config = outer_config_opt()
     # For 3d tiling, default to more autotuning initially
     if "y" in size_hints:
         pass
@@ -2712,15 +2661,7 @@ def reduction(
 
     assert triton_meta is not None
 
-    num_dynamic = 0
-    for k in triton_meta["signature"].keys():
-        if "ks" in k:
-            num_dynamic += 1
-
-    configs = _reduction_configs(
-        size_hints=size_hints, inductor_meta=inductor_meta, num_dynamic=num_dynamic
-    )
-
+    configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
     configs = _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs)
     return cached_autotune(
         size_hints,