Bug fix and optimisation for persistent reduction kernel tuning

jataylo · web-flow · commit fdf61977af3e · 2025-09-03T16:38:26.000+01:00
Original PR had incorrect indentation. Updated PR such that autotune will always add tiny configs, otherwise use the hinted configs only.
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -2595,20 +2595,20 @@ def _persistent_reduction_configs(
         elif reduction_hint == ReductionHint.OUTER:
             configs = configs[-1:]
 
-    if reduction_hint == ReductionHint.OUTER_TINY:
-        tiny_configs = [
-            triton_config_reduction(
-                size_hints,
-                2 * (256 // rnumel) if rnumel <= 256 else 1,
-                rnumel,
-            )
-        ]
-        if max_autotune_enabled:
-            for tconfig in tiny_configs:
-                if tconfig not in configs:
-                    configs.append(tconfig)
-            else:
-                configs = tiny_configs
+    tiny_configs = [
+        triton_config_reduction(
+            size_hints,
+            2 * (256 // rnumel) if rnumel <= 256 else 1,
+            rnumel,
+        )
+    ]
+
+    if max_autotune_enabled:
+        for conf in tiny_configs:
+            if conf not in configs:
+                configs.append(conf)
+    elif reduction_hint == ReductionHint.OUTER_TINY:
+        configs = tiny_configs
 
     for c in configs:
         # we don't need Rn_BLOCK for persistent reduction