[AUTOGENERATED] [release/2.8] [SWDEV-539215] - Autotune support for persistent reduction and no_x_dim removal (#2454)

okakarpa · jataylo · jithunnair-amd · commit c58ceb1b6ae9 · 2025-10-10T14:55:54.000-05:00
Cherry-pick of #2417 Need to resolve conflicts --------- Co-authored-by: Jack Taylor <108682042+jataylo@users.noreply.github.com> (cherry picked from commit eb47158)
diff --git a/test/inductor/test_combo_kernels.py b/test/inductor/test_combo_kernels.py
@@ -296,23 +296,6 @@ def fn(a0, a1, a2, b0, b1, b2):
 
         self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
 
-    @requires_cuda_and_triton
-    def test_persistent_reduction_no_x_dim(self):
-        def fn(x, y):
-            return x.sum(1), y.sum(1)
-
-        inps = (
-            torch.rand(16, 256, device="cuda"),
-            torch.rand(32, 256, device="cuda"),
-        )
-        torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
-        torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
-        out_eager = fn(*inps)
-        out_compiled = torch.compile(fn)(*inps)
-
-        self.assertEqual(out_eager, out_compiled)
-        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
-
 
 @instantiate_parametrized_tests
 class ComboKernelDynamicShapesTests(TestCase):
diff --git a/test/inductor/test_torchinductor_strided_blocks.py b/test/inductor/test_torchinductor_strided_blocks.py
@@ -816,6 +816,7 @@ def test_2d_reduction_odd_shapes(
         # Check the code for multiple Rn_BLOCK's
         self._assert_reduction_ndims(code, 2)
 
+
     @parametrize(
         "size,expected_num_block_pointers,expected_num_triton_kernels,expect_fallback",
         [
diff --git a/torch/_inductor/choices.py b/torch/_inductor/choices.py
@@ -232,6 +232,18 @@ def should_use_persistent_reduction(
             features.reduction_numel, threshold
         )  # type: ignore[arg-types]
 
+    @staticmethod
+    def want_no_x_dim(features: SIMDKernelFeatures) -> bool:
+        """
+        Heuristic to decide if we should drop the X dimension from a persistent reduction kernel.
+        So the [XBLOCK, RBLOCK] block becomes a [RBLOCK] block and XBLOCK is forced to be always 1.
+        Strangely this is faster than a [1, RBLOCK] block in some cases.
+
+        ROCm branch change: Remove want_no_x_dim for persistent reduction.
+        Inductor benchmarks show no perf advantage and simplifies autotune flow.
+        """
+        return False
+
     @staticmethod
     def reduction_split_factor(
         device: torch.device,
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -2030,12 +2030,11 @@ def should_use_persistent_reduction(self) -> bool:
         )
 
     def want_no_x_dim(self):
-        return (
-            self.persistent_reduction
-            and len(self.numels) == self.num_reduction_dims + 1
-            and self.fixed_config
-            and self.fixed_config["XBLOCK"] == 1
-        )
+        """
+        ROCm branch change: Remove want_no_x_dim for persistent reduction.
+        Inductor benchmarks show no perf advantage and simplifies autotune flow.
+        """
+        return False
 
     @property
     def assert_function(self) -> str:
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -2870,6 +2870,10 @@ def _persistent_reduction_configs(
     rnumel = get_total_reduction_numel(size_hints)
 
     MAX_PERSISTENT_BLOCK_NUMEL = 4096
+    max_autotune_enabled = not disable_pointwise_autotuning(inductor_meta) or (
+        inductor_meta.get("max_autotune")
+        or inductor_meta.get("max_autotune_pointwise")
+    )
 
     if "y" not in size_hints:
         configs = [
@@ -2899,18 +2903,27 @@ def _persistent_reduction_configs(
     if "y" in size_hints:
         pass
     # TODO(jansel): we should be able to improve these heuristics
-    elif reduction_hint == ReductionHint.INNER and rnumel >= 256:
-        configs = configs[:1]
-    elif reduction_hint == ReductionHint.OUTER:
-        configs = configs[-1:]
-    elif reduction_hint == ReductionHint.OUTER_TINY:
-        configs = [
+    if not max_autotune_enabled: # Don't filter if tuning enabled
+        if reduction_hint == ReductionHint.INNER and rnumel >= 256:
+            configs = configs[:1]
+        elif reduction_hint == ReductionHint.OUTER:
+            configs = configs[-1:]
+
+    if reduction_hint == ReductionHint.OUTER_TINY:
+        tiny_configs = [
             triton_config_reduction(
                 size_hints,
                 2 * (256 // rnumel) if rnumel <= 256 else 1,
                 rnumel,
             )
         ]
+        if max_autotune_enabled:
+            for tconfig in tiny_configs:
+                if tconfig not in configs:
+                    configs.append(tconfig)
+            else:
+                configs = tiny_configs
+
     for c in configs:
         # we don't need Rn_BLOCK for persistent reduction
         for prefix in size_hints:

Original file line number	Diff line number	Diff line change
`@@ -816,6 +816,7 @@ def test_2d_reduction_odd_shapes(`
`816`	`816`	`# Check the code for multiple Rn_BLOCK's`
`817`	`817`	`self._assert_reduction_ndims(code, 2)`
`818`	`818`
	`819`	`+`
`819`	`820`	`@parametrize(`
`820`	`821`	`"size,expected_num_block_pointers,expected_num_triton_kernels,expect_fallback",`
`821`	`822`	`[`