Revert "[Inductor] Prune configs that require more shared memory than the hardware limit (pytorch#161040)"

pytorchmergebot · pytorchmergebot · commit 92ab18482459 · 2025-08-26T03:15:32.000Z
This reverts commit b2e06e0. Reverted pytorch#161040 on behalf of https://github.com/jeffdaily due to still failing on rocm, see https://hud.pytorch.org/failure?name=rocm%20%2F%20linux-jammy-rocm-py3.10%20%2F%20test%20(default%2C%203%2C%206%2C%20linux.rocm.gpu.2)&jobName=undefined&failureCaptures=inductor%2Ftest_triton_heuristics.py%3A%3ATestTritonHeuristics%3A%3Atest_prune_configs_over_shared_memory_limit_do_pruning_True ([comment](pytorch#161040 (comment)))
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
@@ -19,7 +19,7 @@
 from torch._dynamo import reset
 from torch._dynamo.exc import BackendCompilerFailed
 from torch._dynamo.testing import rand_strided, reset_rng_state
-from torch._dynamo.utils import counters, same
+from torch._dynamo.utils import same
 from torch._inductor import config
 from torch._inductor.autotune_process import (
     _TestBenchmarkRequest,
@@ -1682,26 +1682,6 @@ def mm(x, y):
             out, code = run_and_get_code(compiled_f, a, b)
             torch.testing.assert_close(out, mm(a, b), atol=1e-2, rtol=1e-2)
 
-    @config.patch(
-        max_autotune_gemm=True,
-        max_autotune_prune_choices_based_on_shared_mem=True,
-    )
-    def test_max_autotune_prune_choices(self):
-        def mm(x, y):
-            return x @ y
-
-        M, K, N = (3, 3, 3)
-
-        x = torch.rand([M, K], device=GPU_TYPE, dtype=torch.float32)
-        y = torch.rand([K, N], device=GPU_TYPE, dtype=torch.float32)
-
-        compiled_f = torch.compile(mm)
-        compiled_f(x, y)
-
-        self.assertEqual(
-            counters["inductor"]["select_algorithm_num_precompilation_exceptions"], 0
-        )
-
 
 class TestMaxAutotunePrecompile(TestCase):
     def test_precompilation_threads(self):
diff --git a/test/inductor/test_triton_heuristics.py b/test/inductor/test_triton_heuristics.py
@@ -9,13 +9,7 @@
 from torch._dynamo.testing import rand_strided
 from torch._inductor.runtime.triton_compat import HAS_WARP_SPEC
 from torch._inductor.utils import clone_preserve_strides
-from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
-    IS_LINUX,
-    parametrize,
-    runOnRocm,
-    skipIfXpu,
-)
+from torch.testing._internal.common_utils import IS_LINUX, runOnRocm, skipIfXpu
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_GPU,
@@ -73,7 +67,6 @@ def get_autotuned_amd_sqr_kernel():
     )(amd_sqr_kernel)
 
 
-@instantiate_parametrized_tests
 class TestTritonHeuristics(TestCase):
     device_type = GPU_TYPE
 
@@ -269,28 +262,6 @@ def fn(x):
         res = torch.compile(fn)(x)
         self.assertEqual(ref, res)
 
-    @parametrize("do_pruning", [False, True])
-    def test_prune_configs_over_shared_memory_limit(self, do_pruning):
-        from torch._inductor.template_heuristics import CUDAConfigHeuristic, GemmConfig
-
-        expected_count = 1 if do_pruning else 2
-        mm_configs = [
-            GemmConfig(32, 32, 32, 1, 8, 8),
-            GemmConfig(
-                128, 128, 128, 100, 8, 4
-            ),  # intentionally large to exceed shared memory limit
-        ]
-        with config.patch(
-            {"max_autotune_prune_choices_based_on_shared_mem": do_pruning}
-        ):
-            config_heuristic = CUDAConfigHeuristic()
-            config_heuristic.should_scale_configs = False
-            config_heuristic.mm_configs = mm_configs
-            configs = list(
-                config_heuristic.get_mm_configs()(3, 3, 3, dtype_size=4, op_name="mm")
-            )
-            self.assertEqual(len(configs), expected_count)
-
 
 class TestArgumentCloneAndRestore(TestCase):
     # Our tensor is large enough. If a unexpected copy happens, the
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
@@ -448,12 +448,6 @@ def prologue_fusion_enabled() -> bool:
     os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_REPORT_CHOICES_STATS", "1") == "1"
 )
 
-# Prune configs that require more shared memory than the hardware limit
-max_autotune_prune_choices_based_on_shared_mem = (
-    os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_PRUNE_CHOICES_BASED_ON_SHARED_MEM", "1")
-    == "1"
-)
-
 # enable inductor graph partition to allow multiple inductor graphs for the same dynamo graph
 graph_partition: bool = (
     os.environ.get("TORCHINDUCTOR_GRAPH_PARTITION", "1" if not is_fbcode() else "0")
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
@@ -2760,9 +2760,6 @@ def wait_on_futures():
                 timeout=precompilation_timeout_seconds,
             ):
                 if e := future.exception():
-                    counters["inductor"][
-                        "select_algorithm_num_precompilation_exceptions"
-                    ] += 1
                     exceptions.append((futures[future], e))
                     from torch._inductor.codegen.cuda.cuda_kernel import (
                         CUDATemplateCaller,
diff --git a/torch/_inductor/template_heuristics.py b/torch/_inductor/template_heuristics.py
@@ -540,43 +540,34 @@ def _scale_mm_configs(
 
         return scaled_configs
 
-    def _exceed_available_shared_memeory(
-        self, gemm_config: BaseConfig, dtype_size: int
-    ) -> bool:
-        try:
-            if dtype_size <= 0:
-                return False
-
-            device = torch.cuda.current_device()
-            props = torch.cuda.get_device_properties(device)
-            if not hasattr(props, "shared_memory_per_block_optin"):
-                return False
-            sm_available = props.shared_memory_per_block_optin  # type: ignore[attr-defined]
-            shared_mem_accum = dtype_size * (
-                gemm_config.block_m * gemm_config.block_k
-                + gemm_config.block_n * gemm_config.block_k
-            )
-            return shared_mem_accum * gemm_config.num_stages > sm_available
-        except Exception:
-            return False
-
     def _prune_exhaustive_configs(
         self,
         configs: list[BaseConfig],
         dtype_size: int,
     ) -> list[BaseConfig]:
+        import torch
+
         pruned_configs = []
         for gemm_config in configs:
-            # Will use more shared memory than available
-            if self._exceed_available_shared_memeory(gemm_config, dtype_size):
-                continue
-
+            device = torch.cuda.current_device()
+            props = torch.cuda.get_device_properties(device)
+            sm_available = props.shared_memory_per_block_optin  # type: ignore[attr-defined]
             NUM_REG = 255
+
             acc_regs = math.ceil(
                 gemm_config.block_m * gemm_config.block_n / (gemm_config.num_warps * 32)
             )
+
+            shared_mem_accum = dtype_size * (
+                gemm_config.block_m * gemm_config.block_k
+                + gemm_config.block_n * gemm_config.block_k
+            )
+
+            # Will use more shared memory than available
+            if shared_mem_accum * gemm_config.num_stages > sm_available:
+                continue
             # Lower bound for register spillage, if exceeds the kernel will certainly spill
-            if acc_regs > NUM_REG:
+            elif acc_regs > NUM_REG:
                 continue
 
             pruned_configs.append(gemm_config)
@@ -608,15 +599,6 @@ def preprocess_mm_configs(
         scaled_configs = self._scale_mm_configs(
             m, n, k, configs, scale, has_int8_tensor, exclude
         )
-
-        # Filter out configs that require more shared memory than is available.
-        if dtype_size > 0 and config.max_autotune_prune_choices_based_on_shared_mem:
-            scaled_configs = [
-                c
-                for c in scaled_configs
-                if not self._exceed_available_shared_memeory(c, dtype_size)
-            ]
-
         if config.max_autotune_gemm_search_space == "EXHAUSTIVE":
             assert dtype_size > 0, "dtype_size must be provided for exhaustive search"
             scaled_configs = self._prune_exhaustive_configs(scaled_configs, dtype_size)