Revert "[Inductor] Fix epilogue fusion decision with 1 Triton caller as choice (pytorch#156500)"

pytorchmergebot · pytorchmergebot · commit dfa2649434f5 · 2025-07-09T18:56:10.000Z
This reverts commit c48d0f4. Reverted pytorch#156500 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](pytorch#156500 (comment)))
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
@@ -10,7 +10,6 @@
 import re
 import tempfile
 import unittest
-from functools import partial
 from typing import Callable, Optional
 from unittest import mock
 from unittest.mock import MagicMock
@@ -36,11 +35,7 @@
     TritonTemplate,
     TritonTemplateCaller,
 )
-from torch._inductor.template_heuristics import (
-    BaseConfigHeuristic,
-    CUDAConfigHeuristic,
-    GemmConfig,
-)
+from torch._inductor.template_heuristics import CUDAConfigHeuristic, GemmConfig
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -1555,61 +1550,6 @@ def f(a, b):
                 if "benchmark_gpu" in counter:
                     self.assertEqual(counters["inductor"][counter], 2)
 
-    @unittest.skipIf(
-        not has_triton_tma_device(), "Need device-side TMA support in Triton"
-    )
-    @config.patch(
-        max_autotune=True,
-        max_autotune_gemm_backends="TRITON",
-        autotune_fallback_to_aten=False,
-    )
-    def test_one_triton_choice_epilogue_fusion(self):
-        """
-        Here we test the fusion case with only 1 Triton choice for mm lowering.
-        The hardcoded config itself is valid, but when fused with the torch.float32
-        case, the shared memory requirements is higher than the amount available on H100.
-
-        This test checks that the fusion does not occur in this edge case. This is important
-        for future work on lookup table for autotuned gemm configs.
-        """
-
-        def f(a, b):
-            return (a @ b).to(torch.float32)
-
-        a = torch.randn(512, 1152, device="cuda", dtype=torch.bfloat16)
-        b = torch.randn(1152, 7680, device="cuda", dtype=torch.bfloat16)
-
-        config_heuristic = BaseConfigHeuristic()
-        with config.patch(
-            {
-                "triton.enable_persistent_tma_matmul": "1",
-            }
-        ):
-            with (
-                mock.patch(
-                    "torch._inductor.kernel.mm.V.choices.get_base_mm_configs"
-                ) as base_mm_mock,
-                mock.patch(
-                    "torch._inductor.kernel.mm.V.choices.get_persistent_mm_configs"
-                ) as persistent_mm_mock,
-            ):
-                base_mm_mock.return_value = partial(
-                    config_heuristic.preprocess_mm_configs, configs=[]
-                )
-                persistent_mm_mock.return_value = partial(
-                    config_heuristic.preprocess_mm_configs,
-                    configs=[GemmConfig(256, 128, 64, 4, 8, 8)],
-                )
-
-                compiled_f = torch.compile(f)
-                out, code = run_and_get_code(compiled_f, a, b)
-
-                FileCheck().check("triton_tem_fused_mm").check(
-                    "triton_poi_fused__to_copy"
-                ).run(code[0])
-
-                torch.testing.assert_close(out, f(a, b), atol=1e-2, rtol=1e-2)
-
 
 class TestMaxAutotunePrecompile(TestCase):
     def test_precompilation_threads(self):
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
@@ -2831,20 +2831,6 @@ def _any_atomic_add(self, node_list: Sequence[BaseSchedulerNode]) -> bool:
             for n in node_list
         )
 
-    def _template_upcast(
-        self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
-    ) -> bool:
-        # Check if fusing an upcast onto a Triton template. If so, we want to benchmark
-        # the fusion to make sure that shared memory requirements are still met
-        return (
-            isinstance(node1.get_template_node(), ir.TritonTemplateBuffer)
-            and node1.node is not None
-            and node2.node is not None
-            and hasattr(node1.node, "get_dtype")
-            and hasattr(node2.node, "get_dtype")
-            and node1.node.get_dtype().itemsize < node2.node.get_dtype().itemsize
-        )
-
     def speedup_by_fusion(
         self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
     ) -> Union[bool, Callable[[], bool]]:
@@ -2858,12 +2844,7 @@ def speedup_by_fusion(
             and isinstance(n.get_template_node(), ir.MultiTemplateBuffer)
             for n in (node1, node2)
         )
-
-        if (
-            not self._template_upcast(node1, node2)
-            and not config.benchmark_fusion
-            and not is_multi_template
-        ):
+        if not config.benchmark_fusion and not is_multi_template:
             return True
 
         if (
@@ -3094,10 +3075,7 @@ def benchmark_when_ready() -> bool:
 
                 except NoTritonConfigsError:
                     return False
-                except RuntimeError as e:
-                    if "out of resource" in str(e):
-                        return False
-                    raise
+
                 except CompilationError as e:
                     if "Loop-carried variable" in str(e):
                         return True