ROCm
diff --git a/‎test/inductor/test_max_autotune.py‎
Lines changed: 7 additions & 4 deletions b/‎test/inductor/test_max_autotune.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎torch/_inductor/choices.py‎
Lines changed: 34 additions & 52 deletions b/‎torch/_inductor/choices.py‎
Lines changed: 34 additions & 52 deletions
diff --git a/‎torch/_inductor/kernel/bmm.py‎
Lines changed: 33 additions & 37 deletions b/‎torch/_inductor/kernel/bmm.py‎
Lines changed: 33 additions & 37 deletions
diff --git a/‎torch/_inductor/kernel/conv.py‎
Lines changed: 0 additions & 9 deletions b/‎torch/_inductor/kernel/conv.py‎
Lines changed: 0 additions & 9 deletions
@@ -35,7 +35,10 @@
     TritonTemplate,
     TritonTemplateCaller,
 )
-from torch._inductor.template_heuristics import CUDAConfigHeuristic, GemmConfig
+from torch._inductor.template_heuristics import (
+    CUDAMMTemplateConfigHeuristic,
+    GemmConfig,
+)
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -1173,7 +1176,7 @@ def f(a, b):
         # Force only decomposeK choice
         with (
             mock.patch(
-                "torch._inductor.kernel.mm.V.choices.get_base_mm_configs"
+                "torch._inductor.kernel.mm.V.choices.get_mm_configs"
             ) as base_mm_mock,
             mock.patch(
                 "torch._inductor.kernel.mm.use_decompose_k_choice"
@@ -1561,9 +1564,9 @@ def f(a, b):
         b = torch.randn(K, N, dtype=torch.float16, device="cuda", requires_grad=True)
 
         with mock.patch(
-            "torch._inductor.kernel.mm.V.choices.get_config_heuristics"
+            "torch._inductor.template_registry.get_template_heuristic"
         ) as config_mock:
-            config_heuristics = CUDAConfigHeuristic()
+            config_heuristics = CUDAMMTemplateConfigHeuristic()
 
             # Traditionally, this would be set of all possible configs
             # We mock out the code path for the sake of the unit test
 
@@ -9,6 +9,7 @@
 
 from . import config
 from .codecache import write_text
+from .kernel_inputs import KernelInputs  # noqa: TC001
 from .metrics import get_metric_table, is_metric_table_enabled
 from .runtime.hints import DeviceProperties, ReductionHint
 from .scheduler import BaseSchedulerNode, Scheduler, WhyNoFuse
@@ -20,6 +21,7 @@
     ROCmConfigHeuristic,
     XPUConfigHeuristic,
 )
+from .template_registry import get_template_heuristic
 from .virtualized import V
 
 
@@ -71,58 +73,6 @@ def get_config_heuristics(
         else:
             return BaseConfigHeuristic()
 
-    # GEMM configs
-    def get_base_mm_configs(
-        self, device_type: Optional[str] = "cuda"
-    ) -> partial[Generator[TritonConfig, None, None]]:
-        mm_heuristics = self.get_config_heuristics(device_type)
-        if config.max_autotune_gemm_search_space != "EXHAUSTIVE":
-            return mm_heuristics.get_mm_configs()
-        else:
-            return mm_heuristics.get_exhaustive_mm_configs()
-
-    def get_extra_mm_configs(
-        self, device_type: Optional[str] = "cuda"
-    ) -> partial[Generator[TritonConfig, None, None]]:
-        mm_heuristics = self.get_config_heuristics(device_type)
-        return mm_heuristics.get_extra_mm_configs()
-
-    def get_int8_mm_configs(
-        self, device_type: Optional[str] = "cuda"
-    ) -> partial[Generator[TritonConfig, None, None]]:
-        mm_heuristics = self.get_config_heuristics(device_type)
-        return mm_heuristics.get_int8_mm_configs()
-
-    def get_mixed_mm_configs(
-        self, device_type: Optional[str] = "cuda"
-    ) -> partial[Generator[TritonConfig, None, None]]:
-        mm_heuristics = self.get_config_heuristics(device_type)
-        return mm_heuristics.get_mixed_mm_configs()
-
-    def get_persistent_mm_configs(
-        self, device_type: Optional[str] = "cuda"
-    ) -> partial[Generator[TritonConfig, None, None]]:
-        mm_heuristics = self.get_config_heuristics(device_type)
-        return mm_heuristics.get_persistent_mm_configs()
-
-    def get_scaled_mm_configs(
-        self, device_type: Optional[str] = "cuda"
-    ) -> partial[Generator[TritonConfig, None, None]]:
-        mm_heuristics = self.get_config_heuristics(device_type)
-        return mm_heuristics.get_scaled_mm_configs()
-
-    def get_scaled_persistent_mm_configs(
-        self, device_type: Optional[str] = "cuda"
-    ) -> partial[Generator[TritonConfig, None, None]]:
-        mm_heuristics = self.get_config_heuristics(device_type)
-        return mm_heuristics.get_scaled_persistent_mm_configs()
-
-    def get_mm_plus_mm_configs(
-        self, device_type: Optional[str] = "cuda"
-    ) -> partial[Generator[TritonConfig, None, None]]:
-        mm_heuristics = self.get_config_heuristics(device_type)
-        return mm_heuristics.get_mm_plus_mm_configs()
-
     # Conv configs
     def get_conv_configs(
         self, device_type: Optional[str] = "cuda"
@@ -131,6 +81,7 @@ def get_conv_configs(
         return conv_heuristics.get_conv_configs()
 
     # Flex attention configs
+    # TODO(coconutruben): break out flexattention/decode configs into the new retrieval mechanism
     def get_flex_attention_fwd_configs(
         self, head_dim: int, dtype: torch.dtype, device_type: Optional[str] = "cuda"
     ) -> list[Any]:
@@ -149,6 +100,37 @@ def get_flex_decode_configs(
         flex_heuristics = self.get_config_heuristics(device_type)
         return flex_heuristics.get_flex_decode_configs(head_dim, dtype)
 
+    def get_mm_configs(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Any,
+        template_name: str,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        """
+        Get generator of template parameters for MM templates using template-specific heuristics.
+
+        Args:
+            kernel_inputs: MMKernelInputs containing input tensor nodes and matrix indices
+            layout: Output layout
+            template_name: Template name (e.g., "bmm", "mm", "mm_persistent_tma")
+            op_name: Operation name (e.g., "bmm", "baddbmm", "addmm", "mm_plus_mm")
+
+        Yields:
+            Template parameter dictionaries ready for maybe_append_choice
+        """
+        input_tensors = kernel_inputs.nodes()
+        if len(input_tensors) < 2:
+            raise ValueError(f"Need at least 2 input tensors, got {len(input_tensors)}")
+
+        # Extract device_type from kernel_inputs
+        device_type = kernel_inputs.device_type
+        assert device_type is not None, "get_mm_configs requires a valid device type"
+        # Get the appropriate template-specific heuristic
+        heuristic = get_template_heuristic(template_name, device_type, op_name)
+
+        yield from heuristic.get_template_configs(kernel_inputs, layout, op_name)
+
     def triton_kernel_kwargs(
         self,
         kernel_cls: type[TritonKernel],
 
@@ -6,6 +6,7 @@
 from torch._inductor.codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
 
 from .. import ir, lowering as L
+from ..kernel_inputs import MMKernelInputs
 from ..select_algorithm import (
     autotune_select_algorithm,
     ExternKernelChoice,
@@ -26,8 +27,6 @@
     addmm_epilogue,
     is_batch_stride_largest,
     mm_args,
-    mm_config_kwargs,
-    mm_options,
 )
 
 
@@ -40,13 +39,6 @@ def bmm_grid(b, m, n, meta, *, cdiv):
     return (cdiv(m, meta["BLOCK_M"]) * cdiv(n, meta["BLOCK_N"]), b, 1)
 
 
-def _is_large_block_for_cpu(m, n, k):
-    # Thresholds are experimentally determined to reduce Triton CPU compile times
-    if m > 128 or n > 128 or k > 128:
-        return True
-    return m * n > 2**12
-
-
 bmm_template = TritonTemplate(
     name="bmm",
     grid=bmm_grid,
@@ -175,9 +167,14 @@ def may_require_contiguous(t, meta_t):
             meta_mat2 = V.graph.current_node.args[1]
             mat2 = may_require_contiguous(mat2, meta_mat2)
 
+    # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m, n, k, layout, mat1, mat2 = mm_args(
         mat1, mat2, layout=layout, out_dtype=out_dtype
     )
+    name = "bmm"
+
+    # Create MMKernelInputs for BMM at the top
+    kernel_inputs = MMKernelInputs([mat1, mat2])
 
     # below is for getting an overview logging info of inductor mms
     batch_size = mat1.get_size()[0]  # Extract batch dimension
@@ -195,63 +192,65 @@ def may_require_contiguous(t, meta_t):
 
     if out_dtype:
         assert mat1.get_device().type == "cuda", "out_dtype is only supported for CUDA"
-        aten_func = aten_bmm_dtype.bind((mat1, mat2), layout, out_dtype=out_dtype)
+        aten_func = aten_bmm_dtype.bind(
+            kernel_inputs.nodes(), layout, out_dtype=out_dtype
+        )
     else:
-        aten_func = aten_bmm.bind((mat1, mat2), layout)
+        aten_func = aten_bmm.bind(kernel_inputs.nodes(), layout)
 
     # options to tune from
     choices = [aten_func] if use_aten_gemm_kernels() else []
 
-    device_type = ir.get_device_type(mat1)
-    bmm_configs = V.choices.get_base_mm_configs(device_type)
-
-    dtype = mat1.get_dtype()
     if use_triton_template(layout):
         # TODO: add out_dtype support for Triton Template
         assert out_dtype is None, "out_dtype is not supported for Triton"
-        for config in bmm_configs(
-            m,
-            n,
-            k,
-            **mm_config_kwargs(device_type, _is_large_block_for_cpu, dtype.itemsize),
+
+        for kwargs in V.choices.get_mm_configs(
+            kernel_inputs, layout, bmm_template.name, name
         ):
             bmm_template.maybe_append_choice(
                 choices,
-                input_nodes=(mat1, mat2),
+                input_nodes=kernel_inputs.nodes(),
                 layout=layout,
-                **mm_options(config, m, n, k, layout),
+                **kwargs,
             )
     _, is_nonzero = _is_static_problem(layout)
     batch_stride_largest = is_batch_stride_largest(mat1, mat2, layout)
     if (
         batch_stride_largest
         and is_nonzero
         and use_cutlass_template(layout, m, n, k)
-        and _use_cutlass_for_op("bmm")
+        and _use_cutlass_for_op(name)
     ):
         from ..codegen.cuda.gemm_template import CUTLASS3xGemmTemplate
 
-        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])  # type: ignore[arg-type]
+        CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, kernel_inputs.nodes()
+        )  # type: ignore[arg-type]
 
     if use_cpp_bmm_template(layout, mat1, mat2):
         from ..codegen.cpp_bmm_template import CppBmmTemplate
 
         CppBmmTemplate.add_choices(
             choices,
             layout,
-            [mat1, mat2],
+            kernel_inputs.nodes(),
         )
 
     if use_ck_gemm_template(layout, m, n, k):
-        CKGemmTemplate.add_ck_gemm_choices(choices, layout, [mat1, mat2])
+        CKGemmTemplate.add_ck_gemm_choices(choices, layout, kernel_inputs.nodes())
 
-    return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout)
+    return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
 
 
 @L.register_lowering(aten.baddbmm)
 def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+    # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)
 
+    # Create MMKernelInputs for BadDBMM at the top
+    kernel_inputs = MMKernelInputs([inp, mat1, mat2])
+
     # below is for getting an overview logging info of inductor mms
     batch_size = mat1.get_size()[0]
     counters["aten_mm_info"][f"aten.baddbmm_{batch_size}_{m}_{n}_{k}"] += 1
@@ -266,29 +265,26 @@ def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         inp.get_dtype(),
         layout,
     )
-
+    name = "baddbmm"
     # options to tune from
     choices = (
-        [aten_baddbmm.bind((inp, mat1, mat2), layout, alpha=alpha, beta=beta)]
+        [aten_baddbmm.bind(kernel_inputs.nodes(), layout, alpha=alpha, beta=beta)]
         if use_aten_gemm_kernels()
         else []
     )
 
-    device_type = ir.get_device_type(mat1)
-    bmm_configs = V.choices.get_base_mm_configs(device_type)
-
     if use_triton_template(layout):
-        for config in bmm_configs(
-            m, n, k, **mm_config_kwargs(device_type, _is_large_block_for_cpu)
+        for kwargs in V.choices.get_mm_configs(
+            kernel_inputs, layout, bmm_template.name, name
         ):
             bmm_template.maybe_append_choice(
                 choices,
-                input_nodes=(inp, mat1, mat2),
+                input_nodes=kernel_inputs.nodes(),
                 layout=layout,
-                **mm_options(config, m, n, k, layout),
+                **kwargs,
                 prefix_args=1,
                 epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
                 epilogue_fn_hash=str(["addmm_epilogue", layout.dtype, alpha, beta]),
             )
 
-    return autotune_select_algorithm("baddbmm", choices, [inp, mat1, mat2], layout)
+    return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
@@ -29,7 +29,6 @@
     use_triton_template,
 )
 from ..virtualized import V
-from .mm_common import mm_config_kwargs
 
 
 if TYPE_CHECKING:
@@ -61,13 +60,6 @@ def conv3d_grid(n, c, d, h, w, meta, *, cdiv):
     )
 
 
-def _is_large_block_for_cpu(m, n, k):
-    # Thresholds are experimentally determined to reduce Triton CPU compile times
-    if m > 256 or n > 256 or k > 256:
-        return True
-    return m * n * k > 2**17
-
-
 LOOP_BODY_2D = """
         idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
         idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
@@ -603,7 +595,6 @@ def channels_last_conv():
             sympy_product([x.get_size()[0], *x.get_size()[2:]]),
             out_chan,
             in_chan,
-            **mm_config_kwargs(device_type, _is_large_block_for_cpu),
         ):
             if ndim == 2:
                 conv2d_template.maybe_append_choice(