Fixing multi-kernel autotune for different size hints on ROCm

chinmaydk99 · AMD · commit f2dc4bac6c09 · 2025-11-19T20:05:12.000Z
diff --git a/test/inductor/test_multi_kernel.py b/test/inductor/test_multi_kernel.py
@@ -16,7 +16,6 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
-    skipIfRocm,
     skipIfXpu,
 )
 from torch.testing._internal.inductor_utils import (
@@ -108,8 +107,6 @@ def test_softmax(self, expect_multi_kernel=True):
             self.assertFalse(_contains_multi_kernel_code(wrapper_code))
 
     @requires_triton()
-    # TODO: bobrenjc93 to fix multi-kernel for ROCM
-    @skipIfRocm
     @unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
     @skipIfXpu(msg="https://github.com/intel/torch-xpu-ops/issues/2295")
     def test_triton_gemm(self):
@@ -133,13 +130,14 @@ def fn(x, y):
         # One for the first pass and one for the second pass.
         # We mainly care about the wrapper for the final pass here.
         wrapper_code = wrapper_code[-1]
-        self.assertEqual(ref, act)
+        if torch.version.hip:
+            self.assertEqual(ref, act, atol=1e-3, rtol=1e-3)
+        else:
+            self.assertEqual(ref, act)
         self.assertTrue(_contains_size_hint_multi_kernel_code(wrapper_code))
 
     @skipIfXpu(msg="https://github.com/intel/torch-xpu-ops/issues/2295")
     @requires_triton()
-    # TODO: bobrenjc93 to fix multi-kernel for ROCM
-    @skipIfRocm
     @unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
     def test_triton_relu_fused_gemm(self):
         def fn(x, y):
@@ -162,7 +160,11 @@ def fn(x, y):
         # One for the first pass and one for the second pass.
         # We mainly care about the wrapper for the final pass here.
         wrapper_code = wrapper_code[-1]
-        self.assertEqual(ref, act)
+        if torch.version.hip:
+            self.assertEqual(ref, act, atol=1e-3, rtol=1e-3)
+        else:
+            self.assertEqual(ref, act)
+
         self.assertTrue(_contains_size_hint_multi_kernel_code(wrapper_code))
 
     @parametrize("force_kernel", (0, 1))
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
@@ -368,7 +368,10 @@ class TensorMeta:
 
     @classmethod
     def from_irnodes(
-        cls, irnodes: Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]
+        cls,
+        irnodes: Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]],
+        *,
+        hint_override: Optional[int] = None,
     ) -> Union[TensorMeta, list[TensorMeta]]:
         if isinstance(irnodes, Sequence):
             result: list[Any] = [cls.from_irnodes(x) for x in irnodes]
@@ -390,14 +393,17 @@ def from_irnodes(
             sizes=V.graph.sizevars.size_hints(
                 node.get_size(),
                 fallback=config.unbacked_symint_fallback,
+                hint_override=hint_override,
             ),
             strides=V.graph.sizevars.size_hints(
                 node.get_stride(),
                 fallback=config.unbacked_symint_fallback,
+                hint_override=hint_override,
             ),
             offset=V.graph.sizevars.size_hint(
                 node.get_layout().offset,
                 fallback=config.unbacked_symint_fallback,
+                hint_override=hint_override,
             ),
             name=node.get_name(),
         )
diff --git a/torch/_inductor/codegen/triton_combo_kernel.py b/torch/_inductor/codegen/triton_combo_kernel.py
@@ -19,7 +19,7 @@
     SequentialComboKernelGrid,
 )
 from ..scheduler import BaseSchedulerNode
-from ..utils import Placeholder, triton_version_uses_attrs_dict
+from ..utils import is_rocm, Placeholder, triton_version_uses_attrs_dict
 from ..virtualized import V
 from .common import (
     ArgName,
@@ -742,10 +742,13 @@ def kernel_benchmark_extra_args(self) -> list[str]:
                     continue
                 # pyrefly: ignore [missing-argument]
                 if not tree.is_reduction or sub_kernel.inside_reduction:
+                    meta_hint = sub_kernel.hint_override if is_rocm() else None
                     extra_args.append(
                         str(
                             V.graph.sizevars.size_hint(
-                                tree.numel, fallback=config.unbacked_symint_fallback
+                                tree.numel,
+                                fallback=config.unbacked_symint_fallback,
+                                hint_override=meta_hint,
                             )
                         )
                     )
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
@@ -1495,10 +1495,12 @@ def call_kernel(
             wrapper.generate_workspace_deallocation(self.workspace_arg)
 
     def kernel_benchmark_extra_args(self) -> list[str]:
+        meta_hint = self.hint_override if torch.version.hip else None
         return [
             str(x)
             for x in self.grid_fn(
-                *V.graph.sizevars.size_hints(self.call_sizes), self.meta
+                *V.graph.sizevars.size_hints(self.call_sizes, hint_override=meta_hint),
+                self.meta,
             )
         ]
 
diff --git a/torch/_inductor/template_heuristics/triton.py b/torch/_inductor/template_heuristics/triton.py
@@ -646,15 +646,27 @@ def _get_exceeding_shared_memory_checker(
         If the device does not report available shared memory, returns None.
         """
 
+        from ..utils import get_gpu_shared_memory
+
+        sm_available = None
+
         try:
             device = torch.cuda.current_device()
             props = torch.cuda.get_device_properties(device)
             if not hasattr(props, "shared_memory_per_block_optin"):  # for NVidia GPUs
                 return None
             sm_available = int(props.shared_memory_per_block_optin)
         except Exception:
-            # If CUDA is not available or properties cannot be queried, return None
-            return None
+            pass
+
+        # ROCm specific logic to get shared memory
+        if torch.version.hip and sm_available is None:
+            try:
+                sm_available = get_gpu_shared_memory()
+                if sm_available == 0:
+                    return None
+            except Exception:
+                return None
 
         # TODO make a BaseDeviceConfigHeuristics to handle different device configuration in its own implementation.
         def exceeds(gemm_config: BaseConfig, dtype_size: int) -> bool:
@@ -1318,6 +1330,7 @@ def _finalize_mm_configs(
                 waves_per_eu,
                 matrix_instr_nonkdim,
                 kpack,
+                conf.hint_override,
             )
 
             # Check if gemm specific arg exists - add to key if does
@@ -1344,7 +1357,12 @@ def _finalize_mm_configs(
                 }
                 if group_m is not None:
                     kwargs["GROUP_M"] = group_m
-                yield self.triton_config(**kwargs)
+
+                tc = self.triton_config(**kwargs)
+                # Preserve hint_override for multi-kernel support
+                if hasattr(conf, "hint_override") and conf.hint_override is not None:
+                    tc.hint_override = conf.hint_override
+                yield tc
 
     def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
         flex_attn_fwd_configs: list[FlexConfig] = []
@@ -1674,6 +1692,12 @@ def _convert_config_to_template_kwargs(
             group_m = triton_config.kwargs.get("GROUP_M", 8)
             options_dict["GROUP_M"] = group_m
 
+        # Keep ROCm multi-kernel size bucket attached to the config
+        if torch.version.hip and "hint_override" not in options_dict:
+            hint_override = getattr(triton_config, "hint_override", None)
+            if hint_override is not None:
+                options_dict["hint_override"] = hint_override
+
         return options_dict
 
     def _get_acc_type(self, dtype: torch.dtype) -> str:
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
@@ -3030,6 +3030,10 @@ def is_gpu(device: Optional[str]) -> bool:
     return device in GPU_TYPES
 
 
+def is_rocm() -> bool:
+    return torch.version.hip is not None
+
+
 def device_need_guard(device: str) -> bool:
     return device != "mps" and is_gpu(device)  # TODO: MPS does not expose streams now
 

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`SequentialComboKernelGrid,`
`20`	`20`	`)`
`21`	`21`	`from ..scheduler import BaseSchedulerNode`
`22`		`-from ..utils import Placeholder, triton_version_uses_attrs_dict`
	`22`	`+from ..utils import is_rocm, Placeholder, triton_version_uses_attrs_dict`
`23`	`23`	`from ..virtualized import V`
`24`	`24`	`from .common import (`
`25`	`25`	`ArgName,`
`@@ -742,10 +742,13 @@ def kernel_benchmark_extra_args(self) -> list[str]:`
`742`	`742`	`continue`
`743`	`743`	`# pyrefly: ignore [missing-argument]`
`744`	`744`	`if not tree.is_reduction or sub_kernel.inside_reduction:`
	`745`	`+ meta_hint = sub_kernel.hint_override if is_rocm() else None`
`745`	`746`	`extra_args.append(`
`746`	`747`	`str(`
`747`	`748`	`V.graph.sizevars.size_hint(`
`748`		`- tree.numel, fallback=config.unbacked_symint_fallback`
	`749`	`+ tree.numel,`
	`750`	`+ fallback=config.unbacked_symint_fallback,`
	`751`	`+ hint_override=meta_hint,`
`749`	`752`	`)`
`750`	`753`	`)`
`751`	`754`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1495,10 +1495,12 @@ def call_kernel(`
`1495`	`1495`	`wrapper.generate_workspace_deallocation(self.workspace_arg)`
`1496`	`1496`
`1497`	`1497`	`def kernel_benchmark_extra_args(self) -> list[str]:`
	`1498`	`+ meta_hint = self.hint_override if torch.version.hip else None`
`1498`	`1499`	`return [`
`1499`	`1500`	`str(x)`
`1500`	`1501`	`for x in self.grid_fn(`
`1501`		`- *V.graph.sizevars.size_hints(self.call_sizes), self.meta`
	`1502`	`+ *V.graph.sizevars.size_hints(self.call_sizes, hint_override=meta_hint),`
	`1503`	`+ self.meta,`
`1502`	`1504`	`)`
`1503`	`1505`	`]`
`1504`	`1506`