[release/2.7] [SWDEV-543214] Reland #2416 Fix warps runtime (#2421)

jataylo · jithunnair-amd · pragupta · commit e5e987e2d8ab · 2025-10-08T20:57:53.000Z
Relands #2416 with caching fix Upstream equivalent pytorch#159146 --------- Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> (cherry picked from commit f0aebdc) (cherry picked from commit 9c429dd)
diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -3,6 +3,7 @@
 import itertools
 import logging
 from typing import Callable, Optional, TYPE_CHECKING
+from functools import lru_cache
 
 from .hints import TRITON_MAX_BLOCK
 from .runtime_utils import red_text, triton_config_to_hashable
@@ -60,6 +61,7 @@ def get_config_max(self, prefix: str) -> int:
         size_hint = self.size_hints.get(prefix) if self.size_hints is not None else None
         return min(max_block, size_hint) if size_hint is not None else max_block
 
+    @lru_cache(maxsize=1)
     def get_warpsmax(self):
         # Currently, CUDA has a maximum of 1024 threads, so 32 is the max
         # number of warps.