Add autotuner_fn argument to @helion.kernel for custom autotuners (#394)

oulgen · web-flow · commit f5594ca5dc6c · 2025-08-01T12:17:57.000-07:00
diff --git a/helion/autotuner/base_cache.py b/helion/autotuner/base_cache.py
@@ -13,6 +13,7 @@
 from torch._inductor.codecache import torch_key
 
 from .._utils import counters
+from .base_search import BaseAutotuner
 
 if TYPE_CHECKING:
     from ..runtime.config import Config
@@ -106,7 +107,7 @@ class StrictAutotuneCacheKey(LooseAutotuneCacheKey):
     triton_key: str = dataclasses.field(default_factory=triton_key_wrapper)
 
 
-class AutotuneCacheBase(abc.ABC):
+class AutotuneCacheBase(BaseAutotuner, abc.ABC):
     """
     Abstract base class that all autotune caches need to implement.
     Any user defined cache will need to extend this class, and
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import abc
 import collections
 import contextlib
 import dataclasses
@@ -56,7 +57,17 @@
 )
 
 
-class BaseSearch:
+class BaseAutotuner(abc.ABC):
+    """
+    Abstract base class for all autotuners and classes that wrap autotuners, like caching.
+    """
+
+    @abc.abstractmethod
+    def autotune(self) -> Config:
+        raise NotImplementedError
+
+
+class BaseSearch(BaseAutotuner):
     """
     Base class for search algorithms. This class defines the interface and utilities for all
     search algorithms.
diff --git a/helion/runtime/kernel.py b/helion/runtime/kernel.py
@@ -467,17 +467,7 @@ def autotune(
                 config = FiniteSearch(self, args, self.configs).autotune()
         else:
             self.settings.check_autotuning_disabled()
-
-            from ..autotuner import DifferentialEvolutionSearch
-            from ..autotuner import LocalAutotuneCache
-
-            config = LocalAutotuneCache(
-                DifferentialEvolutionSearch(
-                    self,
-                    args,
-                    **kwargs,  # pyright: ignore[reportArgumentType]
-                ),
-            ).autotune()
+            config = self.settings.autotuner_fn(self, args, **kwargs).autotune()
 
         self.set_config(config)
         return config
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -8,6 +8,7 @@
 from typing import TYPE_CHECKING
 from typing import Literal
 from typing import Protocol
+from typing import Sequence
 from typing import cast
 
 import torch
@@ -19,9 +20,17 @@
 if TYPE_CHECKING:
     from contextlib import AbstractContextManager
 
+    from ..autotuner.base_search import BaseAutotuner
+    from .kernel import BoundKernel
+
     class _TLS(Protocol):
         default_settings: Settings | None
 
+    class AutotunerFunction(Protocol):
+        def __call__(
+            self, bound_kernel: BoundKernel, args: Sequence[object], **kwargs: object
+        ) -> BaseAutotuner: ...
+
 
 _tls: _TLS = cast("_TLS", threading.local())
 
@@ -50,6 +59,15 @@ def __exit__(self, *args: object) -> None:
     return _RestoreContext()
 
 
+def default_autotuner_fn(
+    bound_kernel: BoundKernel, args: Sequence[object], **kwargs: object
+) -> BaseAutotuner:
+    from ..autotuner import DifferentialEvolutionSearch
+    from ..autotuner import LocalAutotuneCache
+
+    return LocalAutotuneCache(DifferentialEvolutionSearch(bound_kernel, args, **kwargs))  # pyright: ignore[reportArgumentType]
+
+
 @dataclasses.dataclass
 class _Settings:
     # see __slots__ below for the doc strings that show up in help(Settings)
@@ -76,6 +94,7 @@ class _Settings:
     ref_mode: RefMode = (
         RefMode.EAGER if os.environ.get("HELION_INTERPRET", "") == "1" else RefMode.OFF
     )
+    autotuner_fn: AutotunerFunction = default_autotuner_fn
 
 
 class Settings(_Settings):
@@ -97,6 +116,7 @@ class Settings(_Settings):
         "force_autotune": "If True, force autotuning even if a config is provided.",
         "allow_warp_specialize": "If True, allow warp specialization for tl.range calls on CUDA devices.",
         "ref_mode": "Reference mode for kernel execution. Can be RefMode.OFF or RefMode.EAGER.",
+        "autotuner_fn": "Function to create an autotuner",
     }
     assert __slots__.keys() == {field.name for field in dataclasses.fields(_Settings)}
 
diff --git a/test/test_cache.py b/test/test_cache.py
@@ -1,20 +1,17 @@
 from __future__ import annotations
 
-from pathlib import Path
 import unittest
 
 import torch
 
+import helion
 from helion._testing import DEVICE
 from helion._testing import RefEagerTestDisabled
 from helion._testing import TestCase
-from helion._testing import import_path
 from helion._utils import counters
 from helion.autotuner import StrictLocalAutotuneCache
 from helion.autotuner.base_search import BaseSearch
-
-datadir = Path(__file__).parent / "data"
-basic_kernels = import_path(datadir / "basic_kernels.py")
+import helion.language as hl
 
 
 class BasicSearch(BaseSearch):
@@ -24,39 +21,40 @@ def autotune(self):
 
 class TestCache(RefEagerTestDisabled, TestCase):
     def test_basic(self):
+        @helion.kernel(
+            autotuner_fn=lambda k, a: StrictLocalAutotuneCache(BasicSearch(k, a))
+        )
+        def add(x, y):
+            x, y = torch.broadcast_tensors(x, y)
+            out = torch.empty_like(x)
+            for tile in hl.tile(out.size()):
+                out[tile] = x[tile] + y[tile]
+            return out
+
         a = torch.randn(16, device=DEVICE, dtype=torch.bfloat16)
         args_a = (a, a)
         b = torch.randn(16, device=DEVICE, dtype=torch.float16)
         args_b = (b, b)
 
-        bound_kernel = basic_kernels.add.bind(args_a)
-        config = StrictLocalAutotuneCache(BasicSearch(bound_kernel, args_a)).autotune()
-        bound_kernel.set_config(config)
-        result = bound_kernel(*args_a)
+        result = add(*args_a)
         torch.testing.assert_close(result, a + a)
 
         self.assertEqual(counters["autotune"]["cache_miss"], 1)
         self.assertEqual(counters["autotune"]["cache_hit"], 0)
         self.assertEqual(counters["autotune"]["cache_put"], 1)
 
-        basic_kernels.add.reset()
+        add.reset()
 
-        bound_kernel = basic_kernels.add.bind(args_a)
-        config = StrictLocalAutotuneCache(BasicSearch(bound_kernel, args_a)).autotune()
-        bound_kernel.set_config(config)
-        result = bound_kernel(*args_a)
+        result = add(*args_a)
         torch.testing.assert_close(result, a + a)
 
         self.assertEqual(counters["autotune"]["cache_miss"], 1)
         self.assertEqual(counters["autotune"]["cache_hit"], 1)
         self.assertEqual(counters["autotune"]["cache_put"], 1)
 
-        basic_kernels.add.reset()
+        add.reset()
 
-        bound_kernel = basic_kernels.add.bind(args_b)
-        config = StrictLocalAutotuneCache(BasicSearch(bound_kernel, args_b)).autotune()
-        bound_kernel.set_config(config)
-        result = bound_kernel(*args_b)
+        result = add(*args_b)
         torch.testing.assert_close(result, b + b)
 
         self.assertEqual(counters["autotune"]["cache_miss"], 2)