[KERNEL] Add decorator to make caching play well with specialized kernel (#7634)

ThomasRaoux · web-flow · commit 2e359d31bf43 · 2025-07-23T17:10:02.000-07:00
Decorator idea and implementation from @apgoucher. This allow specialized kernels to work with preload.
diff --git a/python/triton_kernels/tests/test_specialize.py b/python/triton_kernels/tests/test_specialize.py
@@ -0,0 +1,84 @@
+import torch
+import importlib
+from triton_kernels.specialize import cacheable, specialize
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def template_kernel(o):
+    cst = 1.0
+    tl.store(o, cst)
+
+
+def retrieve_fn(module, name):
+    module = importlib.import_module(module)
+    fn = getattr(module, name)
+    return fn
+
+
+_specialized_kernel = None
+
+
+def get_specialized_kernel():
+    global _specialized_kernel
+    if _specialized_kernel is not None:
+        return _specialized_kernel
+    import types
+    spec_constants = {}
+    spec_tuples = {}
+    module = types.ModuleType("specialized_kernel")
+    module.specialized = specialize(template_kernel, module, spec_constants, spec_tuples)
+    _specialized_kernel = module.specialized
+    return _specialized_kernel
+
+
+@cacheable
+def cacheable_kernel():
+    return get_specialized_kernel()
+
+
+def test_cacheable(device):
+    specialized_kernel = get_specialized_kernel()
+
+    specialization_data = None
+    fn_name = None
+    module_name = None
+
+    def cache_hook(*args, **kwargs):
+        nonlocal specialization_data
+        nonlocal fn_name
+        nonlocal module_name
+        specialization_data = kwargs["compile"]["specialization_data"]
+        fn_name = kwargs["fn"].name
+        module_name = kwargs["fn"].module
+
+    triton.knobs.runtime.jit_cache_hook = cache_hook
+    o = torch.empty((1, ), dtype=torch.float32, device=device)
+    k = specialized_kernel[(1, )](o, )
+    hash = k.hash
+    assert o.item() == 1.0
+    assert module_name == "tests.test_specialize"
+    assert fn_name == "cacheable_kernel"
+
+    compile_count = 0
+
+    def count_hook(*args, **kwargs):
+        nonlocal compile_count
+        compile_count += 1
+
+    triton.knobs.runtime.jit_cache_hook = count_hook
+    # clear the cache
+    specialized_kernel.device_caches.clear()
+
+    # retrieve the kernel from name and preload it.
+    fn = retrieve_fn(module_name, fn_name)
+    assert fn == specialized_kernel
+    preload = fn.preload(specialization_data)
+    assert compile_count == 1
+    assert preload.hash == hash
+
+    # verify that we hit the cache.
+    compile_count = 0
+    specialized_kernel[(1, )](o, )
+    assert compile_count == 0
diff --git a/python/triton_kernels/triton_kernels/specialize.py b/python/triton_kernels/triton_kernels/specialize.py
@@ -5,6 +5,24 @@
 import triton
 
 
+def cacheable(f):
+    """
+    A decorator that allow you to write something of the form:
+
+    @cacheable
+    def my_kernel(): return (expression dynamically defining a kernel)
+
+    such that it interacts gracefully with triton cache and preload.
+    """
+
+    g = f()
+    g.fn.__name__ = f.__name__
+    g.fn.__module__ = f.__module__
+    g.fn.__qualname__ = f.__qualname__
+    g._fn_name = f"{f.__module__}.{f.__qualname__}"
+    return g
+
+
 def define_kernel(src, module, attrs=None, **extra_globals):
     """
     Dynamically create a Triton function or kernel from a src string,