Cache autotune timings to disk (#6261)

bertmaher · web-flow · commit 1f983791498a · 2025-03-27T16:55:03.000-04:00
Some users had expressed a desire to cache autotune results, both to
speed up local iteration, and to avoid re-tuning when scaling up to
large numbers of GPUs.

This PR caches tuning timings in Triton's cache dir. Running locally on
03-matrix-multiplication.py, the time of later runs is greatly reduced:
```
% time python ./03-matrix-multiplication.py
...
real    1m59.055s

% time python ./03-matrix-multiplication.py
...
real    0m13.794s
```

The cache key consists of:
* system information (triton source, target info, env vars)
* kernel source code (with dependences)
* the values of the tuning keys (e.g. M/N/K in the matmul example)
* the set of configs requested for tuning (so that we'll re-tune if the
user changes tunings)

If any configs have `pre_hook`s defined, we don't try caching at all,
since the results could depend on arbitrary python code.

A sampling of one of the cache entries (from the matmul tutorial) is:
```
% jq . $TRITON_CACHE_DIR/X5O...Q/matmul_kernel.autotune.json
{
  "key": [
    3968,
    3968,
    3968,
    "torch.float8_e5m2",
    "torch.float8_e5m2",
    "torch.float16"
  ],
  "configs_timings": [
    [
      {
        "kwargs": {
          "BLOCK_SIZE_M": 128,
          "BLOCK_SIZE_N": 256,
          "BLOCK_SIZE_K": 64,
          "GROUP_SIZE_M": 8
        },
        "num_warps": 8,
        "num_ctas": 1,
        "num_stages": 3,
        "maxnreg": null,
        "pre_hook": null
      },
      [
        0.14316800236701965,
        0.1420159935951233,
        0.14431999623775482
      ]
    ],
    ...                                                                                                                                                                                                                                                                                                                                                                                                                    
```

It's not strictly necessary to encode the key, since it's part of the
hashed path, but I think it makes it easier to understand the cache
contents for any dev who needs to do so.

I considered a few different designs here:
* storing just the best config versus all timings (I like having the raw
data available in the cache from a dev perspective, but I could relent
on this, it's an easy change)
* allowing new configs to be added while re-using older cached ones (I
got cold feet at the thought of mutating the cache)
* storing all key+config+timings in a single cache file (convenient for
analysis, but also requires mutating the cache)
diff --git a/python/triton/runtime/autotuner.py b/python/triton/runtime/autotuner.py
@@ -4,6 +4,8 @@
 import os
 import time
 import inspect
+import hashlib
+import json
 from typing import Dict, Tuple, List, Optional
 
 from .jit import KernelInterface
@@ -13,22 +15,9 @@
 
 class Autotuner(KernelInterface):
 
-    def __init__(
-        self,
-        fn,
-        arg_names,
-        configs,
-        key,
-        reset_to_zero,
-        restore_value,
-        pre_hook=None,
-        post_hook=None,
-        prune_configs_by: Optional[Dict] = None,
-        warmup=None,
-        rep=None,
-        use_cuda_graph=False,
-        do_bench=None,
-    ):
+    def __init__(self, fn, arg_names, configs, key, reset_to_zero, restore_value, pre_hook=None, post_hook=None,
+                 prune_configs_by: Optional[Dict] = None, warmup=None, rep=None, use_cuda_graph=False, do_bench=None,
+                 cache_results=False):
         """
         :param prune_configs_by: a dict of functions that are used to prune configs, fields:
             'perf_model': performance model used to predicate running time with different configs, returns running time
@@ -42,6 +31,7 @@ def __init__(
         self.keys = key
         self.cache: Dict[Tuple, Config] = {}
         self.arg_names = arg_names
+        self.cache_results = cache_results or os.getenv("TRITON_CACHE_AUTOTUNING", None) == "1"
 
         # Reset to zero or restore values
         self.reset_to_zero = []
@@ -170,6 +160,50 @@ def kernel_call():
                 print(f"Autotuning failed with {e}")
             return [float("inf"), float("inf"), float("inf")]
 
+    def check_disk_cache(self, tuning_key, configs, bench_fn):
+        # We can't serialize prehooks, so just give up and run the benchmarks.
+        if not tuning_key or any(cfg.pre_hook for cfg in configs):
+            bench_fn()
+            return
+
+        from triton._C.libtriton import get_cache_invalidating_env_vars
+        from triton.compiler.compiler import make_backend, triton_key
+        from triton.runtime.cache import get_cache_manager
+        from triton.runtime.jit import JITFunction
+
+        fn = self.fn
+        while not isinstance(fn, JITFunction):
+            fn = fn.fn
+
+        env_vars = get_cache_invalidating_env_vars()
+        cache_key = [
+            triton_key(),
+            make_backend(driver.active.get_current_target()).hash(),
+            fn.cache_key,
+            str(sorted(env_vars.items())),
+            str(tuning_key),
+        ] + [str(c) for c in configs]
+        cache_key = hashlib.sha256("-".join(cache_key).encode("utf-8")).hexdigest()
+        cache = get_cache_manager(cache_key)
+        file_name = f"{fn.__name__[:150]}.autotune.json"
+        path = cache.get_file(file_name)
+        if path:
+            with open(path, "r") as cached_configs:
+                timings = json.load(cached_configs)["configs_timings"]
+                timings = {Config(**config): timing for config, timing in timings}
+                self.cache[tuning_key] = builtins.min(timings, key=timings.get)
+                self.configs_timings = timings
+            return
+
+        bench_fn()
+        cache.put(
+            json.dumps({
+                "key":
+                tuning_key,
+                "configs_timings":
+                [(config.__dict__, timings) for config, timings in self.configs_timings.items() if not config.pre_hook],
+            }), file_name, binary=False)
+
     def run(self, *args, **kwargs):
         self.nargs = dict(zip(self.arg_names, args))
         used_cached_result = True
@@ -182,17 +216,24 @@ def run(self, *args, **kwargs):
                     key.append(str(arg.dtype))
             key = tuple(key)
             if key not in self.cache:
-                # prune configs
                 used_cached_result = False
                 pruned_configs = self.prune_configs(kwargs)
-                bench_start = time.time()
-                timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
-                bench_end = time.time()
-                self.bench_time = bench_end - bench_start
-                self.cache[key] = builtins.min(timings, key=timings.get)
-                full_nargs = {**self.nargs, **kwargs, **self.cache[key].all_kwargs()}
-                self.pre_hook(full_nargs, reset_only=True)
-                self.configs_timings = timings
+
+                def benchmark():
+                    bench_start = time.time()
+                    timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
+                    bench_end = time.time()
+                    self.bench_time = bench_end - bench_start
+                    self.cache[key] = builtins.min(timings, key=timings.get)
+                    full_nargs = {**self.nargs, **kwargs, **self.cache[key].all_kwargs()}
+                    self.pre_hook(full_nargs, reset_only=True)
+                    self.configs_timings = timings
+
+                if self.cache_results:
+                    self.check_disk_cache(key, pruned_configs, benchmark)
+                else:
+                    benchmark()
+
             config = self.cache[key]
         else:
             config = self.configs[0]
@@ -300,9 +341,23 @@ def __str__(self):
         res.append(f"maxnreg: {self.maxnreg}")
         return ", ".join(res)
 
+    def __hash__(self):
+        return hash((*self.all_kwargs().items(), self.pre_hook))
+
+    def __eq__(self, other):
+        self_tuple = tuple((
+            *self.all_kwargs().items(),
+            self.pre_hook,
+        ))
+        other_tuple = tuple((
+            *other.all_kwargs().items(),
+            other.pre_hook,
+        ))
+        return self_tuple == other_tuple
+
 
 def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, restore_value=None, pre_hook=None, post_hook=None,
-             warmup=None, rep=None, use_cuda_graph=False, do_bench=None):
+             warmup=None, rep=None, use_cuda_graph=False, do_bench=None, cache_results=False):
     """
     Decorator for auto-tuning a :code:`triton.jit`'d function.
 
@@ -356,12 +411,14 @@ def kernel(x_ptr, x_size, BLOCK_SIZE: tl.constexpr):
     :type rep: int
     :param do_bench: a benchmark function to measure the time of each run.
     :type do_bench: lambda fn, quantiles
+    :param cache_results: whether to cache autotune timings to disk.  Defaults to False.
+    "type cache_results: bool
     """
 
     def decorator(fn):
         return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, restore_value, pre_hook=pre_hook,
                          post_hook=post_hook, prune_configs_by=prune_configs_by, warmup=warmup, rep=rep,
-                         use_cuda_graph=use_cuda_graph, do_bench=do_bench)
+                         use_cuda_graph=use_cuda_graph, do_bench=do_bench, cache_results=cache_results)
 
     return decorator