use global cache with per key future events for multi-thread sync on same key/kernel autotune

Qubitium · Qubitium · commit 10a560147af5 · 2025-10-15T03:49:49.000Z
diff --git a/python/triton/runtime/autotuner.py b/python/triton/runtime/autotuner.py
@@ -17,12 +17,13 @@
 from triton._C.libtriton import get_cache_invalidating_env_vars
 
 
-class AutotunerThreadState:
-    """Per-thread autotune cache and metadata."""
+class CacheFuture:
 
     def __init__(self):
-        self.cache: Dict[Tuple, Config] = {}
-        self.configs_timings: Dict[Config, List[float]] | None = None
+        self.event = threading.Event()
+        self.config: Config | None = None
+        self.error: BaseException | None = None
+        self.used_cached_result: bool = True
         self.bench_time: float | None = None
 
 
@@ -44,7 +45,9 @@ def __init__(self, fn, arg_names, configs, key, reset_to_zero, restore_value, pr
         else:
             self.configs = configs
         self.keys = key
-        self._thread_state = threading.local()
+        self._cache: Dict[Tuple, Config] = {}
+        self._cache_lock = threading.RLock()
+        self._cache_futures: Dict[Tuple, CacheFuture] = {}
         self.arg_names = arg_names
         self.cache_results = cache_results or (knobs.autotuning.cache and not knobs.runtime.interpret)
 
@@ -135,13 +138,6 @@ def do_bench(self):
             return driver.active.get_benchmarker()
         return self._do_bench
 
-    def _get_thread_state(self) -> AutotunerThreadState:
-        state = getattr(self._thread_state, "value", None)
-        if state is None:
-            state = AutotunerThreadState()
-            self._thread_state.value = state
-        return state
-
     def _bench(self, nargs, *args, config, **meta):
         from ..compiler.errors import CompileTimeAssertionFailure
 
@@ -184,11 +180,11 @@ def kernel_call():
                 print(f"Autotuning failed with {e}")
             return [float("inf"), float("inf"), float("inf")]
 
-    def check_disk_cache(self, tuning_key, configs, bench_fn, state: AutotunerThreadState):
+    def check_disk_cache(self, tuning_key, configs, bench_fn):
         # We can't serialize prehooks, so just give up and run the benchmarks.
         if not tuning_key or any(cfg.pre_hook for cfg in configs):
-            bench_fn()
-            return False
+            configs_timings, bench_time, best_config = bench_fn()
+            return False, bench_time, configs_timings, best_config
 
         from triton.compiler.compiler import make_backend
 
@@ -212,26 +208,82 @@ def check_disk_cache(self, tuning_key, configs, bench_fn, state: AutotunerThread
             with open(path, "r") as cached_configs:
                 timings = json.load(cached_configs)["configs_timings"]
                 timings = {Config(**config): timing for config, timing in timings}
-                state.cache[tuning_key] = builtins.min(timings, key=timings.get)
-                state.configs_timings = timings
-            return True
+            best_config = builtins.min(timings, key=timings.get)
+            return True, None, timings, best_config
 
-        bench_fn()
+        configs_timings, bench_time, best_config = bench_fn()
         cache.put(
             json.dumps({
                 "key":
                 tuning_key,
                 "configs_timings": [(config.__dict__, timings)
-                                    for config, timings in (state.configs_timings or {}).items()
+                                    for config, timings in (configs_timings or {}).items()
                                     if not config.pre_hook],
             }), file_name, binary=False)
-        return False
+        return False, bench_time, configs_timings, best_config
+
+    def _get_config_for_key(self, key, nargs, args, kwargs):
+        with self._cache_lock:
+            cached = self._cache.get(key)
+            if cached is not None:
+                return cached, True, None
+
+            future = self._cache_futures.get(key)
+            if future is None:
+                future = CacheFuture()
+                self._cache_futures[key] = future
+                runner = True
+            else:
+                runner = False
+
+        if not runner:
+            future.event.wait()
+            if future.error is not None:
+                raise future.error
+            return future.config, future.used_cached_result, future.bench_time
+
+        pruned_configs = self.prune_configs(kwargs, nargs)
+
+        def benchmark():
+            bench_start = time.time()
+            timings = {config: self._bench(nargs, *args, config=config, **kwargs) for config in pruned_configs}
+            bench_duration = time.time() - bench_start
+            best_config = builtins.min(timings, key=timings.get)
+            full_nargs_local = {**nargs, **kwargs, **best_config.all_kwargs()}
+            self.pre_hook(full_nargs_local, reset_only=True)
+            return timings, bench_duration, best_config
+
+        used_cached_result = False
+        bench_time = None
+
+        try:
+            if self.cache_results:
+                used_cached_result, bench_time, configs_timings, best_config = self.check_disk_cache(
+                    key, pruned_configs, benchmark)
+            else:
+                configs_timings, bench_time, best_config = benchmark()
+                used_cached_result = False
+
+            if best_config is not None:
+                with self._cache_lock:
+                    self._cache[key] = best_config
+
+            future.config = best_config
+            future.used_cached_result = used_cached_result
+            future.bench_time = bench_time
+            return best_config, used_cached_result, bench_time
+        except BaseException as exc:
+            future.error = exc
+            raise
+        finally:
+            future.event.set()
+            with self._cache_lock:
+                self._cache_futures.pop(key, None)
 
     def run(self, *args, **kwargs):
-        state = self._get_thread_state()
-        cache = state.cache
         nargs = dict(zip(self.arg_names, args))
         used_cached_result = True
+        bench_time = None
         key = None
         if len(self.configs) > 1:
             all_args = {**nargs, **kwargs}
@@ -241,34 +293,14 @@ def run(self, *args, **kwargs):
                 if hasattr(arg, "dtype"):
                     key_values.append(str(arg.dtype))
             key = tuple(key_values)
-            if key not in cache:
-                used_cached_result = False
-                pruned_configs = self.prune_configs(kwargs, nargs)
-
-                def benchmark():
-                    bench_start = time.time()
-                    timings = {config: self._bench(nargs, *args, config=config, **kwargs) for config in pruned_configs}
-                    bench_end = time.time()
-                    state.bench_time = bench_end - bench_start
-                    best_config = builtins.min(timings, key=timings.get)
-                    cache[key] = best_config
-                    full_nargs_local = {**nargs, **kwargs, **best_config.all_kwargs()}
-                    self.pre_hook(full_nargs_local, reset_only=True)
-                    state.configs_timings = timings
-
-                if self.cache_results:
-                    used_cached_result = self.check_disk_cache(key, pruned_configs, benchmark, state)
-                else:
-                    benchmark()
-
-            config = cache[key]
+            config, used_cached_result, bench_time = self._get_config_for_key(key, nargs, args, kwargs)
         else:
             config = self.configs[0]
         self.best_config = config
         if knobs.autotuning.print and key is not None and not used_cached_result:
-            bench_time = state.bench_time or 0.0
+            bench_time_value = bench_time or 0.0
             print(f"Triton autotuning for function {self.base_fn.__name__},\nwith key as {key},\n"
-                  f"finished after {bench_time:.2f}s,\nbest config selected: {self.best_config};")
+                  f"finished after {bench_time_value:.2f}s,\nbest config selected: {self.best_config};")
         full_nargs = {**nargs, **kwargs, **config.all_kwargs()}
         if config.pre_hook is not None:
             config.pre_hook(full_nargs)