[WIP] Improve autotune infra to catch more error cases

yf225 · yf225 · commit 84bdc4d800b0 · 2025-07-22T00:07:00.000-07:00
diff --git a/helion/_compiler/tile_dispatch.py b/helion/_compiler/tile_dispatch.py
@@ -94,7 +94,7 @@ def _add_reduction_strategies(self, fn: DeviceFunction, config: Config) -> None:
             reduction_loop = env.config_spec.reduction_loops.config_get(
                 config.reduction_loops, block_id, None
             )
-            if reduction_loop is None:
+            if reduction_loop is None or reduction_loop <= 1:
                 strategy: TileStrategy = PersistentReductionStrategy(fn, block_id)
             else:
                 strategy = LoopedReductionStrategy(fn, block_id, reduction_loop)
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -21,6 +21,7 @@
 
 from torch._inductor.runtime.triton_compat import OutOfResources
 from torch._inductor.runtime.triton_compat import PTXASError
+from triton.compiler.errors import CompilationError
 import torch.multiprocessing as mp
 from triton.testing import do_bench
 
@@ -43,7 +44,7 @@
     from . import ConfigSpec
 
 _expected_errors_regexp: re.Pattern[str] = re.compile(
-    r"|".join(map(re.escape, ["[CUDA]: invalid argument"]))
+    r"|".join(map(re.escape, ["[CUDA]: invalid argument", "exceeds triton maximum tensor numel"]))
 )
 
 
@@ -88,10 +89,13 @@ def benchmark(self, config: Config) -> float:
         Returns:
             The performance of the configuration in seconds.
         """
-        fn = self.kernel.compile_config(config, allow_print=False)
-        if self.start_precompile_and_check_for_hangs(config, fn)():
-            return self.benchmark_function(config, fn)
-        return inf
+        try:
+            fn = self.kernel.compile_config(config, allow_print=False)
+            if self.start_precompile_and_check_for_hangs(config, fn)():
+                return self.benchmark_function(config, fn)
+            return inf
+        except Exception as e:
+            return inf
 
     def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
         """
@@ -125,8 +129,10 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
             self.log.debug("Benchmarking failed: OutOfResources")
         except PTXASError:
             self.log.warning(f"PTXASError compiling config: {config}")
+        except CompilationError:
+            self.log.debug("Benchmarking failed: Triton CompilationError")
         except Exception as e:
-            if not _expected_errors_regexp.search(str(e)):
+            if not _expected_errors_regexp.search(str(e)) and not "exceeds triton maximum tensor numel" in str(e):
                 raise exc.TritonError(f"{type(e).__qualname__}: {e}", config) from e
             self.log.debug(f"Benchmarking failed: {type(e).__name__}: {e}")
         return inf
@@ -149,6 +155,8 @@ def start_precompile_and_check_for_hangs(
         """
         if not self.settings.autotune_precompile:
             return PrecompileFuture.skip(self, config, True)
+        if fn is None:
+            return PrecompileFuture.skip(self, config, False)
         ctx = mp.get_context("fork")
 
         def extract_launcher(
@@ -188,7 +196,13 @@ def parallel_benchmark(self, configs: list[Config]) -> list[tuple[Config, float]
         Returns:
             A list of tuples containing configurations and their performance.
         """
-        fns = [self.kernel.compile_config(c, allow_print=False) for c in configs]
+        fns = []
+        for c in configs:
+            try:
+                compile_result = self.kernel.compile_config(c, allow_print=False)
+                fns.append(compile_result)
+            except Exception as e:
+                fns.append(None)
         if self.settings.autotune_precompile:
             is_workings = PrecompileFuture.wait_for_all(
                 [
diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py
@@ -411,8 +411,8 @@ def _flat_config(
         default = min(high, 4096)
         value = fn(BlockSizeFragment(low, high, default))
         assert isinstance(value, int)
-        if value >= self.size_hint:
-            return None  # max size becomes persistent reduction
+        if value >= self.size_hint or value < low:
+            return None  # max size or invalid value becomes persistent reduction
         return value
 
     def _normalize(self, name: str, value: object) -> int | None:

Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ def _add_reduction_strategies(self, fn: DeviceFunction, config: Config) -> None:`
`94`	`94`	`reduction_loop = env.config_spec.reduction_loops.config_get(`
`95`	`95`	`config.reduction_loops, block_id, None`
`96`	`96`	`)`
`97`		`- if reduction_loop is None:`
	`97`	`+ if reduction_loop is None or reduction_loop <= 1:`
`98`	`98`	`strategy: TileStrategy = PersistentReductionStrategy(fn, block_id)`
`99`	`99`	`else:`
`100`	`100`	`strategy = LoopedReductionStrategy(fn, block_id, reduction_loop)`