20250722_benchmark_sweep

yf225 · yf225 · commit ea6b4fc2f032 · 2025-07-30T20:38:33.000-07:00
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -27,6 +27,7 @@
 import sys
 from typing import Any
 from typing import Callable
+import time
 
 # Maps tritonbench op names to Helion kernel examples
 # Can map to a single kernel or a list of kernel variants
diff --git a/benchmarks/run_input_shard.sh b/benchmarks/run_input_shard.sh
@@ -0,0 +1,31 @@
+[[ -z "$RANK_OFFSET" ]] && { echo "Error: RANK_OFFSET is not set"; exit 1; }
+[[ -z "$SHARD" ]] && { echo "Error: SHARD is not set"; exit 1; }
+[[ -z "$WORLD_SIZE" ]] && { echo "Error: WORLD_SIZE is not set"; exit 1; }
+
+# Capture timestamp once for consistent filename
+TIMESTAMP=$(date +%s)
+OUTPUT_FILE="benchmarks_autotune_${TIMESTAMP}_input_shard_$((SHARD+1))_of_${WORLD_SIZE}.txt"
+
+# Retry until success
+attempt=0
+while true; do
+# while (( attempt < 10 )); do
+    attempt=$((attempt + 1))
+    echo "Attempt $attempt: Running benchmark for shard $((SHARD+1))/${WORLD_SIZE}..."
+
+    # TIMESTAMP=$(date +%s)
+    # OUTPUT_FILE="benchmarks_autotune_${TIMESTAMP}_input_shard_$((SHARD+1))_of_${WORLD_SIZE}.txt"
+
+    CUDA_VISIBLE_DEVICES=$((RANK_OFFSET+SHARD)) python benchmarks/run.py --input-shard $((SHARD+1))/${WORLD_SIZE} --metrics accuracy,tflops,gbps,speedup >"$OUTPUT_FILE" 2>&1
+
+    exit_code=$?
+    if [ $exit_code -eq 0 ]; then
+        echo "Success! Benchmark completed for shard $((SHARD+1))/${WORLD_SIZE}"
+        break
+    else
+        echo "Failed with exit code $exit_code. Retrying..."
+        sleep 10  # wait a few seconds before retrying
+    fi
+done
+
+# SHARD=0 RANK_OFFSET=4 WORLD_SIZE=4 bash benchmarks/run_input_shard.sh
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -16,6 +16,8 @@
 from typing import NamedTuple
 from typing import NoReturn
 
+from triton.compiler.errors import CompilationError
+
 if TYPE_CHECKING:
     from triton.runtime.jit import JITFunction
 
@@ -97,10 +99,13 @@ def benchmark(self, config: Config) -> float:
         Returns:
             The performance of the configuration in seconds.
         """
-        fn = self.kernel.compile_config(config, allow_print=False)
-        if self.start_precompile_and_check_for_hangs(config, fn)():
-            return self.benchmark_function(config, fn)
-        return inf
+        try:
+            fn = self.kernel.compile_config(config, allow_print=False)
+            if self.start_precompile_and_check_for_hangs(config, fn)():
+                return self.benchmark_function(config, fn)
+            return inf
+        except Exception as e:
+            return inf
 
     def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
         """
@@ -134,9 +139,11 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
             self.log.debug("Benchmarking failed: OutOfResources")
         except PTXASError:
             self.log.warning(f"PTXASError compiling config: {config}")
+        except CompilationError:
+            self.log.debug("Benchmarking failed: Triton CompilationError")
         except Exception as e:
-            if not _expected_errors_regexp.search(str(e)):
-                raise exc.TritonError(f"{type(e).__qualname__}: {e}", config) from e
+            # if not _expected_errors_regexp.search(str(e)):
+            #     raise exc.TritonError(f"{type(e).__qualname__}: {e}", config) from e
             self.log.debug(f"Benchmarking failed: {type(e).__name__}: {e}")
         return inf
 
@@ -158,6 +165,8 @@ def start_precompile_and_check_for_hangs(
         """
         if not self.settings.autotune_precompile:
             return PrecompileFuture.skip(self, config, True)
+        if fn is None:
+            return PrecompileFuture.skip(self, config, False)
         ctx = mp.get_context("fork")
 
         def extract_launcher(
@@ -178,6 +187,8 @@ def extract_launcher(
             precompiler = make_precompiler(e.kernel)(*e.args, **e.kwargs)
             if precompiler is already_compiled:
                 return PrecompileFuture.skip(self, config, True)
+        except Exception as e:
+            return PrecompileFuture.skip(self, config, False)
         process: mp.Process = ctx.Process(target=precompiler)  # pyright: ignore[reportAssignmentType]
         process.start()
         return PrecompileFuture(
@@ -197,7 +208,13 @@ def parallel_benchmark(self, configs: list[Config]) -> list[tuple[Config, float]
         Returns:
             A list of tuples containing configurations and their performance.
         """
-        fns = [self.kernel.compile_config(c, allow_print=False) for c in configs]
+        fns = []
+        for c in configs:
+            try:
+                compile_result = self.kernel.compile_config(c, allow_print=False)
+                fns.append(compile_result)
+            except Exception as e:
+                fns.append(None)
         if self.settings.autotune_precompile:
             is_workings = PrecompileFuture.wait_for_all(
                 [
@@ -376,11 +393,12 @@ def population_statistics(population: list[PopulationMember]) -> str:
         working = [x for x in population if not math.isinf(x.perf)]
         return (
             f"failed={len(population) - len(working)} "
+        ) + (
             f"min={working[0].perf:.4f} "
             f"mid={working[len(working) // 2].perf:.4f} "
             f"max={working[-1].perf:.4f} "
             f"best={population[0].config!s}"
-        )
+        ) if len(working) > 0 else "all failed!"
     return (
         f"min={population[0].perf:.4f} "
         f"mid={population[len(population) // 2].perf:.4f} "
diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py
@@ -411,8 +411,8 @@ def _flat_config(
         default = min(high, 4096)
         value = fn(BlockSizeFragment(low, high, default))
         assert isinstance(value, int)
-        if value >= self.size_hint:
-            return None  # max size becomes persistent reduction
+        if value >= self.size_hint or value < low:
+            return None  # max size or invalid value becomes persistent reduction
         return value
 
     def _normalize(self, name: str, value: object) -> int | None: