Skip to content

Commit ea6b4fc

Browse files
committed
20250722_benchmark_sweep
1 parent 24d453d commit ea6b4fc

File tree

4 files changed

+60
-10
lines changed

4 files changed

+60
-10
lines changed

benchmarks/run.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import sys
2828
from typing import Any
2929
from typing import Callable
30+
import time
3031

3132
# Maps tritonbench op names to Helion kernel examples
3233
# Can map to a single kernel or a list of kernel variants

benchmarks/run_input_shard.sh

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
[[ -z "$RANK_OFFSET" ]] && { echo "Error: RANK_OFFSET is not set"; exit 1; }
2+
[[ -z "$SHARD" ]] && { echo "Error: SHARD is not set"; exit 1; }
3+
[[ -z "$WORLD_SIZE" ]] && { echo "Error: WORLD_SIZE is not set"; exit 1; }
4+
5+
# Capture timestamp once for consistent filename
6+
TIMESTAMP=$(date +%s)
7+
OUTPUT_FILE="benchmarks_autotune_${TIMESTAMP}_input_shard_$((SHARD+1))_of_${WORLD_SIZE}.txt"
8+
9+
# Retry until success
10+
attempt=0
11+
while true; do
12+
# while (( attempt < 10 )); do
13+
attempt=$((attempt + 1))
14+
echo "Attempt $attempt: Running benchmark for shard $((SHARD+1))/${WORLD_SIZE}..."
15+
16+
# TIMESTAMP=$(date +%s)
17+
# OUTPUT_FILE="benchmarks_autotune_${TIMESTAMP}_input_shard_$((SHARD+1))_of_${WORLD_SIZE}.txt"
18+
19+
CUDA_VISIBLE_DEVICES=$((RANK_OFFSET+SHARD)) python benchmarks/run.py --input-shard $((SHARD+1))/${WORLD_SIZE} --metrics accuracy,tflops,gbps,speedup >"$OUTPUT_FILE" 2>&1
20+
21+
exit_code=$?
22+
if [ $exit_code -eq 0 ]; then
23+
echo "Success! Benchmark completed for shard $((SHARD+1))/${WORLD_SIZE}"
24+
break
25+
else
26+
echo "Failed with exit code $exit_code. Retrying..."
27+
sleep 10 # wait a few seconds before retrying
28+
fi
29+
done
30+
31+
# SHARD=0 RANK_OFFSET=4 WORLD_SIZE=4 bash benchmarks/run_input_shard.sh

helion/autotuner/base_search.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
from typing import NamedTuple
1717
from typing import NoReturn
1818

19+
from triton.compiler.errors import CompilationError
20+
1921
if TYPE_CHECKING:
2022
from triton.runtime.jit import JITFunction
2123

@@ -97,10 +99,13 @@ def benchmark(self, config: Config) -> float:
9799
Returns:
98100
The performance of the configuration in seconds.
99101
"""
100-
fn = self.kernel.compile_config(config, allow_print=False)
101-
if self.start_precompile_and_check_for_hangs(config, fn)():
102-
return self.benchmark_function(config, fn)
103-
return inf
102+
try:
103+
fn = self.kernel.compile_config(config, allow_print=False)
104+
if self.start_precompile_and_check_for_hangs(config, fn)():
105+
return self.benchmark_function(config, fn)
106+
return inf
107+
except Exception as e:
108+
return inf
104109

105110
def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
106111
"""
@@ -134,9 +139,11 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
134139
self.log.debug("Benchmarking failed: OutOfResources")
135140
except PTXASError:
136141
self.log.warning(f"PTXASError compiling config: {config}")
142+
except CompilationError:
143+
self.log.debug("Benchmarking failed: Triton CompilationError")
137144
except Exception as e:
138-
if not _expected_errors_regexp.search(str(e)):
139-
raise exc.TritonError(f"{type(e).__qualname__}: {e}", config) from e
145+
# if not _expected_errors_regexp.search(str(e)):
146+
# raise exc.TritonError(f"{type(e).__qualname__}: {e}", config) from e
140147
self.log.debug(f"Benchmarking failed: {type(e).__name__}: {e}")
141148
return inf
142149

@@ -158,6 +165,8 @@ def start_precompile_and_check_for_hangs(
158165
"""
159166
if not self.settings.autotune_precompile:
160167
return PrecompileFuture.skip(self, config, True)
168+
if fn is None:
169+
return PrecompileFuture.skip(self, config, False)
161170
ctx = mp.get_context("fork")
162171

163172
def extract_launcher(
@@ -178,6 +187,8 @@ def extract_launcher(
178187
precompiler = make_precompiler(e.kernel)(*e.args, **e.kwargs)
179188
if precompiler is already_compiled:
180189
return PrecompileFuture.skip(self, config, True)
190+
except Exception as e:
191+
return PrecompileFuture.skip(self, config, False)
181192
process: mp.Process = ctx.Process(target=precompiler) # pyright: ignore[reportAssignmentType]
182193
process.start()
183194
return PrecompileFuture(
@@ -197,7 +208,13 @@ def parallel_benchmark(self, configs: list[Config]) -> list[tuple[Config, float]
197208
Returns:
198209
A list of tuples containing configurations and their performance.
199210
"""
200-
fns = [self.kernel.compile_config(c, allow_print=False) for c in configs]
211+
fns = []
212+
for c in configs:
213+
try:
214+
compile_result = self.kernel.compile_config(c, allow_print=False)
215+
fns.append(compile_result)
216+
except Exception as e:
217+
fns.append(None)
201218
if self.settings.autotune_precompile:
202219
is_workings = PrecompileFuture.wait_for_all(
203220
[
@@ -376,11 +393,12 @@ def population_statistics(population: list[PopulationMember]) -> str:
376393
working = [x for x in population if not math.isinf(x.perf)]
377394
return (
378395
f"failed={len(population) - len(working)} "
396+
) + (
379397
f"min={working[0].perf:.4f} "
380398
f"mid={working[len(working) // 2].perf:.4f} "
381399
f"max={working[-1].perf:.4f} "
382400
f"best={population[0].config!s}"
383-
)
401+
) if len(working) > 0 else "all failed!"
384402
return (
385403
f"min={population[0].perf:.4f} "
386404
f"mid={population[len(population) // 2].perf:.4f} "

helion/autotuner/config_spec.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -411,8 +411,8 @@ def _flat_config(
411411
default = min(high, 4096)
412412
value = fn(BlockSizeFragment(low, high, default))
413413
assert isinstance(value, int)
414-
if value >= self.size_hint:
415-
return None # max size becomes persistent reduction
414+
if value >= self.size_hint or value < low:
415+
return None # max size or invalid value becomes persistent reduction
416416
return value
417417

418418
def _normalize(self, name: str, value: object) -> int | None:

0 commit comments

Comments
 (0)