Skip to content

Commit 4b2d3d8

Browse files
Improve timeout handling: skip timed-out algorithms and cap big benchmarks at 15000 (#718)
* Fix hanging issue in autotune timeout mechanism - Replace manual polling loop with Julia's built-in timedwait() function - Avoid using Base.throwto() which can cause hangs with sleeping tasks - Let timed-out tasks continue in background rather than trying to kill them - Use Channels for clean communication between tasks - Close channels properly to prevent resource leaks This fixes the issue where autotune would hang indefinitely when trying to interrupt tasks that exceeded the timeout limit. * Improve timeout handling: skip timed-out algorithms and cap big benchmarks - Algorithms that timeout are automatically excluded from larger matrix sizes - This prevents wasting time on algorithms that are already too slow - Cap :big benchmark sizes at 15000 instead of 20000 for stability - 20000x20000 matrices often cause issues even on powerful computers - Add tracking of timed-out algorithms per element type - Report number of skipped algorithms in summary output - Update documentation to reflect these improvements This makes the autotuning process much more efficient by not repeatedly testing algorithms that have already proven to be too slow. --------- Co-authored-by: ChrisRackauckas <[email protected]>
1 parent 47e7223 commit 4b2d3d8

File tree

3 files changed

+41
-6
lines changed

3 files changed

+41
-6
lines changed

docs/src/tutorials/autotune.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ Control which matrix size ranges to test:
7373
# :small - 20×20 to 100×100 (small problems)
7474
# :medium - 100×100 to 300×300 (typical problems)
7575
# :large - 300×300 to 1000×1000 (larger problems)
76-
# :big - 10000×1000 to 20000x20000 (GPU/HPC scale)
76+
# :big - 1000×1000 to 15000×15000 (GPU/HPC scale, capped at 15000 for stability)
7777

7878
# Default: test tiny through large
7979
results = autotune_setup() # uses [:tiny, :small, :medium, :large]
@@ -161,8 +161,11 @@ When an algorithm exceeds the `maxtime` limit:
161161
- The test is skipped to prevent hanging
162162
- The result is recorded as `NaN` in the benchmark data
163163
- A warning is displayed indicating the timeout
164+
- **The algorithm is automatically excluded from all larger matrix sizes** to save time
164165
- The benchmark continues with the next algorithm
165166

167+
This intelligent timeout handling ensures that slow algorithms don't waste time on progressively larger matrices once they've proven too slow on smaller ones.
168+
166169
### Missing Algorithm Handling
167170

168171
By default, autotune expects all algorithms to be available to ensure complete benchmarking. You can relax this requirement:

lib/LinearSolveAutotune/src/LinearSolveAutotune.jl

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ Run a comprehensive benchmark of all available LU factorization methods and opti
182182
183183
# Arguments
184184
185-
- `sizes = [:small, :medium, :large]`: Size categories to test. Options: :small (5-20), :medium (20-300), :large (300-1000), :big (10000-100000)
185+
- `sizes = [:small, :medium, :large]`: Size categories to test. Options: :tiny (5-20), :small (20-100), :medium (100-300), :large (300-1000), :big (1000-15000)
186186
- `set_preferences::Bool = true`: Update LinearSolve preferences with optimal algorithms
187187
- `samples::Int = 5`: Number of benchmark samples per algorithm/size
188188
- `seconds::Float64 = 0.5`: Maximum time per benchmark
@@ -265,12 +265,19 @@ function autotune_setup(;
265265

266266
# Display results table - filter out NaN values
267267
successful_results = filter(row -> row.success && !isnan(row.gflops), results_df)
268-
timeout_results = filter(row -> isnan(row.gflops), results_df)
268+
timeout_results = filter(row -> isnan(row.gflops) && !contains(get(row, :error, ""), "Skipped"), results_df)
269+
skipped_results = filter(row -> contains(get(row, :error, ""), "Skipped"), results_df)
269270

270271
if nrow(timeout_results) > 0
271272
@info "$(nrow(timeout_results)) tests timed out (exceeded $(maxtime)s limit)"
272273
end
273274

275+
if nrow(skipped_results) > 0
276+
# Count unique algorithms that were skipped
277+
skipped_algs = unique([row.algorithm for row in eachrow(skipped_results)])
278+
@info "$(length(skipped_algs)) algorithms skipped for larger matrices after timing out"
279+
end
280+
274281
if nrow(successful_results) > 0
275282
@info "Benchmark completed successfully!"
276283

lib/LinearSolveAutotune/src/benchmarking.jl

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
9595
# Initialize results DataFrame
9696
results_data = []
9797

98+
# Track algorithms that have timed out (per element type)
99+
timed_out_algorithms = Dict{String, Set{String}}() # eltype => Set of algorithm names
100+
98101
# Calculate total number of benchmarks for progress bar
99102
total_benchmarks = 0
100103
for eltype in eltypes
@@ -109,6 +112,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
109112

110113
try
111114
for eltype in eltypes
115+
# Initialize timed out set for this element type
116+
timed_out_algorithms[string(eltype)] = Set{String}()
117+
112118
# Filter algorithms for this element type
113119
compatible_algs, compatible_names = filter_compatible_algorithms(algorithms, alg_names, eltype)
114120

@@ -137,6 +143,23 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
137143
end
138144

139145
for (alg, name) in zip(compatible_algs, compatible_names)
146+
# Skip this algorithm if it has already timed out for this element type
147+
if name in timed_out_algorithms[string(eltype)]
148+
# Still need to update progress bar
149+
ProgressMeter.next!(progress)
150+
# Record as skipped due to previous timeout
151+
push!(results_data,
152+
(
153+
size = n,
154+
algorithm = name,
155+
eltype = string(eltype),
156+
gflops = NaN,
157+
success = false,
158+
error = "Skipped: timed out on smaller matrix"
159+
))
160+
continue
161+
end
162+
140163
# Update progress description
141164
ProgressMeter.update!(progress,
142165
desc="Benchmarking $name on $(n)×$(n) $eltype matrix: ")
@@ -210,7 +233,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
210233
if timed_out_flag
211234
# Task timed out
212235
timed_out = true
213-
@warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Recording as NaN."
236+
# Add to timed out set so it's skipped for larger matrices
237+
push!(timed_out_algorithms[string(eltype)], name)
238+
@warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Will skip for larger matrices."
214239
success = false
215240
error_msg = "Timed out (exceeded $(maxtime)s)"
216241
gflops = NaN
@@ -301,7 +326,7 @@ Size categories:
301326
- `:small` - 20:20:100 (for small problems)
302327
- `:medium` - 100:50:300 (for typical problems)
303328
- `:large` - 300:100:1000 (for larger problems)
304-
- `:big` - vcat(1000:2000:10000, 10000:5000:20000) (for very large/GPU problems)
329+
- `:big` - vcat(1000:2000:10000, 10000:5000:15000) (for very large/GPU problems, capped at 15000)
305330
"""
306331
function get_benchmark_sizes(size_categories::Vector{Symbol})
307332
sizes = Int[]
@@ -316,7 +341,7 @@ function get_benchmark_sizes(size_categories::Vector{Symbol})
316341
elseif category == :large
317342
append!(sizes, 300:100:1000)
318343
elseif category == :big
319-
append!(sizes, vcat(1000:2000:10000, 10000:5000:20000))
344+
append!(sizes, vcat(1000:2000:10000, 10000:5000:15000)) # Capped at 15000
320345
else
321346
@warn "Unknown size category: $category. Skipping."
322347
end

0 commit comments

Comments
 (0)