Skip to content

Commit 3fd10c0

Browse files
Revert timeout handling in autotune to use maxtime blocking (#719)
* Revert timeout handling to use maxtime blocking instead of killing - Remove async task creation and timedwait mechanism - Simply run solve() and measure elapsed time - If algorithm exceeds maxtime, block it for larger matrices - Store max allowed size per algorithm per eltype - Skip blocked algorithms for larger sizes with informative message - Keep NaN handling for timed out/blocked runs - Blocked list automatically resets when switching eltypes This approach is more stable than trying to kill running tasks, which can cause Julia to hang. Instead, we let slow algorithms complete but prevent them from running on larger matrices. * Update Project.toml --------- Co-authored-by: ChrisRackauckas <[email protected]>
1 parent 57460fd commit 3fd10c0

File tree

2 files changed

+48
-88
lines changed

2 files changed

+48
-88
lines changed

lib/LinearSolveAutotune/src/LinearSolveAutotune.jl

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,10 @@ function Base.show(io::IO, results::AutotuneResults)
104104
println(io, "📏 Matrix Sizes: ", minimum(sizes), "×", minimum(sizes),
105105
" to ", maximum(sizes), "×", maximum(sizes))
106106

107-
# Report timeouts if any
108-
timeout_results = filter(row -> isnan(row.gflops), results.results_df)
109-
if nrow(timeout_results) > 0
110-
println(io, "⏱️ Timed Out: ", nrow(timeout_results), " tests exceeded time limit")
107+
# Report tests that exceeded maxtime if any
108+
exceeded_results = filter(row -> isnan(row.gflops) && contains(get(row, :error, ""), "Exceeded maxtime"), results.results_df)
109+
if nrow(exceeded_results) > 0
110+
println(io, "⏱️ Exceeded maxtime: ", nrow(exceeded_results), " tests exceeded time limit")
111111
end
112112

113113
# Call to action - reordered
@@ -265,17 +265,17 @@ function autotune_setup(;
265265

266266
# Display results table - filter out NaN values
267267
successful_results = filter(row -> row.success && !isnan(row.gflops), results_df)
268-
timeout_results = filter(row -> isnan(row.gflops) && !contains(get(row, :error, ""), "Skipped"), results_df)
268+
exceeded_maxtime_results = filter(row -> isnan(row.gflops) && contains(get(row, :error, ""), "Exceeded maxtime"), results_df)
269269
skipped_results = filter(row -> contains(get(row, :error, ""), "Skipped"), results_df)
270270

271-
if nrow(timeout_results) > 0
272-
@info "$(nrow(timeout_results)) tests timed out (exceeded $(maxtime)s limit)"
271+
if nrow(exceeded_maxtime_results) > 0
272+
@info "$(nrow(exceeded_maxtime_results)) tests exceeded maxtime limit ($(maxtime)s)"
273273
end
274274

275275
if nrow(skipped_results) > 0
276276
# Count unique algorithms that were skipped
277277
skipped_algs = unique([row.algorithm for row in eachrow(skipped_results)])
278-
@info "$(length(skipped_algs)) algorithms skipped for larger matrices after timing out"
278+
@info "$(length(skipped_algs)) algorithms skipped for larger matrices after exceeding maxtime"
279279
end
280280

281281
if nrow(successful_results) > 0

lib/LinearSolveAutotune/src/benchmarking.jl

Lines changed: 40 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
9595
# Initialize results DataFrame
9696
results_data = []
9797

98-
# Track algorithms that have timed out (per element type)
99-
timed_out_algorithms = Dict{String, Set{String}}() # eltype => Set of algorithm names
98+
# Track algorithms that have exceeded maxtime (per element type and size)
99+
# Structure: eltype => algorithm_name => max_size_tested
100+
blocked_algorithms = Dict{String, Dict{String, Int}}() # eltype => Dict(algorithm_name => max_size)
100101

101102
# Calculate total number of benchmarks for progress bar
102103
total_benchmarks = 0
@@ -112,8 +113,8 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
112113

113114
try
114115
for eltype in eltypes
115-
# Initialize timed out set for this element type
116-
timed_out_algorithms[string(eltype)] = Set{String}()
116+
# Initialize blocked algorithms dict for this element type
117+
blocked_algorithms[string(eltype)] = Dict{String, Int}()
117118

118119
# Filter algorithms for this element type
119120
compatible_algs, compatible_names = filter_compatible_algorithms(algorithms, alg_names, eltype)
@@ -143,32 +144,35 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
143144
end
144145

145146
for (alg, name) in zip(compatible_algs, compatible_names)
146-
# Skip this algorithm if it has already timed out for this element type
147-
if name in timed_out_algorithms[string(eltype)]
148-
# Still need to update progress bar
149-
ProgressMeter.next!(progress)
150-
# Record as skipped due to previous timeout
151-
push!(results_data,
152-
(
153-
size = n,
154-
algorithm = name,
155-
eltype = string(eltype),
156-
gflops = NaN,
157-
success = false,
158-
error = "Skipped: timed out on smaller matrix"
159-
))
160-
continue
147+
# Skip this algorithm if it has exceeded maxtime for a smaller or equal size matrix
148+
if haskey(blocked_algorithms[string(eltype)], name)
149+
max_allowed_size = blocked_algorithms[string(eltype)][name]
150+
if n > max_allowed_size
151+
# Still need to update progress bar
152+
ProgressMeter.next!(progress)
153+
# Record as skipped due to exceeding maxtime on smaller matrix
154+
push!(results_data,
155+
(
156+
size = n,
157+
algorithm = name,
158+
eltype = string(eltype),
159+
gflops = NaN,
160+
success = false,
161+
error = "Skipped: exceeded maxtime on size $max_allowed_size matrix"
162+
))
163+
continue
164+
end
161165
end
162166

163167
# Update progress description
164168
ProgressMeter.update!(progress,
165169
desc="Benchmarking $name on $(n)×$(n) $eltype matrix: ")
166170

167-
gflops = NaN # Use NaN for timed out runs
171+
gflops = NaN # Use NaN for failed/timed out runs
168172
success = true
169173
error_msg = ""
170174
passed_correctness = true
171-
timed_out = false
175+
exceeded_maxtime = false
172176

173177
try
174178
# Create the linear problem for this test
@@ -179,69 +183,25 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
179183
# Time the warmup run and correctness check
180184
start_time = time()
181185

182-
# Warmup run and correctness check with simple timeout
186+
# Warmup run and correctness check - no interruption, just timing
183187
warmup_sol = nothing
184-
timed_out_flag = false
185188

186-
# Try to run with a timeout - simpler approach without async
187-
try
188-
# Create a task for the solve
189-
done_channel = Channel(1)
190-
error_channel = Channel(1)
191-
192-
warmup_task = @async begin
193-
try
194-
result = solve(prob, alg)
195-
put!(done_channel, result)
196-
catch e
197-
put!(error_channel, e)
198-
end
199-
end
200-
201-
# Wait for completion or timeout
202-
timeout_occurred = false
203-
result = nothing
204-
205-
# Use timedwait which is more reliable than manual polling
206-
wait_result = timedwait(() -> istaskdone(warmup_task), maxtime)
207-
208-
if wait_result === :timed_out
209-
timeout_occurred = true
210-
timed_out_flag = true
211-
# Don't try to kill the task - just mark it as timed out
212-
# The task will continue running in background but we move on
213-
else
214-
# Task completed - get the result
215-
if isready(done_channel)
216-
warmup_sol = take!(done_channel)
217-
elseif isready(error_channel)
218-
throw(take!(error_channel))
219-
end
220-
end
221-
222-
# Close channels to prevent resource leaks
223-
close(done_channel)
224-
close(error_channel)
225-
226-
catch e
227-
# If an error occurred during solve, re-throw it
228-
if !timed_out_flag
229-
throw(e)
230-
end
231-
end
189+
# Simply run the solve and measure time
190+
warmup_sol = solve(prob, alg)
191+
elapsed_time = time() - start_time
232192

233-
if timed_out_flag
234-
# Task timed out
235-
timed_out = true
236-
# Add to timed out set so it's skipped for larger matrices
237-
push!(timed_out_algorithms[string(eltype)], name)
238-
@warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Will skip for larger matrices."
193+
# Check if we exceeded maxtime
194+
if elapsed_time > maxtime
195+
exceeded_maxtime = true
196+
# Block this algorithm for larger matrices
197+
# Store the last size that was allowed to complete
198+
blocked_algorithms[string(eltype)][name] = n
199+
@warn "Algorithm $name exceeded maxtime ($(round(elapsed_time, digits=2))s > $(maxtime)s) for size $n, eltype $eltype. Will skip for larger matrices."
239200
success = false
240-
error_msg = "Timed out (exceeded $(maxtime)s)"
201+
error_msg = "Exceeded maxtime ($(round(elapsed_time, digits=2))s)"
241202
gflops = NaN
242203
else
243-
# Successful completion
244-
elapsed_time = time() - start_time
204+
# Successful completion within time limit
245205

246206
# Check correctness if reference solution is available
247207
if check_correctness && reference_solution !== nothing
@@ -259,8 +219,8 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
259219
end
260220
end
261221

262-
# Only benchmark if correctness check passed and we have time remaining
263-
if passed_correctness && !timed_out
222+
# Only benchmark if correctness check passed and we didn't exceed maxtime
223+
if passed_correctness && !exceeded_maxtime
264224
# Check if we have enough time remaining for benchmarking
265225
# Allow at least 2x the warmup time for benchmarking
266226
remaining_time = maxtime - elapsed_time

0 commit comments

Comments
 (0)