Skip to content

Commit 47e7223

Browse files
Fix hanging issue in autotune timeout mechanism (#717)
- Replace manual polling loop with Julia's built-in timedwait() function - Avoid using Base.throwto() which can cause hangs with sleeping tasks - Let timed-out tasks continue in background rather than trying to kill them - Use Channels for clean communication between tasks - Close channels properly to prevent resource leaks This fixes the issue where autotune would hang indefinitely when trying to interrupt tasks that exceeded the timeout limit. Co-authored-by: ChrisRackauckas <[email protected]>
1 parent a0f36af commit 47e7223

File tree

1 file changed

+45
-37
lines changed

1 file changed

+45
-37
lines changed

lib/LinearSolveAutotune/src/benchmarking.jl

Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -156,58 +156,66 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
156156
# Time the warmup run and correctness check
157157
start_time = time()
158158

159-
# Create a channel for communication between tasks
160-
result_channel = Channel(1)
161-
162-
# Warmup run and correctness check with timeout
163-
warmup_task = @async begin
164-
try
165-
result = solve(prob, alg)
166-
put!(result_channel, result)
167-
catch e
168-
put!(result_channel, e)
169-
end
170-
end
159+
# Warmup run and correctness check with simple timeout
160+
warmup_sol = nothing
161+
timed_out_flag = false
171162

172-
# Timer task to enforce timeout
173-
timer_task = @async begin
174-
sleep(maxtime)
175-
if !istaskdone(warmup_task)
163+
# Try to run with a timeout - simpler approach without async
164+
try
165+
# Create a task for the solve
166+
done_channel = Channel(1)
167+
error_channel = Channel(1)
168+
169+
warmup_task = @async begin
176170
try
177-
Base.throwto(warmup_task, InterruptException())
178-
catch
179-
# Task might be in non-interruptible state
171+
result = solve(prob, alg)
172+
put!(done_channel, result)
173+
catch e
174+
put!(error_channel, e)
180175
end
181-
put!(result_channel, :timeout)
182176
end
183-
end
184-
185-
# Wait for result or timeout
186-
warmup_sol = nothing
187-
result = take!(result_channel)
188-
189-
# Clean up timer task if still running
190-
if !istaskdone(timer_task)
191-
try
192-
Base.throwto(timer_task, InterruptException())
193-
catch
194-
# Timer task might have already finished
177+
178+
# Wait for completion or timeout
179+
timeout_occurred = false
180+
result = nothing
181+
182+
# Use timedwait which is more reliable than manual polling
183+
wait_result = timedwait(() -> istaskdone(warmup_task), maxtime)
184+
185+
if wait_result === :timed_out
186+
timeout_occurred = true
187+
timed_out_flag = true
188+
# Don't try to kill the task - just mark it as timed out
189+
# The task will continue running in background but we move on
190+
else
191+
# Task completed - get the result
192+
if isready(done_channel)
193+
warmup_sol = take!(done_channel)
194+
elseif isready(error_channel)
195+
throw(take!(error_channel))
196+
end
197+
end
198+
199+
# Close channels to prevent resource leaks
200+
close(done_channel)
201+
close(error_channel)
202+
203+
catch e
204+
# If an error occurred during solve, re-throw it
205+
if !timed_out_flag
206+
throw(e)
195207
end
196208
end
197209

198-
if result === :timeout
210+
if timed_out_flag
199211
# Task timed out
200212
timed_out = true
201213
@warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Recording as NaN."
202214
success = false
203215
error_msg = "Timed out (exceeded $(maxtime)s)"
204216
gflops = NaN
205-
elseif result isa Exception
206-
# Task threw an error
207-
throw(result)
208217
else
209218
# Successful completion
210-
warmup_sol = result
211219
elapsed_time = time() - start_time
212220

213221
# Check correctness if reference solution is available

0 commit comments

Comments
 (0)