Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/src/tutorials/autotune.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ Control which matrix size ranges to test:
# :small - 20×20 to 100×100 (small problems)
# :medium - 100×100 to 300×300 (typical problems)
# :large - 300×300 to 1000×1000 (larger problems)
# :big - 10000×1000 to 20000x20000 (GPU/HPC scale)
# :big - 1000×1000 to 15000×15000 (GPU/HPC scale, capped at 15000 for stability)

# Default: test tiny through large
results = autotune_setup() # uses [:tiny, :small, :medium, :large]
Expand Down Expand Up @@ -161,8 +161,11 @@ When an algorithm exceeds the `maxtime` limit:
- The test is skipped to prevent hanging
- The result is recorded as `NaN` in the benchmark data
- A warning is displayed indicating the timeout
- **The algorithm is automatically excluded from all larger matrix sizes** to save time
- The benchmark continues with the next algorithm

This intelligent timeout handling ensures that slow algorithms don't waste time on progressively larger matrices once they've proven too slow on smaller ones.

### Missing Algorithm Handling

By default, autotune expects all algorithms to be available to ensure complete benchmarking. You can relax this requirement:
Expand Down
11 changes: 9 additions & 2 deletions lib/LinearSolveAutotune/src/LinearSolveAutotune.jl
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ Run a comprehensive benchmark of all available LU factorization methods and opti

# Arguments

- `sizes = [:small, :medium, :large]`: Size categories to test. Options: :small (5-20), :medium (20-300), :large (300-1000), :big (10000-100000)
- `sizes = [:small, :medium, :large]`: Size categories to test. Options: :tiny (5-20), :small (20-100), :medium (100-300), :large (300-1000), :big (1000-15000)
- `set_preferences::Bool = true`: Update LinearSolve preferences with optimal algorithms
- `samples::Int = 5`: Number of benchmark samples per algorithm/size
- `seconds::Float64 = 0.5`: Maximum time per benchmark
Expand Down Expand Up @@ -265,12 +265,19 @@ function autotune_setup(;

# Display results table - filter out NaN values
successful_results = filter(row -> row.success && !isnan(row.gflops), results_df)
timeout_results = filter(row -> isnan(row.gflops), results_df)
timeout_results = filter(row -> isnan(row.gflops) && !contains(get(row, :error, ""), "Skipped"), results_df)
skipped_results = filter(row -> contains(get(row, :error, ""), "Skipped"), results_df)

if nrow(timeout_results) > 0
@info "$(nrow(timeout_results)) tests timed out (exceeded $(maxtime)s limit)"
end

if nrow(skipped_results) > 0
# Count unique algorithms that were skipped
skipped_algs = unique([row.algorithm for row in eachrow(skipped_results)])
@info "$(length(skipped_algs)) algorithms skipped for larger matrices after timing out"
end

if nrow(successful_results) > 0
@info "Benchmark completed successfully!"

Expand Down
113 changes: 73 additions & 40 deletions lib/LinearSolveAutotune/src/benchmarking.jl
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
# Initialize results DataFrame
results_data = []

# Track algorithms that have timed out (per element type)
timed_out_algorithms = Dict{String, Set{String}}() # eltype => Set of algorithm names

# Calculate total number of benchmarks for progress bar
total_benchmarks = 0
for eltype in eltypes
Expand All @@ -109,6 +112,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;

try
for eltype in eltypes
# Initialize timed out set for this element type
timed_out_algorithms[string(eltype)] = Set{String}()

# Filter algorithms for this element type
compatible_algs, compatible_names = filter_compatible_algorithms(algorithms, alg_names, eltype)

Expand Down Expand Up @@ -137,6 +143,23 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
end

for (alg, name) in zip(compatible_algs, compatible_names)
# Skip this algorithm if it has already timed out for this element type
if name in timed_out_algorithms[string(eltype)]
# Still need to update progress bar
ProgressMeter.next!(progress)
# Record as skipped due to previous timeout
push!(results_data,
(
size = n,
algorithm = name,
eltype = string(eltype),
gflops = NaN,
success = false,
error = "Skipped: timed out on smaller matrix"
))
continue
end

# Update progress description
ProgressMeter.update!(progress,
desc="Benchmarking $name on $(n)×$(n) $eltype matrix: ")
Expand All @@ -156,58 +179,68 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
# Time the warmup run and correctness check
start_time = time()

# Create a channel for communication between tasks
result_channel = Channel(1)

# Warmup run and correctness check with timeout
warmup_task = @async begin
try
result = solve(prob, alg)
put!(result_channel, result)
catch e
put!(result_channel, e)
end
end
# Warmup run and correctness check with simple timeout
warmup_sol = nothing
timed_out_flag = false

# Timer task to enforce timeout
timer_task = @async begin
sleep(maxtime)
if !istaskdone(warmup_task)
# Try to run with a timeout - simpler approach without async
try
# Create a task for the solve
done_channel = Channel(1)
error_channel = Channel(1)

warmup_task = @async begin
try
Base.throwto(warmup_task, InterruptException())
catch
# Task might be in non-interruptible state
result = solve(prob, alg)
put!(done_channel, result)
catch e
put!(error_channel, e)
end
put!(result_channel, :timeout)
end
end

# Wait for result or timeout
warmup_sol = nothing
result = take!(result_channel)

# Clean up timer task if still running
if !istaskdone(timer_task)
try
Base.throwto(timer_task, InterruptException())
catch
# Timer task might have already finished

# Wait for completion or timeout
timeout_occurred = false
result = nothing

# Use timedwait which is more reliable than manual polling
wait_result = timedwait(() -> istaskdone(warmup_task), maxtime)

if wait_result === :timed_out
timeout_occurred = true
timed_out_flag = true
# Don't try to kill the task - just mark it as timed out
# The task will continue running in background but we move on
else
# Task completed - get the result
if isready(done_channel)
warmup_sol = take!(done_channel)
elseif isready(error_channel)
throw(take!(error_channel))
end
end

# Close channels to prevent resource leaks
close(done_channel)
close(error_channel)

catch e
# If an error occurred during solve, re-throw it
if !timed_out_flag
throw(e)
end
end

if result === :timeout
if timed_out_flag
# Task timed out
timed_out = true
@warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Recording as NaN."
# Add to timed out set so it's skipped for larger matrices
push!(timed_out_algorithms[string(eltype)], name)
@warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Will skip for larger matrices."
success = false
error_msg = "Timed out (exceeded $(maxtime)s)"
gflops = NaN
elseif result isa Exception
# Task threw an error
throw(result)
else
# Successful completion
warmup_sol = result
elapsed_time = time() - start_time

# Check correctness if reference solution is available
Expand Down Expand Up @@ -293,7 +326,7 @@ Size categories:
- `:small` - 20:20:100 (for small problems)
- `:medium` - 100:50:300 (for typical problems)
- `:large` - 300:100:1000 (for larger problems)
- `:big` - vcat(1000:2000:10000, 10000:5000:20000) (for very large/GPU problems)
- `:big` - vcat(1000:2000:10000, 10000:5000:15000) (for very large/GPU problems, capped at 15000)
"""
function get_benchmark_sizes(size_categories::Vector{Symbol})
sizes = Int[]
Expand All @@ -308,7 +341,7 @@ function get_benchmark_sizes(size_categories::Vector{Symbol})
elseif category == :large
append!(sizes, 300:100:1000)
elseif category == :big
append!(sizes, vcat(1000:2000:10000, 10000:5000:20000))
append!(sizes, vcat(1000:2000:10000, 10000:5000:15000)) # Capped at 15000
else
@warn "Unknown size category: $category. Skipping."
end
Expand Down
Loading