diff --git a/docs/src/tutorials/autotune.md b/docs/src/tutorials/autotune.md index f829360a7..1317590dc 100644 --- a/docs/src/tutorials/autotune.md +++ b/docs/src/tutorials/autotune.md @@ -73,7 +73,7 @@ Control which matrix size ranges to test: # :small - 20×20 to 100×100 (small problems) # :medium - 100×100 to 300×300 (typical problems) # :large - 300×300 to 1000×1000 (larger problems) -# :big - 10000×1000 to 20000x20000 (GPU/HPC scale) +# :big - 1000×1000 to 15000×15000 (GPU/HPC scale, capped at 15000 for stability) # Default: test tiny through large results = autotune_setup() # uses [:tiny, :small, :medium, :large] @@ -161,8 +161,11 @@ When an algorithm exceeds the `maxtime` limit: - The test is skipped to prevent hanging - The result is recorded as `NaN` in the benchmark data - A warning is displayed indicating the timeout +- **The algorithm is automatically excluded from all larger matrix sizes** to save time - The benchmark continues with the next algorithm +This intelligent timeout handling ensures that slow algorithms don't waste time on progressively larger matrices once they've proven too slow on smaller ones. + ### Missing Algorithm Handling By default, autotune expects all algorithms to be available to ensure complete benchmarking. You can relax this requirement: diff --git a/lib/LinearSolveAutotune/src/LinearSolveAutotune.jl b/lib/LinearSolveAutotune/src/LinearSolveAutotune.jl index 39276726c..0dea674fb 100644 --- a/lib/LinearSolveAutotune/src/LinearSolveAutotune.jl +++ b/lib/LinearSolveAutotune/src/LinearSolveAutotune.jl @@ -182,7 +182,7 @@ Run a comprehensive benchmark of all available LU factorization methods and opti # Arguments - - `sizes = [:small, :medium, :large]`: Size categories to test. Options: :small (5-20), :medium (20-300), :large (300-1000), :big (10000-100000) + - `sizes = [:small, :medium, :large]`: Size categories to test. Options: :tiny (5-20), :small (20-100), :medium (100-300), :large (300-1000), :big (1000-15000) - `set_preferences::Bool = true`: Update LinearSolve preferences with optimal algorithms - `samples::Int = 5`: Number of benchmark samples per algorithm/size - `seconds::Float64 = 0.5`: Maximum time per benchmark @@ -265,12 +265,19 @@ function autotune_setup(; # Display results table - filter out NaN values successful_results = filter(row -> row.success && !isnan(row.gflops), results_df) - timeout_results = filter(row -> isnan(row.gflops), results_df) + timeout_results = filter(row -> isnan(row.gflops) && !contains(get(row, :error, ""), "Skipped"), results_df) + skipped_results = filter(row -> contains(get(row, :error, ""), "Skipped"), results_df) if nrow(timeout_results) > 0 @info "$(nrow(timeout_results)) tests timed out (exceeded $(maxtime)s limit)" end + if nrow(skipped_results) > 0 + # Count unique algorithms that were skipped + skipped_algs = unique([row.algorithm for row in eachrow(skipped_results)]) + @info "$(length(skipped_algs)) algorithms skipped for larger matrices after timing out" + end + if nrow(successful_results) > 0 @info "Benchmark completed successfully!" diff --git a/lib/LinearSolveAutotune/src/benchmarking.jl b/lib/LinearSolveAutotune/src/benchmarking.jl index 21d868647..c7167cd59 100644 --- a/lib/LinearSolveAutotune/src/benchmarking.jl +++ b/lib/LinearSolveAutotune/src/benchmarking.jl @@ -95,6 +95,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes; # Initialize results DataFrame results_data = [] + # Track algorithms that have timed out (per element type) + timed_out_algorithms = Dict{String, Set{String}}() # eltype => Set of algorithm names + # Calculate total number of benchmarks for progress bar total_benchmarks = 0 for eltype in eltypes @@ -109,6 +112,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes; try for eltype in eltypes + # Initialize timed out set for this element type + timed_out_algorithms[string(eltype)] = Set{String}() + # Filter algorithms for this element type compatible_algs, compatible_names = filter_compatible_algorithms(algorithms, alg_names, eltype) @@ -137,6 +143,23 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes; end for (alg, name) in zip(compatible_algs, compatible_names) + # Skip this algorithm if it has already timed out for this element type + if name in timed_out_algorithms[string(eltype)] + # Still need to update progress bar + ProgressMeter.next!(progress) + # Record as skipped due to previous timeout + push!(results_data, + ( + size = n, + algorithm = name, + eltype = string(eltype), + gflops = NaN, + success = false, + error = "Skipped: timed out on smaller matrix" + )) + continue + end + # Update progress description ProgressMeter.update!(progress, desc="Benchmarking $name on $(n)×$(n) $eltype matrix: ") @@ -156,58 +179,68 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes; # Time the warmup run and correctness check start_time = time() - # Create a channel for communication between tasks - result_channel = Channel(1) - - # Warmup run and correctness check with timeout - warmup_task = @async begin - try - result = solve(prob, alg) - put!(result_channel, result) - catch e - put!(result_channel, e) - end - end + # Warmup run and correctness check with simple timeout + warmup_sol = nothing + timed_out_flag = false - # Timer task to enforce timeout - timer_task = @async begin - sleep(maxtime) - if !istaskdone(warmup_task) + # Try to run with a timeout - simpler approach without async + try + # Create a task for the solve + done_channel = Channel(1) + error_channel = Channel(1) + + warmup_task = @async begin try - Base.throwto(warmup_task, InterruptException()) - catch - # Task might be in non-interruptible state + result = solve(prob, alg) + put!(done_channel, result) + catch e + put!(error_channel, e) end - put!(result_channel, :timeout) end - end - - # Wait for result or timeout - warmup_sol = nothing - result = take!(result_channel) - - # Clean up timer task if still running - if !istaskdone(timer_task) - try - Base.throwto(timer_task, InterruptException()) - catch - # Timer task might have already finished + + # Wait for completion or timeout + timeout_occurred = false + result = nothing + + # Use timedwait which is more reliable than manual polling + wait_result = timedwait(() -> istaskdone(warmup_task), maxtime) + + if wait_result === :timed_out + timeout_occurred = true + timed_out_flag = true + # Don't try to kill the task - just mark it as timed out + # The task will continue running in background but we move on + else + # Task completed - get the result + if isready(done_channel) + warmup_sol = take!(done_channel) + elseif isready(error_channel) + throw(take!(error_channel)) + end + end + + # Close channels to prevent resource leaks + close(done_channel) + close(error_channel) + + catch e + # If an error occurred during solve, re-throw it + if !timed_out_flag + throw(e) end end - if result === :timeout + if timed_out_flag # Task timed out timed_out = true - @warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Recording as NaN." + # Add to timed out set so it's skipped for larger matrices + push!(timed_out_algorithms[string(eltype)], name) + @warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Will skip for larger matrices." success = false error_msg = "Timed out (exceeded $(maxtime)s)" gflops = NaN - elseif result isa Exception - # Task threw an error - throw(result) else # Successful completion - warmup_sol = result elapsed_time = time() - start_time # Check correctness if reference solution is available @@ -293,7 +326,7 @@ Size categories: - `:small` - 20:20:100 (for small problems) - `:medium` - 100:50:300 (for typical problems) - `:large` - 300:100:1000 (for larger problems) -- `:big` - vcat(1000:2000:10000, 10000:5000:20000) (for very large/GPU problems) +- `:big` - vcat(1000:2000:10000, 10000:5000:15000) (for very large/GPU problems, capped at 15000) """ function get_benchmark_sizes(size_categories::Vector{Symbol}) sizes = Int[] @@ -308,7 +341,7 @@ function get_benchmark_sizes(size_categories::Vector{Symbol}) elseif category == :large append!(sizes, 300:100:1000) elseif category == :big - append!(sizes, vcat(1000:2000:10000, 10000:5000:20000)) + append!(sizes, vcat(1000:2000:10000, 10000:5000:15000)) # Capped at 15000 else @warn "Unknown size category: $category. Skipping." end