Improve timeout handling: skip timed-out algorithms and cap big benchmarks at 15000 (#718)

ChrisRackauckas-Claude · ChrisRackauckas · web-flow · commit 4b2d3d85b150 · 2025-08-11T13:31:09.000-04:00
* Fix hanging issue in autotune timeout mechanism

- Replace manual polling loop with Julia's built-in timedwait() function
- Avoid using Base.throwto() which can cause hangs with sleeping tasks
- Let timed-out tasks continue in background rather than trying to kill them
- Use Channels for clean communication between tasks
- Close channels properly to prevent resource leaks

This fixes the issue where autotune would hang indefinitely when trying
to interrupt tasks that exceeded the timeout limit.

* Improve timeout handling: skip timed-out algorithms and cap big benchmarks

- Algorithms that timeout are automatically excluded from larger matrix sizes
- This prevents wasting time on algorithms that are already too slow
- Cap :big benchmark sizes at 15000 instead of 20000 for stability
- 20000x20000 matrices often cause issues even on powerful computers
- Add tracking of timed-out algorithms per element type
- Report number of skipped algorithms in summary output
- Update documentation to reflect these improvements

This makes the autotuning process much more efficient by not repeatedly
testing algorithms that have already proven to be too slow.

---------

Co-authored-by: ChrisRackauckas &lt;accounts@chrisrackauckas.com&gt;
diff --git a/docs/src/tutorials/autotune.md b/docs/src/tutorials/autotune.md
@@ -73,7 +73,7 @@ Control which matrix size ranges to test:
 # :small  - 20×20 to 100×100 (small problems)  
 # :medium - 100×100 to 300×300 (typical problems)
 # :large  - 300×300 to 1000×1000 (larger problems)
-# :big    - 10000×1000 to 20000x20000 (GPU/HPC scale)
+# :big    - 1000×1000 to 15000×15000 (GPU/HPC scale, capped at 15000 for stability)
 
 # Default: test tiny through large
 results = autotune_setup()  # uses [:tiny, :small, :medium, :large]
@@ -161,8 +161,11 @@ When an algorithm exceeds the `maxtime` limit:
 - The test is skipped to prevent hanging
 - The result is recorded as `NaN` in the benchmark data
 - A warning is displayed indicating the timeout
+- **The algorithm is automatically excluded from all larger matrix sizes** to save time
 - The benchmark continues with the next algorithm
 
+This intelligent timeout handling ensures that slow algorithms don't waste time on progressively larger matrices once they've proven too slow on smaller ones.
+
 ### Missing Algorithm Handling
 
 By default, autotune expects all algorithms to be available to ensure complete benchmarking. You can relax this requirement:
diff --git a/lib/LinearSolveAutotune/src/LinearSolveAutotune.jl b/lib/LinearSolveAutotune/src/LinearSolveAutotune.jl
@@ -182,7 +182,7 @@ Run a comprehensive benchmark of all available LU factorization methods and opti
 
 # Arguments
 
-  - `sizes = [:small, :medium, :large]`: Size categories to test. Options: :small (5-20), :medium (20-300), :large (300-1000), :big (10000-100000)
+  - `sizes = [:small, :medium, :large]`: Size categories to test. Options: :tiny (5-20), :small (20-100), :medium (100-300), :large (300-1000), :big (1000-15000)
   - `set_preferences::Bool = true`: Update LinearSolve preferences with optimal algorithms
   - `samples::Int = 5`: Number of benchmark samples per algorithm/size
   - `seconds::Float64 = 0.5`: Maximum time per benchmark
@@ -265,12 +265,19 @@ function autotune_setup(;
 
     # Display results table - filter out NaN values
     successful_results = filter(row -> row.success && !isnan(row.gflops), results_df)
-    timeout_results = filter(row -> isnan(row.gflops), results_df)
+    timeout_results = filter(row -> isnan(row.gflops) && !contains(get(row, :error, ""), "Skipped"), results_df)
+    skipped_results = filter(row -> contains(get(row, :error, ""), "Skipped"), results_df)
     
     if nrow(timeout_results) > 0
         @info "$(nrow(timeout_results)) tests timed out (exceeded $(maxtime)s limit)"
     end
     
+    if nrow(skipped_results) > 0
+        # Count unique algorithms that were skipped
+        skipped_algs = unique([row.algorithm for row in eachrow(skipped_results)])
+        @info "$(length(skipped_algs)) algorithms skipped for larger matrices after timing out"
+    end
+    
     if nrow(successful_results) > 0
         @info "Benchmark completed successfully!"
 
diff --git a/lib/LinearSolveAutotune/src/benchmarking.jl b/lib/LinearSolveAutotune/src/benchmarking.jl
@@ -95,6 +95,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
     # Initialize results DataFrame
     results_data = []
     
+    # Track algorithms that have timed out (per element type)
+    timed_out_algorithms = Dict{String, Set{String}}()  # eltype => Set of algorithm names
+    
     # Calculate total number of benchmarks for progress bar
     total_benchmarks = 0
     for eltype in eltypes
@@ -109,6 +112,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
 
     try
         for eltype in eltypes
+            # Initialize timed out set for this element type
+            timed_out_algorithms[string(eltype)] = Set{String}()
+            
             # Filter algorithms for this element type
             compatible_algs, compatible_names = filter_compatible_algorithms(algorithms, alg_names, eltype)
             
@@ -137,6 +143,23 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
                 end
 
                 for (alg, name) in zip(compatible_algs, compatible_names)
+                    # Skip this algorithm if it has already timed out for this element type
+                    if name in timed_out_algorithms[string(eltype)]
+                        # Still need to update progress bar
+                        ProgressMeter.next!(progress)
+                        # Record as skipped due to previous timeout
+                        push!(results_data,
+                            (
+                                size = n,
+                                algorithm = name,
+                                eltype = string(eltype),
+                                gflops = NaN,
+                                success = false,
+                                error = "Skipped: timed out on smaller matrix"
+                            ))
+                        continue
+                    end
+                    
                     # Update progress description
                     ProgressMeter.update!(progress, 
                         desc="Benchmarking $name on $(n)×$(n) $eltype matrix: ")
@@ -210,7 +233,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
                         if timed_out_flag
                             # Task timed out
                             timed_out = true
-                            @warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Recording as NaN."
+                            # Add to timed out set so it's skipped for larger matrices
+                            push!(timed_out_algorithms[string(eltype)], name)
+                            @warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Will skip for larger matrices."
                             success = false
                             error_msg = "Timed out (exceeded $(maxtime)s)"
                             gflops = NaN
@@ -301,7 +326,7 @@ Size categories:
 - `:small` - 20:20:100 (for small problems)
 - `:medium` - 100:50:300 (for typical problems)
 - `:large` - 300:100:1000 (for larger problems)
-- `:big` - vcat(1000:2000:10000, 10000:5000:20000) (for very large/GPU problems)
+- `:big` - vcat(1000:2000:10000, 10000:5000:15000) (for very large/GPU problems, capped at 15000)
 """
 function get_benchmark_sizes(size_categories::Vector{Symbol})
     sizes = Int[]
@@ -316,7 +341,7 @@ function get_benchmark_sizes(size_categories::Vector{Symbol})
         elseif category == :large
             append!(sizes, 300:100:1000)
         elseif category == :big
-            append!(sizes, vcat(1000:2000:10000, 10000:5000:20000))
+            append!(sizes, vcat(1000:2000:10000, 10000:5000:15000))  # Capped at 15000
         else
             @warn "Unknown size category: $category. Skipping."
         end