SciML · ChrisRackauckas · Aug 11, 2025 · Aug 11, 2025 · Aug 11, 2025
diff --git a/docs/src/tutorials/autotune.md b/docs/src/tutorials/autotune.md
@@ -73,7 +73,7 @@ Control which matrix size ranges to test:
 # :small  - 20×20 to 100×100 (small problems)  
 # :medium - 100×100 to 300×300 (typical problems)
 # :large  - 300×300 to 1000×1000 (larger problems)
-# :big    - 10000×1000 to 20000x20000 (GPU/HPC scale)
+# :big    - 1000×1000 to 15000×15000 (GPU/HPC scale, capped at 15000 for stability)
 
 # Default: test tiny through large
 results = autotune_setup()  # uses [:tiny, :small, :medium, :large]
@@ -161,8 +161,11 @@ When an algorithm exceeds the `maxtime` limit:
 - The test is skipped to prevent hanging
 - The result is recorded as `NaN` in the benchmark data
 - A warning is displayed indicating the timeout
+- **The algorithm is automatically excluded from all larger matrix sizes** to save time
 - The benchmark continues with the next algorithm
 
+This intelligent timeout handling ensures that slow algorithms don't waste time on progressively larger matrices once they've proven too slow on smaller ones.
+
 ### Missing Algorithm Handling
 
 By default, autotune expects all algorithms to be available to ensure complete benchmarking. You can relax this requirement:

diff --git a/lib/LinearSolveAutotune/src/LinearSolveAutotune.jl b/lib/LinearSolveAutotune/src/LinearSolveAutotune.jl
@@ -182,7 +182,7 @@ Run a comprehensive benchmark of all available LU factorization methods and opti
 
 # Arguments
 
-  - `sizes = [:small, :medium, :large]`: Size categories to test. Options: :small (5-20), :medium (20-300), :large (300-1000), :big (10000-100000)
+  - `sizes = [:small, :medium, :large]`: Size categories to test. Options: :tiny (5-20), :small (20-100), :medium (100-300), :large (300-1000), :big (1000-15000)
   - `set_preferences::Bool = true`: Update LinearSolve preferences with optimal algorithms
   - `samples::Int = 5`: Number of benchmark samples per algorithm/size
   - `seconds::Float64 = 0.5`: Maximum time per benchmark
@@ -265,12 +265,19 @@ function autotune_setup(;
 
     # Display results table - filter out NaN values
     successful_results = filter(row -> row.success && !isnan(row.gflops), results_df)
-    timeout_results = filter(row -> isnan(row.gflops), results_df)
+    timeout_results = filter(row -> isnan(row.gflops) && !contains(get(row, :error, ""), "Skipped"), results_df)
+    skipped_results = filter(row -> contains(get(row, :error, ""), "Skipped"), results_df)
 
     if nrow(timeout_results) > 0
         @info "$(nrow(timeout_results)) tests timed out (exceeded $(maxtime)s limit)"
     end
 
+    if nrow(skipped_results) > 0
+        # Count unique algorithms that were skipped
+        skipped_algs = unique([row.algorithm for row in eachrow(skipped_results)])
+        @info "$(length(skipped_algs)) algorithms skipped for larger matrices after timing out"
+    end
+
     if nrow(successful_results) > 0
         @info "Benchmark completed successfully!"
 

diff --git a/lib/LinearSolveAutotune/src/benchmarking.jl b/lib/LinearSolveAutotune/src/benchmarking.jl
@@ -95,6 +95,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
     # Initialize results DataFrame
     results_data = []
 
+    # Track algorithms that have timed out (per element type)
+    timed_out_algorithms = Dict{String, Set{String}}()  # eltype => Set of algorithm names
+
     # Calculate total number of benchmarks for progress bar
     total_benchmarks = 0
     for eltype in eltypes
@@ -109,6 +112,9 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
 
     try
         for eltype in eltypes
+            # Initialize timed out set for this element type
+            timed_out_algorithms[string(eltype)] = Set{String}()
+
             # Filter algorithms for this element type
             compatible_algs, compatible_names = filter_compatible_algorithms(algorithms, alg_names, eltype)
 
@@ -137,6 +143,23 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
                 end
 
                 for (alg, name) in zip(compatible_algs, compatible_names)
+                    # Skip this algorithm if it has already timed out for this element type
+                    if name in timed_out_algorithms[string(eltype)]
+                        # Still need to update progress bar
+                        ProgressMeter.next!(progress)
+                        # Record as skipped due to previous timeout
+                        push!(results_data,
+                            (
+                                size = n,
+                                algorithm = name,
+                                eltype = string(eltype),
+                                gflops = NaN,
+                                success = false,
+                                error = "Skipped: timed out on smaller matrix"
+                            ))
+                        continue
+                    end
+
                     # Update progress description
                     ProgressMeter.update!(progress, 
                         desc="Benchmarking $name on $(n)×$(n) $eltype matrix: ")
@@ -156,58 +179,68 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
                         # Time the warmup run and correctness check
                         start_time = time()
 
-                        # Create a channel for communication between tasks
-                        result_channel = Channel(1)
-
-                        # Warmup run and correctness check with timeout
-                        warmup_task = @async begin
-                            try
-                                result = solve(prob, alg)
-                                put!(result_channel, result)
-                            catch e
-                                put!(result_channel, e)
-                            end
-                        end
+                        # Warmup run and correctness check with simple timeout
+                        warmup_sol = nothing
+                        timed_out_flag = false
 
-                        # Timer task to enforce timeout
-                        timer_task = @async begin
-                            sleep(maxtime)
-                            if !istaskdone(warmup_task)
+                        # Try to run with a timeout - simpler approach without async
+                        try
+                            # Create a task for the solve
+                            done_channel = Channel(1)
+                            error_channel = Channel(1)
+
+                            warmup_task = @async begin
                                 try
-                                    Base.throwto(warmup_task, InterruptException())
-                                catch
-                                    # Task might be in non-interruptible state
+                                    result = solve(prob, alg)
+                                    put!(done_channel, result)
+                                catch e
+                                    put!(error_channel, e)
                                 end
-                                put!(result_channel, :timeout)
                             end
-                        end
-
-                        # Wait for result or timeout
-                        warmup_sol = nothing
-                        result = take!(result_channel)
-
-                        # Clean up timer task if still running
-                        if !istaskdone(timer_task)
-                            try
-                                Base.throwto(timer_task, InterruptException())
-                            catch
-                                # Timer task might have already finished
+
+                            # Wait for completion or timeout
+                            timeout_occurred = false
+                            result = nothing
+
+                            # Use timedwait which is more reliable than manual polling
+                            wait_result = timedwait(() -> istaskdone(warmup_task), maxtime)
+
+                            if wait_result === :timed_out
+                                timeout_occurred = true
+                                timed_out_flag = true
+                                # Don't try to kill the task - just mark it as timed out
+                                # The task will continue running in background but we move on
+                            else
+                                # Task completed - get the result
+                                if isready(done_channel)
+                                    warmup_sol = take!(done_channel)
+                                elseif isready(error_channel)
+                                    throw(take!(error_channel))
+                                end
+                            end
+
+                            # Close channels to prevent resource leaks
+                            close(done_channel)
+                            close(error_channel)
+
+                        catch e
+                            # If an error occurred during solve, re-throw it
+                            if !timed_out_flag
+                                throw(e)
                             end
                         end
 
-                        if result === :timeout
+                        if timed_out_flag
                             # Task timed out
                             timed_out = true
-                            @warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Recording as NaN."
+                            # Add to timed out set so it's skipped for larger matrices
+                            push!(timed_out_algorithms[string(eltype)], name)
+                            @warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Will skip for larger matrices."
                             success = false
                             error_msg = "Timed out (exceeded $(maxtime)s)"
                             gflops = NaN
-                        elseif result isa Exception
-                            # Task threw an error
-                            throw(result)
                         else
                             # Successful completion
-                            warmup_sol = result
                             elapsed_time = time() - start_time
 
                             # Check correctness if reference solution is available
@@ -293,7 +326,7 @@ Size categories:
 - `:small` - 20:20:100 (for small problems)
 - `:medium` - 100:50:300 (for typical problems)
 - `:large` - 300:100:1000 (for larger problems)
-- `:big` - vcat(1000:2000:10000, 10000:5000:20000) (for very large/GPU problems)
+- `:big` - vcat(1000:2000:10000, 10000:5000:15000) (for very large/GPU problems, capped at 15000)
 """
 function get_benchmark_sizes(size_categories::Vector{Symbol})
     sizes = Int[]
@@ -308,7 +341,7 @@ function get_benchmark_sizes(size_categories::Vector{Symbol})
         elseif category == :large
             append!(sizes, 300:100:1000)
         elseif category == :big
-            append!(sizes, vcat(1000:2000:10000, 10000:5000:20000))
+            append!(sizes, vcat(1000:2000:10000, 10000:5000:15000))  # Capped at 15000
         else
             @warn "Unknown size category: $category. Skipping."
         end