Add maxtime parameter to LinearSolveAutotune for timeout handling (#716)

ChrisRackauckas-Claude · ChrisRackauckas · web-flow · commit a0f36afcc794 · 2025-08-11T11:43:03.000-04:00
* Add maxtime parameter to LinearSolveAutotune

- Added maxtime parameter with 100s default to autotune_setup() and benchmark_algorithms()
- Implements timeout handling during accuracy checks and benchmarking
- Records timed out runs as NaN in results
- Updated docstrings and documentation to explain the new parameter
- Prevents hanging on slow algorithms or large matrices

* Improve timeout handling: properly kill timed-out tasks

- Use Channel-based communication between warmup and timer tasks
- Properly interrupt timed-out tasks with Base.throwto()
- Clean up timer task when warmup completes successfully
- Handle exceptions from warmup task properly
- Prevents resource leaks from hanging tasks

* Update lib/LinearSolveAutotune/src/benchmarking.jl

* Make analysis tools robust to NaN values from timeouts

- Filter out NaN values when computing mean, max, and std statistics
- Exclude NaN values from plots to avoid visualization errors
- Report number of timed-out tests in summary output
- Ensure categorize_results excludes NaN values when selecting best algorithms
- All aggregation functions now properly handle NaN values that indicate timeouts

This ensures the autotuning system works correctly even when some tests timeout,
which is expected behavior for large matrix sizes or slow algorithms.

---------

Co-authored-by: ChrisRackauckas &lt;accounts@chrisrackauckas.com&gt;
diff --git a/docs/src/tutorials/autotune.md b/docs/src/tutorials/autotune.md
@@ -132,6 +132,37 @@ results = autotune_setup(
 )
 ```
 
+### Time Limits for Algorithm Tests
+
+Control the maximum time allowed for each algorithm test (including accuracy check):
+
+```julia
+# Default: 100 seconds maximum per algorithm test
+results = autotune_setup()  # maxtime = 100.0
+
+# Quick timeout for fast exploration
+results = autotune_setup(maxtime = 10.0)
+
+# Extended timeout for slow algorithms or large matrices
+results = autotune_setup(
+    maxtime = 300.0,  # 5 minutes per test
+    sizes = [:large, :big]
+)
+
+# Conservative timeout for production benchmarking
+results = autotune_setup(
+    maxtime = 200.0,
+    samples = 10,
+    seconds = 2.0
+)
+```
+
+When an algorithm exceeds the `maxtime` limit:
+- The test is skipped to prevent hanging
+- The result is recorded as `NaN` in the benchmark data
+- A warning is displayed indicating the timeout
+- The benchmark continues with the next algorithm
+
 ### Missing Algorithm Handling
 
 By default, autotune expects all algorithms to be available to ensure complete benchmarking. You can relax this requirement:
diff --git a/lib/LinearSolveAutotune/src/LinearSolveAutotune.jl b/lib/LinearSolveAutotune/src/LinearSolveAutotune.jl
@@ -78,13 +78,13 @@ function Base.show(io::IO, results::AutotuneResults)
     println(io, "  • Julia: ", get(results.sysinfo, "julia_version", "Unknown"))
     println(io, "  • Threads: ", get(results.sysinfo, "num_threads", "Unknown"), " (BLAS: ", get(results.sysinfo, "blas_num_threads", "Unknown"), ")")
     
-    # Results summary
-    successful_results = filter(row -> row.success, results.results_df)
+    # Results summary - filter out NaN values
+    successful_results = filter(row -> row.success && !isnan(row.gflops), results.results_df)
     if nrow(successful_results) > 0
         println(io, "\n🏆 Top Performing Algorithms:")
         summary = combine(groupby(successful_results, :algorithm),
-            :gflops => mean => :avg_gflops,
-            :gflops => maximum => :max_gflops,
+            :gflops => (x -> mean(filter(!isnan, x))) => :avg_gflops,
+            :gflops => (x -> maximum(filter(!isnan, x))) => :max_gflops,
             nrow => :num_tests)
         sort!(summary, :avg_gflops, rev = true)
         
@@ -104,6 +104,12 @@ function Base.show(io::IO, results::AutotuneResults)
     println(io, "📏 Matrix Sizes: ", minimum(sizes), "×", minimum(sizes), 
             " to ", maximum(sizes), "×", maximum(sizes))
     
+    # Report timeouts if any
+    timeout_results = filter(row -> isnan(row.gflops), results.results_df)
+    if nrow(timeout_results) > 0
+        println(io, "⏱️  Timed Out: ", nrow(timeout_results), " tests exceeded time limit")
+    end
+    
     # Call to action - reordered
     println(io, "\n" * "="^60)
     println(io, "🚀 For comprehensive results, consider running:")
@@ -158,7 +164,8 @@ end
         seconds::Float64 = 0.5,
         eltypes = (Float32, Float64, ComplexF32, ComplexF64),
         skip_missing_algs::Bool = false,
-        include_fastlapack::Bool = false)
+        include_fastlapack::Bool = false,
+        maxtime::Float64 = 100.0)
 
 Run a comprehensive benchmark of all available LU factorization methods and optionally:
 
@@ -182,6 +189,8 @@ Run a comprehensive benchmark of all available LU factorization methods and opti
   - `eltypes = (Float32, Float64, ComplexF32, ComplexF64)`: Element types to benchmark
   - `skip_missing_algs::Bool = false`: If false, error when expected algorithms are missing; if true, warn instead
   - `include_fastlapack::Bool = false`: If true, includes FastLUFactorization in benchmarks
+  - `maxtime::Float64 = 100.0`: Maximum time in seconds for each algorithm test (including accuracy check). 
+    If exceeded, the run is skipped and recorded as NaN
 
 # Returns
 
@@ -216,7 +225,8 @@ function autotune_setup(;
         seconds::Float64 = 0.5,
         eltypes = (Float64,),
         skip_missing_algs::Bool = false,
-        include_fastlapack::Bool = false)
+        include_fastlapack::Bool = false,
+        maxtime::Float64 = 100.0)
     @info "Starting LinearSolve.jl autotune setup..."
     @info "Configuration: sizes=$sizes, set_preferences=$set_preferences"
     @info "Element types to benchmark: $(join(eltypes, ", "))"
@@ -249,18 +259,25 @@ function autotune_setup(;
 
     # Run benchmarks
     @info "Running benchmarks (this may take several minutes)..."
+    @info "Maximum time per algorithm test: $(maxtime)s"
     results_df = benchmark_algorithms(matrix_sizes, all_algs, all_names, eltypes;
-        samples = samples, seconds = seconds, sizes = sizes)
+        samples = samples, seconds = seconds, sizes = sizes, maxtime = maxtime)
 
-    # Display results table
-    successful_results = filter(row -> row.success, results_df)
+    # Display results table - filter out NaN values
+    successful_results = filter(row -> row.success && !isnan(row.gflops), results_df)
+    timeout_results = filter(row -> isnan(row.gflops), results_df)
+    
+    if nrow(timeout_results) > 0
+        @info "$(nrow(timeout_results)) tests timed out (exceeded $(maxtime)s limit)"
+    end
+    
     if nrow(successful_results) > 0
         @info "Benchmark completed successfully!"
 
-        # Create summary table for display
+        # Create summary table for display - handle NaN values
         summary = combine(groupby(successful_results, :algorithm),
-            :gflops => mean => :avg_gflops,
-            :gflops => maximum => :max_gflops,
+            :gflops => (x -> mean(filter(!isnan, x))) => :avg_gflops,
+            :gflops => (x -> maximum(filter(!isnan, x))) => :max_gflops,
             nrow => :num_tests)
         sort!(summary, :avg_gflops, rev = true)
 
diff --git a/lib/LinearSolveAutotune/src/benchmarking.jl b/lib/LinearSolveAutotune/src/benchmarking.jl
@@ -73,14 +73,19 @@ end
 
 """
     benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes; 
-                        samples=5, seconds=0.5, sizes=[:small, :medium])
+                        samples=5, seconds=0.5, sizes=[:small, :medium],
+                        maxtime=100.0)
 
 Benchmark the given algorithms across different matrix sizes and element types.
 Returns a DataFrame with results including element type information.
+
+# Arguments
+- `maxtime::Float64 = 100.0`: Maximum time in seconds for each algorithm test (including accuracy check). 
+  If the accuracy check exceeds this time, the run is skipped and recorded as NaN.
 """
 function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
         samples = 5, seconds = 0.5, sizes = [:tiny, :small, :medium, :large],
-        check_correctness = true, correctness_tol = 1e0)
+        check_correctness = true, correctness_tol = 1e0, maxtime = 100.0)
 
     # Set benchmark parameters
     old_params = BenchmarkTools.DEFAULT_PARAMETERS
@@ -136,52 +141,120 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
                     ProgressMeter.update!(progress, 
                         desc="Benchmarking $name on $(n)×$(n) $eltype matrix: ")
                     
-                    gflops = 0.0
+                    gflops = NaN  # Use NaN for timed out runs
                     success = true
                     error_msg = ""
                     passed_correctness = true
+                    timed_out = false
 
                     try
                         # Create the linear problem for this test
                         prob = LinearProblem(copy(A), copy(b);
                             u0 = copy(u0),
                             alias = LinearAliasSpecifier(alias_A = true, alias_b = true))
 
-                        # Warmup run and correctness check
-                        warmup_sol = solve(prob, alg)
+                        # Time the warmup run and correctness check
+                        start_time = time()
                         
-                        # Check correctness if reference solution is available
-                        if check_correctness && reference_solution !== nothing
-                            # Compute relative error
-                            rel_error = norm(warmup_sol.u - reference_solution.u) / norm(reference_solution.u)
-                            
-                            if rel_error > correctness_tol
-                                passed_correctness = false
-                                @warn "Algorithm $name failed correctness check for size $n, eltype $eltype. " *
-                                      "Relative error: $(round(rel_error, sigdigits=3)) > tolerance: $correctness_tol. " *
-                                      "Algorithm will be excluded from results."
-                                success = false
-                                error_msg = "Failed correctness check (rel_error = $(round(rel_error, sigdigits=3)))"
+                        # Create a channel for communication between tasks
+                        result_channel = Channel(1)
+                        
+                        # Warmup run and correctness check with timeout
+                        warmup_task = @async begin
+                            try
+                                result = solve(prob, alg)
+                                put!(result_channel, result)
+                            catch e
+                                put!(result_channel, e)
+                            end
+                        end
+                        
+                        # Timer task to enforce timeout
+                        timer_task = @async begin
+                            sleep(maxtime)
+                            if !istaskdone(warmup_task)
+                                try
+                                    Base.throwto(warmup_task, InterruptException())
+                                catch
+                                    # Task might be in non-interruptible state
+                                end
+                                put!(result_channel, :timeout)
+                            end
+                        end
+                        
+                        # Wait for result or timeout
+                        warmup_sol = nothing
+                        result = take!(result_channel)
+                        
+                        # Clean up timer task if still running
+                        if !istaskdone(timer_task)
+                            try
+                                Base.throwto(timer_task, InterruptException())
+                            catch
+                                # Timer task might have already finished
                             end
                         end
                         
-                        # Only benchmark if correctness check passed
-                        if passed_correctness
-                            # Actual benchmark
-                            bench = @benchmark solve($prob, $alg) setup=(prob = LinearProblem(
-                                copy($A), copy($b);
-                                u0 = copy($u0),
-                                alias = LinearAliasSpecifier(alias_A = true, alias_b = true)))
-
-                            # Calculate GFLOPs
-                            min_time_sec = minimum(bench.times) / 1e9
-                            flops = luflop(n, n)
-                            gflops = flops / min_time_sec / 1e9
+                        if result === :timeout
+                            # Task timed out
+                            timed_out = true
+                            @warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Recording as NaN."
+                            success = false
+                            error_msg = "Timed out (exceeded $(maxtime)s)"
+                            gflops = NaN
+                        elseif result isa Exception
+                            # Task threw an error
+                            throw(result)
+                        else
+                            # Successful completion
+                            warmup_sol = result
+                            elapsed_time = time() - start_time
+                            
+                            # Check correctness if reference solution is available
+                            if check_correctness && reference_solution !== nothing
+                                # Compute relative error
+                                rel_error = norm(warmup_sol.u - reference_solution.u) / norm(reference_solution.u)
+                                
+                                if rel_error > correctness_tol
+                                    passed_correctness = false
+                                    @warn "Algorithm $name failed correctness check for size $n, eltype $eltype. " *
+                                          "Relative error: $(round(rel_error, sigdigits=3)) > tolerance: $correctness_tol. " *
+                                          "Algorithm will be excluded from results."
+                                    success = false
+                                    error_msg = "Failed correctness check (rel_error = $(round(rel_error, sigdigits=3)))"
+                                    gflops = 0.0
+                                end
+                            end
+                            
+                            # Only benchmark if correctness check passed and we have time remaining
+                            if passed_correctness && !timed_out
+                                # Check if we have enough time remaining for benchmarking
+                                # Allow at least 2x the warmup time for benchmarking
+                                remaining_time = maxtime - elapsed_time
+                                if remaining_time < 2 * elapsed_time
+                                    @warn "Algorithm $name: insufficient time remaining for benchmarking (warmup took $(round(elapsed_time, digits=2))s). Recording as NaN."
+                                    gflops = NaN
+                                    success = false
+                                    error_msg = "Insufficient time for benchmarking"
+                                else
+                                    # Actual benchmark
+                                    bench = @benchmark solve($prob, $alg) setup=(prob = LinearProblem(
+                                        copy($A), copy($b);
+                                        u0 = copy($u0),
+                                        alias = LinearAliasSpecifier(alias_A = true, alias_b = true)))
+
+                                    # Calculate GFLOPs
+                                    min_time_sec = minimum(bench.times) / 1e9
+                                    flops = luflop(n, n)
+                                    gflops = flops / min_time_sec / 1e9
+                                end
+                            end
                         end
 
                     catch e
                         success = false
                         error_msg = string(e)
+                        gflops = NaN
                         # Don't warn for each failure, just record it
                     end
 
@@ -252,8 +325,8 @@ Categorize the benchmark results into size ranges and find the best algorithm fo
 For complex types, avoids RFLUFactorization if possible due to known issues.
 """
 function categorize_results(df::DataFrame)
-    # Filter successful results
-    successful_df = filter(row -> row.success, df)
+    # Filter successful results and exclude NaN values
+    successful_df = filter(row -> row.success && !isnan(row.gflops), df)
 
     if nrow(successful_df) == 0
         @warn "No successful benchmark results found!"
@@ -293,8 +366,9 @@ function categorize_results(df::DataFrame)
                 continue
             end
 
-            # Calculate average GFLOPs for each algorithm in this range
-            avg_results = combine(groupby(range_df, :algorithm), :gflops => mean => :avg_gflops)
+            # Calculate average GFLOPs for each algorithm in this range, excluding NaN values
+            avg_results = combine(groupby(range_df, :algorithm), 
+                :gflops => (x -> mean(filter(!isnan, x))) => :avg_gflops)
             
             # Sort by performance
             sort!(avg_results, :avg_gflops, rev=true)
diff --git a/lib/LinearSolveAutotune/src/plotting.jl b/lib/LinearSolveAutotune/src/plotting.jl
@@ -44,7 +44,7 @@ function create_benchmark_plots(df::DataFrame; title_base = "LinearSolve.jl LU F
 
         # Plot each algorithm for this element type
         for alg in algorithms
-            alg_df = filter(row -> row.algorithm == alg, eltype_df)
+            alg_df = filter(row -> row.algorithm == alg && !isnan(row.gflops), eltype_df)
             if nrow(alg_df) > 0
                 # Sort by size for proper line plotting
                 sort!(alg_df, :size)
diff --git a/lib/LinearSolveAutotune/src/telemetry.jl b/lib/LinearSolveAutotune/src/telemetry.jl
@@ -365,9 +365,10 @@ function format_detailed_results_markdown(df::DataFrame)
         end
         
         # Create a summary table with average performance per algorithm for this element type
+        # Filter out NaN values when computing statistics
         summary = combine(groupby(eltype_df, :algorithm), 
-                         :gflops => mean => :avg_gflops, 
-                         :gflops => std => :std_gflops,
+                         :gflops => (x -> mean(filter(!isnan, x))) => :avg_gflops, 
+                         :gflops => (x -> std(filter(!isnan, x))) => :std_gflops,
                          nrow => :num_tests)
         sort!(summary, :avg_gflops, rev = true)