Add maxtime parameter to LinearSolveAutotune

ChrisRackauckas · ChrisRackauckas · commit dbe9b1561993 · 2025-08-11T11:17:02.000-04:00
- Added maxtime parameter with 100s default to autotune_setup() and benchmark_algorithms()
- Implements timeout handling during accuracy checks and benchmarking
- Records timed out runs as NaN in results
- Updated docstrings and documentation to explain the new parameter
- Prevents hanging on slow algorithms or large matrices
diff --git a/docs/src/tutorials/autotune.md b/docs/src/tutorials/autotune.md
@@ -132,6 +132,37 @@ results = autotune_setup(
 )
 ```
 
+### Time Limits for Algorithm Tests
+
+Control the maximum time allowed for each algorithm test (including accuracy check):
+
+```julia
+# Default: 100 seconds maximum per algorithm test
+results = autotune_setup()  # maxtime = 100.0
+
+# Quick timeout for fast exploration
+results = autotune_setup(maxtime = 10.0)
+
+# Extended timeout for slow algorithms or large matrices
+results = autotune_setup(
+    maxtime = 300.0,  # 5 minutes per test
+    sizes = [:large, :big]
+)
+
+# Conservative timeout for production benchmarking
+results = autotune_setup(
+    maxtime = 200.0,
+    samples = 10,
+    seconds = 2.0
+)
+```
+
+When an algorithm exceeds the `maxtime` limit:
+- The test is skipped to prevent hanging
+- The result is recorded as `NaN` in the benchmark data
+- A warning is displayed indicating the timeout
+- The benchmark continues with the next algorithm
+
 ### Missing Algorithm Handling
 
 By default, autotune expects all algorithms to be available to ensure complete benchmarking. You can relax this requirement:
diff --git a/lib/LinearSolveAutotune/src/LinearSolveAutotune.jl b/lib/LinearSolveAutotune/src/LinearSolveAutotune.jl
@@ -158,7 +158,8 @@ end
         seconds::Float64 = 0.5,
         eltypes = (Float32, Float64, ComplexF32, ComplexF64),
         skip_missing_algs::Bool = false,
-        include_fastlapack::Bool = false)
+        include_fastlapack::Bool = false,
+        maxtime::Float64 = 100.0)
 
 Run a comprehensive benchmark of all available LU factorization methods and optionally:
 
@@ -182,6 +183,8 @@ Run a comprehensive benchmark of all available LU factorization methods and opti
   - `eltypes = (Float32, Float64, ComplexF32, ComplexF64)`: Element types to benchmark
   - `skip_missing_algs::Bool = false`: If false, error when expected algorithms are missing; if true, warn instead
   - `include_fastlapack::Bool = false`: If true, includes FastLUFactorization in benchmarks
+  - `maxtime::Float64 = 100.0`: Maximum time in seconds for each algorithm test (including accuracy check). 
+    If exceeded, the run is skipped and recorded as NaN
 
 # Returns
 
@@ -216,7 +219,8 @@ function autotune_setup(;
         seconds::Float64 = 0.5,
         eltypes = (Float64,),
         skip_missing_algs::Bool = false,
-        include_fastlapack::Bool = false)
+        include_fastlapack::Bool = false,
+        maxtime::Float64 = 100.0)
     @info "Starting LinearSolve.jl autotune setup..."
     @info "Configuration: sizes=$sizes, set_preferences=$set_preferences"
     @info "Element types to benchmark: $(join(eltypes, ", "))"
@@ -249,8 +253,9 @@ function autotune_setup(;
 
     # Run benchmarks
     @info "Running benchmarks (this may take several minutes)..."
+    @info "Maximum time per algorithm test: $(maxtime)s"
     results_df = benchmark_algorithms(matrix_sizes, all_algs, all_names, eltypes;
-        samples = samples, seconds = seconds, sizes = sizes)
+        samples = samples, seconds = seconds, sizes = sizes, maxtime = maxtime)
 
     # Display results table
     successful_results = filter(row -> row.success, results_df)
diff --git a/lib/LinearSolveAutotune/src/benchmarking.jl b/lib/LinearSolveAutotune/src/benchmarking.jl
@@ -73,14 +73,19 @@ end
 
 """
     benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes; 
-                        samples=5, seconds=0.5, sizes=[:small, :medium])
+                        samples=5, seconds=0.5, sizes=[:small, :medium],
+                        maxtime=100.0)
 
 Benchmark the given algorithms across different matrix sizes and element types.
 Returns a DataFrame with results including element type information.
+
+# Arguments
+- `maxtime::Float64 = 100.0`: Maximum time in seconds for each algorithm test (including accuracy check). 
+  If the accuracy check exceeds this time, the run is skipped and recorded as NaN.
 """
 function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
         samples = 5, seconds = 0.5, sizes = [:tiny, :small, :medium, :large],
-        check_correctness = true, correctness_tol = 1e0)
+        check_correctness = true, correctness_tol = 1e0, maxtime = 100.0)
 
     # Set benchmark parameters
     old_params = BenchmarkTools.DEFAULT_PARAMETERS
@@ -136,52 +141,90 @@ function benchmark_algorithms(matrix_sizes, algorithms, alg_names, eltypes;
                     ProgressMeter.update!(progress, 
                         desc="Benchmarking $name on $(n)×$(n) $eltype matrix: ")
                     
-                    gflops = 0.0
+                    gflops = NaN  # Use NaN for timed out runs
                     success = true
                     error_msg = ""
                     passed_correctness = true
+                    timed_out = false
 
                     try
                         # Create the linear problem for this test
                         prob = LinearProblem(copy(A), copy(b);
                             u0 = copy(u0),
                             alias = LinearAliasSpecifier(alias_A = true, alias_b = true))
 
-                        # Warmup run and correctness check
-                        warmup_sol = solve(prob, alg)
+                        # Time the warmup run and correctness check
+                        start_time = time()
                         
-                        # Check correctness if reference solution is available
-                        if check_correctness && reference_solution !== nothing
-                            # Compute relative error
-                            rel_error = norm(warmup_sol.u - reference_solution.u) / norm(reference_solution.u)
-                            
-                            if rel_error > correctness_tol
-                                passed_correctness = false
-                                @warn "Algorithm $name failed correctness check for size $n, eltype $eltype. " *
-                                      "Relative error: $(round(rel_error, sigdigits=3)) > tolerance: $correctness_tol. " *
-                                      "Algorithm will be excluded from results."
-                                success = false
-                                error_msg = "Failed correctness check (rel_error = $(round(rel_error, sigdigits=3)))"
-                            end
+                        # Warmup run and correctness check with timeout
+                        warmup_task = @async begin
+                            solve(prob, alg)
+                        end
+                        
+                        # Wait for warmup to complete or timeout
+                        warmup_sol = nothing
+                        timeout_wait = maxtime
+                        while !istaskdone(warmup_task) && (time() - start_time) < timeout_wait
+                            sleep(0.1)
                         end
                         
-                        # Only benchmark if correctness check passed
-                        if passed_correctness
-                            # Actual benchmark
-                            bench = @benchmark solve($prob, $alg) setup=(prob = LinearProblem(
-                                copy($A), copy($b);
-                                u0 = copy($u0),
-                                alias = LinearAliasSpecifier(alias_A = true, alias_b = true)))
-
-                            # Calculate GFLOPs
-                            min_time_sec = minimum(bench.times) / 1e9
-                            flops = luflop(n, n)
-                            gflops = flops / min_time_sec / 1e9
+                        if !istaskdone(warmup_task)
+                            # Task timed out
+                            timed_out = true
+                            @warn "Algorithm $name timed out (exceeded $(maxtime)s) for size $n, eltype $eltype. Recording as NaN."
+                            success = false
+                            error_msg = "Timed out (exceeded $(maxtime)s)"
+                            gflops = NaN
+                        else
+                            # Get the result
+                            warmup_sol = fetch(warmup_task)
+                            elapsed_time = time() - start_time
+                            
+                            # Check correctness if reference solution is available
+                            if check_correctness && reference_solution !== nothing
+                                # Compute relative error
+                                rel_error = norm(warmup_sol.u - reference_solution.u) / norm(reference_solution.u)
+                                
+                                if rel_error > correctness_tol
+                                    passed_correctness = false
+                                    @warn "Algorithm $name failed correctness check for size $n, eltype $eltype. " *
+                                          "Relative error: $(round(rel_error, sigdigits=3)) > tolerance: $correctness_tol. " *
+                                          "Algorithm will be excluded from results."
+                                    success = false
+                                    error_msg = "Failed correctness check (rel_error = $(round(rel_error, sigdigits=3)))"
+                                    gflops = 0.0
+                                end
+                            end
+                            
+                            # Only benchmark if correctness check passed and we have time remaining
+                            if passed_correctness && !timed_out
+                                # Check if we have enough time remaining for benchmarking
+                                # Allow at least 2x the warmup time for benchmarking
+                                remaining_time = maxtime - elapsed_time
+                                if remaining_time < 2 * elapsed_time
+                                    @warn "Algorithm $name: insufficient time remaining for benchmarking (warmup took $(round(elapsed_time, digits=2))s). Recording as NaN."
+                                    gflops = NaN
+                                    success = false
+                                    error_msg = "Insufficient time for benchmarking"
+                                else
+                                    # Actual benchmark
+                                    bench = @benchmark solve($prob, $alg) setup=(prob = LinearProblem(
+                                        copy($A), copy($b);
+                                        u0 = copy($u0),
+                                        alias = LinearAliasSpecifier(alias_A = true, alias_b = true)))
+
+                                    # Calculate GFLOPs
+                                    min_time_sec = minimum(bench.times) / 1e9
+                                    flops = luflop(n, n)
+                                    gflops = flops / min_time_sec / 1e9
+                                end
+                            end
                         end
 
                     catch e
                         success = false
                         error_msg = string(e)
+                        gflops = 0.0
                         # Don't warn for each failure, just record it
                     end