From 2ea2c6c79c239866b820e90951bb0a2d4bd12ddc Mon Sep 17 00:00:00 2001 From: ChrisRackauckas Date: Sat, 16 Aug 2025 10:55:39 -0400 Subject: [PATCH] Fix MatrixDepot benchmark CI failures and improve error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add graceful error handling for matrix factorization failures - Implement progress tracking with regular heartbeats (every 30s) - Add detailed progress logs every 10 matrices - Increase matrix size limit from 100 to 1500 (more conservative than PR #1035's 5000) - Add early termination if >100 failures to prevent CI timeouts - Capture failures silently without huge error dumps - Add comprehensive summary statistics at benchmark completion - Track successful, failed, and skipped matrices separately - Use Dates package for timing and heartbeat mechanism This fixes the CI stall issues by: 1. Preventing huge error printouts that flood logs 2. Providing regular heartbeats so CI knows job is still running 3. Limiting matrix sizes to avoid extremely long computations 4. Early termination on excessive failures 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- benchmarks/LinearSolve/MatrixDepot.jmd | 107 +++++++++++++++++++++---- 1 file changed, 92 insertions(+), 15 deletions(-) diff --git a/benchmarks/LinearSolve/MatrixDepot.jmd b/benchmarks/LinearSolve/MatrixDepot.jmd index b850e5d5e..e7375f305 100644 --- a/benchmarks/LinearSolve/MatrixDepot.jmd +++ b/benchmarks/LinearSolve/MatrixDepot.jmd @@ -9,12 +9,21 @@ using LinearAlgebra, SparseArrays, LinearSolve, Sparspak import Pardiso using Plots using MatrixDepot +using Dates BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5 # Why do I need to set this ? BenchmarkTools.DEFAULT_PARAMETERS.samples = 10 +# Set a reasonable timeout for each benchmark +BenchmarkTools.DEFAULT_PARAMETERS.time_tolerance = 0.05 +BenchmarkTools.DEFAULT_PARAMETERS.memory_tolerance = 0.01 + +# Start time for tracking +start_time = now() +last_heartbeat = now() + algs = [ UMFPACKFactorization(), KLUFactorization(), @@ -29,7 +38,14 @@ cols = [:red, :blue, :green, :magenta, :turqoise] # one color per alg # matrices = ["HB/1138_bus", "HB/494_bus", "HB/662_bus", "HB/685_bus", "HB/bcsstk01", "HB/bcsstk02", "HB/bcsstk03", "HB/bcsstk04", "HB/bcsstk05", "HB/bcsstk06", "HB/bcsstk07", "HB/bcsstk08", "HB/bcsstk09", "HB/bcsstk10", "HB/bcsstk11", "HB/bcsstk12", "HB/bcsstk13", "HB/bcsstk14", "HB/bcsstk15", "HB/bcsstk16"] allmatrices_md = listnames("*/*") -@info "Total number of matrices: $(allmatrices_md.content[1].rows)" +total_matrices = length(allmatrices_md.content[1].rows) +@info "Total number of matrices: $total_matrices" + +# Track progress and failures +processed_count = 0 +failed_matrices = String[] +successful_matrices = String[] +skipped_large_matrices = String[] times = fill(NaN, length(allmatrices_md.content[1].rows), length(algs)) percentage_sparsity = fill(NaN, length(allmatrices_md.content[1].rows)) @@ -62,7 +78,32 @@ end ``` ```julia -for z in 1:length(allmatrices_md.content[1].rows) +for z in 1:total_matrices + # Early termination if too many consecutive failures + if length(failed_matrices) > 100 + @warn "Too many failures (>100), terminating benchmark early to prevent CI timeout" + break + end + + # Heartbeat every 30 seconds to prevent CI timeout + current_time = now() + if current_time - last_heartbeat > Dates.Second(30) + elapsed = round((current_time - start_time) / Dates.Minute(1), digits=1) + @info "Heartbeat: Still running... ($(elapsed) minutes elapsed, matrix $z/$total_matrices)" + last_heartbeat = current_time + flush(stdout) + flush(stderr) + end + + # Progress tracking - print every 10 matrices or on first/last + if z == 1 || z == total_matrices || z % 10 == 0 + @info "Progress: Processing matrix $z of $total_matrices ($(round(100*z/total_matrices, digits=1))%)" + @info " - Successful: $(length(successful_matrices))" + @info " - Failed: $(length(failed_matrices))" + @info " - Skipped (too large): $(length(skipped_large_matrices))" + flush(stdout) + flush(stderr) + end try matrix = allmatrices_md.content[1].rows[z] matrix = string(matrix[1]) @@ -76,8 +117,15 @@ for z in 1:length(allmatrices_md.content[1].rows) mtx_copy = copy(A) - @info "$n × $n" - n > 100 && error("Skipping too large matrices") + # Check matrix size and skip if too large + if n > 1500 + @info "Matrix $currMTX ($n × $n) is too large, skipping..." + push!(skipped_large_matrices, currMTX) + processed_count += 1 + continue + end + + @info "Processing $currMTX: $n × $n matrix" ## COMPUTING SPACED OUT SPARSITY rows, cols = size(mtx_copy) @@ -105,12 +153,18 @@ for z in 1:length(allmatrices_md.content[1].rows) u0 = rand(rng, n) for j in 1:length(algs) - bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy($A), - copy($b); - u0 = copy($u0), - alias_A = true, - alias_b = true)) - times[z, j] = bt + try + bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy($A), + copy($b); + u0 = copy($u0), + alias_A = true, + alias_b = true)) + times[z, j] = bt + catch alg_error + # Silently record NaN for failed algorithms + times[z, j] = NaN + @debug "Algorithm $(algnames[j]) failed on $currMTX: $(typeof(alg_error))" + end end bandedness_five[z] = compute_bandedness(A, 5) @@ -130,18 +184,41 @@ for z in 1:length(allmatrices_md.content[1].rows) display(p) =# - println("successfully factorized $(currMTX)") + push!(successful_matrices, currMTX) + processed_count += 1 + @debug "Successfully factorized $currMTX" catch e matrix = allmatrices_md.content[1].rows[z] matrix = string(matrix[1]) - currMTX = matrix - - println("$(currMTX) failed to factorize.") - println(e) + + push!(failed_matrices, currMTX) + processed_count += 1 + + # Only print brief error info, not full stacktrace + error_type = typeof(e) + @warn "Matrix $currMTX failed: $error_type" end end +# Final summary +total_elapsed = round((now() - start_time) / Dates.Minute(1), digits=1) +@info "="^60 +@info "Benchmark Complete!" +@info "Total runtime: $total_elapsed minutes" +@info "Total matrices processed: $processed_count / $total_matrices" +@info "Successful: $(length(successful_matrices))" +@info "Failed: $(length(failed_matrices))" +@info "Skipped (too large): $(length(skipped_large_matrices))" +@info "="^60 + +# Print failed matrices list if not too many +if length(failed_matrices) > 0 && length(failed_matrices) <= 20 + @info "Failed matrices: $(join(failed_matrices, ", "))" +elseif length(failed_matrices) > 20 + @info "Failed matrices (first 20): $(join(failed_matrices[1:20], ", "))..." +end + percentage_sparsity = percentage_sparsity[.!isnan.(percentage_sparsity)] spaced_out_sparsity = spaced_out_sparsity[.!isnan.(spaced_out_sparsity)] spaced_out_sparsity = replace(spaced_out_sparsity, 0 => 1e-10)