SciML
diff --git a/‎lib/LinearSolveAutotune/LocalPreferences.toml
Lines changed: 5 additions & 0 deletions b/‎lib/LinearSolveAutotune/LocalPreferences.toml
Lines changed: 5 additions & 0 deletions
diff --git a/‎lib/LinearSolveAutotune/Project.toml
Lines changed: 41 additions & 0 deletions b/‎lib/LinearSolveAutotune/Project.toml
Lines changed: 41 additions & 0 deletions
diff --git a/‎lib/LinearSolveAutotune/src/LinearSolveAutotune.jl
Lines changed: 174 additions & 0 deletions b/‎lib/LinearSolveAutotune/src/LinearSolveAutotune.jl
Lines changed: 174 additions & 0 deletions
diff --git a/‎lib/LinearSolveAutotune/src/algorithms.jl
Lines changed: 98 additions & 0 deletions b/‎lib/LinearSolveAutotune/src/algorithms.jl
Lines changed: 98 additions & 0 deletions
@@ -0,0 +1,5 @@
+[LinearSolveAutotune]
+autotune_timestamp = "2025-08-03T19:50:58.753"
+best_algorithm_0_128 = "LUFactorization"
+best_algorithm_128_256 = "LUFactorization"
+best_algorithm_256_512 = "LUFactorization"
@@ -0,0 +1,41 @@
+name = "LinearSolveAutotune"
+uuid = "67398393-80e8-4254-b7e4-1b9a36a3c5b6"
+authors = ["SciML"]
+version = "0.1.0"
+
+[deps]
+LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+GitHub = "bc5e4493-9b4d-5f90-b8aa-2b2bcaad7a26"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
+Preferences = "21216c6a-2e73-6563-6e65-726566657250"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+
+[compat]
+LinearSolve = "3"
+BenchmarkTools = "1"
+DataFrames = "1"
+GitHub = "5"
+Plots = "1"
+PrettyTables = "2"
+Preferences = "1"
+Statistics = "1"
+Random = "1"
+LinearAlgebra = "1"
+Printf = "1"
+Dates = "1"
+CUDA = "5"
+Metal = "1"
+julia = "1.10"
@@ -0,0 +1,174 @@
+module LinearSolveAutotune
+
+using LinearSolve
+using BenchmarkTools
+using DataFrames
+using PrettyTables
+using Preferences
+using Statistics
+using Random
+using LinearAlgebra
+using Printf
+using Dates
+
+# Optional dependencies for telemetry and plotting
+using GitHub
+using Plots
+
+export autotune_setup
+
+include("algorithms.jl")
+include("gpu_detection.jl")
+include("benchmarking.jl")
+include("plotting.jl")
+include("telemetry.jl")
+include("preferences.jl")
+
+"""
+    autotune_setup(; 
+        large_matrices::Bool = false,
+        telemetry::Bool = true,
+        make_plot::Bool = true,
+        set_preferences::Bool = true,
+        samples::Int = 5,
+        seconds::Float64 = 0.5)
+
+Run a comprehensive benchmark of all available LU factorization methods and optionally:
+
+  - Create performance plots
+  - Upload results to GitHub telemetry
+  - Set Preferences for optimal algorithm selection
+  - Support both CPU and GPU algorithms based on hardware detection
+
+# Arguments
+
+  - `large_matrices::Bool = false`: Include larger matrix sizes for GPU benchmarking
+  - `telemetry::Bool = true`: Share results to GitHub issue for community data
+  - `make_plot::Bool = true`: Generate performance plots
+  - `set_preferences::Bool = true`: Update LinearSolve preferences with optimal algorithms
+  - `samples::Int = 5`: Number of benchmark samples per algorithm/size
+  - `seconds::Float64 = 0.5`: Maximum time per benchmark
+
+# Returns
+
+  - `DataFrame`: Detailed benchmark results with performance data
+  - `Plot`: Performance visualization (if `make_plot=true`)
+
+# Examples
+
+```julia
+using LinearSolve
+using LinearSolveAutotune
+
+# Basic autotune with default settings
+results = autotune_setup()
+
+# Custom autotune for GPU systems with larger matrices
+results = autotune_setup(large_matrices = true, samples = 10, seconds = 1.0)
+
+# Autotune without telemetry sharing
+results = autotune_setup(telemetry = false)
+```
+"""
+function autotune_setup(;
+        large_matrices::Bool = false,
+        telemetry::Bool = true,
+        make_plot::Bool = true,
+        set_preferences::Bool = true,
+        samples::Int = 5,
+        seconds::Float64 = 0.5)
+    @info "Starting LinearSolve.jl autotune setup..."
+    @info "Configuration: large_matrices=$large_matrices, telemetry=$telemetry, make_plot=$make_plot, set_preferences=$set_preferences"
+
+    # Get system information
+    system_info = get_system_info()
+    @info "System detected: $(system_info["os"]) $(system_info["arch"]) with $(system_info["num_cores"]) cores"
+
+    # Get available algorithms
+    cpu_algs, cpu_names = get_available_algorithms()
+    @info "Found $(length(cpu_algs)) CPU algorithms: $(join(cpu_names, ", "))"
+
+    # Add GPU algorithms if available
+    gpu_algs, gpu_names = get_gpu_algorithms()
+    if !isempty(gpu_algs)
+        @info "Found $(length(gpu_algs)) GPU algorithms: $(join(gpu_names, ", "))"
+    end
+
+    # Combine all algorithms
+    all_algs = vcat(cpu_algs, gpu_algs)
+    all_names = vcat(cpu_names, gpu_names)
+
+    if isempty(all_algs)
+        error("No algorithms found! This shouldn't happen.")
+    end
+
+    # Get benchmark sizes
+    sizes = collect(get_benchmark_sizes(large_matrices))
+    @info "Benchmarking $(length(sizes)) matrix sizes from $(minimum(sizes)) to $(maximum(sizes))"
+
+    # Run benchmarks
+    @info "Running benchmarks (this may take several minutes)..."
+    results_df = benchmark_algorithms(sizes, all_algs, all_names;
+        samples = samples, seconds = seconds, large_matrices = large_matrices)
+
+    # Display results table
+    successful_results = filter(row -> row.success, results_df)
+    if nrow(successful_results) > 0
+        @info "Benchmark completed successfully!"
+
+        # Create summary table for display
+        summary = combine(groupby(successful_results, :algorithm),
+            :gflops => mean => :avg_gflops,
+            :gflops => maximum => :max_gflops,
+            nrow => :num_tests)
+        sort!(summary, :avg_gflops, rev = true)
+
+        println("\n" * "="^60)
+        println("BENCHMARK RESULTS SUMMARY")
+        println("="^60)
+        pretty_table(summary,
+            header = ["Algorithm", "Avg GFLOPs", "Max GFLOPs", "Tests"],
+            formatters = ft_printf("%.2f", [2, 3]),
+            crop = :none)
+    else
+        @warn "No successful benchmark results!"
+        return results_df, nothing
+    end
+
+    # Categorize results and find best algorithms per size range
+    categories = categorize_results(results_df)
+
+    # Set preferences if requested
+    if set_preferences && !isempty(categories)
+        set_algorithm_preferences(categories)
+    end
+
+    # Create plot if requested
+    plot_obj = nothing
+    plot_files = nothing
+    if make_plot
+        @info "Creating performance plots..."
+        plot_obj = create_benchmark_plot(results_df)
+        if plot_obj !== nothing
+            plot_files = save_benchmark_plot(plot_obj)
+        end
+    end
+
+    # Upload telemetry if requested
+    if telemetry && nrow(successful_results) > 0
+        @info "Preparing telemetry data for GitHub..."
+        markdown_content = format_results_for_github(results_df, system_info, categories)
+        upload_to_github(markdown_content, plot_files)
+    end
+
+    @info "Autotune setup completed!"
+
+    # Return results and plot
+    if make_plot && plot_obj !== nothing
+        return results_df, plot_obj
+    else
+        return results_df
+    end
+end
+
+end
@@ -0,0 +1,98 @@
+# Algorithm detection and creation functions
+
+"""
+    get_available_algorithms()
+
+Returns a list of available LU factorization algorithms based on the system and loaded packages.
+"""
+function get_available_algorithms()
+    algs = []
+    alg_names = String[]
+
+    # Core algorithms always available
+    push!(algs, LUFactorization())
+    push!(alg_names, "LUFactorization")
+
+    push!(algs, GenericLUFactorization())
+    push!(alg_names, "GenericLUFactorization")
+
+    # MKL if available
+    if LinearSolve.usemkl
+        push!(algs, MKLLUFactorization())
+        push!(alg_names, "MKLLUFactorization")
+    end
+
+    # Apple Accelerate if available  
+    if LinearSolve.appleaccelerate_isavailable()
+        push!(algs, AppleAccelerateLUFactorization())
+        push!(alg_names, "AppleAccelerateLUFactorization")
+    end
+
+    # RecursiveFactorization if loaded
+    try
+        if LinearSolve.userecursivefactorization(nothing)
+            push!(algs, RFLUFactorization())
+            push!(alg_names, "RFLUFactorization")
+        end
+    catch
+        # RFLUFactorization not available
+    end
+
+    # SimpleLU always available
+    push!(algs, SimpleLUFactorization())
+    push!(alg_names, "SimpleLUFactorization")
+
+    return algs, alg_names
+end
+
+"""
+    get_gpu_algorithms()
+
+Returns GPU-specific algorithms if GPU hardware and packages are available.
+"""
+function get_gpu_algorithms()
+    gpu_algs = []
+    gpu_names = String[]
+
+    # CUDA algorithms
+    if is_cuda_available()
+        try
+            push!(gpu_algs, CudaOffloadFactorization())
+            push!(gpu_names, "CudaOffloadFactorization")
+        catch
+            # CUDA extension not loaded
+        end
+    end
+
+    # Metal algorithms for Apple Silicon
+    if is_metal_available()
+        try
+            push!(gpu_algs, MetalLUFactorization())
+            push!(gpu_names, "MetalLUFactorization")
+        catch
+            # Metal extension not loaded
+        end
+    end
+
+    return gpu_algs, gpu_names
+end
+
+"""
+    luflop(m, n=m; innerflop=2)
+
+Calculate the number of floating point operations for LU factorization.
+From the existing LinearSolve benchmarks.
+"""
+function luflop(m, n = m; innerflop = 2)
+    sum(1:min(m, n)) do k
+        invflop = 1
+        scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
+        updateflop = isempty((k + 1):n) ? 0 :
+                     sum((k + 1):n) do j
+            isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
+                innerflop
+            end
+        end
+        invflop + scaleflop + updateflop
+    end
+end