SciML · ChrisRackauckas-Claude · Nov 12, 2023 · Nov 12, 2023 · Aug 3, 2025 · Aug 3, 2025
diff --git a/Project.toml b/Project.toml
@@ -5,27 +5,33 @@ version = "3.24.0"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ConcreteStructs = "2569d6c7-a4a2-43d3-a901-331e8e4be471"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
+LAPACK_jll = "51474c39-65e3-53ba-86ba-03b1b862ec14"
 LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
+RecursiveFactorization = "f2c3362d-daeb-58d1-803e-2bc74f2840b4"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 SciMLOperators = "c0aeaf25-5076-4817-a8d5-81caf7dfa961"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
+blis_jll = "6136c539-28a5-5bf0-87cc-b183200dce32"
+libflame_jll = "8e9d65e3-b2b8-5a9c-baa2-617b4576f0b9"
 
 [weakdeps]
 BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
@@ -42,11 +48,13 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"
-RecursiveFactorization = "f2c3362d-daeb-58d1-803e-2bc74f2840b4"
+libflame_jll = "8e9d65e3-b2b8-5a9c-baa2-617b4576f0b9"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Sparspak = "e56a9233-b9d6-4f03-8d0f-1825330902ac"
 
 [extensions]
+LinearSolveBLISExt = ["blis_jll", "LAPACK_jll"]
+LinearSolveBLISFlameExt = ["blis_jll", "libflame_jll", "LAPACK_jll"]
 LinearSolveBandedMatricesExt = "BandedMatrices"
 LinearSolveBlockDiagonalsExt = "BlockDiagonals"
 LinearSolveCUDAExt = "CUDA"
@@ -70,6 +78,7 @@ AllocCheck = "0.2"
 Aqua = "0.8"
 ArrayInterface = "7.7"
 BandedMatrices = "1.5"
+BenchmarkTools = "1.6.0"
 BlockDiagonals = "0.1.42, 0.2"
 CUDA = "5"
 CUDSS = "0.1, 0.2, 0.3, 0.4"
@@ -91,6 +100,7 @@ KernelAbstractions = "0.9.27"
 Krylov = "0.10"
 KrylovKit = "0.8, 0.9, 0.10"
 KrylovPreconditioners = "0.3"
+LAPACK_jll = "3"
 LazyArrays = "1.8, 2"
 Libdl = "1.10"
 LinearAlgebra = "1.10"
@@ -100,6 +110,7 @@ Metal = "1"
 MultiFloats = "1"
 Pardiso = "0.5.7, 1"
 Pkg = "1"
+Plots = "1.40.17"
 PrecompileTools = "1.2"
 Preferences = "1.4"
 Random = "1"
@@ -118,7 +129,9 @@ StaticArraysCore = "1.4.2"
 Test = "1"
 UnPack = "1"
 Zygote = "0.7"
+blis_jll = "0.9.0"
 julia = "1.10"
+libflame_jll = "5.2.0"
 
 [extras]
 AllocCheck = "9b6a8646-10ed-4001-bbdc-1d2f46dfbb1a"

diff --git a/README_benchmark.md b/README_benchmark.md
@@ -0,0 +1,110 @@
+# LinearSolve.jl BLIS Benchmark
+
+This directory contains a comprehensive benchmark script for testing the performance of various LU factorization algorithms in LinearSolve.jl, including the new BLIS integration.
+
+## Quick Start
+
+```bash
+julia --project benchmark_blis.jl
+```
+
+This will:
+1. Automatically detect available implementations (BLIS, MKL, Apple Accelerate, etc.)
+2. Run benchmarks on matrix sizes from 4×4 to 256×256  
+3. Generate a performance plot saved as `lu_factorization_benchmark.png`
+4. Display results in both console output and a summary table
+
+**Note**: The PNG plot file cannot be included in this gist due to GitHub's binary file restrictions, but it will be generated locally when you run the benchmark.
+
+## What Gets Benchmarked
+
+The script automatically detects and includes algorithms based on what's available, following LinearSolve.jl's detection patterns:
+
+- **LU (OpenBLAS)**: Default BLAS-based LU factorization
+- **RecursiveFactorization**: High-performance pure Julia implementation  
+- **BLIS**: New BLIS-based implementation (requires `blis_jll` and `LAPACK_jll`)
+- **Intel MKL**: Intel's optimized library (automatically detected on x86_64/i686, excludes EPYC CPUs by default)
+- **Apple Accelerate**: Apple's framework (macOS only, checks for Accelerate.framework availability)
+- **FastLU**: FastLapackInterface.jl implementation (if available)
+
+### Detection Logic
+
+The benchmark uses the same detection patterns as LinearSolve.jl:
+
+- **MKL**: Enabled on x86_64/i686 architectures, disabled on AMD EPYC by default
+- **Apple Accelerate**: Checks for macOS and verifies Accelerate.framework can be loaded with required symbols
+- **BLIS**: Attempts to load blis_jll and LAPACK_jll, verifies extension loading
+- **FastLU**: Attempts to load FastLapackInterface.jl package
+
+## Requirements
+
+### Essential Dependencies
+```julia
+using Pkg
+Pkg.add(["BenchmarkTools", "Plots", "RecursiveFactorization"])
+```
+
+### Optional Dependencies for Full Testing
+```julia
+# For BLIS support
+Pkg.add(["blis_jll", "LAPACK_jll"])
+
+# For FastLU support  
+Pkg.add("FastLapackInterface")
+```
+
+## Sample Output
+
+```
+============================================================
+LinearSolve.jl LU Factorization Benchmark with BLIS
+============================================================
+
+System Information:
+  Julia Version: 1.11.6
+  OS: Linux x86_64
+  CPU Threads: 1
+  BLAS Threads: 1
+  BLAS Config: LBTConfig([ILP64] libopenblas64_.so)
+
+Available Implementations:
+  BLIS: true
+  MKL: false  
+  Apple Accelerate: false
+
+Results Summary (GFLOPs):
+------------------------------------------------------------
+Size    LU (OpenBLAS)   RecursiveFactorization  BLIS
+4       0.05            0.09                    0.03
+8       0.28            0.43                    0.09
+16      0.61            1.28                    0.31
+32      1.67            4.17                    1.09
+64      4.0             9.52                    2.5
+128     9.87            16.86                   8.1
+256     17.33           28.16                   9.62
+```
+
+## Performance Notes
+
+- **RecursiveFactorization** typically performs best for smaller matrices (< 500×500)
+- **BLIS** provides an alternative BLAS implementation with different performance characteristics
+- **Apple Accelerate** and **Intel MKL** may show significant advantages on supported platforms
+- Single-threaded benchmarks are used for consistent comparison
+
+## Customization
+
+You can modify the benchmark by editing `benchmark_blis.jl`:
+
+- **Matrix sizes**: Change the `sizes` parameter in `benchmark_lu_factorizations()`
+- **Benchmark parameters**: Adjust `BenchmarkTools` settings (samples, evaluations)
+- **Algorithms**: Add/remove algorithms in `build_algorithm_list()`
+
+## Understanding the Results
+
+- **GFLOPs**: Billions of floating-point operations per second (higher is better)
+- **Performance scaling**: Look for algorithms that maintain high GFLOPs as matrix size increases
+- **Platform differences**: Results vary significantly between systems based on hardware and BLAS libraries
+
+## Integration with SciMLBenchmarks
+
+This benchmark follows the same structure as the [official SciMLBenchmarks LU factorization benchmark](https://docs.sciml.ai/SciMLBenchmarksOutput/stable/LinearSolve/LUFactorization/), making it easy to compare results and contribute to the broader benchmark suite.