diff --git a/Project.toml b/Project.toml index ad009a58c..c9759eeb9 100644 --- a/Project.toml +++ b/Project.toml @@ -17,6 +17,7 @@ Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" +OpenBLAS_jll = "4536629a-c528-5b80-bd46-f80d51c5b363" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Preferences = "21216c6a-2e73-6563-6e65-726566657250" RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd" @@ -112,6 +113,7 @@ MPI = "0.20" Markdown = "1.10" Metal = "1.4" MultiFloats = "2.3" +OpenBLAS_jll = "0.3" Pardiso = "1" Pkg = "1.10" PrecompileTools = "1.2" diff --git a/docs/src/basics/algorithm_selection.md b/docs/src/basics/algorithm_selection.md index f7ae4feda..37058c4ef 100644 --- a/docs/src/basics/algorithm_selection.md +++ b/docs/src/basics/algorithm_selection.md @@ -71,9 +71,10 @@ sol = solve(LinearProblem(A_small, rand(50)), SimpleLUFactorization()) A_medium = rand(200, 200) sol = solve(LinearProblem(A_medium, rand(200)), RFLUFactorization()) -# Large matrices (> 500×500): MKLLUFactorization or AppleAccelerate +# Large matrices (> 500×500): MKLLUFactorization, OpenBLASLUFactorization, or AppleAccelerate A_large = rand(1000, 1000) sol = solve(LinearProblem(A_large, rand(1000)), MKLLUFactorization()) +# Alternative: OpenBLASLUFactorization() for direct OpenBLAS calls ``` ### Sparse Matrices @@ -141,7 +142,7 @@ Is A symmetric positive definite? → CholeskyFactorization Is A symmetric indefinite? → BunchKaufmanFactorization Is A sparse? → UMFPACKFactorization or KLUFactorization Is A small dense? → RFLUFactorization or SimpleLUFactorization -Is A large dense? → MKLLUFactorization or AppleAccelerateLUFactorization +Is A large dense? → MKLLUFactorization, OpenBLASLUFactorization, or AppleAccelerateLUFactorization Is A GPU array? → QRFactorization or LUFactorization Is A an operator/function? → KrylovJL_GMRES Is the system overdetermined? → QRFactorization or KrylovJL_LSMR diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md index 71a12edc8..d67e9fcb1 100644 --- a/docs/src/solvers/solvers.md +++ b/docs/src/solvers/solvers.md @@ -17,9 +17,11 @@ the best choices, with SVD being the slowest but most precise. For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around 150x150 matrices, though this can be dependent on the exact details of the hardware. After this point, `MKLLUFactorization` is usually faster on most hardware. Note that on Mac computers -that `AppleAccelerateLUFactorization` is generally always the fastest. `LUFactorization` will -use your base system BLAS which can be fast or slow depending on the hardware configuration. -`SimpleLUFactorization` will be fast only on very small matrices but can cut down on compile times. +that `AppleAccelerateLUFactorization` is generally always the fastest. `OpenBLASLUFactorization` +provides direct OpenBLAS calls without going through libblastrampoline and can be faster than +`LUFactorization` in some configurations. `LUFactorization` will use your base system BLAS which +can be fast or slow depending on the hardware configuration. `SimpleLUFactorization` will be fast +only on very small matrices but can cut down on compile times. For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000 @@ -226,6 +228,12 @@ MKLLUFactorization MKL32MixedLUFactorization ``` +### OpenBLAS + +```@docs +OpenBLASLUFactorization +``` + ### AppleAccelerate.jl !!! note diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl index eca4f0692..fa20b7bd1 100644 --- a/src/LinearSolve.jl +++ b/src/LinearSolve.jl @@ -59,6 +59,14 @@ else const usemkl = false end +# OpenBLAS_jll is a standard library, but allow users to disable it via preferences +if Preferences.@load_preference("LoadOpenBLAS_JLL", true) + using OpenBLAS_jll: OpenBLAS_jll + const useopenblas = OpenBLAS_jll.is_available() +else + const useopenblas = false +end + @reexport using SciMLBase @@ -345,6 +353,7 @@ include("extension_algs.jl") include("factorization.jl") include("appleaccelerate.jl") include("mkl.jl") +include("openblas.jl") include("simplelu.jl") include("simplegmres.jl") include("iterative_wrappers.jl") @@ -462,6 +471,7 @@ export MKLPardisoFactorize, MKLPardisoIterate export PanuaPardisoFactorize, PanuaPardisoIterate export PardisoJL export MKLLUFactorization +export OpenBLASLUFactorization export MKL32MixedLUFactorization export AppleAccelerateLUFactorization export AppleAccelerate32MixedLUFactorization diff --git a/src/openblas.jl b/src/openblas.jl new file mode 100644 index 000000000..ea6cb4210 --- /dev/null +++ b/src/openblas.jl @@ -0,0 +1,271 @@ +""" +```julia +OpenBLASLUFactorization() +``` + +A direct wrapper over OpenBLAS's LU factorization (`getrf!` and `getrs!`). +This solver makes direct calls to OpenBLAS_jll without going through Julia's +libblastrampoline, which can provide performance benefits in certain configurations. + +## Performance Characteristics + + - Pre-allocates workspace to avoid allocations during solving + - Makes direct `ccall`s to OpenBLAS routines + - Can be faster than `LUFactorization` when OpenBLAS is well-optimized for the hardware + - Supports `Float32`, `Float64`, `ComplexF32`, and `ComplexF64` element types + +## When to Use + + - When you want to ensure OpenBLAS is used regardless of the system BLAS configuration + - When benchmarking shows better performance than `LUFactorization` on your specific hardware + - When you need consistent behavior across different systems (always uses OpenBLAS) + +## Example + +```julia +using LinearSolve, LinearAlgebra + +A = rand(100, 100) +b = rand(100) +prob = LinearProblem(A, b) +sol = solve(prob, OpenBLASLUFactorization()) +``` +""" +struct OpenBLASLUFactorization <: AbstractFactorization end + +# OpenBLAS methods - OpenBLAS_jll is always available as a standard library + +function openblas_getrf!(A::AbstractMatrix{<:ComplexF64}; + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))), + info = Ref{BlasInt}(), + check = false) + require_one_based_indexing(A) + check && chkfinite(A) + chkstride1(A) + m, n = size(A) + lda = max(1, stride(A, 2)) + if isempty(ipiv) + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))) + end + ccall((@blasfunc(zgetrf_), OpenBLAS_jll.libopenblas), Cvoid, + (Ref{BlasInt}, Ref{BlasInt}, Ptr{ComplexF64}, + Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}), + m, n, A, lda, ipiv, info) + chkargsok(info[]) + A, ipiv, info[], info #Error code is stored in LU factorization type +end + +function openblas_getrf!(A::AbstractMatrix{<:ComplexF32}; + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))), + info = Ref{BlasInt}(), + check = false) + require_one_based_indexing(A) + check && chkfinite(A) + chkstride1(A) + m, n = size(A) + lda = max(1, stride(A, 2)) + if isempty(ipiv) + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))) + end + ccall((@blasfunc(cgetrf_), OpenBLAS_jll.libopenblas), Cvoid, + (Ref{BlasInt}, Ref{BlasInt}, Ptr{ComplexF32}, + Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}), + m, n, A, lda, ipiv, info) + chkargsok(info[]) + A, ipiv, info[], info #Error code is stored in LU factorization type +end + +function openblas_getrf!(A::AbstractMatrix{<:Float64}; + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))), + info = Ref{BlasInt}(), + check = false) + require_one_based_indexing(A) + check && chkfinite(A) + chkstride1(A) + m, n = size(A) + lda = max(1, stride(A, 2)) + if isempty(ipiv) + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))) + end + ccall((@blasfunc(dgetrf_), OpenBLAS_jll.libopenblas), Cvoid, + (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, + Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}), + m, n, A, lda, ipiv, info) + chkargsok(info[]) + A, ipiv, info[], info #Error code is stored in LU factorization type +end + +function openblas_getrf!(A::AbstractMatrix{<:Float32}; + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))), + info = Ref{BlasInt}(), + check = false) + require_one_based_indexing(A) + check && chkfinite(A) + chkstride1(A) + m, n = size(A) + lda = max(1, stride(A, 2)) + if isempty(ipiv) + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))) + end + ccall((@blasfunc(sgetrf_), OpenBLAS_jll.libopenblas), Cvoid, + (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, + Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}), + m, n, A, lda, ipiv, info) + chkargsok(info[]) + A, ipiv, info[], info #Error code is stored in LU factorization type +end + +function openblas_getrs!(trans::AbstractChar, + A::AbstractMatrix{<:ComplexF64}, + ipiv::AbstractVector{BlasInt}, + B::AbstractVecOrMat{<:ComplexF64}; + info = Ref{BlasInt}()) + require_one_based_indexing(A, ipiv, B) + LinearAlgebra.LAPACK.chktrans(trans) + chkstride1(A, B, ipiv) + n = LinearAlgebra.checksquare(A) + if n != size(B, 1) + throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n")) + end + if n != length(ipiv) + throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n")) + end + nrhs = size(B, 2) + ccall((@blasfunc(zgetrs_), OpenBLAS_jll.libopenblas), Cvoid, + (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{ComplexF64}, Ref{BlasInt}, + Ptr{BlasInt}, Ptr{ComplexF64}, Ref{BlasInt}, Ptr{BlasInt}, Clong), + trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info, + 1) + LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[])) + B +end + +function openblas_getrs!(trans::AbstractChar, + A::AbstractMatrix{<:ComplexF32}, + ipiv::AbstractVector{BlasInt}, + B::AbstractVecOrMat{<:ComplexF32}; + info = Ref{BlasInt}()) + require_one_based_indexing(A, ipiv, B) + LinearAlgebra.LAPACK.chktrans(trans) + chkstride1(A, B, ipiv) + n = LinearAlgebra.checksquare(A) + if n != size(B, 1) + throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n")) + end + if n != length(ipiv) + throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n")) + end + nrhs = size(B, 2) + ccall((@blasfunc(cgetrs_), OpenBLAS_jll.libopenblas), Cvoid, + (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{ComplexF32}, Ref{BlasInt}, + Ptr{BlasInt}, Ptr{ComplexF32}, Ref{BlasInt}, Ptr{BlasInt}, Clong), + trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info, + 1) + LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[])) + B +end + +function openblas_getrs!(trans::AbstractChar, + A::AbstractMatrix{<:Float64}, + ipiv::AbstractVector{BlasInt}, + B::AbstractVecOrMat{<:Float64}; + info = Ref{BlasInt}()) + require_one_based_indexing(A, ipiv, B) + LinearAlgebra.LAPACK.chktrans(trans) + chkstride1(A, B, ipiv) + n = LinearAlgebra.checksquare(A) + if n != size(B, 1) + throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n")) + end + if n != length(ipiv) + throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n")) + end + nrhs = size(B, 2) + ccall((@blasfunc(dgetrs_), OpenBLAS_jll.libopenblas), Cvoid, + (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt}, + Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong), + trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info, + 1) + LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[])) + B +end + +function openblas_getrs!(trans::AbstractChar, + A::AbstractMatrix{<:Float32}, + ipiv::AbstractVector{BlasInt}, + B::AbstractVecOrMat{<:Float32}; + info = Ref{BlasInt}()) + require_one_based_indexing(A, ipiv, B) + LinearAlgebra.LAPACK.chktrans(trans) + chkstride1(A, B, ipiv) + n = LinearAlgebra.checksquare(A) + if n != size(B, 1) + throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n")) + end + if n != length(ipiv) + throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n")) + end + nrhs = size(B, 2) + ccall((@blasfunc(sgetrs_), OpenBLAS_jll.libopenblas), Cvoid, + (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt}, + Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong), + trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info, + 1) + LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[])) + B +end + +default_alias_A(::OpenBLASLUFactorization, ::Any, ::Any) = false +default_alias_b(::OpenBLASLUFactorization, ::Any, ::Any) = false + +const PREALLOCATED_OPENBLAS_LU = begin + A = rand(0, 0) + luinst = ArrayInterface.lu_instance(A), Ref{BlasInt}() +end + +function LinearSolve.init_cacheval(alg::OpenBLASLUFactorization, A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::LinearVerbosity, + assumptions::OperatorAssumptions) + PREALLOCATED_OPENBLAS_LU +end + +function LinearSolve.init_cacheval(alg::OpenBLASLUFactorization, + A::AbstractMatrix{<:Union{Float32, ComplexF32, ComplexF64}}, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::LinearVerbosity, + assumptions::OperatorAssumptions) + A = rand(eltype(A), 0, 0) + ArrayInterface.lu_instance(A), Ref{BlasInt}() +end + +function SciMLBase.solve!(cache::LinearCache, alg::OpenBLASLUFactorization; + kwargs...) + A = cache.A + A = convert(AbstractMatrix, A) + if cache.isfresh + cacheval = @get_cacheval(cache, :OpenBLASLUFactorization) + res = openblas_getrf!(A; ipiv = cacheval[1].ipiv, info = cacheval[2]) + fact = LU(res[1:3]...), res[4] + cache.cacheval = fact + + if !LinearAlgebra.issuccess(fact[1]) + return SciMLBase.build_linear_solution( + alg, cache.u, nothing, cache; retcode = ReturnCode.Failure) + end + cache.isfresh = false + end + + A, info = @get_cacheval(cache, :OpenBLASLUFactorization) + require_one_based_indexing(cache.u, cache.b) + m, n = size(A, 1), size(A, 2) + if m > n + Bc = copy(cache.b) + openblas_getrs!('N', A.factors, A.ipiv, Bc; info) + copyto!(cache.u, 1, Bc, 1, n) + else + copyto!(cache.u, cache.b) + openblas_getrs!('N', A.factors, A.ipiv, cache.u; info) + end + + SciMLBase.build_linear_solution( + alg, cache.u, nothing, cache; retcode = ReturnCode.Success) +end diff --git a/test/basictests.jl b/test/basictests.jl index 8aa7547a9..f7f115245 100644 --- a/test/basictests.jl +++ b/test/basictests.jl @@ -287,6 +287,11 @@ end push!(test_algs, MKLLUFactorization()) end + # Test OpenBLAS if available + if LinearSolve.useopenblas + push!(test_algs, OpenBLASLUFactorization()) + end + # Test BLIS if extension is available if Base.get_extension(LinearSolve, :LinearSolveBLISExt) !== nothing push!(test_algs, BLISLUFactorization()) diff --git a/test/preferences.jl b/test/preferences.jl index bd6cf4516..d3118610e 100644 --- a/test/preferences.jl +++ b/test/preferences.jl @@ -190,6 +190,14 @@ using Preferences println("✅ MKLLUFactorization confirmed working") end + # Test OpenBLAS if available + if LinearSolve.useopenblas + sol_openblas = solve(prob, OpenBLASLUFactorization()) + @test sol_openblas.retcode == ReturnCode.Success + @test norm(A * sol_openblas.u - b) < 1e-8 + println("✅ OpenBLASLUFactorization confirmed working") + end + # Test Apple Accelerate if available if LinearSolve.appleaccelerate_isavailable() sol_apple = solve(prob, AppleAccelerateLUFactorization()) diff --git a/test/resolve.jl b/test/resolve.jl index 3fe0a2fea..932df173a 100644 --- a/test/resolve.jl +++ b/test/resolve.jl @@ -19,7 +19,8 @@ for alg in vcat(InteractiveUtils.subtypes(AbstractDenseFactorization), ]) && (!(alg == AppleAccelerateLUFactorization) || LinearSolve.appleaccelerate_isavailable()) && - (!(alg == MKLLUFactorization) || LinearSolve.usemkl) + (!(alg == MKLLUFactorization) || LinearSolve.usemkl) && + (!(alg == OpenBLASLUFactorization) || LinearSolve.useopenblas) A = [1.0 2.0; 3.0 4.0] alg in [KLUFactorization, UMFPACKFactorization, SparspakFactorization] && (A = sparse(A))