diff --git a/Project.toml b/Project.toml index 7b67bcff0..69369115b 100644 --- a/Project.toml +++ b/Project.toml @@ -30,6 +30,7 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" [weakdeps] BandedMatrices = "aae01518-5342-5314-be14-df237901396f" BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0" +blis_jll = "6136c539-28a5-5bf0-87cc-b183200dce32" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" @@ -40,6 +41,7 @@ HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771" IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77" +LAPACK_jll = "51474c39-65e3-53ba-86ba-03b1b862ec14" Metal = "dde4c033-4e86-420c-a63e-0dd931031962" Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2" RecursiveFactorization = "f2c3362d-daeb-58d1-803e-2bc74f2840b4" @@ -47,6 +49,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Sparspak = "e56a9233-b9d6-4f03-8d0f-1825330902ac" [extensions] +LinearSolveBLISExt = ["blis_jll", "LAPACK_jll"] LinearSolveBandedMatricesExt = "BandedMatrices" LinearSolveBlockDiagonalsExt = "BlockDiagonals" LinearSolveCUDAExt = "CUDA" @@ -71,6 +74,7 @@ Aqua = "0.8" ArrayInterface = "7.7" BandedMatrices = "1.5" BlockDiagonals = "0.1.42, 0.2" +blis_jll = "0.9.0" CUDA = "5" CUDSS = "0.1, 0.2, 0.3, 0.4" ChainRulesCore = "1.22" @@ -91,6 +95,7 @@ KernelAbstractions = "0.9.27" Krylov = "0.10" KrylovKit = "0.8, 0.9, 0.10" KrylovPreconditioners = "0.3" +LAPACK_jll = "3" LazyArrays = "1.8, 2" Libdl = "1.10" LinearAlgebra = "1.10" diff --git a/ext/LinearSolveBLISExt.jl b/ext/LinearSolveBLISExt.jl new file mode 100644 index 000000000..e816c8d5f --- /dev/null +++ b/ext/LinearSolveBLISExt.jl @@ -0,0 +1,250 @@ +module LinearSolveBLISExt + +using Libdl +using blis_jll +using LAPACK_jll +using LinearAlgebra +using LinearSolve + +using LinearAlgebra: BlasInt, LU +using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1, + @blasfunc, chkargsok +using LinearSolve: ArrayInterface, BLISLUFactorization, @get_cacheval, LinearCache, SciMLBase + +const global libblis = blis_jll.blis +const global liblapack = LAPACK_jll.liblapack + +function getrf!(A::AbstractMatrix{<:ComplexF64}; + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))), + info = Ref{BlasInt}(), + check = false) + require_one_based_indexing(A) + check && chkfinite(A) + chkstride1(A) + m, n = size(A) + lda = max(1, stride(A, 2)) + if isempty(ipiv) + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))) + end + ccall((@blasfunc(zgetrf_), liblapack), Cvoid, + (Ref{BlasInt}, Ref{BlasInt}, Ptr{ComplexF64}, + Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}), + m, n, A, lda, ipiv, info) + chkargsok(info[]) + A, ipiv, info[], info #Error code is stored in LU factorization type +end + +function getrf!(A::AbstractMatrix{<:ComplexF32}; + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))), + info = Ref{BlasInt}(), + check = false) + require_one_based_indexing(A) + check && chkfinite(A) + chkstride1(A) + m, n = size(A) + lda = max(1, stride(A, 2)) + if isempty(ipiv) + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))) + end + ccall((@blasfunc(cgetrf_), liblapack), Cvoid, + (Ref{BlasInt}, Ref{BlasInt}, Ptr{ComplexF32}, + Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}), + m, n, A, lda, ipiv, info) + chkargsok(info[]) + A, ipiv, info[], info #Error code is stored in LU factorization type +end + +function getrf!(A::AbstractMatrix{<:Float64}; + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))), + info = Ref{BlasInt}(), + check = false) + require_one_based_indexing(A) + check && chkfinite(A) + chkstride1(A) + m, n = size(A) + lda = max(1, stride(A, 2)) + if isempty(ipiv) + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))) + end + ccall((@blasfunc(dgetrf_), liblapack), Cvoid, + (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, + Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}), + m, n, A, lda, ipiv, info) + chkargsok(info[]) + A, ipiv, info[], info #Error code is stored in LU factorization type +end + +function getrf!(A::AbstractMatrix{<:Float32}; + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))), + info = Ref{BlasInt}(), + check = false) + require_one_based_indexing(A) + check && chkfinite(A) + chkstride1(A) + m, n = size(A) + lda = max(1, stride(A, 2)) + if isempty(ipiv) + ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))) + end + ccall((@blasfunc(sgetrf_), liblapack), Cvoid, + (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, + Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}), + m, n, A, lda, ipiv, info) + chkargsok(info[]) + A, ipiv, info[], info #Error code is stored in LU factorization type +end + +function getrs!(trans::AbstractChar, + A::AbstractMatrix{<:ComplexF64}, + ipiv::AbstractVector{BlasInt}, + B::AbstractVecOrMat{<:ComplexF64}; + info = Ref{BlasInt}()) + require_one_based_indexing(A, ipiv, B) + LinearAlgebra.LAPACK.chktrans(trans) + chkstride1(A, B, ipiv) + n = LinearAlgebra.checksquare(A) + if n != size(B, 1) + throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n")) + end + if n != length(ipiv) + throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n")) + end + nrhs = size(B, 2) + ccall(("zgetrs_", liblapack), Cvoid, + (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{ComplexF64}, Ref{BlasInt}, + Ptr{BlasInt}, Ptr{ComplexF64}, Ref{BlasInt}, Ptr{BlasInt}, Clong), + trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info, + 1) + LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[])) + B +end + +function getrs!(trans::AbstractChar, + A::AbstractMatrix{<:ComplexF32}, + ipiv::AbstractVector{BlasInt}, + B::AbstractVecOrMat{<:ComplexF32}; + info = Ref{BlasInt}()) + require_one_based_indexing(A, ipiv, B) + LinearAlgebra.LAPACK.chktrans(trans) + chkstride1(A, B, ipiv) + n = LinearAlgebra.checksquare(A) + if n != size(B, 1) + throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n")) + end + if n != length(ipiv) + throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n")) + end + nrhs = size(B, 2) + ccall(("cgetrs_", liblapack), Cvoid, + (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{ComplexF32}, Ref{BlasInt}, + Ptr{BlasInt}, Ptr{ComplexF32}, Ref{BlasInt}, Ptr{BlasInt}, Clong), + trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info, + 1) + LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[])) + B +end + +function getrs!(trans::AbstractChar, + A::AbstractMatrix{<:Float64}, + ipiv::AbstractVector{BlasInt}, + B::AbstractVecOrMat{<:Float64}; + info = Ref{BlasInt}()) + require_one_based_indexing(A, ipiv, B) + LinearAlgebra.LAPACK.chktrans(trans) + chkstride1(A, B, ipiv) + n = LinearAlgebra.checksquare(A) + if n != size(B, 1) + throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n")) + end + if n != length(ipiv) + throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n")) + end + nrhs = size(B, 2) + ccall(("dgetrs_", liblapack), Cvoid, + (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt}, + Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong), + trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info, + 1) + LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[])) + B +end + +function getrs!(trans::AbstractChar, + A::AbstractMatrix{<:Float32}, + ipiv::AbstractVector{BlasInt}, + B::AbstractVecOrMat{<:Float32}; + info = Ref{BlasInt}()) + require_one_based_indexing(A, ipiv, B) + LinearAlgebra.LAPACK.chktrans(trans) + chkstride1(A, B, ipiv) + n = LinearAlgebra.checksquare(A) + if n != size(B, 1) + throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n")) + end + if n != length(ipiv) + throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n")) + end + nrhs = size(B, 2) + ccall(("sgetrs_", liblapack), Cvoid, + (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt}, + Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong), + trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info, + 1) + LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[])) + B +end + +default_alias_A(::BLISLUFactorization, ::Any, ::Any) = false +default_alias_b(::BLISLUFactorization, ::Any, ::Any) = false + +const PREALLOCATED_BLIS_LU = begin + A = rand(0, 0) + luinst = ArrayInterface.lu_instance(A), Ref{BlasInt}() +end + +function LinearSolve.init_cacheval(alg::BLISLUFactorization, A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::Bool, + assumptions::OperatorAssumptions) + PREALLOCATED_BLIS_LU +end + +function LinearSolve.init_cacheval(alg::BLISLUFactorization, A::AbstractMatrix{<:Union{Float32,ComplexF32,ComplexF64}}, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::Bool, + assumptions::OperatorAssumptions) + A = rand(eltype(A), 0, 0) + ArrayInterface.lu_instance(A), Ref{BlasInt}() +end + +function SciMLBase.solve!(cache::LinearCache, alg::BLISLUFactorization; + kwargs...) + A = cache.A + A = convert(AbstractMatrix, A) + if cache.isfresh + cacheval = @get_cacheval(cache, :BLISLUFactorization) + res = getrf!(A; ipiv = cacheval[1].ipiv, info = cacheval[2]) + fact = LU(res[1:3]...), res[4] + cache.cacheval = fact + cache.isfresh = false + end + + y = ldiv!(cache.u, @get_cacheval(cache, :BLISLUFactorization)[1], cache.b) + SciMLBase.build_linear_solution(alg, y, nothing, cache) + + #= + A, info = @get_cacheval(cache, :BLISLUFactorization) + LinearAlgebra.require_one_based_indexing(cache.u, cache.b) + m, n = size(A, 1), size(A, 2) + if m > n + Bc = copy(cache.b) + getrs!('N', A.factors, A.ipiv, Bc; info) + return copyto!(cache.u, 1, Bc, 1, n) + else + copyto!(cache.u, cache.b) + getrs!('N', A.factors, A.ipiv, cache.u; info) + end + + SciMLBase.build_linear_solution(alg, cache.u, nothing, cache) + =# +end + +end \ No newline at end of file diff --git a/src/extension_algs.jl b/src/extension_algs.jl index 938e1bd11..952057a15 100644 --- a/src/extension_algs.jl +++ b/src/extension_algs.jl @@ -439,3 +439,5 @@ A wrapper over Apple's Metal GPU library. Direct calls to Metal in a way that pr to avoid allocations and automatically offloads to the GPU. """ struct MetalLUFactorization <: AbstractFactorization end + +struct BLISLUFactorization <: AbstractFactorization end diff --git a/test/basictests.jl b/test/basictests.jl index 433213423..c1ff0c5be 100644 --- a/test/basictests.jl +++ b/test/basictests.jl @@ -4,6 +4,13 @@ using IterativeSolvers, KrylovKit, MKL_jll, KrylovPreconditioners using Test import Random +# Try to load BLIS extension +try + using blis_jll, LAPACK_jll +catch LoadError + # BLIS dependencies not available, tests will be skipped +end + const Dual64 = ForwardDiff.Dual{Nothing, Float64, 1} n = 8 @@ -228,6 +235,11 @@ end push!(test_algs, MKLLUFactorization()) end + # Test BLIS if extension is available + if Base.get_extension(LinearSolve, :LinearSolveBLISExt) !== nothing + push!(test_algs, BLISLUFactorization()) + end + @testset "Concrete Factorizations" begin for alg in test_algs @testset "$alg" begin