diff --git a/Project.toml b/Project.toml index 196ed7fc4..0cc456b0d 100644 --- a/Project.toml +++ b/Project.toml @@ -9,7 +9,6 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" ConcreteStructs = "2569d6c7-a4a2-43d3-a901-331e8e4be471" DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56" -FastLapackInterface = "29a986be-02c6-4525-aec4-84b980013641" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7" @@ -35,6 +34,7 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" FastAlmostBandedMatrices = "9d29842c-ecb8-4973-b1e9-a27b1157504e" +FastLapackInterface = "29a986be-02c6-4525-aec4-84b980013641" HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771" IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" @@ -52,6 +52,7 @@ LinearSolveCUDAExt = "CUDA" LinearSolveCUDSSExt = "CUDSS" LinearSolveEnzymeExt = "EnzymeCore" LinearSolveFastAlmostBandedMatricesExt = "FastAlmostBandedMatrices" +LinearSolveFastLapackInterfaceExt = "FastLapackInterface" LinearSolveHYPREExt = "HYPRE" LinearSolveIterativeSolversExt = "IterativeSolvers" LinearSolveKernelAbstractionsExt = "KernelAbstractions" @@ -126,6 +127,7 @@ BandedMatrices = "aae01518-5342-5314-be14-df237901396f" BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" FastAlmostBandedMatrices = "9d29842c-ecb8-4973-b1e9-a27b1157504e" +FastLapackInterface = "29a986be-02c6-4525-aec4-84b980013641" FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771" @@ -150,4 +152,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [targets] -test = ["Aqua", "Test", "IterativeSolvers", "InteractiveUtils", "JET", "KrylovKit", "KrylovPreconditioners", "Pkg", "Random", "SafeTestsets", "MultiFloats", "ForwardDiff", "HYPRE", "MPI", "BlockDiagonals", "Enzyme", "FiniteDiff", "BandedMatrices", "FastAlmostBandedMatrices", "StaticArrays", "AllocCheck", "StableRNGs", "Zygote", "RecursiveFactorization", "Sparspak"] +test = ["Aqua", "Test", "IterativeSolvers", "InteractiveUtils", "JET", "KrylovKit", "KrylovPreconditioners", "Pkg", "Random", "SafeTestsets", "MultiFloats", "ForwardDiff", "HYPRE", "MPI", "BlockDiagonals", "Enzyme", "FiniteDiff", "BandedMatrices", "FastAlmostBandedMatrices", "StaticArrays", "AllocCheck", "StableRNGs", "Zygote", "RecursiveFactorization", "Sparspak", "FastLapackInterface"] diff --git a/docs/pages.jl b/docs/pages.jl index 078f7b10c..c336a2dc0 100644 --- a/docs/pages.jl +++ b/docs/pages.jl @@ -2,8 +2,8 @@ pages = ["index.md", "Tutorials" => Any["tutorials/linear.md", - "tutorials/caching_interface.md", - "tutorials/accelerating_choices.md"], + "tutorials/caching_interface.md", + "tutorials/accelerating_choices.md"], "Basics" => Any["basics/LinearProblem.md", "basics/common_solver_opts.md", "basics/OperatorAssumptions.md", diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md index c6a8709f5..5b003572e 100644 --- a/docs/src/solvers/solvers.md +++ b/docs/src/solvers/solvers.md @@ -84,6 +84,10 @@ use `Krylov_GMRES()`. ### RecursiveFactorization.jl +!!! note + + Using this solver requires adding the package RecursiveFactorization.jl, i.e. `using RecursiveFactorization` + ```@docs RFLUFactorization ``` @@ -123,7 +127,13 @@ FastLapackInterface.jl is a package that allows for a lower-level interface to t calls to allow for preallocating workspaces to decrease the overhead of the wrappers. LinearSolve.jl provides a wrapper to these routines in a way where an initialized solver has a non-allocating LU factorization. In theory, this post-initialized solve should always -be faster than the Base.LinearAlgebra version. +be faster than the Base.LinearAlgebra version. In practice, with the way we wrap the solvers, +we do not see a performance benefit and in fact benchmarks tend to show this inhibits +performance. + +!!! note + + Using this solver requires adding the package FastLapackInterface.jl, i.e. `using FastLapackInterface` ```@docs FastLUFactorization @@ -157,10 +167,6 @@ KrylovJL ### MKL.jl -!!! note - - Using this solver requires adding the package MKL_jll.jl, i.e. `using MKL_jll` - ```@docs MKLLUFactorization ``` diff --git a/docs/src/tutorials/accelerating_choices.md b/docs/src/tutorials/accelerating_choices.md index 1f1e4b01e..b8f0cf7f0 100644 --- a/docs/src/tutorials/accelerating_choices.md +++ b/docs/src/tutorials/accelerating_choices.md @@ -1,6 +1,7 @@ # Accelerating your Linear Solves !!! note + This section is essential if you wish to achieve maximum performance with LinearSolve.jl, especially on v7 and above. Please ensure the tips of this section are adhered to when optimizing code and benchmarking. @@ -16,7 +17,7 @@ scenarios, so let's dive in. ## Understanding Performance of Dense Linear Solves The performance of dense linear solvers is highly dependent on the size of the matrix -and the chosen architecture to run on, i.e. the CPU. +and the chosen architecture to run on, i.e. the CPU. [This issue](https://github.com/SciML/LinearSolve.jl/issues/357) gathered benchmark data from many different users and is summarized in the following graphs: @@ -25,33 +26,34 @@ from many different users and is summarized in the following graphs: Now one thing that is immediate is for example that AppleAccelerate generally does well on Apple M-series chips, MKL generally does well on Intel, etc. And we know this in LinearSolve.jl, in fact we automatically default to different BLASes based on the CPU -architecture already as part of the design! So that covers most of the variation, but +architecture already as part of the design! So that covers most of the variation, but there are a few major tips to note when fine tuning the results to your system: -1. One of the best methods for size 150x150 matrices and below is RecursiveFactorization.jl. - This is a pure Julia BLAS system, but it has a high load time overhead, and thus as of - v7 it's no longer loaded by default! Thus if your matrices are in this range and you would - value better run times at the cost of compile and load times, it is recommended you add - `using RecursiveFactorization`. The defaulting algorithm will then consider it in its list - and will automatically (in an architecture-specific way) insert it as it feels necesssary. -2. One of the major factors that can inhibit BLAS performance on LU factorization is multithreading. - In many of these plots you can see a giant dip in GFLOPs (higher is better) when a certain size - threshold is hit. This is because, for the number of chosen threads, there was not enough work - and thus when the threading threshold is hit you get a hit to the performance due to the added - overhead. The threading performance can be a per-system thing, and it can be greatly influenced - by the number of cores on your system and the number of threads you allow. Thus for example, - OpenBLAS' LU factorization seems to generally be really bad at guessing the thread switch point - for CPUs with really high core/thread counts. If this is the case, you may want to investigate - decreasing your number of BLAS threads, i.e. via `BLAS.set_num_threads(i)`. Note that - RecursiveFactorization.jl uses your Julia thread pool instead of the BLAS threads. -3. The switch points between algorithms can be fairly inexact. LinearSolve.jl tried to keep a tab - on where they are per platform and keep updated, but it can be a moving battle. You may be - able to eek out some performance by testing between the various options on your platform, i.e. - RFLUFactorization vs LUFactorization vs AppleAccelerateLUFactorization (M-series) vs - MKLFactorization (X86) and hardcoding the choice for your problem if the default did not make - the right guess. + 1. One of the best methods for size 150x150 matrices and below is RecursiveFactorization.jl. + This is a pure Julia BLAS system, but it has a high load time overhead, and thus as of + v7 it's no longer loaded by default! Thus if your matrices are in this range and you would + value better run times at the cost of compile and load times, it is recommended you add + `using RecursiveFactorization`. The defaulting algorithm will then consider it in its list + and will automatically (in an architecture-specific way) insert it as it feels necesssary. + 2. One of the major factors that can inhibit BLAS performance on LU factorization is multithreading. + In many of these plots you can see a giant dip in GFLOPs (higher is better) when a certain size + threshold is hit. This is because, for the number of chosen threads, there was not enough work + and thus when the threading threshold is hit you get a hit to the performance due to the added + overhead. The threading performance can be a per-system thing, and it can be greatly influenced + by the number of cores on your system and the number of threads you allow. Thus for example, + OpenBLAS' LU factorization seems to generally be really bad at guessing the thread switch point + for CPUs with really high core/thread counts. If this is the case, you may want to investigate + decreasing your number of BLAS threads, i.e. via `BLAS.set_num_threads(i)`. Note that + RecursiveFactorization.jl uses your Julia thread pool instead of the BLAS threads. + 3. The switch points between algorithms can be fairly inexact. LinearSolve.jl tried to keep a tab + on where they are per platform and keep updated, but it can be a moving battle. You may be + able to eek out some performance by testing between the various options on your platform, i.e. + RFLUFactorization vs LUFactorization vs AppleAccelerateLUFactorization (M-series) vs + MKLFactorization (X86) and hardcoding the choice for your problem if the default did not make + the right guess. !!! warn + As noted, RecursiveFactorization.jl is one of the fastest linear solvers for smaller dense matrices but requires `using RecursiveFactorization` in order to be used in the default solver setups! Thus it's recommended that any optimized code or benchmarks sets this up. @@ -69,17 +71,18 @@ LinearSolve.jl just uses a very simple "if small then use KLU and if large use U is validated by this plot, but leaves a lot to be desired. In particular, the following rules should be thought about: -1. Pardiso is a great solver, you should try `using Pardiso` and using `MKLPardiso()` in many - scenarios. -2. The more structured a sparsity pattern is, the worse KLU is in comparison to the other - algorithms. -3. A Krylov subspace method with proper preconditioning will be better than direct solvers - when the matrices get large enough. You could always precondition a sparse matrix with - iLU as an easy choice, though the tolerance would need to be tuned in a problem-specific - way. + 1. Pardiso is a great solver, you should try `using Pardiso` and using `MKLPardiso()` in many + scenarios. + 2. The more structured a sparsity pattern is, the worse KLU is in comparison to the other + algorithms. + 3. A Krylov subspace method with proper preconditioning will be better than direct solvers + when the matrices get large enough. You could always precondition a sparse matrix with + iLU as an easy choice, though the tolerance would need to be tuned in a problem-specific + way. !!! note + UMFPACK does better when the BLAS is not OpenBLAS. Try `using MKL` on Intel and AMD Ryzen platforms and UMPACK will be faster! LinearSolve.jl cannot default to this as this changes global settings and thus only defaults to MKL locally, and thus cannot change the setting - within UMFPACK. \ No newline at end of file + within UMFPACK. diff --git a/ext/LinearSolveFastLapackInterfaceExt.jl b/ext/LinearSolveFastLapackInterfaceExt.jl new file mode 100644 index 000000000..45b690037 --- /dev/null +++ b/ext/LinearSolveFastLapackInterfaceExt.jl @@ -0,0 +1,86 @@ +module LinearSolveFastLapackInterfaceExt + +using LinearSolve, LinearAlgebra +using FastLapackInterface + +struct WorkspaceAndFactors{W, F} + workspace::W + factors::F +end + +function LinearSolve.init_cacheval(::FastLUFactorization, A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::Bool, + assumptions::OperatorAssumptions) + ws = LUWs(A) + return WorkspaceAndFactors( + ws, LinearSolve.ArrayInterface.lu_instance(convert(AbstractMatrix, A))) +end + +function SciMLBase.solve!( + cache::LinearSolve.LinearCache, alg::FastLUFactorization; kwargs...) + A = cache.A + A = convert(AbstractMatrix, A) + ws_and_fact = LinearSolve.@get_cacheval(cache, :FastLUFactorization) + if cache.isfresh + # we will fail here if A is a different *size* than in a previous version of the same cache. + # it may instead be desirable to resize the workspace. + LinearSolve.@set! ws_and_fact.factors = LinearAlgebra.LU(LAPACK.getrf!( + ws_and_fact.workspace, + A)...) + cache.cacheval = ws_and_fact + cache.isfresh = false + end + y = ldiv!(cache.u, cache.cacheval.factors, cache.b) + SciMLBase.build_linear_solution(alg, y, nothing, cache) +end + +function LinearSolve.init_cacheval( + alg::FastQRFactorization{NoPivot}, A::AbstractMatrix, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::Bool, + assumptions::OperatorAssumptions) + ws = QRWYWs(A; blocksize = alg.blocksize) + return WorkspaceAndFactors(ws, + LinearSolve.ArrayInterface.qr_instance(convert(AbstractMatrix, A))) +end +function LinearSolve.init_cacheval( + ::FastQRFactorization{ColumnNorm}, A::AbstractMatrix, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::Bool, + assumptions::OperatorAssumptions) + ws = QRpWs(A) + return WorkspaceAndFactors(ws, + LinearSolve.ArrayInterface.qr_instance(convert(AbstractMatrix, A))) +end + +function LinearSolve.init_cacheval(alg::FastQRFactorization, A, b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::Bool, + assumptions::OperatorAssumptions) + return init_cacheval(alg, convert(AbstractMatrix, A), b, u, Pl, Pr, + maxiters::Int, abstol, reltol, verbose::Bool, + assumptions::OperatorAssumptions) +end + +function SciMLBase.solve!(cache::LinearSolve.LinearCache, alg::FastQRFactorization{P}; + kwargs...) where {P} + A = cache.A + A = convert(AbstractMatrix, A) + ws_and_fact = LinearSolve.@get_cacheval(cache, :FastQRFactorization) + if cache.isfresh + # we will fail here if A is a different *size* than in a previous version of the same cache. + # it may instead be desirable to resize the workspace. + if P === NoPivot + LinearSolve.@set! ws_and_fact.factors = LinearAlgebra.QRCompactWY(LAPACK.geqrt!( + ws_and_fact.workspace, + A)...) + else + LinearSolve.@set! ws_and_fact.factors = LinearAlgebra.QRPivoted(LAPACK.geqp3!( + ws_and_fact.workspace, + A)...) + end + cache.cacheval = ws_and_fact + cache.isfresh = false + end + y = ldiv!(cache.u, cache.cacheval.factors, cache.b) + SciMLBase.build_linear_solution(alg, y, nothing, cache) +end + +end diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl index 97cf8b4ae..c66e3c1b9 100644 --- a/src/LinearSolve.jl +++ b/src/LinearSolve.jl @@ -14,7 +14,6 @@ using SciMLOperators using SciMLOperators: AbstractSciMLOperator, IdentityOperator using Setfield using UnPack -using FastLapackInterface using DocStringExtensions using EnumX using Markdown diff --git a/src/extension_algs.jl b/src/extension_algs.jl index 9b10e6397..938e1bd11 100644 --- a/src/extension_algs.jl +++ b/src/extension_algs.jl @@ -107,6 +107,31 @@ function RFLUFactorization(; pivot = Val(true), thread = Val(true), throwerror = RFLUFactorization(pivot, thread; throwerror) end +# There's no options like pivot here. +# But I'm not sure it makes sense as a GenericFactorization +# since it just uses `LAPACK.getrf!`. +""" +`FastLUFactorization()` + +The FastLapackInterface.jl version of the LU factorization. Notably, +this version does not allow for choice of pivoting method. +""" +struct FastLUFactorization <: AbstractDenseFactorization end + +""" +`FastQRFactorization()` + +The FastLapackInterface.jl version of the QR factorization. +""" +struct FastQRFactorization{P} <: AbstractDenseFactorization + pivot::P + blocksize::Int +end + +# is 36 or 16 better here? LinearAlgebra and FastLapackInterface use 36, +# but QRFactorization uses 16. +FastQRFactorization() = FastQRFactorization(NoPivot(), 36) + """ ```julia MKLPardisoFactorize(; nprocs::Union{Int, Nothing} = nothing, diff --git a/src/factorization.jl b/src/factorization.jl index 3224faec8..92ba11ec3 100644 --- a/src/factorization.jl +++ b/src/factorization.jl @@ -1026,108 +1026,6 @@ function SciMLBase.solve!(cache::LinearCache, alg::DiagonalFactorization; SciMLBase.build_linear_solution(alg, cache.u, nothing, cache) end -## FastLAPACKFactorizations - -struct WorkspaceAndFactors{W, F} - workspace::W - factors::F -end - -# There's no options like pivot here. -# But I'm not sure it makes sense as a GenericFactorization -# since it just uses `LAPACK.getrf!`. -""" -`FastLUFactorization()` - -The FastLapackInterface.jl version of the LU factorization. Notably, -this version does not allow for choice of pivoting method. -""" -struct FastLUFactorization <: AbstractDenseFactorization end - -function init_cacheval(::FastLUFactorization, A, b, u, Pl, Pr, - maxiters::Int, abstol, reltol, verbose::Bool, - assumptions::OperatorAssumptions) - ws = LUWs(A) - return WorkspaceAndFactors(ws, ArrayInterface.lu_instance(convert(AbstractMatrix, A))) -end - -function SciMLBase.solve!(cache::LinearCache, alg::FastLUFactorization; kwargs...) - A = cache.A - A = convert(AbstractMatrix, A) - ws_and_fact = @get_cacheval(cache, :FastLUFactorization) - if cache.isfresh - # we will fail here if A is a different *size* than in a previous version of the same cache. - # it may instead be desirable to resize the workspace. - @set! ws_and_fact.factors = LinearAlgebra.LU(LAPACK.getrf!(ws_and_fact.workspace, - A)...) - cache.cacheval = ws_and_fact - cache.isfresh = false - end - y = ldiv!(cache.u, cache.cacheval.factors, cache.b) - SciMLBase.build_linear_solution(alg, y, nothing, cache) -end - -""" -`FastQRFactorization()` - -The FastLapackInterface.jl version of the QR factorization. -""" -struct FastQRFactorization{P} <: AbstractDenseFactorization - pivot::P - blocksize::Int -end - -# is 36 or 16 better here? LinearAlgebra and FastLapackInterface use 36, -# but QRFactorization uses 16. -FastQRFactorization() = FastQRFactorization(NoPivot(), 36) - -function init_cacheval(alg::FastQRFactorization{NoPivot}, A::AbstractMatrix, b, u, Pl, Pr, - maxiters::Int, abstol, reltol, verbose::Bool, - assumptions::OperatorAssumptions) - ws = QRWYWs(A; blocksize = alg.blocksize) - return WorkspaceAndFactors(ws, - ArrayInterface.qr_instance(convert(AbstractMatrix, A))) -end -function init_cacheval(::FastQRFactorization{ColumnNorm}, A::AbstractMatrix, b, u, Pl, Pr, - maxiters::Int, abstol, reltol, verbose::Bool, - assumptions::OperatorAssumptions) - ws = QRpWs(A) - return WorkspaceAndFactors(ws, - ArrayInterface.qr_instance(convert(AbstractMatrix, A))) -end - -function init_cacheval(alg::FastQRFactorization, A, b, u, Pl, Pr, - maxiters::Int, abstol, reltol, verbose::Bool, - assumptions::OperatorAssumptions) - return init_cacheval(alg, convert(AbstractMatrix, A), b, u, Pl, Pr, - maxiters::Int, abstol, reltol, verbose::Bool, - assumptions::OperatorAssumptions) -end - -function SciMLBase.solve!(cache::LinearCache, alg::FastQRFactorization{P}; - kwargs...) where {P} - A = cache.A - A = convert(AbstractMatrix, A) - ws_and_fact = @get_cacheval(cache, :FastQRFactorization) - if cache.isfresh - # we will fail here if A is a different *size* than in a previous version of the same cache. - # it may instead be desirable to resize the workspace. - if P === NoPivot - @set! ws_and_fact.factors = LinearAlgebra.QRCompactWY(LAPACK.geqrt!( - ws_and_fact.workspace, - A)...) - else - @set! ws_and_fact.factors = LinearAlgebra.QRPivoted(LAPACK.geqp3!( - ws_and_fact.workspace, - A)...) - end - cache.cacheval = ws_and_fact - cache.isfresh = false - end - y = ldiv!(cache.u, cache.cacheval.factors, cache.b) - SciMLBase.build_linear_solution(alg, y, nothing, cache) -end - ## SparspakFactorization is here since it's MIT licensed, not GPL """ diff --git a/test/basictests.jl b/test/basictests.jl index 0a49da52a..3b5f9ad48 100644 --- a/test/basictests.jl +++ b/test/basictests.jl @@ -1,5 +1,5 @@ using LinearSolve, LinearAlgebra, SparseArrays, MultiFloats, ForwardDiff -using SciMLOperators, RecursiveFactorization, Sparspak +using SciMLOperators, RecursiveFactorization, Sparspak, FastLapackInterface using IterativeSolvers, KrylovKit, MKL_jll, KrylovPreconditioners using Test import Random