Skip to content

Commit 83e88c2

Browse files
add benchmarks and update solver text
1 parent 54b7bec commit 83e88c2

File tree

4 files changed

+213
-5
lines changed

4 files changed

+213
-5
lines changed

benchmarks/applelu.jl

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
using BenchmarkTools, Random, VectorizationBase
2+
using LinearAlgebra, LinearSolve, Metal
3+
nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
4+
BLAS.set_num_threads(nc)
5+
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5
6+
7+
function luflop(m, n = m; innerflop = 2)
8+
sum(1:min(m, n)) do k
9+
invflop = 1
10+
scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
11+
updateflop = isempty((k + 1):n) ? 0 :
12+
sum((k + 1):n) do j
13+
isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
14+
innerflop
15+
end
16+
end
17+
invflop + scaleflop + updateflop
18+
end
19+
end
20+
21+
algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), AppleAccelerateLUFactorization(), MetalLUFactorization()]
22+
res = [Float32[] for i in 1:length(algs)]
23+
24+
ns = 4:8:500
25+
for i in 1:length(ns)
26+
n = ns[i]
27+
@info "$n × $n"
28+
rng = MersenneTwister(123)
29+
global A = rand(rng, Float32, n, n)
30+
global b = rand(rng, Float32, n)
31+
global u0= rand(rng, Float32, n)
32+
33+
for j in 1:length(algs)
34+
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
35+
push!(res[j], luflop(n) / bt / 1e9)
36+
end
37+
end
38+
39+
using Plots
40+
__parameterless_type(T) = Base.typename(T).wrapper
41+
parameterless_type(x) = __parameterless_type(typeof(x))
42+
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
43+
44+
p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
45+
for i in 2:length(res)
46+
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
47+
end
48+
p
49+
50+
savefig("metallubench.png")
51+
savefig("metallubench.pdf")

benchmarks/cudalu.jl

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
using BenchmarkTools, Random, VectorizationBase
2+
using LinearAlgebra, LinearSolve, CUDA, MKL_jll
3+
nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
4+
BLAS.set_num_threads(nc)
5+
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5
6+
7+
function luflop(m, n = m; innerflop = 2)
8+
sum(1:min(m, n)) do k
9+
invflop = 1
10+
scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
11+
updateflop = isempty((k + 1):n) ? 0 :
12+
sum((k + 1):n) do j
13+
isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
14+
innerflop
15+
end
16+
end
17+
invflop + scaleflop + updateflop
18+
end
19+
end
20+
21+
algs = [MKLLUFactorization(), CUDAOffloadFactorization()]
22+
res = [Float32[] for i in 1:length(algs)]
23+
24+
ns = 200:400:20000
25+
for i in 1:length(ns)
26+
n = ns[i]
27+
@info "$n × $n"
28+
rng = MersenneTwister(123)
29+
global A = rand(rng, Float32, n, n)
30+
global b = rand(rng, Float32, n)
31+
global u0= rand(rng, Float32, n)
32+
33+
for j in 1:length(algs)
34+
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
35+
push!(res[j], luflop(n) / bt / 1e9)
36+
end
37+
end
38+
39+
using Plots
40+
__parameterless_type(T) = Base.typename(T).wrapper
41+
parameterless_type(x) = __parameterless_type(typeof(x))
42+
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
43+
44+
p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
45+
for i in 2:length(res)
46+
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
47+
end
48+
p
49+
50+
savefig("cudaoffloadlubench.png")
51+
savefig("cudaoffloadlubench.pdf")

benchmarks/metallu.jl

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
using BenchmarkTools, Random, VectorizationBase
2+
using LinearAlgebra, LinearSolve, Metal
3+
nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
4+
BLAS.set_num_threads(nc)
5+
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5
6+
7+
function luflop(m, n = m; innerflop = 2)
8+
sum(1:min(m, n)) do k
9+
invflop = 1
10+
scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
11+
updateflop = isempty((k + 1):n) ? 0 :
12+
sum((k + 1):n) do j
13+
isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
14+
innerflop
15+
end
16+
end
17+
invflop + scaleflop + updateflop
18+
end
19+
end
20+
21+
algs = [AppleAccelerateLUFactorization(), MetalLUFactorization()]
22+
res = [Float32[] for i in 1:length(algs)]
23+
24+
ns = 200:400:20000
25+
for i in 1:length(ns)
26+
n = ns[i]
27+
@info "$n × $n"
28+
rng = MersenneTwister(123)
29+
global A = rand(rng, Float32, n, n)
30+
global b = rand(rng, Float32, n)
31+
global u0= rand(rng, Float32, n)
32+
33+
for j in 1:length(algs)
34+
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
35+
push!(res[j], luflop(n) / bt / 1e9)
36+
end
37+
end
38+
39+
using Plots
40+
__parameterless_type(T) = Base.typename(T).wrapper
41+
parameterless_type(x) = __parameterless_type(typeof(x))
42+
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
43+
44+
p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
45+
for i in 2:length(res)
46+
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
47+
end
48+
p
49+
50+
savefig("metal_large_lubench.png")
51+
savefig("metal_large_lubench.pdf")

docs/src/solvers/solvers.md

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,37 @@ Solves for ``Au=b`` in the problem defined by `prob` using the algorithm
77

88
## Recommended Methods
99

10+
### Dense Matrices
11+
1012
The default algorithm `nothing` is good for picking an algorithm that will work,
1113
but one may need to change this to receive more performance or precision. If
1214
more precision is necessary, `QRFactorization()` and `SVDFactorization()` are
1315
the best choices, with SVD being the slowest but most precise.
1416

15-
For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations.
16-
`FastLUFactorization` will be faster than `LUFactorization` which is the Base.LinearAlgebra
17-
(`\` default) implementation of LU factorization. `SimpleLUFactorization` will be fast
18-
on very small matrices.
17+
For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around
18+
150x150 matrices, though this can be dependent on the exact details of the hardware. After this
19+
point, `MKLLUFactorization` is usually faster on most hardware. Note that on Mac computers
20+
that `AppleAccelerateLUFactorization` is generally always the fastest. `LUFactorization` will
21+
use your base system BLAS which can be fast or slow depending on the hardware configuration.
22+
`SimpleLUFactorization` will be fast only on very small matrices but can cut down on compile times.
23+
24+
For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used
25+
on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000
26+
matrices (and only supports Float32). `CudaOffloadFactorization` can be more efficient at a
27+
much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though this is highly dependent
28+
on the chosen GPU hardware. `CudaOffloadFactorization` requires a CUDA-compatible NVIDIA GPU.
29+
CUDA offload supports Float64 but most consumer GPU hardware will be much faster on Float32
30+
(many are >32x faster for Float32 operations than Float64 operations) and thus for most hardware
31+
this is only recommended for Float32 matrices.
32+
33+
!!! note
34+
35+
Performance details for dense LU-factorizations can be highly dependent on the hardware configuration.
36+
For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357).
37+
If one is looking to best optimize their system, we suggest running the performance
38+
tuning benchmark.
39+
40+
### Sparse Matrices
1941

2042
For sparse LU-factorizations, `KLUFactorization` if there is less structure
2143
to the sparsity pattern and `UMFPACKFactorization` if there is more structure.
@@ -31,12 +53,25 @@ As sparse matrices get larger, iterative solvers tend to get more efficient than
3153
factorization methods if a lower tolerance of the solution is required.
3254

3355
Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
34-
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods.
56+
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
57+
choice of Krylov method should be the one most constrained to the type of operator one
58+
has, for example if positive definite then `Krylov_CG()`, but if no good properties then
59+
use `Krylov_GMRES()`.
3560

3661
Finally, a user can pass a custom function for handling the linear solve using
3762
`LinearSolveFunction()` if existing solvers are not optimally suited for their application.
3863
The interface is detailed [here](@ref custom).
3964

65+
### Lazy SciMLOperators
66+
67+
If the linear operator is given as a lazy non-concrete operator, such as a `FunctionOperator`,
68+
then using a Krylov method is preferred in order to not concretize the matrix.
69+
Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
70+
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
71+
choice of Krylov method should be the one most constrained to the type of operator one
72+
has, for example if positive definite then `Krylov_CG()`, but if no good properties then
73+
use `Krylov_GMRES()`.
74+
4075
## Full List of Methods
4176

4277
### RecursiveFactorization.jl
@@ -121,6 +156,26 @@ KrylovJL
121156
MKLLUFactorization
122157
```
123158

159+
### AppleAccelerate.jl
160+
161+
!!! note
162+
163+
Using this solver requires a Mac with Apple Accelerate. This should come standard in most "modern" Mac computers.
164+
165+
```@docs
166+
AppleAccelerateLUFactorization
167+
```
168+
169+
### Metal.jl
170+
171+
!!! note
172+
173+
Using this solver requires adding the package Metal.jl, i.e. `using Metal`. This package is only compatible with Mac M-Series computers with a Metal-compatible GPU.
174+
175+
```@docs
176+
MetalLUFactorization
177+
```
178+
124179
### Pardiso.jl
125180

126181
!!! note

0 commit comments

Comments
 (0)