Skip to content

Commit 3881230

Browse files
Merge pull request #361 from SciML/metal
Setup Accelerate and MKL for 32-bit, MKL getrf, and Metal.jl integration
2 parents 9224384 + c3a93d9 commit 3881230

File tree

11 files changed

+375
-9
lines changed

11 files changed

+375
-9
lines changed

Project.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,19 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
3030
[weakdeps]
3131
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
3232
HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
33-
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
3433
IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
3534
KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
35+
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
36+
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
3637
Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"
3738

3839
[extensions]
3940
LinearSolveCUDAExt = "CUDA"
4041
LinearSolveHYPREExt = "HYPRE"
4142
LinearSolveIterativeSolversExt = "IterativeSolvers"
42-
LinearSolveMKLExt = "MKL_jll"
4343
LinearSolveKrylovKitExt = "KrylovKit"
44+
LinearSolveMKLExt = "MKL_jll"
45+
LinearSolveMetalExt = "Metal"
4446
LinearSolvePardisoExt = "Pardiso"
4547

4648
[compat]
@@ -74,6 +76,7 @@ IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
7476
JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
7577
KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
7678
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
79+
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
7780
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
7881
MultiFloats = "bdf0d083-296b-4888-a5b6-7498122e68a5"
7982
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"

benchmarks/applelu.jl

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
using BenchmarkTools, Random, VectorizationBase
2+
using LinearAlgebra, LinearSolve, Metal
3+
nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
4+
BLAS.set_num_threads(nc)
5+
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5
6+
7+
function luflop(m, n = m; innerflop = 2)
8+
sum(1:min(m, n)) do k
9+
invflop = 1
10+
scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
11+
updateflop = isempty((k + 1):n) ? 0 :
12+
sum((k + 1):n) do j
13+
isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
14+
innerflop
15+
end
16+
end
17+
invflop + scaleflop + updateflop
18+
end
19+
end
20+
21+
algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), AppleAccelerateLUFactorization(), MetalLUFactorization()]
22+
res = [Float32[] for i in 1:length(algs)]
23+
24+
ns = 4:8:500
25+
for i in 1:length(ns)
26+
n = ns[i]
27+
@info "$n × $n"
28+
rng = MersenneTwister(123)
29+
global A = rand(rng, Float32, n, n)
30+
global b = rand(rng, Float32, n)
31+
global u0= rand(rng, Float32, n)
32+
33+
for j in 1:length(algs)
34+
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
35+
push!(res[j], luflop(n) / bt / 1e9)
36+
end
37+
end
38+
39+
using Plots
40+
__parameterless_type(T) = Base.typename(T).wrapper
41+
parameterless_type(x) = __parameterless_type(typeof(x))
42+
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
43+
44+
p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
45+
for i in 2:length(res)
46+
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
47+
end
48+
p
49+
50+
savefig("metallubench.png")
51+
savefig("metallubench.pdf")

benchmarks/cudalu.jl

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
using BenchmarkTools, Random, VectorizationBase
2+
using LinearAlgebra, LinearSolve, CUDA, MKL_jll
3+
nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
4+
BLAS.set_num_threads(nc)
5+
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5
6+
7+
function luflop(m, n = m; innerflop = 2)
8+
sum(1:min(m, n)) do k
9+
invflop = 1
10+
scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
11+
updateflop = isempty((k + 1):n) ? 0 :
12+
sum((k + 1):n) do j
13+
isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
14+
innerflop
15+
end
16+
end
17+
invflop + scaleflop + updateflop
18+
end
19+
end
20+
21+
algs = [MKLLUFactorization(), CudaOffloadFactorization()]
22+
res = [Float32[] for i in 1:length(algs)]
23+
24+
ns = 200:400:10000
25+
for i in 1:length(ns)
26+
n = ns[i]
27+
@info "$n × $n"
28+
rng = MersenneTwister(123)
29+
global A = rand(rng, Float32, n, n)
30+
global b = rand(rng, Float32, n)
31+
global u0= rand(rng, Float32, n)
32+
33+
for j in 1:length(algs)
34+
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
35+
push!(res[j], luflop(n) / bt / 1e9)
36+
end
37+
end
38+
39+
using Plots
40+
__parameterless_type(T) = Base.typename(T).wrapper
41+
parameterless_type(x) = __parameterless_type(typeof(x))
42+
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
43+
44+
p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
45+
for i in 2:length(res)
46+
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
47+
end
48+
p
49+
50+
savefig("cudaoffloadlubench.png")
51+
savefig("cudaoffloadlubench.pdf")

benchmarks/metallu.jl

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
using BenchmarkTools, Random, VectorizationBase
2+
using LinearAlgebra, LinearSolve, Metal
3+
nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
4+
BLAS.set_num_threads(nc)
5+
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5
6+
7+
function luflop(m, n = m; innerflop = 2)
8+
sum(1:min(m, n)) do k
9+
invflop = 1
10+
scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
11+
updateflop = isempty((k + 1):n) ? 0 :
12+
sum((k + 1):n) do j
13+
isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
14+
innerflop
15+
end
16+
end
17+
invflop + scaleflop + updateflop
18+
end
19+
end
20+
21+
algs = [AppleAccelerateLUFactorization(), MetalLUFactorization()]
22+
res = [Float32[] for i in 1:length(algs)]
23+
24+
ns = 200:600:15000
25+
for i in 1:length(ns)
26+
n = ns[i]
27+
@info "$n × $n"
28+
rng = MersenneTwister(123)
29+
global A = rand(rng, Float32, n, n)
30+
global b = rand(rng, Float32, n)
31+
global u0= rand(rng, Float32, n)
32+
33+
for j in 1:length(algs)
34+
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
35+
GC.gc()
36+
push!(res[j], luflop(n) / bt / 1e9)
37+
end
38+
end
39+
40+
using Plots
41+
__parameterless_type(T) = Base.typename(T).wrapper
42+
parameterless_type(x) = __parameterless_type(typeof(x))
43+
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
44+
45+
p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
46+
for i in 2:length(res)
47+
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
48+
end
49+
p
50+
51+
savefig("metal_large_lubench.png")
52+
savefig("metal_large_lubench.pdf")

docs/src/solvers/solvers.md

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,37 @@ Solves for ``Au=b`` in the problem defined by `prob` using the algorithm
77

88
## Recommended Methods
99

10+
### Dense Matrices
11+
1012
The default algorithm `nothing` is good for picking an algorithm that will work,
1113
but one may need to change this to receive more performance or precision. If
1214
more precision is necessary, `QRFactorization()` and `SVDFactorization()` are
1315
the best choices, with SVD being the slowest but most precise.
1416

15-
For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations.
16-
`FastLUFactorization` will be faster than `LUFactorization` which is the Base.LinearAlgebra
17-
(`\` default) implementation of LU factorization. `SimpleLUFactorization` will be fast
18-
on very small matrices.
17+
For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around
18+
150x150 matrices, though this can be dependent on the exact details of the hardware. After this
19+
point, `MKLLUFactorization` is usually faster on most hardware. Note that on Mac computers
20+
that `AppleAccelerateLUFactorization` is generally always the fastest. `LUFactorization` will
21+
use your base system BLAS which can be fast or slow depending on the hardware configuration.
22+
`SimpleLUFactorization` will be fast only on very small matrices but can cut down on compile times.
23+
24+
For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used
25+
on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000
26+
matrices (and only supports Float32). `CudaOffloadFactorization` can be more efficient at a
27+
much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though this is highly dependent
28+
on the chosen GPU hardware. `CudaOffloadFactorization` requires a CUDA-compatible NVIDIA GPU.
29+
CUDA offload supports Float64 but most consumer GPU hardware will be much faster on Float32
30+
(many are >32x faster for Float32 operations than Float64 operations) and thus for most hardware
31+
this is only recommended for Float32 matrices.
32+
33+
!!! note
34+
35+
Performance details for dense LU-factorizations can be highly dependent on the hardware configuration.
36+
For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357).
37+
If one is looking to best optimize their system, we suggest running the performance
38+
tuning benchmark.
39+
40+
### Sparse Matrices
1941

2042
For sparse LU-factorizations, `KLUFactorization` if there is less structure
2143
to the sparsity pattern and `UMFPACKFactorization` if there is more structure.
@@ -31,12 +53,25 @@ As sparse matrices get larger, iterative solvers tend to get more efficient than
3153
factorization methods if a lower tolerance of the solution is required.
3254

3355
Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
34-
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods.
56+
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
57+
choice of Krylov method should be the one most constrained to the type of operator one
58+
has, for example if positive definite then `Krylov_CG()`, but if no good properties then
59+
use `Krylov_GMRES()`.
3560

3661
Finally, a user can pass a custom function for handling the linear solve using
3762
`LinearSolveFunction()` if existing solvers are not optimally suited for their application.
3863
The interface is detailed [here](@ref custom).
3964

65+
### Lazy SciMLOperators
66+
67+
If the linear operator is given as a lazy non-concrete operator, such as a `FunctionOperator`,
68+
then using a Krylov method is preferred in order to not concretize the matrix.
69+
Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
70+
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
71+
choice of Krylov method should be the one most constrained to the type of operator one
72+
has, for example if positive definite then `Krylov_CG()`, but if no good properties then
73+
use `Krylov_GMRES()`.
74+
4075
## Full List of Methods
4176

4277
### RecursiveFactorization.jl
@@ -121,6 +156,26 @@ KrylovJL
121156
MKLLUFactorization
122157
```
123158

159+
### AppleAccelerate.jl
160+
161+
!!! note
162+
163+
Using this solver requires a Mac with Apple Accelerate. This should come standard in most "modern" Mac computers.
164+
165+
```@docs
166+
AppleAccelerateLUFactorization
167+
```
168+
169+
### Metal.jl
170+
171+
!!! note
172+
173+
Using this solver requires adding the package Metal.jl, i.e. `using Metal`. This package is only compatible with Mac M-Series computers with a Metal-compatible GPU.
174+
175+
```@docs
176+
MetalLUFactorization
177+
```
178+
124179
### Pardiso.jl
125180

126181
!!! note

ext/LinearSolveMKLExt.jl

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,63 @@ function getrf!(A::AbstractMatrix{<:Float64}; ipiv = similar(A, BlasInt, min(siz
2727
A, ipiv, info[], info #Error code is stored in LU factorization type
2828
end
2929

30+
function getrf!(A::AbstractMatrix{<:Float32}; ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))), info = Ref{BlasInt}(), check = false)
31+
require_one_based_indexing(A)
32+
check && chkfinite(A)
33+
chkstride1(A)
34+
m, n = size(A)
35+
lda = max(1,stride(A, 2))
36+
if isempty(ipiv)
37+
ipiv = similar(A, BlasInt, min(size(A,1),size(A,2)))
38+
end
39+
ccall((@blasfunc(sgetrf_), MKL_jll.libmkl_rt), Cvoid,
40+
(Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32},
41+
Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
42+
m, n, A, lda, ipiv, info)
43+
chkargsok(info[])
44+
A, ipiv, info[], info #Error code is stored in LU factorization type
45+
end
46+
47+
function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::AbstractVector{BlasInt}, B::AbstractVecOrMat{<:Float64}; info = Ref{BlasInt}())
48+
require_one_based_indexing(A, ipiv, B)
49+
LinearAlgebra.LAPACK.chktrans(trans)
50+
chkstride1(A, B, ipiv)
51+
n = LinearAlgebra.checksquare(A)
52+
if n != size(B, 1)
53+
throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n"))
54+
end
55+
if n != length(ipiv)
56+
throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n"))
57+
end
58+
nrhs = size(B, 2)
59+
ccall(("dgetrs_", MKL_jll.libmkl_rt), Cvoid,
60+
(Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt},
61+
Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
62+
trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
63+
LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
64+
B
65+
end
66+
67+
function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::AbstractVector{BlasInt}, B::AbstractVecOrMat{<:Float32}; info = Ref{BlasInt}())
68+
require_one_based_indexing(A, ipiv, B)
69+
LinearAlgebra.LAPACK.chktrans(trans)
70+
chkstride1(A, B, ipiv)
71+
n = LinearAlgebra.checksquare(A)
72+
if n != size(B, 1)
73+
throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n"))
74+
end
75+
if n != length(ipiv)
76+
throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n"))
77+
end
78+
nrhs = size(B, 2)
79+
ccall(("sgetrs_", MKL_jll.libmkl_rt), Cvoid,
80+
(Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt},
81+
Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
82+
trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
83+
LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
84+
B
85+
end
86+
3087
default_alias_A(::MKLLUFactorization, ::Any, ::Any) = false
3188
default_alias_b(::MKLLUFactorization, ::Any, ::Any) = false
3289

@@ -47,8 +104,25 @@ function SciMLBase.solve!(cache::LinearCache, alg::MKLLUFactorization;
47104
cache.cacheval = fact
48105
cache.isfresh = false
49106
end
107+
50108
y = ldiv!(cache.u, @get_cacheval(cache, :MKLLUFactorization)[1], cache.b)
51109
SciMLBase.build_linear_solution(alg, y, nothing, cache)
110+
111+
#=
112+
A, info = @get_cacheval(cache, :MKLLUFactorization)
113+
LinearAlgebra.require_one_based_indexing(cache.u, cache.b)
114+
m, n = size(A, 1), size(A, 2)
115+
if m > n
116+
Bc = copy(cache.b)
117+
getrs!('N', A.factors, A.ipiv, Bc; info)
118+
return copyto!(cache.u, 1, Bc, 1, n)
119+
else
120+
copyto!(cache.u, cache.b)
121+
getrs!('N', A.factors, A.ipiv, cache.u; info)
122+
end
123+
124+
SciMLBase.build_linear_solution(alg, cache.u, nothing, cache)
125+
=#
52126
end
53127

54128
end

0 commit comments

Comments
 (0)