Skip to content

Commit daf7212

Browse files
committed
format
1 parent 974a36f commit daf7212

15 files changed

+180
-87
lines changed

benchmarks/applelu.jl

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,13 @@ function luflop(m, n = m; innerflop = 2)
1818
end
1919
end
2020

21-
algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), AppleAccelerateLUFactorization(), MetalLUFactorization()]
21+
algs = [
22+
LUFactorization(),
23+
GenericLUFactorization(),
24+
RFLUFactorization(),
25+
AppleAccelerateLUFactorization(),
26+
MetalLUFactorization(),
27+
]
2228
res = [Float32[] for i in 1:length(algs)]
2329

2430
ns = 4:8:500
@@ -28,10 +34,14 @@ for i in 1:length(ns)
2834
rng = MersenneTwister(123)
2935
global A = rand(rng, Float32, n, n)
3036
global b = rand(rng, Float32, n)
31-
global u0= rand(rng, Float32, n)
32-
37+
global u0 = rand(rng, Float32, n)
38+
3339
for j in 1:length(algs)
34-
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
40+
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A),
41+
copy(b);
42+
u0 = copy(u0),
43+
alias_A = true,
44+
alias_b = true))
3545
push!(res[j], luflop(n) / bt / 1e9)
3646
end
3747
end
@@ -41,11 +51,17 @@ __parameterless_type(T) = Base.typename(T).wrapper
4151
parameterless_type(x) = __parameterless_type(typeof(x))
4252
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
4353

44-
p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
54+
p = plot(ns,
55+
res[1];
56+
ylabel = "GFLOPs",
57+
xlabel = "N",
58+
title = "GFLOPs for NxN LU Factorization",
59+
label = string(Symbol(parameterless_type(algs[1]))),
60+
legend = :outertopright)
4561
for i in 2:length(res)
4662
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
4763
end
4864
p
4965

5066
savefig("metallubench.png")
51-
savefig("metallubench.pdf")
67+
savefig("metallubench.pdf")

benchmarks/cudalu.jl

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,14 @@ for i in 1:length(ns)
2828
rng = MersenneTwister(123)
2929
global A = rand(rng, Float32, n, n)
3030
global b = rand(rng, Float32, n)
31-
global u0= rand(rng, Float32, n)
32-
31+
global u0 = rand(rng, Float32, n)
32+
3333
for j in 1:length(algs)
34-
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
34+
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A),
35+
copy(b);
36+
u0 = copy(u0),
37+
alias_A = true,
38+
alias_b = true))
3539
push!(res[j], luflop(n) / bt / 1e9)
3640
end
3741
end
@@ -41,11 +45,17 @@ __parameterless_type(T) = Base.typename(T).wrapper
4145
parameterless_type(x) = __parameterless_type(typeof(x))
4246
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
4347

44-
p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
48+
p = plot(ns,
49+
res[1];
50+
ylabel = "GFLOPs",
51+
xlabel = "N",
52+
title = "GFLOPs for NxN LU Factorization",
53+
label = string(Symbol(parameterless_type(algs[1]))),
54+
legend = :outertopright)
4555
for i in 2:length(res)
4656
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
4757
end
4858
p
4959

5060
savefig("cudaoffloadlubench.png")
51-
savefig("cudaoffloadlubench.pdf")
61+
savefig("cudaoffloadlubench.pdf")

benchmarks/lu.jl

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,14 @@ function luflop(m, n = m; innerflop = 2)
1818
end
1919
end
2020

21-
algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), MKLLUFactorization(), FastLUFactorization(), SimpleLUFactorization()]
21+
algs = [
22+
LUFactorization(),
23+
GenericLUFactorization(),
24+
RFLUFactorization(),
25+
MKLLUFactorization(),
26+
FastLUFactorization(),
27+
SimpleLUFactorization(),
28+
]
2229
res = [Float64[] for i in 1:length(algs)]
2330

2431
ns = 4:8:500
@@ -28,10 +35,14 @@ for i in 1:length(ns)
2835
rng = MersenneTwister(123)
2936
global A = rand(rng, n, n)
3037
global b = rand(rng, n)
31-
global u0= rand(rng, n)
32-
38+
global u0 = rand(rng, n)
39+
3340
for j in 1:length(algs)
34-
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
41+
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A),
42+
copy(b);
43+
u0 = copy(u0),
44+
alias_A = true,
45+
alias_b = true))
3546
push!(res[j], luflop(n) / bt / 1e9)
3647
end
3748
end
@@ -41,11 +52,17 @@ __parameterless_type(T) = Base.typename(T).wrapper
4152
parameterless_type(x) = __parameterless_type(typeof(x))
4253
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
4354

44-
p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
55+
p = plot(ns,
56+
res[1];
57+
ylabel = "GFLOPs",
58+
xlabel = "N",
59+
title = "GFLOPs for NxN LU Factorization",
60+
label = string(Symbol(parameterless_type(algs[1]))),
61+
legend = :outertopright)
4562
for i in 2:length(res)
4663
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
4764
end
4865
p
4966

5067
savefig("lubench.png")
51-
savefig("lubench.pdf")
68+
savefig("lubench.pdf")

benchmarks/metallu.jl

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,14 @@ for i in 1:length(ns)
2828
rng = MersenneTwister(123)
2929
global A = rand(rng, Float32, n, n)
3030
global b = rand(rng, Float32, n)
31-
global u0= rand(rng, Float32, n)
32-
31+
global u0 = rand(rng, Float32, n)
32+
3333
for j in 1:length(algs)
34-
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
34+
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A),
35+
copy(b);
36+
u0 = copy(u0),
37+
alias_A = true,
38+
alias_b = true))
3539
GC.gc()
3640
push!(res[j], luflop(n) / bt / 1e9)
3741
end
@@ -42,11 +46,17 @@ __parameterless_type(T) = Base.typename(T).wrapper
4246
parameterless_type(x) = __parameterless_type(typeof(x))
4347
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
4448

45-
p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
49+
p = plot(ns,
50+
res[1];
51+
ylabel = "GFLOPs",
52+
xlabel = "N",
53+
title = "GFLOPs for NxN LU Factorization",
54+
label = string(Symbol(parameterless_type(algs[1]))),
55+
legend = :outertopright)
4656
for i in 2:length(res)
4757
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
4858
end
4959
p
5060

5161
savefig("metal_large_lubench.png")
52-
savefig("metal_large_lubench.pdf")
62+
savefig("metal_large_lubench.pdf")

docs/src/solvers/solvers.md

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,15 @@ but one may need to change this to receive more performance or precision. If
1414
more precision is necessary, `QRFactorization()` and `SVDFactorization()` are
1515
the best choices, with SVD being the slowest but most precise.
1616

17-
For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around
17+
For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around
1818
150x150 matrices, though this can be dependent on the exact details of the hardware. After this
1919
point, `MKLLUFactorization` is usually faster on most hardware. Note that on Mac computers
2020
that `AppleAccelerateLUFactorization` is generally always the fastest. `LUFactorization` will
21-
use your base system BLAS which can be fast or slow depending on the hardware configuration.
21+
use your base system BLAS which can be fast or slow depending on the hardware configuration.
2222
`SimpleLUFactorization` will be fast only on very small matrices but can cut down on compile times.
2323

2424
For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used
25-
on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000
25+
on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000
2626
matrices (and only supports Float32). `CudaOffloadFactorization` can be more efficient at a
2727
much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though this is highly dependent
2828
on the chosen GPU hardware. `CudaOffloadFactorization` requires a CUDA-compatible NVIDIA GPU.
@@ -31,9 +31,9 @@ CUDA offload supports Float64 but most consumer GPU hardware will be much faster
3131
this is only recommended for Float32 matrices.
3232

3333
!!! note
34-
35-
Performance details for dense LU-factorizations can be highly dependent on the hardware configuration.
36-
For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357).
34+
35+
Performance details for dense LU-factorizations can be highly dependent on the hardware configuration.
36+
For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357).
3737
If one is looking to best optimize their system, we suggest running the performance
3838
tuning benchmark.
3939

@@ -65,19 +65,19 @@ The interface is detailed [here](@ref custom).
6565
### Lazy SciMLOperators
6666

6767
If the linear operator is given as a lazy non-concrete operator, such as a `FunctionOperator`,
68-
then using a Krylov method is preferred in order to not concretize the matrix.
68+
then using a Krylov method is preferred in order to not concretize the matrix.
6969
Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
7070
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
7171
choice of Krylov method should be the one most constrained to the type of operator one
7272
has, for example if positive definite then `Krylov_CG()`, but if no good properties then
7373
use `Krylov_GMRES()`.
7474

7575
!!! tip
76-
76+
7777
If your materialized operator is a uniform block diagonal matrix, then you can use
7878
`SimpleGMRES(; blocksize = <known block size>)` to further improve performance.
7979
This often shows up in Neural Networks where the Jacobian wrt the Inputs (almost always)
80-
is a Uniform Block Diagonal matrix of Block Size = size of the input divided by the
80+
is a Uniform Block Diagonal matrix of Block Size = size of the input divided by the
8181
batch size.
8282

8383
## Full List of Methods

ext/LinearSolveBlockDiagonalsExt.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ using LinearSolve, BlockDiagonals
44

55
function LinearSolve.init_cacheval(alg::SimpleGMRES{false}, A::BlockDiagonal, b, args...;
66
kwargs...)
7-
@assert ndims(A) == 2 "ndims(A) == $(ndims(A)). `A` must have ndims == 2."
7+
@assert ndims(A)==2 "ndims(A) == $(ndims(A)). `A` must have ndims == 2."
88
# We need to perform this check even when `zeroinit == true`, since the type of the
99
# cache is dependent on whether we are able to use the specialized dispatch.
1010
bsizes = blocksizes(A)

ext/LinearSolveKernelAbstractionsExt.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ using GPUArraysCore
99
function LinearSolve._fast_sym_givens!(c, s, R, nr::Int, inner_iter::Int, bsize::Int, Hbis)
1010
backend = get_backend(Hbis)
1111
kernel! = __fast_sym_givens_kernel!(backend)
12-
kernel!(c[inner_iter], s[inner_iter], R[nr + inner_iter], Hbis; ndrange=bsize)
12+
kernel!(c[inner_iter], s[inner_iter], R[nr + inner_iter], Hbis; ndrange = bsize)
1313
return c, s, R
1414
end
1515

ext/LinearSolveMKLExt.jl

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,49 +2,60 @@ module LinearSolveMKLExt
22

33
using MKL_jll
44
using LinearAlgebra: BlasInt, LU
5-
using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1,
6-
@blasfunc, chkargsok
5+
using LinearAlgebra.LAPACK: require_one_based_indexing,
6+
chkfinite, chkstride1,
7+
@blasfunc, chkargsok
78
using LinearAlgebra
89
const usemkl = MKL_jll.is_available()
910

1011
using LinearSolve
1112
using LinearSolve: ArrayInterface, MKLLUFactorization, @get_cacheval, LinearCache, SciMLBase
1213

13-
function getrf!(A::AbstractMatrix{<:Float64}; ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))), info = Ref{BlasInt}(), check = false)
14+
function getrf!(A::AbstractMatrix{<:Float64};
15+
ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
16+
info = Ref{BlasInt}(),
17+
check = false)
1418
require_one_based_indexing(A)
1519
check && chkfinite(A)
1620
chkstride1(A)
1721
m, n = size(A)
18-
lda = max(1,stride(A, 2))
22+
lda = max(1, stride(A, 2))
1923
if isempty(ipiv)
20-
ipiv = similar(A, BlasInt, min(size(A,1),size(A,2)))
24+
ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2)))
2125
end
2226
ccall((@blasfunc(dgetrf_), MKL_jll.libmkl_rt), Cvoid,
23-
(Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64},
27+
(Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64},
2428
Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
25-
m, n, A, lda, ipiv, info)
29+
m, n, A, lda, ipiv, info)
2630
chkargsok(info[])
2731
A, ipiv, info[], info #Error code is stored in LU factorization type
2832
end
2933

30-
function getrf!(A::AbstractMatrix{<:Float32}; ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))), info = Ref{BlasInt}(), check = false)
34+
function getrf!(A::AbstractMatrix{<:Float32};
35+
ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
36+
info = Ref{BlasInt}(),
37+
check = false)
3138
require_one_based_indexing(A)
3239
check && chkfinite(A)
3340
chkstride1(A)
3441
m, n = size(A)
35-
lda = max(1,stride(A, 2))
42+
lda = max(1, stride(A, 2))
3643
if isempty(ipiv)
37-
ipiv = similar(A, BlasInt, min(size(A,1),size(A,2)))
44+
ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2)))
3845
end
3946
ccall((@blasfunc(sgetrf_), MKL_jll.libmkl_rt), Cvoid,
40-
(Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32},
47+
(Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32},
4148
Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
42-
m, n, A, lda, ipiv, info)
49+
m, n, A, lda, ipiv, info)
4350
chkargsok(info[])
4451
A, ipiv, info[], info #Error code is stored in LU factorization type
4552
end
4653

47-
function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::AbstractVector{BlasInt}, B::AbstractVecOrMat{<:Float64}; info = Ref{BlasInt}())
54+
function getrs!(trans::AbstractChar,
55+
A::AbstractMatrix{<:Float64},
56+
ipiv::AbstractVector{BlasInt},
57+
B::AbstractVecOrMat{<:Float64};
58+
info = Ref{BlasInt}())
4859
require_one_based_indexing(A, ipiv, B)
4960
LinearAlgebra.LAPACK.chktrans(trans)
5061
chkstride1(A, B, ipiv)
@@ -57,14 +68,19 @@ function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::Abstrac
5768
end
5869
nrhs = size(B, 2)
5970
ccall(("dgetrs_", MKL_jll.libmkl_rt), Cvoid,
60-
(Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt},
61-
Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
62-
trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
71+
(Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt},
72+
Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
73+
trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info,
74+
1)
6375
LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
6476
B
6577
end
6678

67-
function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::AbstractVector{BlasInt}, B::AbstractVecOrMat{<:Float32}; info = Ref{BlasInt}())
79+
function getrs!(trans::AbstractChar,
80+
A::AbstractMatrix{<:Float32},
81+
ipiv::AbstractVector{BlasInt},
82+
B::AbstractVecOrMat{<:Float32};
83+
info = Ref{BlasInt}())
6884
require_one_based_indexing(A, ipiv, B)
6985
LinearAlgebra.LAPACK.chktrans(trans)
7086
chkstride1(A, B, ipiv)
@@ -77,9 +93,10 @@ function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::Abstrac
7793
end
7894
nrhs = size(B, 2)
7995
ccall(("sgetrs_", MKL_jll.libmkl_rt), Cvoid,
80-
(Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt},
81-
Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
82-
trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
96+
(Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt},
97+
Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
98+
trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info,
99+
1)
83100
LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
84101
B
85102
end
@@ -125,4 +142,4 @@ function SciMLBase.solve!(cache::LinearCache, alg::MKLLUFactorization;
125142
=#
126143
end
127144

128-
end
145+
end

ext/LinearSolveMetalExt.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,4 @@ function SciMLBase.solve!(cache::LinearCache, alg::MetalLUFactorization;
2828
SciMLBase.build_linear_solution(alg, y, nothing, cache)
2929
end
3030

31-
end
31+
end

src/LinearSolve.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ PrecompileTools.@recompile_invalidations begin
2828
import InteractiveUtils
2929

3030
using LinearAlgebra: BlasInt, LU
31-
using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1,
32-
@blasfunc, chkargsok
31+
using LinearAlgebra.LAPACK: require_one_based_indexing,
32+
chkfinite, chkstride1,
33+
@blasfunc, chkargsok
3334

3435
import GPUArraysCore
3536
import Preferences

0 commit comments

Comments
 (0)