Working version for GMRES with blocksize

avik-pal · avik-pal · commit 8547c19a95fa · 2023-09-15T19:37:11.000-04:00
diff --git a/ext/LinearSolveBlockDiagonalsExt.jl b/ext/LinearSolveBlockDiagonalsExt.jl
@@ -18,7 +18,7 @@ function LinearSolve.init_cacheval(alg::SimpleGMRES{false}, A::BlockDiagonal, b,
     end
     # Can't help but perform dynamic dispatch here
     return LinearSolve._init_cacheval(Val(uniform_blocks), alg, A, b, u, Pl, Pr, maxiters,
-        abstol, reltol, verbose, assumptions; zeroinit)
+        abstol, reltol, verbose, assumptions; zeroinit, blocksize = usize)
 end
 
 end
diff --git a/ext/LinearSolveNNlibExt.jl b/ext/LinearSolveNNlibExt.jl
@@ -1,5 +1,53 @@
 module LinearSolveNNlibExt
 
-using LinearSolve, NNlib
+using LinearAlgebra, LinearSolve, NNlib
+import LinearSolve: SimpleGMRESCache, SimpleGMRES, OperatorAssumptions, _no_preconditioner,
+    _init_cacheval, _norm2, LinearCache
+import UnPack: @unpack
+
+function SciMLBase.solve!(cache::SimpleGMRESCache{true, T}, lincache::LinearCache) where {T}
+    @unpack M, N, maxiters, ϵ, Q, H, x, r, βe₁, A, b, β, abstol, blocksize = cache
+    res_norm = β
+
+    # FIXME: The performance for this is quite bad when compared to the KrylovJL_GMRES
+    #        version
+    for _ in 1:((maxiters ÷ M) + 1)
+        for j in 1:M
+            Qⱼ₊₁ = @view(Q[:, j + 1, :])
+            mul!(vec(Qⱼ₊₁), A, vec(@view(Q[:, j, :])))  # Q(:,j+1) <- A Q(:, j)
+            for i in 1:j
+                H[i, j, :] .= vec(sum(@view(Q[:, i, :]) .* Qⱼ₊₁; dims = 1))
+                Qⱼ₊₁ .-= H[i:i, j, :] .* @view(Q[:, i, :])
+            end
+            H[j + 1, j, :] .= vec(_norm2(Qⱼ₊₁, 1))
+            Qⱼ₊₁ ./= H[j + 1, j:j, :]
+
+            # FIXME: Figure out a way to avoid the allocation
+            # Using views doesn't work very well with LinearSolve
+            y = similar(b, j, 1, size(H, 3))
+            for bidx in 1:size(y, 3)
+                y[:, :, bidx] .= @view(H[1:(j + 1), 1:j, bidx]) \ @view(βe₁[1:(j + 1), bidx])
+            end
+
+            # Update the solution
+            batched_mul!(reshape(x, blocksize, 1, :), @view(Q[:, 1:j, :]), y)
+            mul!(r, A, x, T(-1), T(0))
+            r .+= b
+            res_norm = _norm2(reshape(r, blocksize, :), 1)
+
+            if maximum(res_norm) < abstol
+                return SciMLBase.build_linear_solution(lincache.alg, x, r, lincache;
+                    retcode = ReturnCode.Success)
+            end
+        end
+
+        # Restart
+        Q[:, 1, :] = reshape(r, blocksize, :) ./ res_norm
+        fill!(H, zero(T))
+    end
+
+    return SciMLBase.build_linear_solution(lincache.alg, x, r, lincache;
+        retcode = ReturnCode.MaxIters)
+end
 
 end
diff --git a/src/simplegmres.jl b/src/simplegmres.jl
@@ -59,14 +59,15 @@ _no_preconditioner(::IdentityOperator) = true
 _no_preconditioner(::UniformScaling) = true
 _no_preconditioner(_) = false
 
-function init_cacheval(alg::SimpleGMRES{false}, args...; kwargs...)
-    return _init_cacheval(Val(false), alg, args...; kwargs...)
+_norm2(x) = norm(x, 2)
+_norm2(x, dims) = .√(sum(abs2, x; dims))
+
+function init_cacheval(alg::SimpleGMRES{UDB}, args...; kwargs...) where {UDB}
+    return _init_cacheval(Val(UDB), alg, args...; kwargs...)
 end
 
-# TODO: We can check if `A` is a block diagonal matrix with uniformly sized square blocks
-#       and use the specialized dispatch
 function _init_cacheval(::Val{false}, alg::SimpleGMRES, A, b, u, Pl, Pr, maxiters::Int,
-    abstol, ::Any, ::Bool, ::OperatorAssumptions; zeroinit = true)
+    abstol, ::Any, ::Bool, ::OperatorAssumptions; zeroinit = true, kwargs...)
     if zeroinit
         return SimpleGMRESCache{false}(0, 0, maxiters, alg.blocksize, zero(eltype(u)),
             similar(b, 0, 0), similar(b, 0, 0), u, similar(b, 0), similar(b, 0),
@@ -75,6 +76,7 @@ function _init_cacheval(::Val{false}, alg::SimpleGMRES, A, b, u, Pl, Pr, maxiter
 
     @assert _no_preconditioner(Pl)&&_no_preconditioner(Pr) "Preconditioning not supported! Use KrylovJL_GMRES instead."
     N = LinearAlgebra.checksquare(A)
+    @assert N == length(b) "The size of `A` and `b` must match."
     T = eltype(u)
     M = min(maxiters, alg.restart)
     ϵ = eps(T)
@@ -87,7 +89,7 @@ function _init_cacheval(::Val{false}, alg::SimpleGMRES, A, b, u, Pl, Pr, maxiter
 
     mul!(@view(Q[:, 1]), A, u, T(-1), T(0))  # r0 <- A u
     axpy!(T(1), b, @view(Q[:, 1]))  # r0 <- r0 - b
-    β = norm(@view(Q[:, 1]), 2)
+    β = _norm2(@view(Q[:, 1]))
     Q[:, 1] ./= β
 
     x = u
@@ -100,6 +102,45 @@ function _init_cacheval(::Val{false}, alg::SimpleGMRES, A, b, u, Pl, Pr, maxiter
         β, abstol)
 end
 
+function _init_cacheval(::Val{true}, alg::SimpleGMRES, A, b, u, Pl, Pr, maxiters::Int,
+    abstol, ::Any, ::Bool, ::OperatorAssumptions; zeroinit = true,
+    blocksize = alg.blocksize)
+    if zeroinit
+        return SimpleGMRESCache{true}(0, 0, maxiters, alg.blocksize, zero(eltype(u)),
+            similar(b, 0, 0, 0), similar(b, 0, 0, 0), u, similar(b, 0), similar(b, 0, 0),
+            A, b, similar(b, 0, 0), abstol)
+    end
+
+    @assert _no_preconditioner(Pl)&&_no_preconditioner(Pr) "Preconditioning not supported! Use KrylovJL_GMRES instead."
+    N = LinearAlgebra.checksquare(A)
+    @assert mod(N, blocksize)==0 "The blocksize must divide the size of the matrix."
+    @assert N==length(b) "The size of `A` and `b` must match."
+    T = eltype(u)
+    M = min(maxiters, alg.restart)
+    ϵ = eps(T)
+    bsize = N ÷ blocksize
+
+    # Initialize the Cache
+    ## Use `b` since `A` might be an operator
+    Q = similar(b, blocksize, M + 1, bsize)
+    H = similar(b, M + 1, M, bsize)
+    fill!(H, zero(T))
+
+    mul!(vec(@view(Q[:, 1, :])), A, u, T(-1), T(0))  # r0 <- A u
+    axpy!(T(1), b, vec(@view(Q[:, 1, :])))  # r0 <- r0 - b
+    β = _norm2(@view(Q[:, 1, :]), 1)
+    Q[:, 1, :] ./= β
+
+    x = u
+    r = similar(b)
+    βe₁ = similar(b, M + 1, bsize)
+    fill!(βe₁, 0)
+    βe₁[1, :] .= vec(β)  # Avoid the scalar indexing error
+
+    return SimpleGMRESCache{true}(M, N, maxiters, blocksize, ϵ, Q, H, x, r, βe₁, A, b,
+        β, abstol)
+end
+
 default_alias_A(::SimpleGMRES, ::Any, ::Any) = false
 default_alias_b(::SimpleGMRES, ::Any, ::Any) = false
 
@@ -111,25 +152,25 @@ function SciMLBase.solve!(cache::LinearCache, alg::SimpleGMRES; kwargs...)
         cache.cacheval = solver
         cache.isfresh = false
     end
-    return SciMLBase.solve!(cache.cacheval)
+    return SciMLBase.solve!(cache.cacheval, cache)
 end
 
-function SciMLBase.solve!(cache::SimpleGMRESCache{false, T}) where {T}
+function SciMLBase.solve!(cache::SimpleGMRESCache{false, T},
+    lincache::LinearCache) where {T}
     @unpack M, N, maxiters, ϵ, Q, H, x, r, βe₁, A, b, β, abstol = cache
-    norm2 = Base.Fix2(norm, 2)
     res_norm = β
 
     # FIXME: The performance for this is quite bad when compared to the KrylovJL_GMRES
     #        version
-    for _ in 1:(maxiters ÷ M + 1)
+    for _ in 1:((maxiters ÷ M) + 1)
         for j in 1:M
             Qⱼ₊₁ = @view(Q[:, j + 1])
             mul!(Qⱼ₊₁, A, @view(Q[:, j]))  # Q(:,j+1) <- A Q(:, j)
             for i in 1:j
                 H[i, j] = dot(@view(Q[:, i]), Qⱼ₊₁)
                 axpy!(-H[i, j], @view(Q[:, i]), Qⱼ₊₁)
             end
-            H[j + 1, j] = norm2(Qⱼ₊₁)
+            H[j + 1, j] = _norm2(Qⱼ₊₁)
             H[j + 1, j] > ϵ && (Qⱼ₊₁ ./= H[j + 1, j])
 
             # FIXME: Figure out a way to avoid the allocation
@@ -140,10 +181,10 @@ function SciMLBase.solve!(cache::SimpleGMRESCache{false, T}) where {T}
             mul!(x, @view(Q[:, 1:j]), y)
             mul!(r, A, x, T(-1), T(0))
             axpy!(T(1), b, r)
-            res_norm = norm2(r)
+            res_norm = _norm2(r)
 
             if res_norm < abstol
-                return SciMLBase.build_linear_solution(nothing, x, r, nothing;
+                return SciMLBase.build_linear_solution(lincache.alg, x, r, lincache;
                     retcode = ReturnCode.Success)
             end
         end
@@ -153,6 +194,6 @@ function SciMLBase.solve!(cache::SimpleGMRESCache{false, T}) where {T}
         fill!(H, zero(T))
     end
 
-    return SciMLBase.build_linear_solution(nothing, x, r, nothing;
+    return SciMLBase.build_linear_solution(lincache.alg, x, r, lincache;
         retcode = ReturnCode.MaxIters)
 end