Implement GMRES as an iterator (#143)

haampie · andreasnoack · commit d48a419e1b26 · 2017-07-24T23:05:02.000-04:00
diff --git a/benchmark/benchmark-linear-systems.jl b/benchmark/benchmark-linear-systems.jl
@@ -59,11 +59,10 @@ function gmres(; n = 100_000, tol = 1e-5, restart::Int = 15, maxiter::Int = 210)
 
     println("Matrix of size ", n, " with ~", nnz(A) / n, " nonzeros per row")
     println("Tolerance = ", tol, "; restart = ", restart, "; max #iterations = ", maxiter)
-    
-    impr = @benchmark IterativeSolvers.improved_gmres($A, $b, tol = $tol, restart = $restart, maxiter = $maxiter, log = false)
-    old = @benchmark IterativeSolvers.gmres($A, $b, tol = $tol, restart = $restart, maxiter = $maxiter, log = false)
 
-    impr, old
+    IterativeSolvers.gmres(A, b, tol = tol, restart = restart, maxiter = maxiter, verbose = true)
+
+    @benchmark IterativeSolvers.gmres($A, $b, tol = $tol, restart = $restart, maxiter = $maxiter)
 end
 
 function bicgstabl()
diff --git a/src/gmres.jl b/src/gmres.jl
@@ -1,143 +1,169 @@
+import Base: start, next, done
+
 export gmres, gmres!
 
-gmres(A, b; kwargs...) = gmres!(zeros(b), A, b; kwargs...)
+type ArnoldiDecomp{T, matT}
+    A::matT
+    V::Matrix{T} # Orthonormal basis vectors
+    H::Matrix{T} # Hessenberg matrix
+end
 
-function gmres!(x, A, b;
-  Pl = Identity(),
-  Pr = Identity(),
-  tol = sqrt(eps(real(eltype(b)))),
-  restart::Int = min(20, length(b)),
-  maxiter::Int = restart,
-  log::Bool = false,
-  kwargs...
+ArnoldiDecomp{matT}(A::matT, order::Int, T::Type) = ArnoldiDecomp{T, matT}(
+    A,
+    zeros(T, size(A, 1), order + 1),
+    zeros(T, order + 1, order)
 )
-    history = ConvergenceHistory(partial = !log, restart = restart)
-    history[:tol] = tol
-    log && reserve!(history, :resnorm, maxiter)
-    gmres_method!(history, x, A, b; Pl = Pl, Pr = Pr, tol = tol, maxiter = maxiter, restart = restart, log = log, kwargs...)
-    log && shrink!(history)
-    log ? (x, history) : x
+
+type Residual{numT, resT}
+    current::resT # Current, absolute, preconditioned residual
+    accumulator::resT # Used to compute the residual on the go
+    nullvec::Vector{numT} # Vector in the null space of H to compute residuals
+    β::resT # the initial residual
 end
 
-function gmres_method!(history::ConvergenceHistory, x, A, b;
-    Pl = Identity(),
-    Pr = Identity(),
-    tol = sqrt(eps(real(eltype(b)))),
-    restart::Int = min(20, length(b)),
-    outer::Int = 1,
-    maxiter::Int = restart,
-    verbose::Bool = false,
-    log = false
+Residual(order, T::Type) = Residual{T, real(T)}(
+    one(real(T)),
+    one(real(T)),
+    ones(T, order + 1),
+    one(real(T))
 )
-    T = eltype(b)
 
-    # Approximate solution
-    arnoldi = ArnoldiDecomp(A, restart, T)
-    residual = Residual(restart, T)
+type GMRESIterable{preclT, precrT, vecT <: AbstractVector, arnoldiT <: ArnoldiDecomp, residualT <: Residual, resT <: Real}
+    Pl::preclT
+    Pr::precrT
+    x::vecT
+    b::vecT
+    Ax::vecT # Some room to work in.
+
+    arnoldi::arnoldiT
+    residual::residualT
+
+    mv_products::Int
+    restart::Int
+    k::Int
+    maxiter::Int
+    reltol::resT
+    β::resT
+end
 
-    # Workspace vector to reduce the # allocs.
-    reserved_vec = similar(b)
-    β = residual.current = init!(arnoldi, x, b, Pl, reserved_vec)
-    init_residual!(residual, β)
+converged(g::GMRESIterable) = g.residual.current ≤ g.reltol
 
-    # Log the first mvp for computing the initial residual
-    if log
-        history.mvps += 1
-    end
+start(::GMRESIterable) = 0
 
-    # Stopping criterion is based on |r0| / |rk|
-    reltol = residual.current * tol
+done(g::GMRESIterable, iteration::Int) = iteration ≥ g.maxiter || converged(g)
 
-    # Total iterations (not reset after restart)
-    total_iter = 1
+function next(g::GMRESIterable, iteration::Int)
 
-    while total_iter ≤ maxiter
+    # Arnoldi step: expand
+    expand!(g.arnoldi, g.Pl, g.Pr, g.k, g.Ax)
+    g.mv_products += 1
 
-        # We already have the initial residual
-        if total_iter > 1
+    # Orthogonalize V[:, k + 1] w.r.t. V[:, 1 : k]
+    g.arnoldi.H[g.k + 1, g.k] = orthogonalize_and_normalize!(
+        view(g.arnoldi.V, :, 1 : g.k),
+        view(g.arnoldi.V, :, g.k + 1),
+        view(g.arnoldi.H, 1 : g.k, g.k)
+    )
 
-            # Set the first basis vector
-            β = init!(arnoldi, x, b, Pl, reserved_vec)
+    # Implicitly computes the residual
+    update_residual!(g.residual, g.arnoldi, g.k)
 
-            # And initialize the residual
-            init_residual!(residual, β)
-            
-            if log
-                history.mvps += 1
-            end
-        end
+    g.k += 1
 
-        # Inner iterations k = 1, ..., restart
-        k = 1
-
-        while residual.current > reltol && k ≤ restart && total_iter ≤ maxiter
-
-            # Arnoldi step: expand
-            expand!(arnoldi, Pl, Pr, k)
-
-            # Orthogonalize V[:, k + 1] w.r.t. V[:, 1 : k]
-            arnoldi.H[k + 1, k] = orthogonalize_and_normalize!(
-                view(arnoldi.V, :, 1 : k),
-                view(arnoldi.V, :, k + 1),
-                view(arnoldi.H, 1 : k, k)
-            )
-
-            # Implicitly computes the residual
-            update_residual!(residual, arnoldi, k)
-            
-            if log
-                nextiter!(history, mvps = 1)
-                push!(history, :resnorm, residual.current)
-            end
-            
-            verbose && @printf("%3d\t%3d\t%1.2e\n", mod(total_iter, restart), k, residual.current)
-
-            k += 1
-            total_iter += 1
-        end
+    # Computation of x only at the end of the iterations
+    # and at restart.
+    if g.k == g.restart + 1 || done(g, iteration + 1)
 
         # Solve the projected problem Hy = β * e1 in the least-squares sense
-        rhs = solve_least_squares!(arnoldi, β, k)
+        rhs = solve_least_squares!(g.arnoldi, g.β, g.k)
 
         # And improve the solution x ← x + Pr \ (V * y)
-        update_solution!(x, view(rhs, 1 : k - 1), arnoldi, Pr, k)
-    
-        # Converged?
-        if residual.current ≤ reltol
-            setconv(history, true)
-            break
+        update_solution!(g.x, view(rhs, 1 : g.k - 1), g.arnoldi, g.Pr, g.k, g.Ax)
+
+        g.k = 1
+
+        # Restart when not done.
+        if !done(g, iteration)
+
+            # Set the first basis vector
+            g.β = init!(g.arnoldi, g.x, g.b, g.Pl, g.Ax)
+
+            # And initialize the residual
+            init_residual!(g.residual, g.β)
+
+            g.mv_products += 1
         end
     end
 
-    verbose && @printf("\n")
-    x
+    g.residual.current, iteration + 1
 end
 
-type ArnoldiDecomp{T}
-    A
-    V::Matrix{T} # Orthonormal basis vectors
-    H::Matrix{T} # Hessenberg matrix
-end
+gmres_iterable(A, b; kwargs...) = gmres_iterable!(zeros(b), A, b; initially_zero = true, kwargs...)
 
-ArnoldiDecomp(A, order::Int, T::Type) = ArnoldiDecomp{T}(
-    A,
-    zeros(T, size(A, 1), order + 1),
-    zeros(T, order + 1, order)
+function gmres_iterable!(x, A, b;
+    Pl = Identity(),
+    Pr = Identity(),
+    tol = sqrt(eps(real(eltype(b)))),
+    restart::Int = min(20, length(b)),
+    maxiter::Int = restart,
+    initially_zero = false
 )
+    T = eltype(b)
 
-type Residual{numT, resT}
-    current::resT # Current relative residual
-    accumulator::resT # Used to compute the residual on the go
-    nullvec::Vector{numT} # Vector in the null space of H to compute residuals
-    β::resT # the initial residual
+    # Approximate solution
+    arnoldi = ArnoldiDecomp(A, restart, T)
+    residual = Residual(restart, T)
+    mv_products = initially_zero == true ? 1 : 0
+
+    # Workspace vector to reduce the # allocs.
+    Ax = similar(b)
+    residual.current = init!(arnoldi, x, b, Pl, Ax, initially_zero = initially_zero)
+    init_residual!(residual, residual.current)
+
+    reltol = tol * residual.current
+
+    GMRESIterable(Pl, Pr, x, b, Ax,
+        arnoldi, residual,
+        mv_products, restart, 1, maxiter, reltol, residual.current
+    )
 end
 
-Residual(order, T::Type) = Residual{T, real(T)}(
-    one(real(T)),
-    one(real(T)),
-    ones(T, order + 1),
-    one(real(T))
+gmres(A, b; kwargs...) = gmres!(zeros(b), A, b; initially_zero = true, kwargs...)
+
+function gmres!(x, A, b;
+  Pl = Identity(),
+  Pr = Identity(),
+  tol = sqrt(eps(real(eltype(b)))),
+  restart::Int = min(20, length(b)),
+  maxiter::Int = restart,
+  log::Bool = false,
+  initially_zero = false,
+  verbose::Bool = false
 )
+    history = ConvergenceHistory(partial = !log, restart = restart)
+    history[:tol] = tol
+    log && reserve!(history, :resnorm, maxiter)
+
+    iterable = gmres_iterable!(x, A, b; Pl = Pl, Pr = Pr, tol = tol, maxiter = maxiter, restart = restart, initially_zero = initially_zero)
+
+    verbose && @printf("=== gmres ===\n%4s\t%4s\t%7s\n","rest","iter","resnorm")
+
+    for (iteration, residual) = enumerate(iterable)
+        if log
+            nextiter!(history)
+            history.mvps = iterable.mv_products
+            push!(history, :resnorm, residual)
+        end
+
+        verbose && @printf("%3d\t%3d\t%1.2e\n", 1 + div(iteration - 1, restart), 1 + mod(iteration - 1, restart), residual)
+    end
+
+    verbose && println()
+    setconv(history, converged(iterable))
+    log && shrink!(history)
+
+    log ? (x, history) : x
+end
 
 function update_residual!(r::Residual, arnoldi::ArnoldiDecomp, k::Int)
     # Cheaply computes the current residual
@@ -146,15 +172,20 @@ function update_residual!(r::Residual, arnoldi::ArnoldiDecomp, k::Int)
     r.current = r.β / √r.accumulator
 end
 
-function init!{T}(arnoldi::ArnoldiDecomp{T}, x, b, Pl, reserved_vec)
+function init!{T}(arnoldi::ArnoldiDecomp{T}, x, b, Pl, Ax; initially_zero::Bool = false)
     # Initialize the Krylov subspace with the initial residual vector
     # This basically does V[1] = Pl \ (b - A * x) and then normalize
     
     first_col = view(arnoldi.V, :, 1)
 
     copy!(first_col, b)
-    A_mul_B!(reserved_vec, arnoldi.A, x)
-    @blas! first_col -= one(T) * reserved_vec
+
+    # Potentially save one MV product
+    if !initially_zero
+        A_mul_B!(Ax, arnoldi.A, x)
+        @blas! first_col -= one(T) * Ax
+    end
+
     A_ldiv_B!(Pl, first_col)
 
     # Normalize
@@ -179,33 +210,37 @@ function solve_least_squares!{T}(arnoldi::ArnoldiDecomp{T}, β, k::Int)
     rhs
 end
 
-function update_solution!{T}(x, y, arnoldi::ArnoldiDecomp{T}, Pr::Identity, k::Int)
+function update_solution!{T}(x, y, arnoldi::ArnoldiDecomp{T}, Pr::Identity, k::Int, Ax)
     # Update x ← x + V * y
 
     # TODO: find the SugarBLAS alternative
     BLAS.gemv!('N', one(T), view(arnoldi.V, :, 1 : k - 1), y, one(T), x)
 end
 
-function update_solution!{T}(x, y, arnoldi::ArnoldiDecomp{T}, Pr, k::Int)
-    # Allocates a temporary while computing x ← x + Pr \ (V * y)
-    tmp = view(arnoldi.V, :, 1 : k - 1) * y
-    @blas! x += one(T) * (Pr \ tmp)
+function update_solution!{T}(x, y, arnoldi::ArnoldiDecomp{T}, Pr, k::Int, Ax)
+    # Computing x ← x + Pr \ (V * y) and use Ax as a work space
+    A_mul_B!(Ax, view(arnoldi.V, :, 1 : k - 1), y)
+    A_ldiv_B!(Pr, Ax)
+    @blas! x += one(T) * Ax
 end
 
-function expand!(arnoldi::ArnoldiDecomp, Pl::Identity, Pr::Identity, k::Int)
+function expand!(arnoldi::ArnoldiDecomp, Pl::Identity, Pr::Identity, k::Int, Ax)
     # Simply expands by A * v without allocating
     A_mul_B!(view(arnoldi.V, :, k + 1), arnoldi.A, view(arnoldi.V, :, k))
 end
 
-function expand!(arnoldi::ArnoldiDecomp, Pl, Pr::Identity, k::Int)
+function expand!(arnoldi::ArnoldiDecomp, Pl, Pr::Identity, k::Int, Ax)
     # Expands by Pl \ (A * v) without allocating
-    A_mul_B!(view(arnoldi.V, :, k + 1), arnoldi.A, view(arnoldi.V, :, k))
-    A_ldiv_B!(Pl, view(arnoldi.V, :, k + 1))
+    nextV = view(arnoldi.V, :, k + 1)
+    A_mul_B!(nextV, arnoldi.A, view(arnoldi.V, :, k))
+    A_ldiv_B!(Pl, nextV)
 end
 
-function expand!(arnoldi::ArnoldiDecomp, Pl, Pr, k::Int)
-    # Expands by Pl \ (A * (Pr \ v)). Allocates one vector.
-    A_ldiv_B!(view(arnoldi.V, :, k + 1), Pr, view(arnoldi.V, :, k))
-    copy!(view(arnoldi.V, :, k + 1), arnoldi.A * view(arnoldi.V, :, k + 1))
-    A_ldiv_B!(Pl, view(arnoldi.V, :, k + 1))
+function expand!(arnoldi::ArnoldiDecomp, Pl, Pr, k::Int, Ax)
+    # Expands by Pl \ (A * (Pr \ v)). Avoids allocation by using Ax.
+    nextV = view(arnoldi.V, :, k + 1)
+    A_ldiv_B!(nextV, Pr, view(arnoldi.V, :, k))
+    A_mul_B!(Ax, arnoldi.A, nextV)
+    copy!(nextV,  Ax)
+    A_ldiv_B!(Pl, nextV)
 end