JuliaSmoothOptimizers
diff --git a/‎docs/src/custom_workspaces.md‎
Lines changed: 42 additions & 1 deletion b/‎docs/src/custom_workspaces.md‎
Lines changed: 42 additions & 1 deletion
diff --git a/‎src/bilq.jl‎
Lines changed: 15 additions & 15 deletions b/‎src/bilq.jl‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎src/bilqr.jl‎
Lines changed: 24 additions & 24 deletions b/‎src/bilqr.jl‎
Lines changed: 24 additions & 24 deletions
diff --git a/‎src/block_krylov_utils.jl‎
Lines changed: 1 addition & 1 deletion b/‎src/block_krylov_utils.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/cg_lanczos.jl‎
Lines changed: 6 additions & 6 deletions b/‎src/cg_lanczos.jl‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/cg_lanczos_shift.jl‎
Lines changed: 6 additions & 6 deletions b/‎src/cg_lanczos_shift.jl‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/cgls_lanczos_shift.jl‎
Lines changed: 12 additions & 12 deletions b/‎src/cgls_lanczos_shift.jl‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎src/craig.jl‎
Lines changed: 8 additions & 8 deletions b/‎src/craig.jl‎
Lines changed: 8 additions & 8 deletions
@@ -188,6 +188,17 @@ function Krylov.kscal!(n::Integer, s::T, x::HaloVector{T}) where T <: FloatOrCom
     return x
 end
 
+function Krylov.kdiv!(n::Integer, x::HaloVector{T}, s::T) where T <: FloatOrComplex
+    mx, nx = size(x.data)
+    _x = x.data
+    for i = 1:mx-1
+        for j = 1:nx-1
+            _x[i,j] = _x[i,j] / s
+        end
+    end
+    return x
+end
+
 function Krylov.kaxpy!(n::Integer, s::T, x::HaloVector{T}, y::HaloVector{T}) where T <: FloatOrComplex
     mx, nx = size(x.data)
     _x = x.data
@@ -224,6 +235,30 @@ function Krylov.kcopy!(n::Integer, y::HaloVector{T}, x::HaloVector{T}) where T <
     return y
 end
 
+function Krylov.kscalcopy!(n::Integer, y::HaloVector{T}, s::T, x::HaloVector{T}) where T <: FloatOrComplex
+    mx, nx = size(x.data)
+    _x = x.data
+    _y = y.data
+    for i = 1:mx-1
+        for j = 1:nx-1
+            _y[i,j] = s * _x[i,j]
+        end
+    end
+    return y
+end
+
+function Krylov.kdivcopy!(n::Integer, y::HaloVector{T}, x::HaloVector{T}, s::T) where T <: FloatOrComplex
+    mx, nx = size(x.data)
+    _x = x.data
+    _y = y.data
+    for i = 1:mx-1
+        for j = 1:nx-1
+            _y[i,j] = _x[i,j] / s
+        end
+    end
+    return y
+end
+
 function Krylov.kfill!(x::HaloVector{T}, val::T) where T <: FloatOrComplex
     mx, nx = size(x.data)
     _x = x.data
@@ -251,7 +286,13 @@ function Krylov.kref!(n::Integer, x::HaloVector{T}, y::HaloVector{T}, c::T, s::T
 end
 ```
 
-Note that `Krylov.kref!` is only required for `minres_qlp`.
+By default, `kdiv!(n, y, x, s)` calls `kscal!(n, y, t, x)` with `t = 1/s`, so a separate implementation isn't required.
+However, this approach may introduce numerical issues when `s` is very small.
+We do this because computing $y \leftarrow t \times x$ can often leverage SIMD or fused multiply-add (FMA) instructions on certain architectures, capabilities that a direct element-wise division $y \leftarrow x/s$ typically lacks.
+Thus, the implementation of `kdiv!` provides flexibility, allowing users to choose a trade-off between speed and numerical precision by overloading the function if needed.
+The operations provided by `kdivcopy!` and `kscalcopy!` could be implemented directly by using `kcopy!`, `kscal!`, and `kdiv!` but require two separate memory passes, which can be suboptimal for performance.
+To address this limitation, `kdivcopy!` and `kscalcopy!` fuse the copy and scaling/division operations into a single memory pass.
+Note that `Krylov.kref!` is only required for the function `minres_qlp`.
 
 ### 2D Poisson equation solver with Krylov methods
 
 
@@ -191,19 +191,19 @@ kwargs_bilq = (:c, :transfer_to_bicg, :M, :N, :ldiv, :atol, :rtol, :itmax, :time
     (verbose > 0) && @printf(iostream, "%5s  %8s  %7s  %5s\n", "k", "αₖ", "‖rₖ‖", "timer")
     kdisplay(iter, verbose) && @printf(iostream, "%5d  %8.1e  %7.1e  %.2fs\n", iter, cᴴb, bNorm, start_time |> ktimer)
 
-    βₖ = √(abs(cᴴb))            # β₁γ₁ = cᴴ(b - Ax₀)
-    γₖ = cᴴb / βₖ               # β₁γ₁ = cᴴ(b - Ax₀)
-    kfill!(vₖ₋₁, zero(FC))      # v₀ = 0
-    kfill!(uₖ₋₁, zero(FC))      # u₀ = 0
-    vₖ .= r₀ ./ βₖ              # v₁ = (b - Ax₀) / β₁
-    uₖ .= c ./ conj(γₖ)         # u₁ = c / γ̄₁
-    cₖ₋₁ = cₖ = -one(T)         # Givens cosines used for the LQ factorization of Tₖ
-    sₖ₋₁ = sₖ = zero(FC)        # Givens sines used for the LQ factorization of Tₖ
-    kfill!(d̅, zero(FC))         # Last column of D̅ₖ = Vₖ(Qₖ)ᴴ
-    ζₖ₋₁ = ζbarₖ = zero(FC)     # ζₖ₋₁ and ζbarₖ are the last components of z̅ₖ = (L̅ₖ)⁻¹β₁e₁
-    ζₖ₋₂ = ηₖ = zero(FC)        # ζₖ₋₂ and ηₖ are used to update ζₖ₋₁ and ζbarₖ
-    δbarₖ₋₁ = δbarₖ = zero(FC)  # Coefficients of Lₖ₋₁ and L̅ₖ modified over the course of two iterations
-    norm_vₖ = bNorm / βₖ        # ‖vₖ‖ is used for residual norm estimates
+    βₖ = √(abs(cᴴb))               # β₁γ₁ = cᴴ(b - Ax₀)
+    γₖ = cᴴb / βₖ                  # β₁γ₁ = cᴴ(b - Ax₀)
+    kfill!(vₖ₋₁, zero(FC))         # v₀ = 0
+    kfill!(uₖ₋₁, zero(FC))         # u₀ = 0
+    kdivcopy!(n, vₖ, r₀, βₖ)       # v₁ = (b - Ax₀) / β₁
+    kdivcopy!(n, uₖ, c, conj(γₖ))  # u₁ = c / γ̄₁
+    cₖ₋₁ = cₖ = -one(T)            # Givens cosines used for the LQ factorization of Tₖ
+    sₖ₋₁ = sₖ = zero(FC)           # Givens sines used for the LQ factorization of Tₖ
+    kfill!(d̅, zero(FC))            # Last column of D̅ₖ = Vₖ(Qₖ)ᴴ
+    ζₖ₋₁ = ζbarₖ = zero(FC)        # ζₖ₋₁ and ζbarₖ are the last components of z̅ₖ = (L̅ₖ)⁻¹β₁e₁
+    ζₖ₋₂ = ηₖ = zero(FC)           # ζₖ₋₂ and ηₖ are used to update ζₖ₋₁ and ζbarₖ
+    δbarₖ₋₁ = δbarₖ = zero(FC)     # Coefficients of Lₖ₋₁ and L̅ₖ modified over the course of two iterations
+    norm_vₖ = bNorm / βₖ           # ‖vₖ‖ is used for residual norm estimates
 
     # Stopping criterion.
     solved_lq = bNorm ≤ ε
@@ -321,8 +321,8 @@ kwargs_bilq = (:c, :transfer_to_bicg, :M, :N, :ldiv, :atol, :rtol, :itmax, :time
       kcopy!(n, uₖ₋₁, uₖ)  # uₖ₋₁ ← uₖ
 
       if pᴴq ≠ 0
-        vₖ .= q ./ βₖ₊₁        # βₖ₊₁vₖ₊₁ = q
-        uₖ .= p ./ conj(γₖ₊₁)  # γ̄ₖ₊₁uₖ₊₁ = p
+        kdivcopy!(n, vₖ, q, βₖ₊₁)        # vₖ₊₁ = q / βₖ₊₁
+        kdivcopy!(n, uₖ, p, conj(γₖ₊₁))  # uₖ₊₁ = p / γ̄ₖ₊₁
       end
 
       # Compute ⟨vₖ,vₖ₊₁⟩ and ‖vₖ₊₁‖
 
@@ -175,24 +175,24 @@ kwargs_bilqr = (:transfer_to_bicg, :atol, :rtol, :itmax, :timemax, :verbose, :hi
     end
 
     # Set up workspace.
-    βₖ = √(abs(cᴴb))            # β₁γ₁ = (c - Aᴴy₀)ᴴ(b - Ax₀)
-    γₖ = cᴴb / βₖ               # β₁γ₁ = (c - Aᴴy₀)ᴴ(b - Ax₀)
-    kfill!(vₖ₋₁, zero(FC))      # v₀ = 0
-    kfill!(uₖ₋₁, zero(FC))      # u₀ = 0
-    vₖ .= r₀ ./ βₖ              # v₁ = (b - Ax₀) / β₁
-    uₖ .= s₀ ./ conj(γₖ)        # u₁ = (c - Aᴴy₀) / γ̄₁
-    cₖ₋₁ = cₖ = -one(T)         # Givens cosines used for the LQ factorization of Tₖ
-    sₖ₋₁ = sₖ = zero(FC)        # Givens sines used for the LQ factorization of Tₖ
-    kfill!(d̅, zero(FC))         # Last column of D̅ₖ = Vₖ(Qₖ)ᴴ
-    ζₖ₋₁ = ζbarₖ = zero(FC)     # ζₖ₋₁ and ζbarₖ are the last components of z̅ₖ = (L̅ₖ)⁻¹β₁e₁
-    ζₖ₋₂ = ηₖ = zero(FC)        # ζₖ₋₂ and ηₖ are used to update ζₖ₋₁ and ζbarₖ
-    δbarₖ₋₁ = δbarₖ = zero(FC)  # Coefficients of Lₖ₋₁ and L̅ₖ modified over the course of two iterations
-    ψbarₖ₋₁ = ψₖ₋₁ = zero(FC)   # ψₖ₋₁ and ψbarₖ are the last components of h̅ₖ = Qₖγ̄₁e₁
-    norm_vₖ = bNorm / βₖ        # ‖vₖ‖ is used for residual norm estimates
-    ϵₖ₋₃ = λₖ₋₂ = zero(FC)      # Components of Lₖ₋₁
-    kfill!(wₖ₋₃, zero(FC))      # Column k-3 of Wₖ = Uₖ(Lₖ)⁻ᴴ
-    kfill!(wₖ₋₂, zero(FC))      # Column k-2 of Wₖ = Uₖ(Lₖ)⁻ᴴ
-    τₖ = zero(T)                # τₖ is used for the dual residual norm estimate
+    βₖ = √(abs(cᴴb))                # β₁γ₁ = (c - Aᴴy₀)ᴴ(b - Ax₀)
+    γₖ = cᴴb / βₖ                   # β₁γ₁ = (c - Aᴴy₀)ᴴ(b - Ax₀)
+    kfill!(vₖ₋₁, zero(FC))          # v₀ = 0
+    kfill!(uₖ₋₁, zero(FC))          # u₀ = 0
+    kdivcopy!(n, vₖ, r₀, βₖ)        # v₁ = (b - Ax₀) / β₁
+    kdivcopy!(n, uₖ, s₀, conj(γₖ))  # u₁ = (c - Aᴴy₀) / γ̄₁
+    cₖ₋₁ = cₖ = -one(T)             # Givens cosines used for the LQ factorization of Tₖ
+    sₖ₋₁ = sₖ = zero(FC)            # Givens sines used for the LQ factorization of Tₖ
+    kfill!(d̅, zero(FC))             # Last column of D̅ₖ = Vₖ(Qₖ)ᴴ
+    ζₖ₋₁ = ζbarₖ = zero(FC)         # ζₖ₋₁ and ζbarₖ are the last components of z̅ₖ = (L̅ₖ)⁻¹β₁e₁
+    ζₖ₋₂ = ηₖ = zero(FC)            # ζₖ₋₂ and ηₖ are used to update ζₖ₋₁ and ζbarₖ
+    δbarₖ₋₁ = δbarₖ = zero(FC)      # Coefficients of Lₖ₋₁ and L̅ₖ modified over the course of two iterations
+    ψbarₖ₋₁ = ψₖ₋₁ = zero(FC)       # ψₖ₋₁ and ψbarₖ are the last components of h̅ₖ = Qₖγ̄₁e₁
+    norm_vₖ = bNorm / βₖ            # ‖vₖ‖ is used for residual norm estimates
+    ϵₖ₋₃ = λₖ₋₂ = zero(FC)          # Components of Lₖ₋₁
+    kfill!(wₖ₋₃, zero(FC))          # Column k-3 of Wₖ = Uₖ(Lₖ)⁻ᴴ
+    kfill!(wₖ₋₂, zero(FC))          # Column k-2 of Wₖ = Uₖ(Lₖ)⁻ᴴ
+    τₖ = zero(T)                    # τₖ is used for the dual residual norm estimate
 
     # Stopping criterion.
     solved_lq = bNorm == 0
@@ -355,23 +355,22 @@ kwargs_bilqr = (:transfer_to_bicg, :atol, :rtol, :itmax, :timemax, :verbose, :hi
         # w₁ = u₁ / δ̄₁
         if iter == 2
           wₖ₋₁ = wₖ₋₂
-          kaxpy!(n, one(FC), uₖ₋₁, wₖ₋₁)
-          wₖ₋₁ .= uₖ₋₁ ./ conj(δₖ₋₁)
+          kdivcopy!(n, wₖ₋₁, uₖ₋₁, conj(δₖ₋₁))
         end
         # w₂ = (u₂ - λ̄₁w₁) / δ̄₂
         if iter == 3
           wₖ₋₁ = wₖ₋₃
           kaxpy!(n, one(FC), uₖ₋₁, wₖ₋₁)
           kaxpy!(n, -conj(λₖ₋₂), wₖ₋₂, wₖ₋₁)
-          wₖ₋₁ .= wₖ₋₁ ./ conj(δₖ₋₁)
+          kdiv!(n, wₖ₋₁, conj(δₖ₋₁))
         end
         # wₖ₋₁ = (uₖ₋₁ - λ̄ₖ₋₂wₖ₋₂ - ϵ̄ₖ₋₃wₖ₋₃) / δ̄ₖ₋₁
         if iter ≥ 4
           kscal!(n, -conj(ϵₖ₋₃), wₖ₋₃)
           wₖ₋₁ = wₖ₋₃
           kaxpy!(n, one(FC), uₖ₋₁, wₖ₋₁)
           kaxpy!(n, -conj(λₖ₋₂), wₖ₋₂, wₖ₋₁)
-          wₖ₋₁ .= wₖ₋₁ ./ conj(δₖ₋₁)
+          kdiv!(n, wₖ₋₁, conj(δₖ₋₁))
         end
 
         if iter ≥ 3
@@ -405,9 +404,10 @@ kwargs_bilqr = (:transfer_to_bicg, :atol, :rtol, :itmax, :timemax, :verbose, :hi
       kcopy!(n, vₖ₋₁, vₖ)  # vₖ₋₁ ← vₖ
       kcopy!(n, uₖ₋₁, uₖ)  # uₖ₋₁ ← uₖ
 
+
       if pᴴq ≠ zero(FC)
-        vₖ .= q ./ βₖ₊₁        # βₖ₊₁vₖ₊₁ = q
-        uₖ .= p ./ conj(γₖ₊₁)  # γ̄ₖ₊₁uₖ₊₁ = p
+        kdivcopy!(n, vₖ, q, βₖ₊₁)        # vₖ₊₁ = q / βₖ₊₁
+        kdivcopy!(n, uₖ, p, conj(γₖ₊₁))  # uₖ₊₁ = p / γ̄ₖ₊₁
       end
 
       # Update ϵₖ₋₃, λₖ₋₂, δbarₖ₋₁, cₖ₋₁, sₖ₋₁, γₖ and βₖ.
 
@@ -26,7 +26,7 @@ function gs!(Q::AbstractMatrix{FC}, R::AbstractMatrix{FC}, v::AbstractVector{FC}
   kfill!(R, zero(FC))
   for j = 1:k
     qⱼ = view(Q,:,j)
-    aⱼ .= qⱼ
+    kcopy!(n, aⱼ, qⱼ)
     for i = 1:j-1
       qᵢ = view(Q,:,i)
       R[i,j] = kdot(n, qᵢ, aⱼ)    # rᵢⱼ = ⟨qᵢ , aⱼ⟩
 
@@ -157,9 +157,9 @@ kwargs_cg_lanczos = (:M, :ldiv, :check_curvature, :atol, :rtol, :itmax, :timemax
 
     # Initialize Lanczos process.
     # β₁Mv₁ = b
-    kscal!(n, one(FC) / β, v)           # v₁  ←  v₁ / β₁
-    MisI || kscal!(n, one(FC) / β, Mv)  # Mv₁ ← Mv₁ / β₁
-    kcopy!(n, Mv_prev, Mv)              # Mv_prev ← Mv
+    kdiv!(n, v, β)           # v₁  ←  v₁ / β₁
+    MisI || kdiv!(n, Mv, β)  # Mv₁ ← Mv₁ / β₁
+    kcopy!(n, Mv_prev, Mv)   # Mv_prev ← Mv
 
     iter = 0
     itmax == 0 && (itmax = 2 * n)
@@ -191,7 +191,7 @@ kwargs_cg_lanczos = (:M, :ldiv, :check_curvature, :atol, :rtol, :itmax, :timemax
 
       # Check curvature. Exit fast if requested.
       # It is possible to show that σₖ² (δₖ - ωₖ₋₁ / γₖ₋₁) = pₖᴴ A pₖ.
-      γ = one(T) / (δ - ω / γ)  # γₖ = 1 / (δₖ - ωₖ₋₁ / γₖ₋₁)
+      γ = inv(δ - ω / γ)  # γₖ = 1 / (δₖ - ωₖ₋₁ / γₖ₋₁)
       indefinite |= (γ ≤ 0)
       (check_curvature & indefinite) && continue
 
@@ -203,8 +203,8 @@ kwargs_cg_lanczos = (:M, :ldiv, :check_curvature, :atol, :rtol, :itmax, :timemax
       kcopy!(n, Mv, Mv_next)              # Mvₖ ← Mvₖ₊₁
       MisI || mulorldiv!(v, M, Mv, ldiv)  # vₖ₊₁ = M⁻¹ * Mvₖ₊₁
       β = knorm_elliptic(n, v, Mv)        # βₖ₊₁ = vₖ₊₁ᴴ M vₖ₊₁
-      kscal!(n, one(FC) / β, v)           # vₖ₊₁  ←  vₖ₊₁ / βₖ₊₁
-      MisI || kscal!(n, one(FC) / β, Mv)  # Mvₖ₊₁ ← Mvₖ₊₁ / βₖ₊₁
+      kdiv!(n, v, β)                      # vₖ₊₁  ←  vₖ₊₁ / βₖ₊₁
+      MisI || kdiv!(n, Mv, β)             # Mvₖ₊₁ ← Mvₖ₊₁ / βₖ₊₁
       Anorm2 += β_prev^2 + β^2 + δ^2      # Use ‖Tₖ₊₁‖₂ as increasing approximation of ‖A‖₂.
       β_prev = β
 
 
@@ -160,9 +160,9 @@ kwargs_cg_lanczos_shift = (:M, :ldiv, :check_curvature, :atol, :rtol, :itmax, :t
 
     # Initialize Lanczos process.
     # β₁Mv₁ = b
-    kscal!(n, one(FC) / β, v)           # v₁  ←  v₁ / β₁
-    MisI || kscal!(n, one(FC) / β, Mv)  # Mv₁ ← Mv₁ / β₁
-    kcopy!(n, Mv_prev, Mv)              # Mv_prev ← Mv
+    kdiv!(n, v, β)           # v₁  ←  v₁ / β₁
+    MisI || kdiv!(n, Mv, β)  # Mv₁ ← Mv₁ / β₁
+    kcopy!(n, Mv_prev, Mv)   # Mv_prev ← Mv
 
     # Initialize some constants used in recursions below.
     ρ = one(T)
@@ -206,15 +206,15 @@ kwargs_cg_lanczos_shift = (:M, :ldiv, :check_curvature, :atol, :rtol, :itmax, :t
       kcopy!(n, Mv, Mv_next)              # Mvₖ ← Mvₖ₊₁
       MisI || mulorldiv!(v, M, Mv, ldiv)  # vₖ₊₁ = M⁻¹ * Mvₖ₊₁
       β = knorm_elliptic(n, v, Mv)        # βₖ₊₁ = vₖ₊₁ᴴ M vₖ₊₁
-      kscal!(n, one(FC) / β, v)           # vₖ₊₁  ←  vₖ₊₁ / βₖ₊₁
-      MisI || kscal!(n, one(FC) / β, Mv)  # Mvₖ₊₁ ← Mvₖ₊₁ / βₖ₊₁
+      kdiv!(n, v, β)                      # vₖ₊₁  ←  vₖ₊₁ / βₖ₊₁
+      MisI || kdiv!(n, Mv, β)             # Mvₖ₊₁ ← Mvₖ₊₁ / βₖ₊₁
 
       # Check curvature: vₖᴴ(A + sᵢI)vₖ = vₖᴴAvₖ + sᵢ‖vₖ‖² = δₖ + ρₖ * sᵢ with ρₖ = ‖vₖ‖².
       # It is possible to show that σₖ² (δₖ + ρₖ * sᵢ - ωₖ₋₁ / γₖ₋₁) = pₖᴴ (A + sᵢ I) pₖ.
       MisI || (ρ = kdotr(n, v, v))
       for i = 1 : nshifts
         δhat[i] = δ + ρ * shifts[i]
-        γ[i] = 1 / (δhat[i] - ω[i] / γ[i])
+        γ[i] = inv(δhat[i] - ω[i] / γ[i])
       end
       for i = 1 : nshifts
         indefinite[i] |= γ[i] ≤ 0
 
@@ -161,8 +161,8 @@ kwargs_cgls_lanczos_shift = (:M, :ldiv, :atol, :rtol, :itmax, :timemax, :verbose
 
     # Initialize Lanczos process.
     # β₁v₁ = b
-    kscal!(n, one(FC) / β, v)  # v₁ ← v₁ / β₁
-    kscal!(m, one(FC) / β, u)
+    kdiv!(n, v, β)  # v₁ ← v₁ / β₁
+    kdiv!(m, u, β)
 
     # Initialize some constants used in recursions below.
     ρ = one(T)
@@ -196,21 +196,21 @@ kwargs_cgls_lanczos_shift = (:M, :ldiv, :atol, :rtol, :itmax, :timemax, :verbose
     while ! (solved || tired || user_requested_exit || overtimed)
 
       # Form next Lanczos vector.
-      mul!(u_next, A, v)              # u_nextₖ ← Avₖ
-      δ = kdotr(m, u_next, u_next)    # δₖ = vₖᵀAᴴAvₖ
-      kaxpy!(m, -δ, u, u_next)        # uₖ₊₁ = u_nextₖ - δₖuₖ - βₖuₖ₋₁
+      mul!(u_next, A, v)            # u_nextₖ ← Avₖ
+      δ = kdotr(m, u_next, u_next)  # δₖ = vₖᵀAᴴAvₖ
+      kaxpy!(m, -δ, u, u_next)      # uₖ₊₁ = u_nextₖ - δₖuₖ - βₖuₖ₋₁
       kaxpy!(m, -β, u_prev, u_next)
-      mul!(v, Aᴴ, u_next)             # vₖ₊₁ = Aᴴuₖ₊₁
-      β = knorm_elliptic(n, v, v)     # βₖ₊₁ = vₖ₊₁ᵀ M vₖ₊₁
-      kscal!(n, one(FC) / β, v)       # vₖ₊₁  ←  vₖ₊₁ / βₖ₊₁
-      kscal!(m, one(FC) / β, u_next)  # uₖ₊₁ = uₖ₊₁ / βₖ₊₁
-      kcopy!(m, u_prev, u)            # u_prev ← u
-      kcopy!(m, u, u_next)            # u ← u_next
+      mul!(v, Aᴴ, u_next)           # vₖ₊₁ = Aᴴuₖ₊₁
+      β = knorm_elliptic(n, v, v)   # βₖ₊₁ = vₖ₊₁ᵀ M vₖ₊₁
+      kdiv!(n, v, β)                # vₖ₊₁ = vₖ₊₁ / βₖ₊₁
+      kdiv!(m, u_next, β)           # uₖ₊₁ = uₖ₊₁ / βₖ₊₁
+      kcopy!(m, u_prev, u)          # u_prev ← u
+      kcopy!(m, u, u_next)          # u ← u_next
 
       MisI || (ρ = kdotr(n, v, v))
       for i = 1 : nshifts
         δhat[i] = δ + ρ * shifts[i]
-        γ[i] = 1 / (δhat[i] - ω[i] / γ[i])
+        γ[i] = inv(δhat[i] - ω[i] / γ[i])
       end
 
       # Compute next CG iterate for each shifted system that has not yet converged.
 
@@ -223,8 +223,8 @@ kwargs_craig = (:M, :N, :ldiv, :transfer_to_lsqr, :sqd, :λ, :btol, :conlim, :at
 
     # Initialize Golub-Kahan process.
     # β₁Mu₁ = b.
-    kscal!(m, one(FC) / β₁, u)
-    MisI || kscal!(m, one(FC) / β₁, Mu)
+    kdiv!(m, u, β₁)
+    MisI || kdiv!(m, Mu, β₁)
 
     kfill!(Nv, zero(FC))
     kfill!(w, zero(FC))  # Used to update y.
@@ -275,8 +275,8 @@ kwargs_craig = (:M, :N, :ldiv, :transfer_to_lsqr, :sqd, :λ, :btol, :conlim, :at
         inconsistent = true
         continue
       end
-      kscal!(n, one(FC) / α, v)
-      NisI || kscal!(n, one(FC) / α, Nv)
+      kdiv!(n, v, α)
+      NisI || kdiv!(n, Nv, α)
 
       Anorm² += α * α + λ * λ
 
@@ -315,8 +315,8 @@ kwargs_craig = (:M, :N, :ldiv, :transfer_to_lsqr, :sqd, :λ, :btol, :conlim, :at
       MisI || mulorldiv!(u, M, Mu, ldiv)
       β = knorm_elliptic(m, u, Mu)
       if β ≠ 0
-        kscal!(m, one(FC) / β, u)
-        MisI || kscal!(m, one(FC) / β, Mu)
+        kdiv!(m, u, β)
+        MisI || kdiv!(m, Mu, β)
       end
 
       # Finish  updates from the first Givens rotation.
@@ -358,8 +358,8 @@ kwargs_craig = (:M, :N, :ldiv, :transfer_to_lsqr, :sqd, :λ, :btol, :conlim, :at
       solved_resid_lim = rNorm ≤ btol + atol * Anorm * xNorm / β₁
       solved = solved_mach | solved_lim | solved_resid_tol | solved_resid_lim
 
-      ill_cond_mach = one(T) + one(T) / Acond ≤ one(T)
-      ill_cond_lim = 1 / Acond ≤ ctol
+      ill_cond_mach = one(T) + inv(Acond) ≤ one(T)
+      ill_cond_lim = inv(Acond) ≤ ctol
       ill_cond = ill_cond_mach | ill_cond_lim
 
       user_requested_exit = callback(solver) :: Bool