Add LineSearchTestCase (#177)

timholy · mateuszbaran · web-flow · commit 3259cd240144 · 2024-08-05T15:45:39.000+02:00
* Add LineSearchTestCase

Also includes the failing case in PR#174.

Co-authored-by: Mateusz Baran &lt;mateuszbaran89@gmail.com&gt;

* Add caching to all line search algorithms

* Add to docs

* Test caching for all algs

---------

Co-authored-by: Mateusz Baran &lt;mateuszbaran89@gmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@
 docs/build
 docs/src/examples/generated
 /docs/Manifest.toml
+Manifest.toml
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "LineSearches"
 uuid = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
-version = "7.2.0"
+version = "7.3.0"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -13,6 +13,8 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 DoubleFloats = "1"
 NLSolversBase = "7"
 NaNMath = "1"
+Optim = "1"
+OptimTestProblems = "2"
 Parameters = "0.10, 0.11, 0.12"
 julia = "1.6"
 
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -46,6 +46,14 @@ using LineSearches
 ```
 to load the package.
 
+## Debugging
+
+If you suspect a method of suboptimal performance or find that your code errors,
+create a [`LineSearchCache`](@ref) to record intermediate values for later
+inspection and analysis. If you're using this via Optim.jl, configure it inside
+the method, e.g., `Newton(linesearch=LineSearches.MoreThuente(; cache))`. The
+value stored in the cache will reflect the final iteration of line search during
+optimization.
 
 ## References
 
diff --git a/docs/src/reference/linesearch.md b/docs/src/reference/linesearch.md
@@ -11,3 +11,9 @@ MoreThuente
 Static
 StrongWolfe
 ```
+
+## Debugging
+
+```@docs
+LineSearchCache
+```
diff --git a/src/LineSearches.jl b/src/LineSearches.jl
@@ -1,5 +1,3 @@
-__precompile__()
-
 module LineSearches
 
 using Printf
@@ -9,13 +7,14 @@ using Parameters, NaNMath
 import NLSolversBase
 import NLSolversBase: AbstractObjective
 
-export LineSearchException
+export LineSearchException, LineSearchCache
 
-export BackTracking, HagerZhang, Static, MoreThuente, StrongWolfe
+export AbstractLineSearch, BackTracking, HagerZhang, Static, MoreThuente, StrongWolfe
 
 export InitialHagerZhang, InitialStatic, InitialPrevious,
     InitialQuadratic, InitialConstantChange
 
+
 function make_ϕ(df, x_new, x, s)
     function ϕ(α)
         # Move a distance of alpha in the direction of s
@@ -91,6 +90,26 @@ end
 
 include("types.jl")
 
+# The following don't extend `empty!` and `push!` because we want implementations for `nothing`
+# and that would be piracy
+emptycache!(cache::LineSearchCache) = begin
+    empty!(cache.alphas)
+    empty!(cache.values)
+    empty!(cache.slopes)
+end
+emptycache!(::Nothing) = nothing
+pushcache!(cache::LineSearchCache, α, val, slope) = begin
+    push!(cache.alphas, α)
+    push!(cache.values, val)
+    push!(cache.slopes, slope)
+end
+pushcache!(cache::LineSearchCache, α, val) = begin
+    push!(cache.alphas, α)
+    push!(cache.values, val)
+end
+pushcache!(::Nothing, α, val, slope) = nothing
+pushcache!(::Nothing, α, val) = nothing
+
 # Line Search Methods
 include("backtracking.jl")
 include("strongwolfe.jl")
diff --git a/src/backtracking.jl b/src/backtracking.jl
@@ -8,13 +8,14 @@ there exists a factor ρ = ρ(c₁) such that α' ≦ ρ α.
 
 This is a modification of the algorithm described in Nocedal Wright (2nd ed), Sec. 3.5.
 """
-@with_kw struct BackTracking{TF, TI}
+@with_kw struct BackTracking{TF, TI} <: AbstractLineSearch
     c_1::TF = 1e-4
     ρ_hi::TF = 0.5
     ρ_lo::TF = 0.1
     iterations::TI = 1_000
     order::TI = 3
     maxstep::TF = Inf
+    cache::Union{Nothing,LineSearchCache{TF}} = nothing
 end
 BackTracking{TF}(args...; kwargs...) where TF = BackTracking{TF,Int}(args...; kwargs...)
 
@@ -37,7 +38,9 @@ end
 
 # TODO: Should we deprecate the interface that only uses the ϕ argument?
 function (ls::BackTracking)(ϕ, αinitial::Tα, ϕ_0, dϕ_0) where Tα
-    @unpack c_1, ρ_hi, ρ_lo, iterations, order = ls
+    @unpack c_1, ρ_hi, ρ_lo, iterations, order, cache = ls
+    emptycache!(cache)
+    pushcache!(cache, 0, ϕ_0, dϕ_0)  # backtracking doesn't use the slope except here
 
     iterfinitemax = -log2(eps(real(Tα)))
 
@@ -68,6 +71,8 @@ function (ls::BackTracking)(ϕ, αinitial::Tα, ϕ_0, dϕ_0) where Tα
 
         ϕx_1 = ϕ(α_2)
     end
+    pushcache!(cache, αinitial, ϕx_1)
+    # TODO: check if value is finite (maybe iterfinite > iterfinitemax)
 
     # Backtrack until we satisfy sufficient decrease condition
     while ϕx_1 > ϕ_0 + c_1 * α_2 * dϕ_0
@@ -112,6 +117,7 @@ function (ls::BackTracking)(ϕ, αinitial::Tα, ϕ_0, dϕ_0) where Tα
 
         # Evaluate f(x) at proposed position
         ϕx_0, ϕx_1 = ϕx_1, ϕ(α_2)
+        pushcache!(cache, α_2, ϕx_1)
     end
 
     return α_2, ϕx_1
diff --git a/src/hagerzhang.jl b/src/hagerzhang.jl
@@ -80,7 +80,7 @@ Conjugate gradient line search implementation from:
     conjugate gradient method with guaranteed descent. ACM
     Transactions on Mathematical Software 32: 113–137.
 """
-@with_kw struct HagerZhang{T, Tm}
+@with_kw struct HagerZhang{T, Tm} <: AbstractLineSearch
    delta::T = DEFAULTDELTA # c_1 Wolfe sufficient decrease condition
    sigma::T = DEFAULTSIGMA # c_2 Wolfe curvature condition (Recommend 0.1 for GradientDescent)
    alphamax::T = Inf
@@ -91,6 +91,7 @@ Conjugate gradient line search implementation from:
    psi3::T = 0.1
    display::Int = 0
    mayterminate::Tm = Ref{Bool}(false)
+   cache::Union{Nothing,LineSearchCache{T}} = nothing
 end
 HagerZhang{T}(args...; kwargs...) where T = HagerZhang{T, Base.RefValue{Bool}}(args...; kwargs...)
 
@@ -109,9 +110,11 @@ function (ls::HagerZhang)(ϕ, ϕdϕ,
                           phi_0::Real,
                           dphi_0::Real) where T # Should c and phi_0 be same type?
     @unpack delta, sigma, alphamax, rho, epsilon, gamma,
-            linesearchmax, psi3, display, mayterminate = ls
+            linesearchmax, psi3, display, mayterminate, cache = ls
+    emptycache!(cache)
 
     zeroT = convert(T, 0)
+    pushcache!(cache, zeroT, phi_0, dphi_0)
     if !(isfinite(phi_0) && isfinite(dphi_0))
         throw(LineSearchException("Value and slope at step length = 0 must be finite.", T(0)))
     end
@@ -124,9 +127,13 @@ function (ls::HagerZhang)(ϕ, ϕdϕ,
     # Prevent values of x_new = x+αs that are likely to make
     # ϕ(x_new) infinite
     iterfinitemax::Int = ceil(Int, -log2(eps(T)))
-    alphas = [zeroT] # for bisection
-    values = [phi_0]
-    slopes = [dphi_0]
+    if cache !== nothing
+        @unpack alphas, values, slopes = cache
+    else
+        alphas = [zeroT] # for bisection
+        values = [phi_0]
+        slopes = [dphi_0]
+    end
     if display & LINESEARCH > 0
         println("New linesearch")
     end
@@ -203,10 +210,10 @@ function (ls::HagerZhang)(ϕ, ϕdϕ,
         else
             # We'll still going downhill, expand the interval and try again.
             # Reaching this branch means that dphi_c < 0 and phi_c <= phi_0 + ϵ_k
-            # So cold = c has a lower objective than phi_0 up to epsilon. 
+            # So cold = c has a lower objective than phi_0 up to epsilon.
             # This makes it a viable step to return if bracketing fails.
 
-            # Bracketing can fail if no cold < c <= alphamax can be found with finite phi_c and dphi_c. 
+            # Bracketing can fail if no cold < c <= alphamax can be found with finite phi_c and dphi_c.
             # Going back to the loop with c = cold will only result in infinite cycling.
             # So returning (cold, phi_cold) and exiting the line search is the best move.
             cold = c
diff --git a/src/morethuente.jl b/src/morethuente.jl
@@ -138,13 +138,14 @@ The line search implementation from:
     Line search algorithms with guaranteed sufficient decrease.
     ACM Transactions on Mathematical Software (TOMS) 20.3 (1994): 286-307.
 """
-@with_kw struct MoreThuente{T}
+@with_kw struct MoreThuente{T} <: AbstractLineSearch
     f_tol::T = 1e-4 # c_1 Wolfe sufficient decrease condition
     gtol::T = 0.9   # c_2 Wolfe curvature condition (Recommend 0.1 for GradientDescent)
     x_tol::T = 1e-8
     alphamin::T = 1e-16
     alphamax::T = 65536.0
     maxfev::Int = 100
+    cache::Union{Nothing,LineSearchCache{T}} = nothing
 end
 
 function (ls::MoreThuente)(df::AbstractObjective, x::AbstractArray{T},
@@ -161,13 +162,15 @@ function (ls::MoreThuente)(ϕdϕ,
                            alpha::T,
                            ϕ_0,
                            dϕ_0) where T
-    @unpack f_tol, gtol, x_tol, alphamin, alphamax, maxfev = ls
+    @unpack f_tol, gtol, x_tol, alphamin, alphamax, maxfev, cache = ls
+    emptycache!(cache)
 
     iterfinitemax = -log2(eps(T))
     info = 0
     info_cstep = 1 # Info from step
 
     zeroT = convert(T, 0)
+    pushcache!(cache, zeroT, ϕ_0, dϕ_0)
 
     #
     # Check the input parameters for errors.
@@ -236,7 +239,9 @@ function (ls::MoreThuente)(ϕdϕ,
         # Make stmax = (3/2)*alpha < 2alpha in the first iteration below
         stx = (convert(T, 7)/8)*alpha
     end
+    pushcache!(cache, alpha, f, dg)
     # END: Ensure that the initial step provides finite function values
+    # TODO: check if value is finite (maybe iterfinite > iterfinitemax)
 
     while true
         #
@@ -282,6 +287,7 @@ function (ls::MoreThuente)(ϕdϕ,
         # and compute the directional derivative.
         #
         f, dg = ϕdϕ(alpha)
+        pushcache!(cache, alpha, f, dg)
         nfev += 1 # This includes calls to f() and g!()
 
         if isapprox(dg, 0, atol=eps(T)) # Should add atol value to MoreThuente
diff --git a/src/static.jl b/src/static.jl
@@ -3,7 +3,7 @@
 
 `Static` is intended for methods with well-scaled updates; i.e. Newton, on well-behaved problems.
 """
-struct Static end
+struct Static <: AbstractLineSearch end
 
 function (ls::Static)(df::AbstractObjective, x, s, α, x_new = similar(x), ϕ_0 = nothing, dϕ_0 = nothing)
     ϕ = make_ϕ(df, x_new, x, s)
diff --git a/src/strongwolfe.jl b/src/strongwolfe.jl
@@ -14,10 +14,11 @@ use `MoreThuente`, `HagerZhang` or `BackTracking`.
 * `c_2 = 0.9` : second (strong) Wolfe condition
 * `ρ = 2.0` : bracket growth
 """
-@with_kw struct StrongWolfe{T}
+@with_kw struct StrongWolfe{T} <: AbstractLineSearch
     c_1::T = 1e-4
     c_2::T = 0.9
     ρ::T = 2.0
+    cache::Union{Nothing,LineSearchCache{T}} = nothing
 end
 
 """
@@ -49,9 +50,11 @@ Both `alpha` and `ϕ(alpha)` are returned.
 """
 function (ls::StrongWolfe)(ϕ, dϕ, ϕdϕ,
                            alpha0::T, ϕ_0, dϕ_0) where T<:Real
-    @unpack c_1, c_2, ρ = ls
+    @unpack c_1, c_2, ρ, cache = ls
+    emptycache!(cache)
 
     zeroT = convert(T, 0)
+    pushcache!(cache, zeroT, ϕ_0, dϕ_0)
 
     # Step-sizes
     a_0 = zeroT
@@ -71,17 +74,21 @@ function (ls::StrongWolfe)(ϕ, dϕ, ϕdϕ,
 
     while a_i < a_max
         ϕ_a_i = ϕ(a_i)
+        pushcache!(cache, a_i, ϕ_a_i)
 
         # Test Wolfe conditions
         if (ϕ_a_i > ϕ_0 + c_1 * a_i * dϕ_0) ||
             (ϕ_a_i >= ϕ_a_iminus1 && i > 1)
             a_star = zoom(a_iminus1, a_i,
                           dϕ_0, ϕ_0,
-                          ϕ, dϕ, ϕdϕ)
+                          ϕ, dϕ, ϕdϕ, cache)
             return a_star, ϕ(a_star)
         end
 
         dϕ_a_i = dϕ(a_i)
+        if cache !== nothing
+            push!(cache.slopes, dϕ_a_i)
+        end
 
         # Check condition 2
         if abs(dϕ_a_i) <= -c_2 * dϕ_0
@@ -91,7 +98,7 @@ function (ls::StrongWolfe)(ϕ, dϕ, ϕdϕ,
         # Check condition 3
         if dϕ_a_i >= zeroT # FIXME untested!
             a_star = zoom(a_i, a_iminus1,
-                          dϕ_0, ϕ_0, ϕ, dϕ, ϕdϕ)
+                          dϕ_0, ϕ_0, ϕ, dϕ, ϕdϕ, cache)
             return a_star, ϕ(a_star)
         end
 
@@ -117,6 +124,7 @@ function zoom(a_lo::T,
               ϕ,
               dϕ,
               ϕdϕ,
+              cache,
               c_1::Real = convert(T, 1)/10^4,
               c_2::Real = convert(T, 9)/10) where T
 
@@ -133,8 +141,10 @@ function zoom(a_lo::T,
         iteration += 1
 
         ϕ_a_lo, ϕprime_a_lo = ϕdϕ(a_lo)
+        pushcache!(cache, a_lo, ϕ_a_lo, ϕprime_a_lo)
 
         ϕ_a_hi, ϕprime_a_hi = ϕdϕ(a_hi)
+        pushcache!(cache, a_hi, ϕ_a_hi, ϕprime_a_hi)
 
         # Interpolate a_j
         if a_lo < a_hi
@@ -150,6 +160,7 @@ function zoom(a_lo::T,
 
         # Evaluate ϕ(a_j)
         ϕ_a_j = ϕ(a_j)
+        pushcache!(cache, a_j, ϕ_a_j)
 
         # Check Armijo
         if (ϕ_a_j > ϕ_0 + c_1 * a_j * dϕ_0) ||
@@ -158,6 +169,9 @@ function zoom(a_lo::T,
         else
             # Evaluate ϕprime(a_j)
             ϕprime_a_j = dϕ(a_j)
+            if cache !== nothing
+                push!(cache.slopes, ϕprime_a_j)
+            end
 
             if abs(ϕprime_a_j) <= -c_2 * dϕ_0
                 return a_j
diff --git a/src/types.jl b/src/types.jl
@@ -2,3 +2,39 @@ mutable struct LineSearchException{T<:Real} <: Exception
     message::AbstractString
     alpha::T
 end
+
+abstract type AbstractLineSearch end
+
+# For debugging
+struct LineSearchCache{T}
+    alphas::Vector{T}
+    values::Vector{T}
+    slopes::Vector{T}
+end
+"""
+    cache = LineSearchCache{T}()
+
+Initialize an empty cache for storing intermediate results during line search.
+The `α`, `ϕ(α)`, and possibly `dϕ(α)` values computed during line search are
+available in `cache.alphas`, `cache.values`, and `cache.slopes`, respectively.
+
+# Example
+
+```jldoctest
+julia> ϕ(x) = (x - π)^4; dϕ(x) = 4*(x-π)^3;
+
+julia> cache = LineSearchCache{Float64}();
+
+julia> ls = BackTracking(; cache);
+
+julia> ls(ϕ, 10.0, ϕ(0), dϕ(0))
+(1.8481462933284658, 2.7989406670901373)
+
+julia> cache
+LineSearchCache{Float64}([0.0, 10.0, 1.8481462933284658], [97.40909103400242, 2212.550050116452, 2.7989406670901373], [-124.02510672119926])
+```
+
+Because `BackTracking` doesn't use derivatives except at `α=0`, only the initial slope was stored in the cache.
+Other methods may store all three.
+"""
+LineSearchCache{T}() where T = LineSearchCache{T}(T[], T[], T[])
diff --git a/test/REQUIRE b/test/REQUIRE
diff --git a/test/TestCases.jl b/test/TestCases.jl
diff --git a/test/captured.jl b/test/captured.jl
diff --git a/test/runtests.jl b/test/runtests.jl