diff --git a/src/hagerzhang.jl b/src/hagerzhang.jl index 9d1be56..0076a20 100644 --- a/src/hagerzhang.jl +++ b/src/hagerzhang.jl @@ -7,24 +7,11 @@ # Code comments such as "HZ, stage X" or "HZ, eqs Y" are with # reference to a particular point in this paper. # -# There are some modifications and/or extensions from what's in the -# paper (these may or may not be extensions of the cg_descent code -# that can be downloaded from Hager's site; his code has undergone -# numerous revisions since publication of the paper): -# linesearch: the Wolfe conditions are checked only after alpha is -# generated either by quadratic interpolation or secant -# interpolation, not when alpha is generated by bisection or -# expansion. This increases the likelihood that alpha will be a -# good approximation of the minimum. +# Enhancements: # -# linesearch: In step I2, we multiply by psi2 only if the convexity -# test failed, not if the function-value test failed. This -# prevents one from going uphill further when you already know -# you're already higher than the point at alpha=0. +# - checks for Inf/NaN function values # -# both: checks for Inf/NaN function values -# -# both: support maximum value of alpha (equivalently, c). This +# - support maximum value of alpha (equivalently, c). This # facilitates using these routines for constrained minimization # when you can calculate the distance along the path to the # disallowed region. (When you can't easily calculate that @@ -36,35 +23,9 @@ # below. The default value for alphamax is Inf. See alphamaxfunc # for cgdescent and alphamax for HagerZhang. - - -# TODO: Remove these bitfield things and create a proper -# tracing functionality instead - -# Display flags are represented as a bitfield -# (not exported, but can use via LineSearches.ITER, for example) -const one64 = convert(UInt64, 1) -const FINAL = one64 -const ITER = one64 << 1 -const PARAMETERS = one64 << 2 -const GRADIENT = one64 << 3 -const SEARCHDIR = one64 << 4 -const ALPHA = one64 << 5 -const BETA = one64 << 6 -# const ALPHAGUESS = one64 << 7 TODO: not needed -const BRACKET = one64 << 8 -const LINESEARCH = one64 << 9 -const UPDATE = one64 << 10 -const SECANT2 = one64 << 11 -const BISECT = one64 << 12 -const BARRIERCOEF = one64 << 13 -display_nextbit = 14 - - const DEFAULTDELTA = 0.1 # Values taken from HZ paper (Nocedal & Wright recommends 0.01?) const DEFAULTSIGMA = 0.9 # Values taken from HZ paper (Nocedal & Wright recommends 0.1 for GradientDescent) - # NOTE: # [1] The type `T` in the `HagerZhang{T}` need not be the same `T` as in # `hagerzhang!{T}`; in the latter, `T` comes from the input vector `x`. @@ -86,10 +47,12 @@ Conjugate gradient line search implementation from: alphamax::T = Inf rho::T = 5.0 epsilon::T = 1e-6 + epsilonk::Union{Nothing,Base.RefValue{T}} = nothing gamma::T = 0.66 + theta::T = 0.5 linesearchmax::Int = 50 psi3::T = 0.1 - display::Int = 0 + display::Int = 0 # non-functional mayterminate::Tm = Ref{Bool}(false) cache::Union{Nothing,LineSearchCache{T}} = nothing end @@ -103,427 +66,150 @@ function (ls::HagerZhang)(df::AbstractObjective, x::AbstractArray{T}, end (ls::HagerZhang)(ϕ, dϕ, ϕdϕ, c, phi_0, dphi_0) = ls(ϕ, ϕdϕ, c, phi_0, dphi_0) - -# TODO: Should we deprecate the interface that only uses the ϕ and ϕd\phi arguments? -function (ls::HagerZhang)(ϕ, ϕdϕ, - c::T, - phi_0::Real, - dphi_0::Real) where T # Should c and phi_0 be same type? - @unpack delta, sigma, alphamax, rho, epsilon, gamma, - linesearchmax, psi3, display, mayterminate, cache = ls - emptycache!(cache) - - zeroT = convert(T, 0) - pushcache!(cache, zeroT, phi_0, dphi_0) - if !(isfinite(phi_0) && isfinite(dphi_0)) - throw(LineSearchException("Value and slope at step length = 0 must be finite.", T(0))) - end - if dphi_0 >= eps(T) * abs(phi_0) - throw(LineSearchException("Search direction is not a direction of descent.", T(0))) - elseif dphi_0 >= 0 - return zeroT, phi_0 - end - - # Prevent values of x_new = x+αs that are likely to make - # ϕ(x_new) infinite - iterfinitemax::Int = ceil(Int, -log2(eps(T))) - if cache !== nothing - @unpack alphas, values, slopes = cache +(ls::HagerZhang)(ϕ, ϕdϕ, c, phi_0, dphi_0) = ls(ϕdϕ, c, phi_0, dphi_0) + +# TODO: Should we deprecate the interface that only uses the ϕ and ϕdϕ arguments? +function (ls::HagerZhang)(ϕdϕ, + c::Tα, + phi_0::Tϕ, + dphi_0::Real) where {Tα, Tϕ} + @unpack alphamax, gamma, linesearchmax, cache = ls + if cache === nothing + cache = LineSearchCache{Tα}() # this won't be seen by the user, but it's used internally else - alphas = [zeroT] # for bisection - values = [phi_0] - slopes = [dphi_0] - end - if display & LINESEARCH > 0 - println("New linesearch") - end - - - phi_lim = phi_0 + epsilon * abs(phi_0) - @assert c >= 0 - c <= eps(T) && return zeroT, phi_0 - @assert isfinite(c) && c <= alphamax - phi_c, dphi_c = ϕdϕ(c) - iterfinite = 1 - while !(isfinite(phi_c) && isfinite(dphi_c)) && iterfinite < iterfinitemax - mayterminate[] = false - iterfinite += 1 - c *= psi3 - phi_c, dphi_c = ϕdϕ(c) - end - if !(isfinite(phi_c) && isfinite(dphi_c)) - @warn("Failed to achieve finite new evaluation point, using alpha=0") - mayterminate[] = false # reset in case another initial guess is used next - return zeroT, phi_0 + eltype(cache) === Tα || throw(LineSearchException("Element type of line search cache doesn't match location type.", zero(Tα))) + emptycache!(cache) end - push!(alphas, c) - push!(values, phi_c) - push!(slopes, dphi_c) + ϵₖ = Tα(ls.epsilonk !== nothing ? ls.epsilonk[] : ls.epsilon * abs(phi_0)) + γ = Tα(ls.gamma) - # If c was generated by quadratic interpolation, check whether it - # satisfies the Wolfe conditions - if mayterminate[] && - satisfies_wolfe(c, phi_c, dphi_c, phi_0, dphi_0, phi_lim, delta, sigma) - if display & LINESEARCH > 0 - println("Wolfe condition satisfied on point alpha = ", c) - end - mayterminate[] = false # reset in case another initial guess is used next - return c, phi_c # phi_c - end - # Initial bracketing step (HZ, stages B0-B3) - isbracketed = false - ia = 1 - ib = 2 - @assert length(alphas) == 2 - iter = 1 - cold = -one(T) - while !isbracketed && iter < linesearchmax - if display & BRACKET > 0 - println("bracketing: ia = ", ia, - ", ib = ", ib, - ", c = ", c, - ", phi_c = ", phi_c, - ", dphi_c = ", dphi_c) - end - if dphi_c >= zeroT - # We've reached the upward slope, so we have b; examine - # previous values to find a - ib = length(alphas) - for i = (ib - 1):-1:1 - if values[i] <= phi_lim - ia = i - break - end - end - isbracketed = true - elseif values[end] > phi_lim - # The value is higher, but the slope is downward, so we must - # have crested over the peak. Use bisection. - ib = length(alphas) - ia = 1 - if c ≉ alphas[ib] || slopes[ib] >= zeroT - error("c = ", c) - end - # ia, ib = bisect(phi, lsr, ia, ib, phi_lim) # TODO: Pass options - ia, ib = bisect!(ϕdϕ, alphas, values, slopes, ia, ib, phi_lim, display) - isbracketed = true - else - # We'll still going downhill, expand the interval and try again. - # Reaching this branch means that dphi_c < 0 and phi_c <= phi_0 + ϵ_k - # So cold = c has a lower objective than phi_0 up to epsilon. - # This makes it a viable step to return if bracketing fails. - - # Bracketing can fail if no cold < c <= alphamax can be found with finite phi_c and dphi_c. - # Going back to the loop with c = cold will only result in infinite cycling. - # So returning (cold, phi_cold) and exiting the line search is the best move. - cold = c - phi_cold = phi_c - if nextfloat(cold) >= alphamax - mayterminate[] = false # reset in case another initial guess is used next - return cold, phi_cold - end - c *= rho - if c > alphamax - c = alphamax - if display & BRACKET > 0 - println("bracket: exceeding alphamax, using c = alphamax = ", alphamax, - ", cold = ", cold) - end - end - phi_c, dphi_c = ϕdϕ(c) - iterfinite = 1 - while !(isfinite(phi_c) && isfinite(dphi_c)) && c > nextfloat(cold) && iterfinite < iterfinitemax - alphamax = c # shrinks alphamax, assumes that steps >= c can never have finite phi_c and dphi_c - iterfinite += 1 - if display & BRACKET > 0 - println("bracket: non-finite value, bisection") - end - c = (cold + c) / 2 - phi_c, dphi_c = ϕdϕ(c) - end - if !(isfinite(phi_c) && isfinite(dphi_c)) - if display & BRACKET > 0 - println("Warning: failed to expand interval to bracket with finite values. If this happens frequently, check your function and gradient.") - println("c = ", c, - ", alphamax = ", alphamax, - ", phi_c = ", phi_c, - ", dphi_c = ", dphi_c) - end - return cold, phi_cold - end - push!(alphas, c) - push!(values, phi_c) - push!(slopes, dphi_c) - end - iter += 1 - end - while iter < linesearchmax - a = alphas[ia] - b = alphas[ib] - @assert b > a - if display & LINESEARCH > 0 - println("linesearch: ia = ", ia, - ", ib = ", ib, - ", a = ", a, - ", b = ", b, - ", phi(a) = ", values[ia], - ", phi(b) = ", values[ib]) - end - if b - a <= eps(b) - mayterminate[] = false # reset in case another initial guess is used next - return a, values[ia] # lsr.value[ia] - end - iswolfe, iA, iB = secant2!(ϕdϕ, alphas, values, slopes, ia, ib, phi_lim, delta, sigma, display) - if iswolfe - mayterminate[] = false # reset in case another initial guess is used next - return alphas[iA], values[iA] # lsr.value[iA] - end - A = alphas[iA] - B = alphas[iB] - @assert B > A - if B - A < gamma * (b - a) - if display & LINESEARCH > 0 - println("Linesearch: secant succeeded") - end - if nextfloat(values[ia]) >= values[ib] && nextfloat(values[iA]) >= values[iB] - # It's so flat, secant didn't do anything useful, time to quit - if display & LINESEARCH > 0 - println("Linesearch: secant suggests it's flat") - end - mayterminate[] = false # reset in case another initial guess is used next - return A, values[iA] - end - ia = iA - ib = iB - else - # Secant is converging too slowly, use bisection - if display & LINESEARCH > 0 - println("Linesearch: secant failed, using bisection") - end - c = (A + B) / convert(T, 2) - - phi_c, dphi_c = ϕdϕ(c) - @assert isfinite(phi_c) && isfinite(dphi_c) - push!(alphas, c) - push!(values, phi_c) - push!(slopes, dphi_c) - - ia, ib = update!(ϕdϕ, alphas, values, slopes, iA, iB, length(alphas), phi_lim, display) + pushcache!(cache, zero(Tα), phi_0, dphi_0) + if !(isfinite(phi_0) && isfinite(dphi_0)) + throw(LineSearchException("Value and slope at step length = 0 must be finite.", zero(Tα))) + end + c > zero(Tα) || throw(LineSearchException("Line search step `c` must be positive.", zero(Tα))) + if dphi_0 * eps(c) >= abs(phi_0) + throw(LineSearchException("Search direction is not a direction of descent.", zero(Tα))) + elseif dphi_0 >= zero(dphi_0) + return zero(Tα), phi_0 + end + + j = 0 + aⱼ, bⱼ, terminate = bracket(ϕdϕ, c, phi_0, dphi_0, ϵₖ, ls, cache) # L0 + while !terminate && j < linesearchmax + a, b, terminate = secant²(ϕdϕ, aⱼ, bⱼ, phi_0, dphi_0, ϵₖ, ls, cache) # L1 + terminate && break + if b - a > γ * (bⱼ - aⱼ) # L2 + c = (a + b) / 2 + a, b, terminate = update(ϕdϕ, a, b, c, phi_0, dphi_0, ϵₖ, ls, cache) end - iter += 1 + aⱼ, bⱼ = a, b # L3 + j += 1 end + j >= linesearchmax && throw(LineSearchException("Line search failed to converge, reached maximum iterations $(linesearchmax).", aⱼ)) - throw(LineSearchException("Linesearch failed to converge, reached maximum iterations $(linesearchmax).", - alphas[ia])) - - -end - -# Check Wolfe & approximate Wolfe -function satisfies_wolfe(c::T, - phi_c::Real, - dphi_c::Real, - phi_0::Real, - dphi_0::Real, - phi_lim::Real, - delta::Real, - sigma::Real) where T<:Number - wolfe1 = delta * dphi_0 >= (phi_c - phi_0) / c && - dphi_c >= sigma * dphi_0 - wolfe2 = (2 * delta - 1) * dphi_0 >= dphi_c >= sigma * dphi_0 && - phi_c <= phi_lim - return wolfe1 || wolfe2 + ls.mayterminate[] = false + return cache.alphas[end], cache.values[end] end -# HZ, stages S1-S4 -function secant(a::Real, b::Real, dphi_a::Real, dphi_b::Real) - return (a * dphi_b - b * dphi_a) / (dphi_b - dphi_a) +function satisfies_wolfe(ls::HagerZhang, + α::T, + ϕ_α::Real, + dϕ_α::Real, + ϕ_0::Real, + dϕ_0::Real, + ϵₖ::Real) where T<:Number + δ, σ = ls.delta, ls.sigma + δ * dϕ_0 * α >= ϕ_α - ϕ_0 && dϕ_α >= σ * dϕ_0 && return true # T1 + return (2 * δ - 1) * dϕ_0 >= dϕ_α >= σ * dϕ_0 && ϕ_α <= ϕ_0 + ϵₖ # T2 end -function secant(alphas, values, slopes, ia::Integer, ib::Integer) - return secant(alphas[ia], alphas[ib], slopes[ia], slopes[ib]) -end -# phi -function secant2!(ϕdϕ, - alphas, - values, - slopes, - ia::Integer, - ib::Integer, - phi_lim::Real, - delta::Real = DEFAULTDELTA, - sigma::Real = DEFAULTSIGMA, - display::Integer = 0) - phi_0 = values[1] - dphi_0 = slopes[1] - a = alphas[ia] - b = alphas[ib] - dphi_a = slopes[ia] - dphi_b = slopes[ib] - T = eltype(slopes) - zeroT = convert(T, 0) - if !(dphi_a < zeroT && dphi_b >= zeroT) - error(string("Search direction is not a direction of descent; ", - "this error may indicate that user-provided derivatives are inaccurate. ", - @sprintf "(dphi_a = %f; dphi_b = %f)" dphi_a dphi_b)) - end - c = secant(a, b, dphi_a, dphi_b) - if display & SECANT2 > 0 - println("secant2: a = ", a, ", b = ", b, ", c = ", c) - end - @assert isfinite(c) - # phi_c = phi(tmpc, c) # Replace - phi_c, dphi_c = ϕdϕ(c) - @assert isfinite(phi_c) && isfinite(dphi_c) - push!(alphas, c) - push!(values, phi_c) - push!(slopes, dphi_c) - - ic = length(alphas) - if satisfies_wolfe(c, phi_c, dphi_c, phi_0, dphi_0, phi_lim, delta, sigma) - if display & SECANT2 > 0 - println("secant2: first c satisfied Wolfe conditions") - end - return true, ic, ic - end - - iA, iB = update!(ϕdϕ, alphas, values, slopes, ia, ib, ic, phi_lim, display) - if display & SECANT2 > 0 - println("secant2: iA = ", iA, ", iB = ", iB, ", ic = ", ic) - end - a = alphas[iA] - b = alphas[iB] - doupdate = false - if iB == ic - # we updated b, make sure we also update a - c = secant(alphas, values, slopes, ib, iB) - elseif iA == ic - # we updated a, do it for b too - c = secant(alphas, values, slopes, ia, iA) - end - if (iA == ic || iB == ic) && a <= c <= b - if display & SECANT2 > 0 - println("secant2: second c = ", c) +function bracket(ϕdϕ::Function, + c::Tα, + ϕ_0::Tϕ, + dϕ_0::Real, + ϵₖ::Real, + ls::HagerZhang, + cache::LineSearchCache) where {Tα, Tϕ} + δ, σ, ρ = ls.delta, ls.sigma, ls.rho + δ, σ, ρ = Tα(δ), Tα(σ), Tα(ρ) + j, cⱼ = 0, c + while true + ϕ_j, dϕ_j = ϕdϕ(cⱼ) + pushcache!(cache, cⱼ, ϕ_j, dϕ_j) + if iszero(dϕ_j) || j > 0 || ls.mayterminate[] + satisfies_wolfe(ls, cⱼ, ϕ_j, dϕ_j, ϕ_0, dϕ_0, ϵₖ) && return zero(Tα), cⱼ, true end - # phi_c = phi(tmpc, c) # TODO: Replace - phi_c, dphi_c = ϕdϕ(c) - @assert isfinite(phi_c) && isfinite(dphi_c) - - push!(alphas, c) - push!(values, phi_c) - push!(slopes, dphi_c) - - ic = length(alphas) - # Check arguments here - if satisfies_wolfe(c, phi_c, dphi_c, phi_0, dphi_0, phi_lim, delta, sigma) - if display & SECANT2 > 0 - println("secant2: second c satisfied Wolfe conditions") + j += 1 + if dϕ_j >= zero(dϕ_j) # B1 + b = cⱼ + i = j + while true + i == 0 && @show j cache.values ϕ_0 ϵₖ + cache.values[i] < ϕ_0 + ϵₖ && break + i -= 1 end - return true, ic, ic + return cache.alphas[i], b, false end - iA, iB = update!(ϕdϕ, alphas, values, slopes, iA, iB, ic, phi_lim, display) - end - if display & SECANT2 > 0 - println("secant2 output: a = ", alphas[iA], ", b = ", alphas[iB]) + if ϕ_j > ϕ_0 + ϵₖ # B2 + return update3(ϕdϕ, zero(Tα), cⱼ, ϕ_0, dϕ_0, ϵₖ, ls, cache) + end + cⱼ = ρ * cⱼ end - return false, iA, iB end -# HZ, stages U0-U3 -# Given a third point, pick the best two that retain the bracket -# around the minimum (as defined by HZ, eq. 29) -# b will be the upper bound, and a the lower bound -function update!(ϕdϕ, - alphas, - values, - slopes, - ia::Integer, - ib::Integer, - ic::Integer, - phi_lim::Real, - display::Integer = 0) - a = alphas[ia] - b = alphas[ib] - T = eltype(slopes) - zeroT = convert(T, 0) - # Debugging (HZ, eq. 4.4): - @assert slopes[ia] < zeroT - @assert values[ia] <= phi_lim - @assert slopes[ib] >= zeroT - @assert b > a - c = alphas[ic] - phi_c = values[ic] - dphi_c = slopes[ic] - if display & UPDATE > 0 - println("update: ia = ", ia, - ", a = ", a, - ", ib = ", ib, - ", b = ", b, - ", c = ", c, - ", phi_c = ", phi_c, - ", dphi_c = ", dphi_c) - end - if c < a || c > b - return ia, ib #, 0, 0 # it's out of the bracketing interval - end - if dphi_c >= zeroT - return ia, ic #, 0, 0 # replace b with a closer point +# Step U3 +function update3(ϕdϕ, ā, b̄, ϕ_0, dϕ_0, ϵₖ, ls, cache) + θ = ls.theta + while true + d = (1 - θ) * ā + θ * b̄ + @assert ā < d < b̄ + ϕ_d, dϕ_d = ϕdϕ(d) + pushcache!(cache, d, ϕ_d, dϕ_d) + satisfies_wolfe(ls, d, ϕ_d, dϕ_d, ϕ_0, dϕ_0, ϵₖ) && return ā, d, true + dϕ_d >= zero(dϕ_d) && return ā, d, false + if ϕ_d <= ϕ_0 + ϵₖ + ā = d + else + b̄ = d + end end - # We know dphi_c < 0. However, phi may not be monotonic between a - # and c, so check that the value is also smaller than phi_0. (It's - # more dangerous to replace a than b, since we're leaving the - # secure environment of alpha=0; that's why we didn't check this - # above.) - if phi_c <= phi_lim - return ic, ib#, 0, 0 # replace a +end + +function update(ϕdϕ, a, b, c, phi_0, dphi_0, ϵₖ, ls, cache) + a < c < b || return a, b, false # U0 + # Custom modification: sometimes c duplicates a previous value. Since + # computing objectives and gradients can be expensive, it seems much better + # to try to look it up. + i = findfirst(==(c), cache.alphas) + if i !== nothing + ϕ_c, dϕ_c = cache.values[i], cache.slopes[i] + else + ϕ_c, dϕ_c = ϕdϕ(c) + pushcache!(cache, c, ϕ_c, dϕ_c) + satisfies_wolfe(ls, c, ϕ_c, dϕ_c, phi_0, dphi_0, ϵₖ) && return a, c, true end - # phi_c is bigger than phi_0, which implies that the minimum - # lies between a and c. Find it via bisection. - return bisect!(ϕdϕ, alphas, values, slopes, ia, ic, phi_lim, display) + dϕ_c >= zero(dϕ_c) && return a, c, false # U1 + ϕ_c <= phi_0 + ϵₖ && return c, b, false # U2 + return update3(ϕdϕ, a, c, phi_0, dphi_0, ϵₖ, ls, cache) # U3 end -# HZ, stage U3 (with theta=0.5) -function bisect!(ϕdϕ, - alphas::AbstractArray{T}, - values, - slopes, - ia::Integer, - ib::Integer, - phi_lim::Real, - display::Integer = 0) where T - gphi = convert(T, NaN) - a = alphas[ia] - b = alphas[ib] - # Debugging (HZ, conditions shown following U3) - zeroT = convert(T, 0) - @assert slopes[ia] < zeroT - @assert values[ia] <= phi_lim - @assert slopes[ib] < zeroT # otherwise we wouldn't be here - @assert values[ib] > phi_lim - @assert b > a - while b - a > eps(b) - if display & BISECT > 0 - println("bisect: a = ", a, ", b = ", b, ", b - a = ", b - a) - end - d = (a + b) / convert(T, 2) - phi_d, gphi = ϕdϕ(d) - @assert isfinite(phi_d) && isfinite(gphi) +function secant²(ϕdϕ, a::T, b::T, phi_0, dphi_0, ϵₖ, ls, cache::LineSearchCache{T}) where T + @assert a < b + c = secant(a, b, cache) # S1 + A, B, terminate = update(ϕdϕ, a, b, c, phi_0, dphi_0, ϵₖ, ls, cache) + terminate && return A, B, true + c != A && c != B && return A, B, false # S4, part b + c̄ = c == B ? secant(b, B, cache) : secant(a, A, cache) # S2 & S3 + return update(ϕdϕ, A, B, c̄, phi_0, dphi_0, ϵₖ, ls, cache) # S4, part a +end - push!(alphas, d) - push!(values, phi_d) - push!(slopes, gphi) +secant(a::T, b::T, cache::LineSearchCache{T}) where T = + secant(a, b, + cache.slopes[findfirst(==(a), cache.alphas)], + cache.slopes[findfirst(==(b), cache.alphas)]) - id = length(alphas) - if gphi >= zeroT - return ia, id # replace b, return - end - if phi_d <= phi_lim - a = d # replace a, but keep bisecting until dphi_b > 0 - ia = id - else - b = d - ib = id - end - end - return ia, ib +function secant(a::Real, b::Real, dϕ_a::Real, dϕ_b::Real) + dϕ_a == dϕ_b && return (a + b)/2 # custom modification + return (a * dϕ_b - b * dϕ_a) / (dϕ_b - dϕ_a) end diff --git a/src/types.jl b/src/types.jl index 98e041d..d16c538 100644 --- a/src/types.jl +++ b/src/types.jl @@ -38,3 +38,6 @@ Because `BackTracking` doesn't use derivatives except at `α=0`, only the initia Other methods may store all three. """ LineSearchCache{T}() where T = LineSearchCache{T}(T[], T[], T[]) + +Base.eltype(::Type{LineSearchCache{T}}) where T = T +Base.eltype(cache::LineSearchCache) = eltype(typeof(cache)) diff --git a/test/captured.jl b/test/captured.jl index 05b81fa..8066db5 100644 --- a/test/captured.jl +++ b/test/captured.jl @@ -21,15 +21,47 @@ end end end -# From PR#174 -@testset "PR#174" begin - tc = LineSearchTestCase( - [0.0, 1.0, 5.0, 3.541670844449739], - [3003.592409634743, 2962.0378569864743, 2891.4462095232184, 3000.9760725116876], - [-22332.321416890798, -20423.214551925797, 11718.185026267562, -22286.821227217057] - ) - fdf = OnceDifferentiable(tc) - hz = HagerZhang() - α, val = hz(fdf.f, fdf.fdf, 1.0, fdf.fdf(0.0)...) - @test_broken val <= minimum(tc) +wolfec1(ls::BackTracking) = ls.c_1 +wolfec1(ls::StrongWolfe) = ls.c_1 +wolfec1(ls::MoreThuente) = ls.f_tol +wolfec1(ls::HagerZhang{T}) where T = zero(T) # HZ uses Wolfe condition, but not necessarily based at zero + +function amongtypes(x, types) + t = typeof(x) + return any(t <: ty for ty in types) +end + +@testset "Captured cases" begin + lsalgs = (HagerZhang(), StrongWolfe(), MoreThuente(), + BackTracking(), BackTracking(; order=2) ) + # From PR#174 + @testset "PR#174" begin + tc = LineSearchTestCase( + [0.0, 1.0, 5.0, 3.541670844449739], + [3003.592409634743, 2962.0378569864743, 2891.4462095232184, 3000.9760725116876], + [-22332.321416890798, -20423.214551925797, 11718.185026267562, -22286.821227217057] + ) + fdf = OnceDifferentiable(tc) + for ls in lsalgs + α, val = ls(fdf.f, fdf.df, fdf.fdf, 1.0, fdf.fdf(0.0)...) + @test 0 < α <= 5 + @test val < fdf.f(0) + wolfec1(ls) * α * fdf.df(0) # Armijo (first Wolfe) condition + if amongtypes(ls, (StrongWolfe, MoreThuente, HagerZhang)) # these types do not just backtrack + @test val <= tc.values[2] + end + end + end + @testset "Issue#175" begin + tc = LineSearchTestCase( + [0.0, 0.2, 0.1, 0.055223623837026156], + [3.042968312396456, 3.1174112871667603, -3.5035848233450224, 0.5244246783151265], + [-832.4270136930788, -505.3362249257043, 674.9478303586366, 738.3388472427769] + ) + fdf = OnceDifferentiable(tc) + for ls in lsalgs + α, val = ls(fdf.f, fdf.df, fdf.fdf, 0.2, fdf.fdf(0.0)...) + @test 0 < α < 0.2 + @test val < fdf.f(0) + wolfec1(ls) * α * fdf.df(0) + end + end end diff --git a/test/issues.jl b/test/issues.jl index d0d01f4..d060c47 100644 --- a/test/issues.jl +++ b/test/issues.jl @@ -19,7 +19,7 @@ gv = similar(x) ϕ0 = fg!(gv, x) s = -1*gv dϕ0 = dot(gv, s) -println(ϕ0, ", ", dϕ0) +# println(ϕ0, ", ", dϕ0) # Univariate line search functions ϕ(α) = f(x .+ α.*s) @@ -36,3 +36,41 @@ end res = (StrongWolfe())(ϕ, dϕ, ϕdϕ, α0, ϕ0, dϕ0) @test res[2] > 0 @test res[2] == ϕ(res[1]) + +# Flatness check in HagerZhang +function makeϕdϕ(a) + @assert axes(a) == (1:2,) + A = a*a' + f(x) = x'*A*x/2 # Hessian has one positive and one zero eigenvalue (in exact arithmetic) + df(x) = A*x + x0 = [a[2], -a[1]] + d = -x0 / 2 + ϕ(α) = f(x0 + α*d) + dϕ(α) = dot(df(x0 + α*d), d) + ϕdϕ(α) = (ϕ(α), dϕ(α)) + return ϕ, dϕ, ϕdϕ +end + +@testset "Flatness" begin + cache = LineSearchCache{Float64}() + lsalgs = (HagerZhang(; cache=cache, epsilonk=Ref(1e-12)), + StrongWolfe(; cache=cache), + MoreThuente(; cache=cache), + BackTracking(; cache=cache), BackTracking(; order=2, cache=cache) + ) + + npass = zeros(Int, length(lsalgs)) + n, nmax = 0, 1000 + while n < nmax + ϕ, dϕ, ϕdϕ = makeϕdϕ(randn(2)) + ϕ0, dϕ0 = ϕdϕ(0) + dϕ0 < -eps(abs(ϕ0)) || continue # any "slope" is just roundoff error, but we want roundoff that looks like descent + n += 1 + for (i, ls) in enumerate(lsalgs) + res = ls(ϕ, dϕ, ϕdϕ, 1.0, ϕ(0.0), dϕ(0.0)) + npass[i] += length(cache.alphas) < 10 + end + end + @test_broken all(npass .== nmax) + @test all(npass .>= 0.99*nmax) +end diff --git a/test/runtests.jl b/test/runtests.jl index 4910d53..ba15c84 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,6 +15,7 @@ my_tests = [ "alphacalc.jl", "arbitrary_precision.jl", "examples.jl", + "issues.jl", "captured.jl" ]