Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "Aqua", "Distributions", "ExplicitImports", "ForwardDiff", "JET", "MathOptInterface", "Measurements", "OptimTestProblems", "Random", "RecursiveArrayTools", "StableRNGs", "ReverseDiff"]
test = ["Test", "Aqua", "Distributions", "ExplicitImports", "ForwardDiff", "JET", "MathOptInterface", "Measurements", "OptimTestProblems", "Random", "RecursiveArrayTools", "StableRNGs", "ReverseDiff", "StaticArrays", "BlockArrays"]
41 changes: 26 additions & 15 deletions src/multivariate/solvers/second_order/newton.jl
Original file line number Diff line number Diff line change
@@ -1,30 +1,36 @@
struct Newton{IL,L} <: SecondOrderOptimizer
struct Newton{IL,L,S} <: SecondOrderOptimizer
alphaguess!::IL
linesearch!::L
solve!::S # mutating solver: (d, state, method) i.e., writes state.s (and maybe state.F)
end


"""
# Newton
## Constructor
```julia
Newton(; alphaguess = LineSearches.InitialStatic(),
linesearch = LineSearches.HagerZhang())
linesearch = LineSearches.HagerZhang(),
solve = default_newton_solve)
```

## Description
The `Newton` method implements Newton's method for optimizing a function. We use
a special factorization from the package `PositiveFactorizations.jl` to ensure
The `Newton` method implements Newton's method for optimizing a function.

The `solve` function should take (H, g) and return s such that H*s = -g.
Defaults to a robust solver that handles dense or sparse matrices.
If the matrix is not an `AbstractSparseMatrix`, we use a special factorization from the package `PositiveFactorizations.jl` to ensure
that each search direction is a direction of descent. See Wright and Nocedal and
Wright (ch. 6, 1999) for a discussion of Newton's method in practice.

## References
- Nocedal, J. and S. J. Wright (1999), Numerical optimization. Springer Science 35.67-68: 7.
"""
function Newton(;
alphaguess = LineSearches.InitialStatic(), # Good default for Newton
alphaguess = LineSearches.InitialStatic(),
linesearch = LineSearches.HagerZhang(),
) # Good default for Newton
Newton(_alphaguess(alphaguess), linesearch)
solve = default_newton_solve!
)
Newton(_alphaguess(alphaguess), linesearch, solve)
end

Base.summary(io::IO, ::Newton) = print(io, "Newton's Method")
Expand Down Expand Up @@ -57,7 +63,8 @@ function initial_state(method::Newton, options, d, x0)
)
end

function update_state!(d, state::NewtonState, method::Newton)
# Default solver that handles common matrix types intelligently
function default_newton_solve!(d, state::NewtonState, method::Newton)
# Search direction is always the negative gradient divided by
# a matrix encoding the absolute values of the curvatures
# represented by H. It deviates from the usual "add a scaled
Expand All @@ -67,9 +74,9 @@ function update_state!(d, state::NewtonState, method::Newton)
if state.H_x isa AbstractSparseMatrix
state.s .= .-(state.H_x \ convert(Vector, state.g_x))
else
# Use PositiveFactorizations for robustness on dense matrices
state.F = cholesky!(Positive, state.H_x)
if state.g_x isa Array
# is this actually StridedArray?
if state.g_x isa StridedArray
ldiv!(state.s, state.F, state.g_x)
state.s .= .-state.s
else
Expand All @@ -79,12 +86,16 @@ function update_state!(d, state::NewtonState, method::Newton)
copyto!(state.s, state.F \ gv)
end
end
# Determine the distance of movement along the search line
lssuccess = perform_linesearch!(state, method, d)
end

# Update current position # x = x + alpha * s
Base.summary(io::IO, ::Newton) = print(io, "Newton's Method")

function update_state!(d, state::NewtonState, method::Newton)
method.solve!(d, state, method) # should mutate state.s (and maybe state.F)

lssuccess = perform_linesearch!(state, method, d)
@. state.x = state.x + state.alpha * state.s
return !lssuccess # break on linesearch error
return !lssuccess
end

function trace!(tr, d, state::NewtonState, iteration::Integer, ::Newton, options::Options, curr_time = time())
Expand Down
260 changes: 258 additions & 2 deletions test/multivariate/solvers/second_order/newton.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
@testset "Newton" begin

function f_1(x::Vector)
(x[1] - 5.0)^4
end
Expand All @@ -10,6 +11,7 @@
function h!_1(storage::Matrix, x::Vector)
storage[1, 1] = 12.0 * (x[1] - 5.0)^2
end

initial_x = [0.0]

Optim.optimize(NonDifferentiable(f_1, initial_x), [0.0], Newton())
Expand All @@ -20,7 +22,7 @@
@test_throws ErrorException Optim.x_trace(results)
@test Optim.g_converged(results)
@test norm(Optim.minimizer(results) - [5.0]) < 0.01

eta = 0.9

function f_2(x::Vector)
Expand Down Expand Up @@ -56,8 +58,262 @@
)
@test norm(Optim.minimizer(res) - prob.solutions) < 1e-9
end

@testset "Optim problems" begin
run_optim_tests(Newton(); skip = ("Trigonometric",), show_name = debug_printing)
end

wrap_solver(Hg_solver) = (d, state, method) -> begin
H = NLSolversBase.hessian(d)
g = NLSolversBase.gradient(d)
state.s .= Hg_solver(H, g)
nothing
end

@testset "Custom Solvers" begin

# Custom solver using LU decomposition
custom_solve(H, g) = -(lu(H) \ g)

result = optimize(f_2, g!_2, h!_2, [10.0, 20.0], Newton(solve=wrap_solver(custom_solve)))
@test Optim.g_converged(result)
@test norm(Optim.minimizer(result) - [0.0, 0.0]) < 0.01

# Custom solver using QR decomposition
qr_solve(H, g) = -(qr(H) \ g)
result2 = optimize(f_2, g!_2, h!_2, [5.0, 5.0], Newton(solve=wrap_solver(qr_solve)))
@test Optim.g_converged(result2)
@test norm(Optim.minimizer(result2) - [0.0, 0.0]) < 0.01

# Simple solver
simple_solve(H, g) = -(H \ g)
result3 = optimize(f_2, g!_2, h!_2, [3.0, 4.0], Newton(solve=wrap_solver(simple_solve)))
@test Optim.g_converged(result3)
@test norm(Optim.minimizer(result3) - [0.0, 0.0]) < 0.01

# Solver with specialized matrx types--can add more late i.e., BlockBanded, Block, etc. for right now, only testing StaticArrays
using StaticArrays
static_solve(H, g) = -(SMatrix{2, 2}(H) \ SVector{2}(g))
result_static = optimize(f_2, g!_2, h!_2, [6.0, 7.0], Newton(solve=wrap_solver(static_solve)))
@test Optim.g_converged(result_static)
@test norm(Optim.minimizer(result_static) - [0.0, 0.0]) < 0.01
end

@testset "BlockArray Hessian with custom solver" begin
using BlockArrays

function f(x)
return sum((x .- 1.0).^2)
end

function g!(storage, x)
storage .= 2.0 .* (x .- 1.0)
end

function h!(H::BlockArray{<:Any,2}, x)
for i in 1:size(H.blocks, 1)
blk = H.blocks[i, i]
fill!(blk, 0.0)
for j in 1:min(size(blk)...)
blk[j,j] = 2.0
end
end
return nothing
end

# --- Setup ---
block_sizes = (2, 3)
n = sum(block_sizes)

x0 = zeros(n)
initial_f = f(x0)
initial_g = similar(x0)
g!(initial_g, x0)

H_data = zeros(n, n)
H_block = BlockArray(H_data, [2, 3], [2, 3])
h!(H_block, x0)

custom_solve(H::BlockArray, g) = -(Matrix(H) \ g)

td = TwiceDifferentiable(f, g!, h!, x0, initial_f, initial_g, H_block)
result = optimize(td, x0, Newton(solve=wrap_solver(custom_solve)))

@test Optim.g_converged(result)
@test typeof(NLSolversBase.hessian(td)) <: BlockArray
@test typeof(result.minimizer) == Vector{Float64}
@test norm(result.minimizer .- ones(n)) < 1e-6
end

@testset "Hessian Types" begin
using SparseArrays

# Test sparse solver
sparse_solve(H, g) = -(sparse(H) \ g)
result_sparse = optimize(f_2, g!_2, h!_2, [5.0, 5.0], Newton(solve=wrap_solver(sparse_solve)))
@test Optim.g_converged(result_sparse)
@test norm(Optim.minimizer(result_sparse) - [0.0, 0.0]) < 0.01

# Test default solver handles both dense and sparse correctly
using LinearAlgebra
result_default = optimize(f_2, g!_2, h!_2, [3.0, 4.0], Newton())
@test Optim.g_converged(result_default)
@test norm(Optim.minimizer(result_default) - [0.0, 0.0]) < 0.01
end

@testset "Block Tridiagonal System - Kalman Smoothing" begin
using LinearAlgebra, Random, Optim

# ----------------------------
# Utility: Block tridiagonal solver
# ----------------------------
function block_thomas_solve(H, g, block_size)
n_blocks = length(g) ÷ block_size
B = [Matrix(H[(i-1)*block_size+1:i*block_size, (i-1)*block_size+1:i*block_size]) for i in 1:n_blocks]
A = [Matrix(H[i*block_size+1:(i+1)*block_size, (i-1)*block_size+1:i*block_size]) for i in 1:n_blocks-1]
C = [Matrix(H[(i-1)*block_size+1:i*block_size, i*block_size+1:(i+1)*block_size]) for i in 1:n_blocks-1]
d = [g[(i-1)*block_size+1:i*block_size] for i in 1:n_blocks]
B_work = copy(B)
d_work = copy(d)
for i in 2:n_blocks
L = A[i-1] / B_work[i-1]
B_work[i] .-= L * C[i-1]
d_work[i] .-= L * d_work[i-1]
end
x = Vector{Vector{Float64}}(undef, n_blocks)
x[end] = B_work[end] \ d_work[end]
for i in n_blocks-1:-1:1
x[i] = B_work[i] \ (d_work[i] - C[i] * x[i+1])
end
return vcat(x...)
end

# ----------------------------
# Problem setup
# ----------------------------
function create_kalman_problem(T::Int, D::Int)
Random.seed!(1)
A = 0.9I + 0.1 * randn(D, D)
C = randn(2, D)
Q = I + 0.1 * randn(D, D); Q = Q'Q
R = I + 0.1 * randn(2, 2); R = R'R
x0 = randn(D)
P0 = Matrix{Float64}(I, D, D)
y = randn(2, T)
w = ones(T)
return (; A, C, Q, R, x0, P0, y, w)
end

# ----------------------------
# Negative log-likelihood
# ----------------------------
function nll(x_vec, p)
X = reshape(x_vec, size(p.A, 1), size(p.y, 2))
R_chol = cholesky(Symmetric(p.R)).U
Q_chol = cholesky(Symmetric(p.Q)).U
P0_chol = cholesky(Symmetric(p.P0)).U

ll = sum(abs2, P0_chol \ (X[:,1] - p.x0))
for t in 1:size(p.y, 2)
if t > 1
ll += sum(abs2, Q_chol \ (X[:,t] - p.A * X[:,t-1]))
end
ll += p.w[t] * sum(abs2, R_chol \ (p.y[:,t] - p.C * X[:,t]))
end
return 0.5 * ll
end

# ----------------------------
# Gradient
# ----------------------------
function g!(g, x_vec, p)
D, T = size(p.A, 1), size(p.y, 2)
X = reshape(x_vec, D, T)
R_chol = cholesky(Symmetric(p.R))
Q_chol = cholesky(Symmetric(p.Q))
P0_chol = cholesky(Symmetric(p.P0))
C_inv_R = (R_chol \ p.C)'
A_inv_Q = (Q_chol \ p.A)'

grad = zeros(D, T)

grad[:,1] .= A_inv_Q * (X[:,2] - p.A * X[:,1]) +
p.w[1] * C_inv_R * (p.y[:,1] - p.C * X[:,1]) -
(P0_chol \ (X[:,1] - p.x0))

for t in 2:T-1
grad[:,t] .= p.w[t] * C_inv_R * (p.y[:,t] - p.C * X[:,t]) -
Q_chol \ (X[:,t] - p.A * X[:,t-1]) +
A_inv_Q * (X[:,t+1] - p.A * X[:,t])
end

grad[:,T] .= p.w[T] * C_inv_R * (p.y[:,T] - p.C * X[:,T]) -
Q_chol \ (X[:,T] - p.A * X[:,T-1])

g .= vec(-grad)
return nothing
end

# ----------------------------
# Hessian
# ----------------------------
function h!(H, x_vec, p)
D, T = size(p.A, 1), size(p.y, 2)
inv_R = inv(p.R)
inv_Q = inv(p.Q)
inv_P0 = inv(p.P0)
yt_xt = p.C' * inv_R * p.C
xt_xt_1 = inv_Q
xt1_xt = p.A' * inv_Q * p.A
x0_term = inv_P0

diag = Vector{Matrix{Float64}}(undef, T)
sub = fill(-inv_Q * p.A, T-1)
sup = fill(-p.A' * inv_Q, T-1)

diag[1] = p.w[1] * yt_xt + xt1_xt + x0_term
for t in 2:T-1
diag[t] = p.w[t] * yt_xt + xt_xt_1 + xt1_xt
end
diag[T] = p.w[T] * yt_xt + xt_xt_1

H_mat = zeros(D*T, D*T)
for t in 1:T
idx = (t-1)*D + 1 : t*D
H_mat[idx, idx] = diag[t]
if t < T
H_mat[idx, idx .+ D] = sup[t]
H_mat[idx .+ D, idx] = sub[t]
end
end
mul!(H, -1.0, H_mat)
return nothing
end

# ----------------------------
# Main test
# ----------------------------
function run()
T, D = 50, 4
p = create_kalman_problem(T, D)
x0 = zeros(T * D)

f = x -> nll(x, p)
g = (storage, x) -> g!(storage, x, p)
h = (storage, x) -> h!(storage, x, p)

println("Standard Newton:")
res1 = optimize(f, g, h, copy(x0), Newton())
println(" Iterations: ", Optim.iterations(res1))

println("Block Thomas Newton:")
res2 = optimize(f, g, h, copy(x0), Newton(solve = wrap_solver((H, g) -> block_thomas_solve(H, g, D)), Optim.Options(show_trace=true)))
println(" Iterations: ", Optim.iterations(res2))

@test Optim.g_converged(res1)
@test Optim.g_converged(res2)
@test norm(Optim.minimizer(res1) - Optim.minimizer(res2)) < 1e-8
end
end
end

2 changes: 1 addition & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ using Random
import LineSearches
import NLSolversBase
import NLSolversBase: clear!
import LinearAlgebra: norm, diag, I, Diagonal, dot, eigen, issymmetric, mul!
import LinearAlgebra: norm, diag, I, Diagonal, dot, eigen, issymmetric, mul!, lu, qr
import SparseArrays: normalize!, spdiagm

import ForwardDiff
Expand Down