Skip to content

Commit d7324eb

Browse files
authored
Add Adam and AdaMax (#1069)
* Add Adam and AdaMax * Update adam.jl * Add tests
1 parent 1a649e8 commit d7324eb

File tree

5 files changed

+239
-0
lines changed

5 files changed

+239
-0
lines changed

src/Optim.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ export optimize, maximize, # main function
9292
### Acceleration methods
9393
AcceleratedGradientDescent,
9494
MomentumGradientDescent,
95+
Adam,
96+
AdaMax,
9597

9698
### Nonlinear GMRES
9799
NGMRES,
@@ -148,6 +150,8 @@ include("multivariate/solvers/first_order/bfgs.jl")
148150
include("multivariate/solvers/first_order/l_bfgs.jl")
149151

150152
## Acceleration methods
153+
include("multivariate/solvers/first_order/adamax.jl")
154+
include("multivariate/solvers/first_order/adam.jl")
151155
include("multivariate/solvers/first_order/accelerated_gradient_descent.jl")
152156
include("multivariate/solvers/first_order/momentum_gradient_descent.jl")
153157

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""
2+
# Adam
3+
## Constructor
4+
```julia
5+
Adam(; alpha=0.0001, beta_mean=0.9, beta_var=0.999, epsilon=1e-8)
6+
```
7+
## Description
8+
Adam is a gradient based optimizer that choses its search direction by building up estimates of the first two moments of the gradient vector. This makes it suitable for problems with a stochastic objective and thus gradient. The method is introduced in [1] where the related AdaMax method is also introduced, see `?AdaMax` for more information on that method.
9+
10+
## References
11+
[1] https://arxiv.org/abs/1412.6980
12+
"""
13+
struct Adam{T, Tm} <: FirstOrderOptimizer
14+
α::T
15+
β₁::T
16+
β₂::T
17+
ϵ::T
18+
manifold::Tm
19+
end
20+
Adam(; alpha = 0.0001, beta_mean = 0.9, beta_var = 0.999, epsilon = 1e-8) =
21+
Adam(alpha, beta_mean, beta_var, epsilon, Flat())
22+
Base.summary(::Adam) = "Adam"
23+
function default_options(method::Adam)
24+
(; allow_f_increases = true, iterations=10_000)
25+
end
26+
27+
mutable struct AdamState{Tx, T, Tz, Tm, Tu, Ti} <: AbstractOptimizerState
28+
x::Tx
29+
x_previous::Tx
30+
f_x_previous::T
31+
s::Tx
32+
z::Tz
33+
m::Tm
34+
u::Tu
35+
iter::Ti
36+
end
37+
function reset!(method, state::AdamState, obj, x)
38+
value_gradient!!(obj, x)
39+
end
40+
function initial_state(method::Adam, options, d, initial_x::AbstractArray{T}) where T
41+
initial_x = copy(initial_x)
42+
43+
value_gradient!!(d, initial_x)
44+
α, β₁, β₂ = method.α, method.β₁, method.β₂
45+
46+
z = copy(initial_x)
47+
m = copy(gradient(d))
48+
u = fill(zero(m[1]^2), length(m))
49+
a = 1 - β₁
50+
iter = 0
51+
52+
AdamState(initial_x, # Maintain current state in state.x
53+
copy(initial_x), # Maintain previous state in state.x_previous
54+
real(T(NaN)), # Store previous f in state.f_x_previous
55+
similar(initial_x), # Maintain current search direction in state.s
56+
z,
57+
m,
58+
u,
59+
iter)
60+
end
61+
62+
function update_state!(d, state::AdamState{T}, method::Adam) where T
63+
state.iter = state.iter+1
64+
value_gradient!(d, state.x)
65+
α, β₁, β₂, ϵ = method.α, method.β₁, method.β₂, method.ϵ
66+
a = 1 - β₁
67+
b = 1 - β₂
68+
69+
m, u, z = state.m, state.u, state.z
70+
v = u
71+
m .= β₁ .* m .+ a .* gradient(d)
72+
v .= β₂ .* v .+ b .* gradient(d) .^ 2
73+
# m̂ = m./(1-β₁^state.iter)
74+
# v̂ = v./(1-β₂^state.iter)
75+
#@. z = z - α*m̂/(sqrt(v̂+ϵ))
76+
@. z = z - α*m/(1-β₁^state.iter)/(sqrt(v./(1-β₂^state.iter)+ϵ))
77+
78+
# not quite the same because epsilon is in the sqrt
79+
# not sure where I got this from
80+
# αₜ = α * sqrt(1 - β₂^state.iter) / (1 - β₁^state.iter)
81+
# z .= z .- αₜ .* m ./ (sqrt.(v .+ ϵ) )
82+
83+
for _i in eachindex(z)
84+
# since m and u start at 0, this can happen if the initial gradient is exactly 0
85+
# rosenbrock(x) = (1.0 - x[1])^2 + 100.0 * (x[2] - x[1]^2)^2
86+
# optimize(rosenbrock, zeros(2), Adam(), Optim.Options(iterations=10000))
87+
if isnan(z[_i])
88+
z[_i] = state.x[_i]
89+
end
90+
end
91+
state.x .= z
92+
# Update current position # x = x + alpha * s
93+
false # break on linesearch error
94+
end
95+
96+
function trace!(tr, d, state, iteration, method::Adam, options, curr_time=time())
97+
common_trace!(tr, d, state, iteration, method, options, curr_time)
98+
end
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
"""
2+
AdaMax(; alpha=0.002, beta_mean=0.9, beta_var=0.999)
3+
# Adam
4+
## Constructor
5+
```julia
6+
AdaMax(; alpha=0.002, beta_mean=0.9, beta_var=0.999)
7+
```
8+
## Description
9+
AdaMax is a gradient based optimizer that choses its search direction by building up estimates of the first two moments of the gradient vector. This makes it suitable for problems with a stochastic objective and thus gradient. The method is introduced in [1] where the related Adam method is also introduced, see `?Adam` for more information on that method.
10+
11+
12+
[1] https://arxiv.org/abs/1412.6980
13+
"""
14+
15+
struct AdaMax{T,Tm} <: FirstOrderOptimizer
16+
α::T
17+
β₁::T
18+
β₂::T
19+
manifold::Tm
20+
end
21+
AdaMax(; alpha = 0.002, beta_mean = 0.9, beta_var = 0.999) =
22+
AdaMax(alpha, beta_mean, beta_var, Flat())
23+
Base.summary(::AdaMax) = "AdaMax"
24+
function default_options(method::AdaMax)
25+
(; allow_f_increases = true, iterations=10_000)
26+
end
27+
28+
29+
mutable struct AdaMaxState{Tx, T, Tz, Tm, Tu, Ti} <: AbstractOptimizerState
30+
x::Tx
31+
x_previous::Tx
32+
f_x_previous::T
33+
s::Tx
34+
z::Tz
35+
m::Tm
36+
u::Tu
37+
iter::Ti
38+
end
39+
function reset!(method, state::AdaMaxState, obj, x)
40+
value_gradient!!(obj, x)
41+
end
42+
function initial_state(method::AdaMax, options, d, initial_x::AbstractArray{T}) where T
43+
initial_x = copy(initial_x)
44+
45+
value_gradient!!(d, initial_x)
46+
α, β₁, β₂ = method.α, method.β₁, method.β₂
47+
48+
z = copy(initial_x)
49+
m = copy(gradient(d))
50+
u = fill(zero(m[1]^2), length(m))
51+
a = 1 - β₁
52+
iter = 0
53+
54+
AdaMaxState(initial_x, # Maintain current state in state.x
55+
copy(initial_x), # Maintain previous state in state.x_previous
56+
real(T(NaN)), # Store previous f in state.f_x_previous
57+
similar(initial_x), # Maintain current search direction in state.s
58+
z,
59+
m,
60+
u,
61+
iter)
62+
end
63+
64+
function update_state!(d, state::AdaMaxState{T}, method::AdaMax) where T
65+
state.iter = state.iter+1
66+
value_gradient!(d, state.x)
67+
α, β₁, β₂ = method.α, method.β₁, method.β₂
68+
a = 1 - β₁
69+
m, u, z = state.m, state.u, state.z
70+
71+
m .= β₁ .* m .+ a .* gradient(d)
72+
u .= max.(β₂ .* u, abs.(gradient(d)))
73+
z .= z .-./ (1 - β₁^state.iter)) .* m ./ u
74+
for _i in eachindex(z)
75+
# since m and u start at 0, this can happen if the initial gradient is exactly 0
76+
# rosenbrock(x) = (1.0 - x[1])^2 + 100.0 * (x[2] - x[1]^2)^2
77+
# optimize(rosenbrock, zeros(2), AdaMax(), Optim.Options(iterations=10000))
78+
if isnan(z[_i])
79+
z[_i] = state.x[_i]
80+
end
81+
end
82+
state.x .= z
83+
# Update current position # x = x + alpha * s
84+
false # break on linesearch error
85+
end
86+
87+
function trace!(tr, d, state, iteration, method::AdaMax, options, curr_time=time())
88+
common_trace!(tr, d, state, iteration, method, options, curr_time)
89+
end
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
@testset "Adam" begin
2+
f(x) = x[1]^4
3+
function g!(storage, x)
4+
storage[1] = 4 * x[1]^3
5+
return
6+
end
7+
8+
initial_x = [1.0]
9+
options = Optim.Options(show_trace = debug_printing, allow_f_increases=true, iterations=100_000)
10+
results = Optim.optimize(f, g!, initial_x, Adam(), options)
11+
@test norm(Optim.minimum(results)) < 1e-6
12+
@test summary(results) == "Adam"
13+
14+
# TODO: Check why skip problems fail
15+
skip = ("Large Polynomial", "Parabola", "Paraboloid Random Matrix",
16+
"Paraboloid Diagonal", "Penalty Function I", "Polynomial", "Powell",
17+
"Extended Powell", "Trigonometric", "Himmelblau", "Rosenbrock", "Extended Rosenbrock",
18+
"Quadratic Diagonal", "Beale", "Fletcher-Powell", "Exponential",
19+
)
20+
run_optim_tests(Adam();
21+
skip = skip,
22+
show_name = true)
23+
end
24+
@testset "AdaMax" begin
25+
f(x) = x[1]^4
26+
function g!(storage, x)
27+
storage[1] = 4 * x[1]^3
28+
return
29+
end
30+
31+
initial_x = [1.0]
32+
options = Optim.Options(show_trace = debug_printing, allow_f_increases=true, iterations=100_000)
33+
results = Optim.optimize(f, g!, initial_x, AdaMax(), options)
34+
@test norm(Optim.minimum(results)) < 1e-6
35+
@test summary(results) == "AdaMax"
36+
37+
# TODO: Check why skip problems fail
38+
skip = ("Trigonometric", "Large Polynomial", "Parabola", "Paraboloid Random Matrix",
39+
"Paraboloid Diagonal", "Extended Rosenbrock", "Penalty Function I", "Beale",
40+
"Extended Powell", "Himmelblau", "Large Polynomial", "Polynomial", "Powell",
41+
"Exponential",
42+
)
43+
run_optim_tests(AdaMax();
44+
skip = skip,
45+
show_name=true,
46+
iteration_exceptions = (("Trigonometric", 1_000_000,),))
47+
end

test/runtests.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ multivariate_tests = [
6565
"solvers/constrained/samin",
6666
## first order
6767
"solvers/first_order/accelerated_gradient_descent",
68+
"solvers/first_order/adam_adamax",
6869
"solvers/first_order/bfgs",
6970
"solvers/first_order/cg",
7071
"solvers/first_order/gradient_descent",

0 commit comments

Comments
 (0)