Use zeros instead of fill in Adam (#1075)

pkofod · web-flow · commit 31d8d965fcba · 2024-01-29T22:05:38.000+01:00
* Use zeros instead of fill

* Update adamax.jl

* Apply suggestions from code review

* Fix it once and for all
diff --git a/docs/src/algo/adam_adamax.md b/docs/src/algo/adam_adamax.md
@@ -14,7 +14,8 @@ where `alpha` is the step length or learning parameter. `beta_mean` and `beta_va
 ```julia
 AdaMax(; alpha=0.002,
          beta_mean=0.9,
-         beta_var=0.999)
+         beta_var=0.999,
+         epsilon=1e-8)
 ```
 where `alpha` is the step length or learning parameter. `beta_mean` and `beta_var` are exponential decay parameters for the first and second moments estimates. Setting these closer to 0 will cause past iterates to matter less for the current steps and setting them closer to 1 means emphasizing past iterates more.
 
diff --git a/src/multivariate/solvers/first_order/adam.jl b/src/multivariate/solvers/first_order/adam.jl
@@ -17,19 +17,19 @@ struct Adam{T, Tm} <: FirstOrderOptimizer
     ϵ::T
     manifold::Tm
 end
+# could use epsilon = T->sqrt(eps(T)) and input the promoted type
 Adam(; alpha = 0.0001, beta_mean = 0.9, beta_var = 0.999, epsilon = 1e-8) =
     Adam(alpha, beta_mean, beta_var, epsilon, Flat())
 Base.summary(::Adam) = "Adam"
 function default_options(method::Adam)
     (; allow_f_increases = true, iterations=10_000)
 end
 
-mutable struct AdamState{Tx, T, Tz, Tm, Tu, Ti} <: AbstractOptimizerState
+mutable struct AdamState{Tx, T, Tm, Tu, Ti} <: AbstractOptimizerState
     x::Tx
     x_previous::Tx
     f_x_previous::T
     s::Tx
-    z::Tz
     m::Tm
     u::Tu
     iter::Ti
@@ -43,17 +43,15 @@ function initial_state(method::Adam, options, d, initial_x::AbstractArray{T}) wh
     value_gradient!!(d, initial_x)
     α, β₁, β₂ = method.α, method.β₁, method.β₂
    
-    z = copy(initial_x)
     m = copy(gradient(d))
-    u = fill(zero(m[1]^2), length(m))
+    u = zero(m)
     a = 1 - β₁
     iter = 0
 
     AdamState(initial_x, # Maintain current state in state.x
                          copy(initial_x), # Maintain previous state in state.x_previous
                          real(T(NaN)), # Store previous f in state.f_x_previous
                          similar(initial_x), # Maintain current search direction in state.s
-                         z,
                          m,
                          u,
                          iter)
@@ -66,25 +64,15 @@ function update_state!(d, state::AdamState{T}, method::Adam) where T
     a = 1 - β₁
     b = 1 - β₂
 
-    m, u, z = state.m, state.u, state.z
+    m, u = state.m, state.u
     v = u
     m .= β₁ .* m .+ a .* gradient(d)
     v .= β₂ .* v .+ b .* gradient(d) .^ 2
     #  m̂ = m./(1-β₁^state.iter)
     # v̂ = v./(1-β₂^state.iter)
     #@. z = z - α*m̂/(sqrt(v̂+ϵ))
     αₜ = α * sqrt(1 - β₂^state.iter) / (1 - β₁^state.iter)
-    @. z = z - αₜ * m / (sqrt(v) + ϵ)
-
-    for _i in eachindex(z)
-        # since m and u start at 0, this can happen if the initial gradient is exactly 0
-        # rosenbrock(x) =  (1.0 - x[1])^2 + 100.0 * (x[2] - x[1]^2)^2
-        # optimize(rosenbrock, zeros(2), Adam(), Optim.Options(iterations=10000))
-        if isnan(z[_i])
-            z[_i] = state.x[_i]
-        end
-    end
-    state.x .= z
+    @. state.x = state.x - αₜ * m / (sqrt(v) + ϵ)
     # Update current position # x = x + alpha * s
     false # break on linesearch error
 end
diff --git a/src/multivariate/solvers/first_order/adamax.jl b/src/multivariate/solvers/first_order/adamax.jl
@@ -1,9 +1,8 @@
 """
-    AdaMax(; alpha=0.002, beta_mean=0.9, beta_var=0.999)
-# Adam
+# AdaMax
 ## Constructor
 ```julia
-    AdaMax(; alpha=0.002, beta_mean=0.9, beta_var=0.999)
+    AdaMax(; alpha=0.002, beta_mean=0.9, beta_var=0.999, epsilon=1e-8)
 ```
 ## Description
 AdaMax is a gradient based optimizer that choses its search direction by building up estimates of the first two moments of the gradient vector. This makes it suitable for problems with a stochastic objective and thus gradient. The method is introduced in [1] where the related Adam method is also introduced, see `?Adam` for more information on that method.
@@ -16,22 +15,22 @@ struct AdaMax{T,Tm} <: FirstOrderOptimizer
     α::T
     β₁::T
     β₂::T
+    ϵ::T
     manifold::Tm
 end
-AdaMax(; alpha = 0.002, beta_mean = 0.9, beta_var = 0.999) =
-    AdaMax(alpha, beta_mean, beta_var, Flat())
+AdaMax(; alpha = 0.002, beta_mean = 0.9, beta_var = 0.999, epsilon = sqrt(eps(Float64))) =
+    AdaMax(alpha, beta_mean, beta_var, epsilon, Flat())
 Base.summary(::AdaMax) = "AdaMax"
 function default_options(method::AdaMax)
     (; allow_f_increases = true, iterations=10_000)
 end
 
 
-mutable struct AdaMaxState{Tx, T, Tz, Tm, Tu, Ti} <: AbstractOptimizerState
+mutable struct AdaMaxState{Tx, T, Tm, Tu, Ti} <: AbstractOptimizerState
     x::Tx
     x_previous::Tx
     f_x_previous::T
     s::Tx
-    z::Tz
     m::Tm
     u::Tu
     iter::Ti
@@ -45,17 +44,15 @@ function initial_state(method::AdaMax, options, d, initial_x::AbstractArray{T})
     value_gradient!!(d, initial_x)
     α, β₁, β₂ = method.α, method.β₁, method.β₂
    
-    z = copy(initial_x)
     m = copy(gradient(d))
-    u = fill(zero(m[1]^2), length(m))
+    u = zero(m)
     a = 1 - β₁
     iter = 0
 
     AdaMaxState(initial_x, # Maintain current state in state.x
                          copy(initial_x), # Maintain previous state in state.x_previous
                          real(T(NaN)), # Store previous f in state.f_x_previous
                          similar(initial_x), # Maintain current search direction in state.s
-                         z,
                          m,
                          u,
                          iter)
@@ -64,22 +61,14 @@ end
 function update_state!(d, state::AdaMaxState{T}, method::AdaMax) where T
     state.iter = state.iter+1
     value_gradient!(d, state.x)
-    α, β₁, β₂ = method.α, method.β₁, method.β₂
+    α, β₁, β₂, ϵ = method.α, method.β₁, method.β₂, method.ϵ
     a = 1 - β₁
-    m, u, z = state.m, state.u, state.z
+    m, u = state.m, state.u
 
     m .= β₁ .* m .+ a .* gradient(d)
-    u .= max.(β₂ .* u, abs.(gradient(d)))
-    z .= z .- (α ./ (1 - β₁^state.iter)) .* m ./ u
-    for _i in eachindex(z)
-        # since m and u start at 0, this can happen if the initial gradient is exactly 0
-        # rosenbrock(x) =  (1.0 - x[1])^2 + 100.0 * (x[2] - x[1]^2)^2
-        # optimize(rosenbrock, zeros(2), AdaMax(), Optim.Options(iterations=10000))
-        if isnan(z[_i])
-            z[_i] = state.x[_i]
-        end
-    end
-    state.x .= z
+    u .= max.(ϵ, max.(β₂ .* u, abs.(gradient(d)))) # I know it's not there in the paper but if m and u start at 0 for some element... NaN occurs next
+
+    @. state.x = state.x - (α / (1 - β₁^state.iter)) * m / u
     # Update current position # x = x + alpha * s
     false # break on linesearch error
 end
diff --git a/test/multivariate/solvers/first_order/adam_adamax.jl b/test/multivariate/solvers/first_order/adam_adamax.jl
@@ -19,7 +19,7 @@
              )
     run_optim_tests(Adam();
                     skip = skip,
-                    show_name = true)
+                    show_name = debug_printing)
 end
 @testset "AdaMax" begin
     f(x) = x[1]^4
@@ -42,6 +42,6 @@ end
              )
     run_optim_tests(AdaMax();
                     skip = skip,
-                    show_name=true,
+                    show_name=debug_printing,
                     iteration_exceptions = (("Trigonometric", 1_000_000,),))
 end