Merge pull request #29 from mcabbott/isnumeric

mcabbott · web-flow · commit 755a97adc74d · 2022-01-27T12:16:44.000-05:00
Optimise only at `isnumeric` leaves
diff --git a/src/interface.jl b/src/interface.jl
@@ -1,31 +1,37 @@
 patch(x, x̄) = x .- x̄
 
 function state(o, x)
-  if isleaf(x)
+  if isnumeric(x)
     return init(o, x)
+  elseif isleaf(x)
+    return nothing
   else
-    x, _ = functor(x)
-    return map(x -> state(o, x), x)
+    x′, _ = functor(x)
+    return map(xᵢ -> state(o, xᵢ), x′)
   end
 end
 
 function _update(o, st, x, x̄s...)
-  st, x̄ = apply(o, st, x, x̄s...)
-  return st, patch(x, x̄)
+  st′, x̄′ = apply(o, st, x, x̄s...)
+  return st′, patch(x, x̄′)
 end
 
 function update(o, state, x::T, x̄s...) where T
   if all(isnothing, x̄s)
     return state, x
-  elseif isleaf(x)
+  elseif isnumeric(x)
     return _update(o, state, x, x̄s...)
   else
-    x̄s = map(x̄ -> functor(typeof(x), x̄)[1], x̄s)
-    x, restructure = functor(typeof(x), x)
-    xstate = map((state, x, x̄s...) -> update(o, state, x, x̄s...), state, x, x̄s...)
-    return map(first, xstate), restructure(map(last, xstate))
+    x̄s′ = map(x̄ -> functor(typeof(x), x̄)[1], x̄s)
+    x′, re = functor(typeof(x), x)
+    xstate = map((stᵢ, xᵢ, x̄sᵢ...) -> update(o, stᵢ, xᵢ, x̄sᵢ...), state, x′, x̄s′...)
+    return map(first, xstate), re(map(last, xstate))
   end
 end
 
 # default all rules to first order calls
 apply(o, state, x, dx, dxs...) = apply(o, state, x, dx)
+
+isnumeric(x::AbstractArray{<:Number}) = isleaf(x)  # isleaf to allow for e.g. transposed shared weights
+isnumeric(x::AbstractArray{<:Bool}) = false  # convention of ChainRules is that Bool is non-differentiable
+isnumeric(x) = false
diff --git a/src/rules.jl b/src/rules.jl
@@ -108,9 +108,9 @@ init(o::RMSProp, x::AbstractArray) = zero(x)
 function apply(o::RMSProp, state, x, dx)
   η, ρ, ϵ, acc = o.eta, o.rho, o.epsilon, state
   @. acc = ρ * acc + (1 - ρ) * dx^2
-  dx = @. dx * (η / (sqrt(acc) + ϵ))
+  dx′ = @. dx * (η / (sqrt(acc) + ϵ))
   
-  return acc, dx
+  return acc, dx′
 end
 
 (o::RMSProp)(state, m, dm) = update(o, state, m, dm)
@@ -145,9 +145,9 @@ function apply(o::ADAM{T}, state, x, dx) where T
 
   @. mt = β[1] * mt + (one(T) - β[1]) * dx
   @. vt = β[2] * vt + (one(T) - β[2]) * dx ^ 2
-  dx = @. mt / (one(T) - βt[1]) / (sqrt(vt / (one(T) - βt[2])) + ϵ) * η
+  dx′ = @. mt / (one(T) - βt[1]) / (sqrt(vt / (one(T) - βt[2])) + ϵ) * η
 
-  return (mt, vt, βt .* β), dx
+  return (mt, vt, βt .* β), dx′
 end
 
 """
@@ -185,12 +185,12 @@ function apply(o::RADAM, state, x, dx)
   ρ = ρ∞ - 2*t * βt[2] / (1 - βt[2])
   if ρ > 4
     r = sqrt((ρ - 4) * (ρ - 2) * ρ∞/((ρ∞ - 4) * (ρ∞ - 2) * ρ))
-    dx = @. mt / (1 - βt[1]) / (sqrt(vt / (1 - βt[2])) + ϵ) * η * r
+    dx′ = @. mt / (1 - βt[1]) / (sqrt(vt / (1 - βt[2])) + ϵ) * η * r
   else
-    dx = @. mt / (1 - βt[1]) * η
+    dx′ = @. mt / (1 - βt[1]) * η
   end
 
-  return (mt, vt, βt .* β, t + 1), dx
+  return (mt, vt, βt .* β, t + 1), dx′
 end
 
 """
@@ -224,9 +224,9 @@ function apply(o::AdaMax, state, x, dx)
 
   @. mt = β[1] * mt + (1 - β[1]) * dx
   @. ut = max(β[2] * ut, abs(dx))
-  dx = @. (η/(1 - βt[1])) * mt/(ut + ϵ)
+  dx′ = @. (η/(1 - βt[1])) * mt/(ut + ϵ)
 
-  return (mt, ut, βt .* β), dx
+  return (mt, ut, βt .* β), dx′
 end
 
 """
@@ -263,9 +263,9 @@ function apply(o::OADAM, state, x, dx)
   @. vt = β[2] * vt + (1 - β[2]) * dx^2
   @. dx = -dx_
   @. dx_ = η * mt / (1 - βt[1]) / (sqrt(vt / (1 - βt[2])) + ϵ)
-  dx = @. dx + 2*dx_
+  dx′ = @. dx + 2*dx_
 
-  return (mt, vt, βt .* β, dx_), dx
+  return (mt, vt, βt .* β, dx_), dx′
 end
 
 """
@@ -296,9 +296,9 @@ function apply(o::ADAGrad, state, x, dx)
   acc = state
 
   @. acc += dx^2
-  dx = @. dx * η / (sqrt(acc) + ϵ)
+  dx′ = @. dx * η / (sqrt(acc) + ϵ)
 
-  return acc, dx
+  return acc, dx′
 end
 
 """
@@ -330,10 +330,10 @@ function apply(o::ADADelta, state, x, dx)
   @. acc = ρ * acc + (1 - ρ) * dx^2
   # DON'T remove epsilon from numerator
   # or even out of the square roots
-  dx = @. dx * sqrt(Δacc + ϵ) / sqrt(acc + ϵ)
+  dx′ = @. dx * sqrt(Δacc + ϵ) / sqrt(acc + ϵ)
   @. Δacc = ρ * Δacc + (1 - ρ) * dx^2
   
-  return (acc, Δacc), dx
+  return (acc, Δacc), dx′
 end
 
 """
@@ -370,9 +370,9 @@ function apply(o::AMSGrad, state, x, dx)
   @. mt = β[1] * mt + (1 - β[1]) * dx
   @. vt = β[2] * vt + (1 - β[2]) * dx ^ 2
   @. v̂t = max(v̂t, vt)
-  dx = @. η * mt / (sqrt(v̂t) + ϵ)
+  dx′ = @. η * mt / (sqrt(v̂t) + ϵ)
 
-  return (mt, vt, v̂t), dx
+  return (mt, vt, v̂t), dx′
 end
 
 """
@@ -407,10 +407,10 @@ function apply(o::NADAM, state, x, dx)
 
   @. mt = β[1] * mt + (1 - β[1]) * dx
   @. vt = β[2] * vt + (1 - β[2]) * dx^2
-  dx = @. (β[1] * mt / (1 - β[1] * βt[1]) + (1 - β[1]) * dx / (1 - βt[1])) / 
+  dx′ = @. (β[1] * mt / (1 - β[1] * βt[1]) + (1 - β[1]) * dx / (1 - βt[1])) / 
           (sqrt(vt * β[2] / (1 - βt[2])) + ϵ) * η
 
-  return (mt, vt, βt .* β), dx
+  return (mt, vt, βt .* β), dx′
 end
 
 """
@@ -462,9 +462,9 @@ function apply(o::AdaBelief, state, x, dx)
 
   @. mt = β[1] * mt + (1 - β[1]) * dx
   @. st = β[2] * st + (1 - β[2]) * (dx - mt)^2
-  dx = @. η * mt / (sqrt(st) + ϵ)
+  dx′ = @. η * mt / (sqrt(st) + ϵ)
   
-  return (mt, st), dx
+  return (mt, st), dx′
 end
 
 """
@@ -485,9 +485,9 @@ init(o::WeightDecay, x::AbstractArray) = nothing
 (o::WeightDecay)(state, m, dm) = update(o, state, m, dm)
 
 function apply(o::WeightDecay, state, x, dx)
-  dx = @. dx + o.wd * x
+  dx′ = @. dx + o.wd * x
 
-  return state, dx
+  return state, dx′
 end
 
 """
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,15 +8,34 @@ using Statistics
   @testset for o in (Descent(), ADAM(), Momentum(), Nesterov(), RMSProp(),
                      ADAGrad(), AdaMax(), ADADelta(), AMSGrad(), NADAM(),
                      ADAMW(), RADAM(), OADAM(), AdaBelief())
-    w = (α = rand(3, 3), β = rand(3, 3))
+
+    # Original example
+    w = (α = 5rand(3, 3), β = rand(3, 3))
     st = Optimisers.state(o, w)
     loss(x, y) = mean((x.α .* x.β .- y.α .* y.β) .^ 2)
-    l = loss(w, w′)
+    @test loss(w, w′) > 1
     for i = 1:10^4
       gs = gradient(x -> loss(x, w′), w)
-      st, w = o(st, w, gs...)
+      st, w = Optimisers.update(o, st, w, gs...)
+    end
+    lw = loss(w, w′)
+    @test lw < 0.001
+
+    # Slightly harder variant
+    m = (α = randn(3), β = transpose(5rand(3,3)), γ = (rand(2), tanh))  # issue 28
+    st = Optimisers.state(o, m)
+    @test loss(m, w′) > 1
+    for i = 1:10^4
+      gs = gradient(x -> loss(x, w′), m)
+      st, m = o(st, m, gs...)
+    end
+    lm = loss(m, w′)
+    if lm < 0.1
+      @test lm < 0.1
+    else
+      @test_broken lm < 0.1  # @test keyword broken doesn't exist on Julia 1.6
     end
-    @test loss(w, w′) < 0.01
+
   end
 end