Wrap optimiser and state in a struct (#30)

mcabbott · darsnack · web-flow · commit 132bdd80ff38 · 2022-01-28T15:08:03.000-05:00
* wrap optimiser and state in a struct

* rename state -&gt; setup

* remove _update and use dispatch

* an idea

* fixup rebase

* rm opt(state, model, ...) methods

* move show, tweak things

* docs

* add a check that there are no tied weights

* a bug

* a test

* include strings

* add docstrings for all the basic functions

* wording

Co-authored-by: Kyle Daruwalla &lt;daruwalla.k.public@icloud.com&gt;

Co-authored-by: Kyle Daruwalla &lt;daruwalla.k.public@icloud.com&gt;
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -1,3 +1,6 @@
+
+## Optimisation Rules
+
 ```@docs
 Optimisers.Descent
 Optimisers.Momentum
@@ -15,9 +18,27 @@ Optimisers.ADAMW
 Optimisers.AdaBelief
 ```
 
+In addition to the main course, you may wish to order some of these condiments:
+
 ```@docs
 Optimisers.ClipGrad
 Optimisers.ClipNorm
 Optimisers.WeightDecay
 Optimisers.OptimiserChain
 ```
+
+## Model Interface
+
+```@docs
+Optimisers.setup
+Optimisers.update
+Optimisers.update!
+```
+
+## Rule Definition
+
+```@docs
+Optimisers.apply!
+Optimisers.init
+Optimisers.@..
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -2,48 +2,56 @@
 
 ## Define an Optimiser
 
+A new optimiser must overload two functions, `apply!` and `init`:
+
 ```julia
-# Define a container to hold any optimiser specific parameters (if any)
-struct Descent{T}
+# Define a container to hold any optimiser specific parameters (if any):
+struct DecayDescent{T}
   η::T
 end
 
-# Define an `apply!` rule with which to update the current params
-# using the gradients
-function Optimisers.apply!(o::Descent, state, m, m̄)
-    o.η .* m̄, state
+# Define an `apply!` rule which encodes how the gradients will be used to
+# update the parameters:
+function Optimisers.apply!(o::DecayDescent, state, x, x̄)
+  newx̄ = (o.η / √state) .* x̄
+  nextstate = state + 1
+  return nextstate, newx̄
 end
 
-Optimisers.init(o, x::AbstractArray) = nothing
+# Define the function which sets up the initial state (if any):
+Optimisers.init(o::DecayDescent, x::AbstractArray) = 1
 ```
 
+The parameters will be immediately updated to `x .- newx̄`, while `nextstate` is
+caried to the next iteration.
+
 Notice that the state is handled separately from the optimiser itself. This
 is a key design principle and allows users to manage their own state explicitly.
 
 It of course also makes it easier to store the state.
 
 ## Usage
 
+To apply such an optimiser to a whole model, `setup` builds a tree containing any initial
+state for every trainable array. Then at each step, `update` uses this and the gradient
+to adjust the model:
+
 ```julia
 
 using Flux, Metalhead, Optimisers
 
 o = Optimisers.ADAM() # define an ADAM optimiser with default settings
-st = Optimisers.state(o, m)  # initialize the optimiser before using it
+st = Optimisers.setup(o, m)  # initialize the optimiser before using it
 
-model = ResNet() # define a model to train on
+model = ResNet18() # define a model to train on
 ip = rand(Float32, 224, 224, 3, 1) # dummy data
 
 m̄, _ = gradient(model, ip) do m, x # calculate the gradients
-  sum(m(x))
+  sum(m(x)) # dummy loss function
 end
 
+st, mnew = Optimisers.update(st, m, m̄)
 
-st, mnew = Optimisers.update(o, st, m, m̄)
-
-# or
-
-st, mnew = o(m, m̄, st)
 ```
 
 Notice that a completely new instance of the model is returned. Internally, this
@@ -53,6 +61,6 @@ work with different forms of gradients, but most likely use case are the gradien
 returned by [Zygote.jl](https://fluxml.ai/Zygote.jl).
 
 There is also `Optimisers.update!` which similarly returns a new model and new state,
-but is free to mutate arrays within the old one, for efficiency.
+but is free to mutate arrays within the old one for efficiency.
 The method of `apply!` you write is likewise free to mutate arrays within its state;
 they are defensively copied when this rule is used with `update`.
diff --git a/src/Optimisers.jl b/src/Optimisers.jl
@@ -10,4 +10,97 @@ export Descent, ADAM, Momentum, Nesterov, RMSProp,
        ADAGrad, AdaMax, ADADelta, AMSGrad, NADAM, ADAMW, RADAM, OADAM, AdaBelief,
        WeightDecay, ClipGrad, ClipNorm, OptimiserChain
 
+"""
+    Optimisers.apply!(rule::RuleType, state, parameters, gradient) -> (state, gradient)
+
+This defines the action of any optimisation rule. It should return the modified gradient
+which will be subtracted from the parameters, and the updated state (if any) for use at
+the next iteration, as a tuple `(state, gradient)`.
+
+For efficiency it is free to mutate the old state, but only what is returned will be used.
+Ideally this should check `iswriteable(x)`, which the built-in rules do via [`@..`](@ref).
+
+The initial state is `init(rule::RuleType, parameters)`.
+
+# Example
+```jldoctest
+julia> Optimisers.init(Descent(0.1), [1,2,3]) === nothing
+true
+
+julia> Optimisers.apply!(Descent(0.1), nothing, [1,2,3], [4,5,6])
+(nothing, Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}}(*, ([4, 5, 6], 0.1)))
+```
+"""
+apply!
+
+"""
+    Optimisers.init(rule::RuleType, parameters) -> state
+
+Sets up the initial state for a given optimisation rule, and an array of parameters.
+This and [`apply!`](@ref) are the two functions which any new optimisation rule must define.
+
+# Examples
+```jldoctest
+julia> Optimisers.init(Descent(), [1,2,3])  # is `nothing`
+
+julia> Optimisers.init(Momentum(), [1.0, 2.0])
+2-element Vector{Float64}:
+ 0.0
+ 0.0
+```
+"""
+init
+
+"""
+    Optimisers.setup(rule, model) -> tree
+
+Initialises the given optimiser for every trainable parameter within the model.
+Returns a tree of the relevant states, which must be passed to [`update`](@ref)
+or [`update!`](@ref).
+
+# Example
+```jldoctest
+julia> Optimisers.setup(Descent(0.1f0), (x = rand(3), y = (true, false), z = tanh))
+(x = Leaf(Descent{Float32}(0.1), nothing), y = (nothing, nothing), z = nothing)
+```
+"""
+setup
+
+"""
+    Optimisers.update(tree, model, gradient) -> (tree, model)
+
+Uses the optimiser and the gradient to change the trainable parameters in the model.
+Returns the improved model, and the optimiser states needed for the next update.
+The initial tree of states comes from [`setup`](@ref).
+
+See also [`update!`](@ref), which will be faster for models of ordinary `Array`s or `CuArray`s.
+
+# Example
+```jldoctest
+julia> m = (x = Float32[1,2,3], y = tanh);
+
+julia> t = Optimisers.setup(Descent(0.1f0), m)
+(x = Leaf(Descent{Float32}(0.1), nothing), y = nothing)
+
+julia> g = (x = [1,1,1], y = nothing);  # fake gradient
+
+julia> Optimisers.update(t, m, g)
+((x = Leaf(Descent{Float32}(0.1), nothing), y = nothing), (x = Float32[0.9, 1.9, 2.9], y = tanh))
+```
+"""
+update
+
+"""
+    Optimisers.update!(tree, model, gradient) -> (tree, model)
+
+Uses the optimiser and the gradient to change the trainable parameters in the model.
+Returns the improved model, and the optimiser states needed for the next update.
+The initial tree of states comes from [`setup`](@ref).
+
+This is used in exactly the same manner as [`update`](@ref), but because it may mutate
+arrays within the old model (and the old state), it will be faster for models of ordinary
+`Array`s or `CuArray`s. However, you should not rely on the old model being fully updated.
+"""
+update!
+
 end # module
diff --git a/src/interface.jl b/src/interface.jl
@@ -1,39 +1,48 @@
 
-function state(o, x)
+struct Leaf{R,S}
+  rule::R
+  state::S
+end
+
+function setup(rule, x; seen = Base.IdSet())
   if isnumeric(x)
-    return init(o, x)
+    x in seen && throw(ArgumentError("Optimisers.jl does not at present handle tied weights, sorry."))
+    isbits(x) || push!(seen, x)
+    return Leaf(rule, init(rule, x))
   elseif isleaf(x)
     return nothing
   else
     x′, _ = functor(x)
-    return map(xᵢ -> state(o, xᵢ), x′)
+    return map(xᵢ -> setup(rule, xᵢ; seen), x′)
   end
 end
 
-patch!(x, x̄) = iswriteable(x) ? (x .= x .- x̄) : (x .- x̄)
+subtract!(x, x̄) = iswriteable(x) ? (x .= x .- x̄) : (x .- x̄)
 
-function _update!(o, st, x, x̄s...)
-  st′, x̄′ = apply!(o, st, x, x̄s...)
-  return st′, patch!(x, x̄′)
+function update!(ℓ::Leaf, x, x̄s...)
+  if all(isnothing, x̄s)
+    return ℓ, x
+  else
+    s′, x̄′ = apply!(ℓ.rule, ℓ.state, x, x̄s...)
+    return Leaf(ℓ.rule, s′), subtract!(x, x̄′)
+  end
 end
 
-function update!(o, state, x, x̄s...)
+function update!(tree, x, x̄s...)
   if all(isnothing, x̄s)
-    return state, x
-  elseif isnumeric(x)
-    return _update!(o, state, x, x̄s...)
+    return tree, x
   else
     x̄s′ = map(x̄ -> functor(typeof(x), x̄)[1], x̄s)
     x′, re = functor(typeof(x), x)
-    xstate = map((stᵢ, xᵢ, x̄sᵢ...) -> update!(o, stᵢ, xᵢ, x̄sᵢ...), state, x′, x̄s′...)
-    return map(first, xstate), re(map(last, xstate))
+    xtree = map((stᵢ, xᵢ, x̄sᵢ...) -> update!(stᵢ, xᵢ, x̄sᵢ...), tree, x′, x̄s′...)
+    return map(first, xtree), re(map(last, xtree))
   end
 end
 
-function update(o, state, x, x̄s...)
-  state′ = fmap(copy, state; exclude = iswriteable)
+function update(tree, x, x̄s...)
+  t′ = fmap(copy, tree; exclude = iswriteable)
   x′ = fmap(copy, x; exclude = iswriteable)
-  update!(o, state′, x′, x̄s...)
+  update!(t′, x′, x̄s...)
 end
 
 # default all rules to first order calls
@@ -75,3 +84,11 @@ function lazy end
 Broadcast.broadcasted(::typeof(lazy), x) = Lazy(x)
 struct Lazy{T}; bc::T; end
 Broadcast.materialize(x::Lazy) = Broadcast.instantiate(x.bc)
+
+function Base.show(io::IO, ℓ::Leaf)  # show method is mostly to hide its long type!
+  ioc = IOContext(io, :compact => true)
+  print(ioc, "Leaf(", ℓ.rule, ", ")
+  show(ioc, ℓ.state)
+  print(io, ")")
+end
+
diff --git a/src/rules.jl b/src/rules.jl
diff --git a/test/runtests.jl b/test/runtests.jl