Add adjust! (#113)

mcabbott · ToucheSir · web-flow · commit 444a6b9c8f6f · 2022-11-26T23:40:11.000-05:00
* add mutating adjust

* add tests

* add to docs

* Apply suggestions from code review

Co-authored-by: Brian Chen &lt;ToucheSir@users.noreply.github.com&gt;

Co-authored-by: Brian Chen &lt;ToucheSir@users.noreply.github.com&gt;
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -34,6 +34,7 @@ Optimisers.OptimiserChain
 Optimisers.setup
 Optimisers.update
 Optimisers.update!
+Optimisers.adjust!
 Optimisers.adjust(::Any, ::Real)
 Optimisers.freeze!
 Optimisers.thaw!
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -165,6 +165,26 @@ Optimisers.thaw!(opt)
 opt.layers[3].bias  # Leaf(Momentum(...), [0.0, 0.0])
 ```
 
+## Adjusting Hyperparameters
+
+To change the learning rate during training, use [`adjust!`](@ref Optimisers.adjust!).
+This works much like `freeze!` by mutating the state tree, or part of it,
+without discarding the momenta. For the Flux model from just above:
+
+```julia
+Optimisers.adjust!(opt, 0.03)  # change η for the whole model...
+
+Optimisers.adjust!(opt.layers[3], 0.04)  # ... or just for one layer.
+```
+
+To change other fields of the optimisation rule, it accepts keyword arguments:
+
+```julia
+Momentum |> fieldnames  # (:eta, :rho)
+
+Optimisers.adjust!(opt, rho = 0.95)  # change ρ for the whole model.
+```
+
 ## Tied Parameters
 
 If the same array appears twice (or more) in the model, [Functors.jl](https://fluxml.ai/Functors.jl) should recognise this.
@@ -187,7 +207,7 @@ This identification relies on `===`, and will work for ordinary `Array`s and `Cu
 It will not at present work for `reshape`d arrays, nor for immutable arrays such as those
 from StaticArrays.jl.
 
- 
+
 ## Obtaining a flat parameter vector
 
 Instead of a nested tree-like structure, sometimes is is convenient to have all the
diff --git a/src/adjust.jl b/src/adjust.jl
@@ -56,12 +56,15 @@ thaw!(::Union{Number, AbstractArray{<:Number}}) = throw(ArgumentError(
 ###
 
 """
-    Optimisers.adjust(tree, η) -> tree
+    Optimisers.adjust!(tree, η)
 
 Alters the state `tree = setup(rule, model)` to change the parameters of the
 optimisation rule, without destroying its stored state. Typically used mid-way
 through training.
 
+Can be applied to part of a model, by acting only on the corresponding part
+of the state `tree`.
+
 To change just the learning rate, provide a number `η::Real`.
 
 # Example
@@ -76,11 +79,13 @@ julia> st, m = Optimisers.update(st, m, (vec = [16, 88], fun = nothing));  # wit
 julia> st
 (vec = Leaf(Nesterov{Float32}(0.001, 0.9), Float32[-0.016, -0.088]), fun = ())
 
-julia> st = Optimisers.adjust(st, 0.123)  # change learning rate, stored momentum untouched
+julia> Optimisers.adjust!(st, 0.123)  # change learning rate, stored momentum untouched
+
+julia> st
 (vec = Leaf(Nesterov{Float32}(0.123, 0.9), Float32[-0.016, -0.088]), fun = ())
 ```
 
-To change other parameters, `adjust` also accepts keyword arguments matching the field
+To change other parameters, `adjust!` also accepts keyword arguments matching the field
 names of the optimisation rule's type.
 
 ```
@@ -97,15 +102,30 @@ julia> Optimisers.adjust(st; beta = "no such field")  # silently ignored!
 (vec = Leaf(Nesterov{Float32}(0.001, 0.9), Float32[-0.016, -0.088]), fun = nothing)
 ```
 """
-adjust(tree, eta::Real) = map(st -> adjust(st, eta), tree)
-adjust(tree; kw...) = map(st -> adjust(st; kw...), tree)
+adjust!(tree, eta::Real) = foreach(st -> adjust!(st, eta), tree)
+adjust!(tree; kw...) = foreach(st -> adjust!(st; kw...), tree)
 
-adjust(::Nothing, ::Real) = nothing
-adjust(::Nothing; kw...) = nothing
+adjust!(ℓ::Leaf, eta::Real) = (ℓ.rule = adjust(ℓ.rule, eta); nothing)
+adjust!(ℓ::Leaf; kw...) = (ℓ.rule = adjust(ℓ.rule; kw...); nothing)
 
 adjust(ℓ::Leaf, eta::Real) = Leaf(adjust(ℓ.rule, eta), ℓ.state, ℓ.frozen)
 adjust(ℓ::Leaf; kw...) = Leaf(adjust(ℓ.rule; kw...), ℓ.state, ℓ.frozen)
 
+"""
+    adjust(tree, η) -> tree
+
+Like [`adjust!`](@ref Optimisers.adjust), but returns a new tree instead of mutating the old one.
+"""
+function adjust(tree, eta::Real)
+  t′ = fmap(copy, tree; exclude = maywrite)  # same as used for update / update!
+  adjust!(t′, eta)
+  t′
+end
+function adjust(tree; kw...)
+  t′ = fmap(copy, tree; exclude = maywrite)
+  adjust!(t′; kw...)
+  t′
+end
 
 """
     Optimisers.adjust(rule::RuleType, η::Real) -> rule
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -177,7 +177,7 @@ end
       @test eltype(s6[2].state[2]) == Float32
     end
 
-    @testset "adjusyting parameters" begin
+    @testset "adjusting parameters, out-of-place" begin
       # Simple momentum:
       m = (α = ([0.0], sin), γ = Float32[4,3,2])
       s = Optimisers.setup(Momentum(0.1, 0.9), m)
@@ -221,6 +221,50 @@ end
       @test sc2.γ.state[2][1] ≈ [0.1, 0.2, 0.2]
     end
 
+    @testset "adjusting parameters, in-place" begin
+      # Simple momentum:
+      m = (α = ([0.0], sin), γ = Float32[4,3,2])
+      s = Optimisers.setup(Momentum(0.1, 0.9), m)
+      s1, m1 = Optimisers.update(s, m, (α = nothing, γ = [1,10,100],))
+      @test m.γ .- m1.γ ≈ [0.1, 1, 10]
+      @test s1.γ.rule.eta == 0.1
+      @test s1.γ.state ≈ [0.1, 1, 10]
+
+      Optimisers.adjust!(s1, 0.2)
+      @test s1.γ.rule.eta == 0.2
+      @test s1.γ.rule.rho == 0.9
+      @test s1.γ.state ≈ [0.1, 1, 10]
+      @test s1.α[1].rule.eta == 0.2
+
+      Optimisers.adjust!(s1; eta=0.3, rho=0.7)
+      @test s1.γ.rule.eta == 0.3
+      @test s1.γ.rule.rho == 0.7
+      @test s1.γ.state ≈ [0.1, 1, 10]
+      @test s1.α[1].rule.rho == 0.7
+
+      _, m3 = Optimisers.update(s1, m, (α = nothing, γ = [1,10,100],))
+      @test !(m.γ .- m3.γ ≈ [1, 10, 100])
+
+      Optimisers.adjust!(s1, zeta = "this does nothing")
+      @test s1.γ.rule.eta == 0.3
+
+      # OptimiserChain
+      sc = Optimisers.setup(OptimiserChain(ClipGrad(2), Adam()), m)
+      sc1, mc1 = Optimisers.update(sc, m, (α = nothing, γ = [1,10,100],))
+      @test sc1.γ.rule.opts[2].eta == 0.001f0
+      @test sc1.γ.state[2][1] ≈ [0.1, 0.2, 0.2]
+
+      Optimisers.adjust!(sc1, 0.2)
+      @test sc1.γ.rule.opts[1].delta == 2 # unchanged
+      @test sc1.γ.rule.opts[2].eta === 0.2f0
+      @test sc1.γ.state[2][1] ≈ [0.1, 0.2, 0.2]
+
+      Optimisers.adjust!(sc1; delta = 2.5)  # ClipGrad(2) does not store an Int, for this reason
+      @test sc1.γ.rule.opts[1].delta == 2.5
+      @test sc1.γ.rule.opts[2].eta === 0.2f0 # unchanged
+      @test sc1.γ.state[2][1] ≈ [0.1, 0.2, 0.2]
+    end
+
     @testset "freeze/thaw" begin
       m = (x=[1.0, 2.0], y=([3.0, 4.0], sin));
       st = Optimisers.setup(Descent(0.1), m);