minimal trainable (#36)

mcabbott · web-flow · commit 5b6b38004dac · 2022-01-28T16:40:07.000-05:00
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -35,6 +35,13 @@ Optimisers.update
 Optimisers.update!
 ```
 
+Calling `Functors.@functor` on your model's layer types by default causes the
+optimiser to act on all suitable fields. To restrict this, define `trainable`:
+
+```@docs
+Optimisers.trainable
+```
+
 ## Rule Definition
 
 ```@docs
diff --git a/src/interface.jl b/src/interface.jl
@@ -12,13 +12,14 @@ function setup(rule, x; seen = Base.IdSet())
   elseif isleaf(x)
     return nothing
   else
-    x′, _ = functor(x)
-    return map(xᵢ -> setup(rule, xᵢ; seen), x′)
+    return map(xᵢ -> setup(rule, xᵢ; seen), _trainable(x))
   end
 end
 
 subtract!(x, x̄) = iswriteable(x) ? (x .= x .- x̄) : (x .- x̄)
 
+update!(::Nothing, x, x̄s...) = nothing, x
+
 function update!(ℓ::Leaf, x, x̄s...)
   if all(isnothing, x̄s)
     return ℓ, x
@@ -55,6 +56,26 @@ isnumeric(x) = false
 iswriteable(::DenseArray{<:AbstractFloat}) = true  # more elaborate versions are possible, wait until needed?
 iswriteable(_) = false
 
+"""
+    trainable(x::Layer) -> NamedTuple
+
+This should be overloaded to make optimisers ignore some fields of
+every `Layer`, which would otherwise contain trainable parameters.
+(Elements such as functions and sizes are always ignored.)
+
+The default is `Functors.children(x)`, usually a NamedTuple of all fields,
+and `trainable(x)` must contain a subset of these.
+"""
+trainable(x) = functor(x)[1]
+
+_trainable(x) = _trainable(functor(x)[1], trainable(x))
+_trainable(ch::NamedTuple, tr::NamedTuple) = merge(map(_ -> nothing, ch), tr)
+_trainable(ch::Tuple, tr::Tuple) = tr
+function _trainable(ch::NamedTuple, tr::Tuple)  # for old Flux-style no-names tuple
+  @warn "trainable(x) should now return a NamedTuple with the field names, not a Tuple"
+  map(c -> c in tr ? c : nothing, ch)
+end
+
 """
     @.. x = x + y
     @.. x + y / z
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,9 +1,17 @@
-using Optimisers, Test
-using Zygote
-using Statistics, Random, LinearAlgebra
-Random.seed!(1)
+using Optimisers, Functors, Zygote
+using LinearAlgebra, Statistics, Test, Random
 using Optimisers: @..
 
+Random.seed!(1)
+
+struct Foo; x; y; end
+Functors.@functor Foo
+Optimisers.trainable(x::Foo) = (x.y, x.x)
+
+struct TwoThirds a; b; c; end
+Functors.@functor TwoThirds (a, c)
+Optimisers.trainable(x::TwoThirds) = (a = x.a,)
+
 @testset verbose=true "Optimisers.jl" begin
 
   @testset "very basics" begin
@@ -23,7 +31,7 @@ using Optimisers: @..
     @test m3[1] ≈ [1,2] .- 0.1 .* [25, 33]
   end
 
-  @testset "$(first(string(o), 42))" for o in (
+  @testset "rule: $(first(string(o), 42))" for o in (
                      Descent(), ADAM(), Momentum(), Nesterov(), RMSProp(),
                      ADAGrad(), AdaMax(), ADADelta(), AMSGrad(), NADAM(),
                      ADAMW(), RADAM(), OADAM(), AdaBelief()
@@ -99,6 +107,38 @@ using Optimisers: @..
     @test isnan(m3n.γ[3])
   end
 
+  @testset "trainable subset" begin
+    # Foo has an old-style tuple trainable, both elements
+    mf = Foo([1,2], (a = sin, b = [3,4], c = 5))
+    sf = Optimisers.setup(Descent(0.1), mf)
+    gf = (x = nothing, y = (a = nothing, b = [1,1], c = 1))
+    _, mf2 = Optimisers.update(sf, mf, gf)
+    @test mf2.x == [1,2]
+    @test mf2.y == (a = sin, b = [2.9, 3.9], c = 5)
+
+    # TwoThirds has functor a,c only, and trainable a only
+    mt = TwoThirds(Float32[1,2], Float32[3,4], Float32[5,6])
+    mt10 = fmap(x -> 10x, mt)
+    @test mt10.a == [10, 20]
+    @test mt10.b == [3, 4]
+    @test mt10.c == [50, 60]
+    st = Optimisers.setup(Momentum(0.1, 0.9), mt)
+    gt = gradient(m -> sum(abs2, m.a) + 100sum(abs2, m.b), mt)
+    _, mtup = Optimisers.update(st, mt, gt...)
+    @test mtup.a ≈ [0.8, 1.6]
+    @test mtup.b == [3, 4]
+    @test mtup.c == [5, 6]
+
+    # Various kinds of missing branches together:
+    m = Foo(
+        TwoThirds(Foo(1.0, Float32[2,3,4]), 5.0, Float32[6,7]),
+        TwoThirds((p = Float32[1,2,3],), sin, (q = 4.0, r = cos,)),
+        )
+    s = Optimisers.setup(Momentum(0.1, 0.9), m)
+    g = gradient(m -> sum(abs2, m.x.a.y) + m.x.b^2 + log(m.y.c.q), m)
+    @test Optimisers.update!(s, m, g...)[2] isa Foo
+  end
+
   @testset "broadcasting macro" begin
     x = [1.0, 2.0]; y = [3,4]; z = [5,6]
     @test (@.. x + y * z) isa Broadcast.Broadcasted