Rule for gradient accumulation (#137)

CarloLucibello · mcabbott · web-flow · commit e6d81607be81 · 2023-04-12T10:52:13.000+02:00
* implement AccumGrad

* gradient accumulation

* new interface

* more tests

* remove NoUpdaete

* fix

* test for subtract! Zero

* fix

* don't test AccumGrad with other rules

* another variant

* Update src/rules.jl

Co-authored-by: Michael Abbott &lt;32575566+mcabbott@users.noreply.github.com&gt;

* less docs

---------

Co-authored-by: Michael Abbott &lt;32575566+mcabbott@users.noreply.github.com&gt;
diff --git a/src/Optimisers.jl b/src/Optimisers.jl
@@ -14,7 +14,8 @@ export destructure
 include("rules.jl")
 export Descent, Adam, Momentum, Nesterov, Rprop, RMSProp,
        AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW, RAdam, OAdam, AdaBelief,
-       WeightDecay, ClipGrad, ClipNorm, OptimiserChain, Lion
+       WeightDecay, ClipGrad, ClipNorm, OptimiserChain, Lion,
+       AccumGrad
 
 ###
 ### one-array functions
diff --git a/src/adjust.jl b/src/adjust.jl
@@ -144,4 +144,3 @@ function _adjust(r::T, nt::NamedTuple) where T <: AbstractRule
   end
   T(vals...)  # relies on having the default constructor
 end
-
diff --git a/src/backup.jl b/src/backup.jl
diff --git a/src/interface.jl b/src/interface.jl
@@ -1,7 +1,8 @@
-
 using ChainRulesCore: canonicalize, backing, Tangent, AbstractZero, ZeroTangent
+
 base(dx::Tangent) = backing(canonicalize(dx))
 base(dx) = dx
+
 const Zero = Union{Nothing, AbstractZero}  # Union{Zygote, Diffractor}
 
 abstract type AbstractRule end
@@ -96,6 +97,7 @@ function _update!(ℓ::Leaf, x; grads, params)
 end
 
 subtract!(x, x̄) = maywrite(x) ? (x .= x .- x̄) : eltype(x).(x .- x̄)
+subtract!(x, x̄::Zero) = x
 
 _grads!(dict::IdDict, ℓ::Leaf, x, ::Zero...) = nothing
 function _grads!(dict::IdDict, ℓ::Leaf, x, x̄s...)
@@ -222,3 +224,4 @@ Broadcast.materialize(x::Lazy) = Broadcast.instantiate(x.bc)
 
 onevalue(λ::T, x::AbstractArray{T}) where T = map(_ -> λ, x)
 onevalue(λ, x::AbstractArray{T}) where T = onevalue(convert(float(T), λ), x)
+
diff --git a/src/rules.jl b/src/rules.jl
@@ -631,6 +631,7 @@ so `update!` will subtract the full gradient from the parameters.
 This is equivalent to `Descent(1)`.
 
 # Example
+
 ```jldoctest
 julia> o = OptimiserChain(ClipGrad(1.0), Descent(0.1));
 
@@ -654,8 +655,12 @@ init(o::OptimiserChain, x::AbstractArray) = map(opt -> init(opt, x), o.opts)
 
 function apply!(o::OptimiserChain, states, x, dx, dxs...)
   foldl(tuple.(o.opts, states); init = ((), dx)) do (states′, dx′), (opt, state)
-    state′, dx′ = apply!(opt, state, x, dx′, dxs...)
-    return (states′..., state′), dx′
+    if dx′ isa Zero
+      return (states′..., state), dx′
+    else 
+      state′, dx′ = apply!(opt, state, x, dx′, dxs...)
+      return (states′..., state′), dx′
+    end
   end
 end
 
@@ -667,3 +672,60 @@ end
 
 adjust(ℓ::OptimiserChain, eta::Real) = OptimiserChain(map(opt -> adjust(opt, eta), ℓ.opts)...)
 adjust(ℓ::OptimiserChain; kw...) = OptimiserChain(map(opt -> adjust(opt; kw...), ℓ.opts)...)
+
+
+"""
+  AccumGrad(n::Int)
+
+A rule constructed `OptimiserChain(AccumGrad(n), Rule())` will accumulate for `n` steps,
+before applying `Rule` to the mean of these `n` gradients.
+
+This is useful for training with effective batch sizes too large for the available memory.
+Instead of computing the gradient for batch size `b` at once, compute it for size `b/n` and
+accumulate `n` such gradients.
+
+# Example
+```jldoctest
+julia> m = (x=[1f0], y=[2f0]);
+
+julia> r = OptimiserChain(AccumGrad(2), WeightDecay(0.01), Descent(0.1));
+
+julia> s = Optimisers.setup(r, m);
+
+julia> Optimisers.update!(s, m, (x=[33], y=[0]));
+
+julia> m  # model not yet changed
+(x = Float32[1.0], y = Float32[2.0])
+
+julia> Optimisers.update!(s, m, (x=[0], y=[444]));
+
+julia> m  # n=2 gradients applied at once
+(x = Float32[-0.651], y = Float32[-20.202])
+```
+"""
+struct AccumGrad <: AbstractRule
+  n::Int
+  
+  function AccumGrad(n::Int)
+    n > 0 || throw(ArgumentError("AccumGrad must accumulate at least one gradient"))
+    return new(n)  
+  end
+end
+
+function init(o::AccumGrad, x)
+  return (zero(x), 1)
+end
+
+function apply!(o::AccumGrad, state, x, dx)
+  accum_dx, counter = state
+  if counter == 1
+    @.. accum_dx = dx / o.n
+  else
+    @.. accum_dx = accum_dx + dx / o.n
+  end
+  if counter == o.n
+    return (accum_dx, 1), accum_dx
+  else
+    return (accum_dx, counter + 1), nothing
+  end
+end
diff --git a/test/rules.jl b/test/rules.jl
@@ -244,3 +244,26 @@ VERSION < v"1.9-" && @testset "using Yota" begin
     @test loss(w, w′) < 0.001
   end
 end
+
+@testset "AccumGrad" begin
+  x0 = rand(5)
+  x = copy(x0)
+  lr = 0.01
+  tree = Optimisers.setup(OptimiserChain(AccumGrad(3), Descent(lr)), x)
+
+  g1 = rand(5)
+  tree, x1 = Optimisers.update(tree, x, g1)
+  @test x1 ≈ x
+  @test x1 ≈ x0 
+  g2 = rand(5)
+  tree, x2 = Optimisers.update(tree, x1, g2)
+  @test x2 ≈ x
+  @test x2 ≈ x0 
+  g3 = rand(5)
+  tree, x3 = Optimisers.update(tree, x2, g3)
+  @test x3 ≈ x0 .- lr .* (g1 .+ g2 .+ g3) ./ 3
+  g4 = rand(5)
+  
+  tree, x4 = Optimisers.update(tree, x3, g4)
+  @test x4 ≈ x3
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -499,6 +499,13 @@ y2z(x) = x
       end
     end  # 2nd-order
 
+    @testset "subtract! handles Zero" begin
+      x = rand(3)
+      y = Optimisers.subtract!(x, ChainRulesCore.ZeroTangent())
+      @test y === x
+      y = Optimisers.subtract!(x, nothing)
+      @test y === x
+    end
   end
   @testset verbose=true "Destructure" begin
     include("destructure.jl")

-Original file line number
+Diff line change
   end
   T(vals...)  # relies on having the default constructor
 end
+-