feat: support for ForwardDiff training (#1273)

hstrey · helmutstrey · avik-pal · web-flow · commit a7b1f112a8bb · 2025-03-26T14:23:41.000-04:00
* added extension for ForwardDiff

* moved compute_gradients_imp ForwardDiff dispatch to /helpers/training/jl

* removed LuxForwardDiffExt from Project.toml

* Update src/helpers/training.jl

* Update src/helpers/training.jl

* added test for ForwardDiff training

* removed ()

* created new testitem for ForwardDiff and added ForwardDiff Limitation to docstring

* added test condition at the end of ForwardDiff test, and reduced reduced function calls

* feat: use caching to reduce memory allocations

* Apply suggestions from code review

---------

Co-authored-by: Helmut Strey &lt;Helmut.Strey@stonybrook.edu&gt;
Co-authored-by: Avik Pal &lt;avik.pal.2017@gmail.com&gt;
Co-authored-by: Avik Pal &lt;avikpal@mit.edu&gt;
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Lux"
 uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "1.10.1"
+version = "1.11.0"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -11,6 +11,7 @@ ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 ConcreteStructs = "2569d6c7-a4a2-43d3-a901-331e8e4be471"
+DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
 DispatchDoctor = "8d63f2c5-f18a-4cf2-ba9d-b3f60fc568c8"
 EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
@@ -82,6 +83,7 @@ ChainRulesCore = "1.25"
 Compat = "4.16"
 ComponentArrays = "0.15.22"
 ConcreteStructs = "0.2.3"
+DiffResults = "1.1"
 DispatchDoctor = "0.4.12"
 Enzyme = "0.13.35"
 EnzymeCore = "0.8.8"
diff --git a/src/Lux.jl b/src/Lux.jl
@@ -82,6 +82,7 @@ include("extended_ops.jl")
 # Training Helpers
 include("helpers/optimizers.jl")
 include("helpers/training.jl")
+include("helpers/forwarddiff_training.jl")
 
 # Experimental
 include("contrib/contrib.jl")
@@ -155,7 +156,8 @@ export Training
 
 export jacobian_vector_product, vector_jacobian_product
 export batched_jacobian
-export AutoEnzyme, AutoForwardDiff, AutoReverseDiff, AutoTracker, AutoZygote
+export AutoEnzyme,
+    AutoForwardDiff, AutoReverseDiff, AutoTracker, AutoZygote, AutoForwardDiff
 
 export BinaryCrossEntropyLoss,
     BinaryFocalLoss,
diff --git a/src/helpers/forwarddiff_training.jl b/src/helpers/forwarddiff_training.jl
@@ -0,0 +1,91 @@
+using ADTypes: AutoForwardDiff
+using DiffResults: DiffResults
+using ForwardDiff: ForwardDiff
+using Setfield: @set!
+using Static: True, False
+
+function Training.compute_gradients_impl(
+    ad::AutoForwardDiff, obj_fn::F, data, ts::Training.TrainState
+) where {F}
+    @assert ts.parameters isa AbstractArray "AutoForwardDiff only supports AbstractArray \
+                                             parameters, not $(typeof(ts.parameters)). To \
+                                             convert the parameter structure to an array \
+                                             use `ComponentArray(ps)`."
+
+    obj_fn_wrap, st_wrap, stats_wrap = Training.wrap_objective_function(
+        obj_fn, ts.model, ts.parameters, ts.states, data, True()
+    )
+
+    gradient_result = DiffResults.GradientResult(ts.parameters)
+    ForwardDiff.gradient!(
+        gradient_result, ps -> obj_fn_wrap(ts.model, ps, ts.states, data), ts.parameters
+    )
+
+    cache = Training.TrainingBackendCache(
+        ad, False(), gradient_result, (; obj_fn=obj_fn_wrap, st_wrap, stats_wrap)
+    )
+    @set! ts.cache = cache
+    @set! ts.objective_function = obj_fn
+    @set! ts.states = st_wrap[]
+    return (
+        DiffResults.gradient(gradient_result),
+        DiffResults.value(gradient_result),
+        stats_wrap[],
+        ts,
+    )
+end
+
+const FORWARDDIFF_CACHE_TYPE = Training.TrainingBackendCache{
+    <:AutoForwardDiff,False,PS,<:NamedTuple{(:obj_fn, :st_wrap, :stats_wrap)}
+} where {PS}
+
+function Training.compute_gradients_impl(
+    ::AutoForwardDiff, obj_fn::F, data, ts::Training.TrainState{<:FORWARDDIFF_CACHE_TYPE,F}
+) where {F}
+    gradient_result = ts.cache.dparameters
+
+    ForwardDiff.gradient!(
+        gradient_result,
+        ps -> ts.cache.extras.obj_fn(ts.model, ps, ts.states, data),
+        ts.parameters,
+    )
+
+    @set! ts.objective_function = obj_fn
+    @set! ts.states = ts.cache.extras.st_wrap[]
+
+    return (
+        DiffResults.gradient(gradient_result),
+        DiffResults.value(gradient_result),
+        ts.cache.extras.stats_wrap[],
+        ts,
+    )
+end
+
+function Training.compute_gradients_impl(
+    ::AutoForwardDiff,
+    obj_fn::F,
+    data,
+    ts::Training.TrainState{<:Training.TrainingBackendCache{<:AutoForwardDiff,False}},
+) where {F}
+    @warn "Detected calls to `compute_gradients(::AutoForwardDiff, ...)` with objective \
+           function that is changing across function calls. This can lead to the \
+           generation of slow code" maxlog = 1
+    gradient_result = ts.cache.dparameters
+
+    # We do exactly same thing as the first case but without caching the function
+    obj_fn_wrap, st_wrap, stats_wrap = Training.wrap_objective_function(
+        obj_fn, ts.model, ts.parameters, ts.states, data, False()
+    )
+
+    ForwardDiff.gradient!(
+        gradient_result, ps -> obj_fn_wrap(ts.model, ps, ts.states, data), ts.parameters
+    )
+
+    @set! ts.states = st_wrap[]
+    return (
+        DiffResults.gradient(gradient_result),
+        DiffResults.value(gradient_result),
+        stats_wrap[],
+        ts,
+    )
+end
diff --git a/src/helpers/training.jl b/src/helpers/training.jl
@@ -160,6 +160,7 @@ Compute the gradients of the objective function wrt parameters stored in `ts`.
 | `AutoReverseDiff(; compile)` | `ReverseDiff.jl` |
 | `AutoTracker`                | `Tracker.jl`     |
 | `AutoEnzyme`                 | `Enzyme.jl`      |
+| `AutoForwardDiff`            |                  |
 
 ## Arguments
 
@@ -185,6 +186,8 @@ A 4-Tuple containing:
   - `AutoReverseDiff(; compile=true)` is not supported for Lux models with non-empty state
     `st`. Additionally the returned stats must be empty (`NamedTuple()`). We catch these
     issues in most cases and throw an error.
+  - AutoForwardDiff only works with parameters that are AbstractArrays
+    (e.g. ps=ComponentVector(ps))
 
 !!! danger "Aliased Gradients"
 
diff --git a/test/helpers/training_tests.jl b/test/helpers/training_tests.jl
@@ -139,6 +139,56 @@ end
     end
 end
 
+@testitem "Training API ForwardDiff" setup = [SharedTestSetup] tags = [:misc] begin
+    using ADTypes, Optimisers, ComponentArrays
+
+    mse = MSELoss()
+
+    rng = StableRNG(12345)
+
+    x_data = randn(rng, Float32, 4, 32)
+    y_data = evalpoly.(x_data, ((1, 2, 3),)) .- evalpoly.(x_data, ((5, 2),))
+    y_data = (y_data .- minimum(y_data)) ./ (maximum(y_data) - minimum(y_data))
+    dataset = [(x_data[:, i], y_data[:, i]) for i in Iterators.partition(1:32, 8)]
+
+    model = Chain(
+        Dense(4, 32, tanh), BatchNorm(32), Dense(32, 32, tanh), BatchNorm(32), Dense(32, 4)
+    )
+
+    dataset_ = [(x, y) for (x, y) in dataset]
+    opt = Adam(0.001f0)
+
+    ps, st = Lux.setup(rng, model)
+    tstate = Training.TrainState(model, ComponentVector(ps), st, opt)
+
+    initial_loss = first(
+        mse(model, tstate.parameters, Lux.testmode(tstate.states), dataset_[1])
+    )
+
+    for epoch in 1:100, (x, y) in dataset_
+        grads, loss, _, tstate = allow_unstable() do
+            Training.compute_gradients(AutoForwardDiff(), mse, (x, y), tstate)
+        end
+        tstate = Training.apply_gradients!(tstate, grads)
+    end
+
+    for epoch in 1:100, (x, y) in dataset_
+        grads, loss, _, tstate = allow_unstable() do
+            Training.single_train_step!(AutoForwardDiff(), mse, (x, y), tstate)
+        end
+    end
+
+    for epoch in 1:100, (x, y) in dataset_
+        grads, loss, _, tstate = allow_unstable() do
+            Training.single_train_step(AutoForwardDiff(), mse, (x, y), tstate)
+        end
+    end
+
+    final_loss = first(mse(model, tstate.parameters, tstate.states, dataset_[1]))
+
+    @test final_loss * 50 < initial_loss
+end
+
 @testitem "Enzyme: Invalidate Cache on State Update" setup = [SharedTestSetup] tags = [
     :misc
 ] skip = :(using LuxTestUtils; !LuxTestUtils.ENZYME_TESTING_ENABLED) begin