perf: avoid double function call in ReverseDiff value_and_gradient (#729)

gdalle · web-flow · commit 3417ff540f0d · 2025-02-16T17:38:22.000+01:00
* perf: avoid double function call in ReverseDiff `value_and_gradient`

* Fixes

* Fix hessian

* Separate completely

* Replace DR.gradient(result) with grad
diff --git a/DifferentiationInterface/Project.toml b/DifferentiationInterface/Project.toml
@@ -1,7 +1,7 @@
 name = "DifferentiationInterface"
 uuid = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 authors = ["Guillaume Dalle", "Adrian Hill"]
-version = "0.6.40"
+version = "0.6.41"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
diff --git a/DifferentiationInterface/ext/DifferentiationInterfaceReverseDiffExt/onearg.jl b/DifferentiationInterface/ext/DifferentiationInterfaceReverseDiffExt/onearg.jl
@@ -87,23 +87,26 @@ end
 function DI.value_and_gradient!(
     f, grad, prep::ReverseDiffGradientPrep, ::AutoReverseDiff{compile}, x
 ) where {compile}
-    y = f(x)  # TODO: ReverseDiff#251
-    result = DiffResult(y, (grad,))
+    result = MutableDiffResult(zero(eltype(x)), (grad,))  # ReverseDiff#251
     if compile
         result = gradient!(result, prep.tape, x)
     else
         result = gradient!(result, f, x, prep.config)
     end
-    y = DR.value(result)
-    grad === DR.gradient(result) || copyto!(grad, DR.gradient(result))
-    return y, grad
+    return DR.value(result), grad  # ReverseDiff#269
 end
 
 function DI.value_and_gradient(
-    f, prep::ReverseDiffGradientPrep, backend::AutoReverseDiff, x
-)
-    grad = similar(x)
-    return DI.value_and_gradient!(f, grad, prep, backend, x)
+    f, prep::ReverseDiffGradientPrep, backend::AutoReverseDiff{compile}, x
+) where {compile}
+    # GradientResult tries to mutate an SArray
+    result = MutableDiffResult(zero(eltype(x)), (similar(x),))
+    if compile
+        result = gradient!(result, prep.tape, x)
+    else
+        result = gradient!(result, f, x, prep.config)
+    end
+    return DR.value(result), DR.gradient(result)
 end
 
 function DI.gradient!(
@@ -144,23 +147,19 @@ function DI.value_and_gradient!(
     contexts::Vararg{DI.Context,C},
 ) where {C}
     fc = DI.with_contexts(f, contexts...)
-    y = fc(x)  # TODO: ReverseDiff#251
-    result = DiffResult(y, (grad,))
+    result = MutableDiffResult(zero(eltype(x)), (grad,))  # ReverseDiff#251
     result = gradient!(result, fc, x, prep.config)
-    y = DR.value(result)
-    grad === DR.gradient(result) || copyto!(grad, DR.gradient(result))
-    return y, grad
+    return DR.value(result), grad  # ReverseDiff#269
 end
 
 function DI.value_and_gradient(
-    f,
-    prep::ReverseDiffGradientPrep,
-    backend::AutoReverseDiff,
-    x,
-    contexts::Vararg{DI.Context,C},
+    f, prep::ReverseDiffGradientPrep, ::AutoReverseDiff, x, contexts::Vararg{DI.Context,C}
 ) where {C}
-    grad = similar(x)
-    return DI.value_and_gradient!(f, grad, prep, backend, x, contexts...)
+    fc = DI.with_contexts(f, contexts...)
+    # GradientResult tries to mutate an SArray
+    result = MutableDiffResult(zero(eltype(x)), (similar(x),))
+    result = gradient!(result, fc, x, prep.config)
+    return DR.value(result), DR.gradient(result)
 end
 
 function DI.gradient!(
@@ -310,31 +309,23 @@ end
 
 ### Without contexts
 
-@kwdef struct ReverseDiffHessianPrep{GC,HC,GT,HT} <: DI.HessianPrep
-    gradient_config::GC
+@kwdef struct ReverseDiffHessianPrep{G<:ReverseDiffGradientPrep,HC,HT} <: DI.HessianPrep
+    gradient_prep::G
     hessian_config::HC
-    gradient_tape::GT
     hessian_tape::HT
 end
 
-function DI.prepare_hessian(f, ::AutoReverseDiff{compile}, x) where {compile}
+function DI.prepare_hessian(f, backend::AutoReverseDiff{compile}, x) where {compile}
+    gradient_prep = DI.prepare_gradient(f, backend, x)
     if compile
-        gradient_tape = ReverseDiff.compile(GradientTape(f, x))
         hessian_tape = ReverseDiff.compile(HessianTape(f, x))
         return ReverseDiffHessianPrep(;
-            gradient_config=nothing,
-            hessian_config=nothing,
-            gradient_tape=gradient_tape,
-            hessian_tape=hessian_tape,
+            gradient_prep, hessian_config=nothing, hessian_tape=hessian_tape
         )
     else
-        gradient_config = GradientConfig(x)
         hessian_config = HessianConfig(x)
         return ReverseDiffHessianPrep(;
-            gradient_config=gradient_config,
-            hessian_config=hessian_config,
-            gradient_tape=nothing,
-            hessian_tape=nothing,
+            gradient_prep, hessian_config=hessian_config, hessian_tape=nothing
         )
     end
 end
@@ -360,47 +351,32 @@ function DI.hessian(
 end
 
 function DI.value_gradient_and_hessian!(
-    f, grad, hess, prep::ReverseDiffHessianPrep, ::AutoReverseDiff{compile}, x
+    f, grad, hess, prep::ReverseDiffHessianPrep, backend::AutoReverseDiff{compile}, x
 ) where {compile}
-    y = f(x)  # TODO: ReverseDiff#251
-    result = DiffResult(y, (grad, hess))
-    if compile
-        result = hessian!(result, prep.hessian_tape, x)
-        grad = gradient!(grad, prep.gradient_tape, x) # TODO: ReverseDiff#251
-    else
-        result = hessian!(result, f, x)  # TODO: add prep.hessian_config
-        grad = gradient!(grad, f, x, prep.gradient_config) # TODO: ReverseDiff#251
-    end
-    # grad === DR.gradient(result) || copyto!(grad, DR.gradient(result))
-    hess === DR.hessian(result) || copyto!(hess, DR.hessian(result))
+    y = f(x)
+    DI.gradient!(f, grad, prep.gradient_prep, backend, x)
+    DI.hessian!(f, hess, prep, backend, x)
     return y, grad, hess
 end
 
 function DI.value_gradient_and_hessian(
-    f, prep::ReverseDiffHessianPrep, ::AutoReverseDiff{compile}, x
+    f, prep::ReverseDiffHessianPrep, backend::AutoReverseDiff{compile}, x
 ) where {compile}
-    y = f(x)  # TODO: remove once ReverseDiff#251 is fixed
-    result = DiffResult(y, (similar(x), similar(x, length(x), length(x))))
-    if compile
-        result = hessian!(result, prep.hessian_tape, x)
-    else
-        result = hessian!(result, f, x)  # todo: add prep.hessian_config
-    end
-    return (y, DR.gradient(result), DR.hessian(result))
+    y = f(x)
+    grad = DI.gradient(f, prep.gradient_prep, backend, x)
+    hess = DI.hessian(f, prep, backend, x)
+    return y, grad, hess
 end
 
 ### With contexts
 
 function DI.prepare_hessian(
-    f, ::AutoReverseDiff, x, contexts::Vararg{DI.Context,C}
+    f, backend::AutoReverseDiff, x, contexts::Vararg{DI.Context,C}
 ) where {C}
-    gradient_config = GradientConfig(x)
+    gradient_prep = DI.prepare_gradient(f, backend, x, contexts...)
     hessian_config = HessianConfig(x)
     return ReverseDiffHessianPrep(;
-        gradient_config=gradient_config,
-        hessian_config=hessian_config,
-        gradient_tape=nothing,
-        hessian_tape=nothing,
+        gradient_prep, hessian_config=hessian_config, hessian_tape=nothing
     )
 end
 
@@ -428,27 +404,25 @@ function DI.value_gradient_and_hessian!(
     grad,
     hess,
     prep::ReverseDiffHessianPrep,
-    ::AutoReverseDiff,
+    backend::AutoReverseDiff,
     x,
     contexts::Vararg{DI.Context,C},
 ) where {C}
-    fc = DI.with_contexts(f, contexts...)
-    y = fc(x)  # TODO: ReverseDiff#251
-    result = DiffResult(y, (grad, hess))
-    result = hessian!(result, fc, x)  # TODO: add prep.hessian_config
-    y = DR.value(result)
-    # grad === DR.gradient(result) || copyto!(grad, DR.gradient(result))
-    grad = gradient!(grad, fc, x, prep.gradient_config)  # TODO: ReverseDiff#251
-    hess === DR.hessian(result) || copyto!(hess, DR.hessian(result))
+    y = f(x, map(DI.unwrap, contexts)...)
+    DI.gradient!(f, grad, prep.gradient_prep, backend, x, contexts...)
+    DI.hessian!(f, hess, prep, backend, x, contexts...)
     return y, grad, hess
 end
 
 function DI.value_gradient_and_hessian(
-    f, prep::ReverseDiffHessianPrep, ::AutoReverseDiff, x, contexts::Vararg{DI.Context,C}
+    f,
+    prep::ReverseDiffHessianPrep,
+    backend::AutoReverseDiff,
+    x,
+    contexts::Vararg{DI.Context,C},
 ) where {C}
-    fc = DI.with_contexts(f, contexts...)
-    y = fc(x)  # TODO: ReverseDiff#251
-    result = HessianResult(x)
-    result = hessian!(result, fc, x)  # TODO: add prep.hessian_config
-    return (DR.value(result), DR.gradient(result), DR.hessian(result))
+    y = f(x, map(DI.unwrap, contexts)...)
+    grad = DI.gradient(f, prep.gradient_prep, backend, x, contexts...)
+    hess = DI.hessian(f, prep, backend, x, contexts...)
+    return y, grad, hess
 end
diff --git a/DifferentiationInterface/test/Back/ReverseDiff/test.jl b/DifferentiationInterface/test/Back/ReverseDiff/test.jl
@@ -29,7 +29,9 @@ test_differentiation(
     logging=LOGGING,
 );
 
-test_differentiation(backends, static_scenarios(); logging=LOGGING);
+test_differentiation(
+    backends, static_scenarios(; include_constantified=true); logging=LOGGING
+);
 
 ## Sparse