diff --git a/Project.toml b/Project.toml
index ab0bfb2..e8e7aab 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,16 +1,16 @@
 name = "ProximalAlgorithms"
 uuid = "140ffc9f-1907-541a-a177-7475e0a401e9"
-version = "0.5.5"
+version = "0.6.0"
 
 [deps]
+AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProximalCore = "dc4f5ac2-75d1-4f31-931e-60435d74994b"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
+AbstractDifferentiation = "0.6"
 LinearAlgebra = "1.2"
 Printf = "1.2"
 ProximalCore = "0.1"
-Zygote = "0.6"
 julia = "1.2"
diff --git a/README.md b/README.md
index 15129be..525ed7b 100644
--- a/README.md
+++ b/README.md
@@ -11,14 +11,21 @@ A Julia package for non-smooth optimization algorithms.
 This package provides algorithms for the minimization of objective functions
 that include non-smooth terms, such as constraints or non-differentiable penalties.
 Implemented algorithms include:
-* (Fast) Proximal gradient methods
-* Douglas-Rachford splitting
-* Three-term splitting
-* Primal-dual splitting algorithms
-* Newton-type methods
-
-This package works well in combination with [ProximalOperators](https://github.com/JuliaFirstOrder/ProximalOperators.jl) (>= 0.15),
-which contains a wide range of functions that can be used to express cost terms.
+- (Fast) Proximal gradient methods
+- Douglas-Rachford splitting
+- Three-term splitting
+- Primal-dual splitting algorithms
+- Newton-type methods
+
+Check out [this section](https://juliafirstorder.github.io/ProximalAlgorithms.jl/stable/guide/implemented_algorithms/) for an overview of the available algorithms.
+
+Algorithms rely on:
+- [AbstractDifferentiation.jl](https://github.com/JuliaDiff/AbstractDifferentiation.jl) for automatic differentiation
+(but you can easily bring your own gradients)
+- the [ProximalCore API](https://github.com/JuliaFirstOrder/ProximalCore.jl) for proximal mappings, projections, etc,
+to handle non-differentiable terms
+(see for example [ProximalOperators](https://github.com/JuliaFirstOrder/ProximalOperators.jl)
+for an extensive collection of functions).
 
 ## Documentation
 
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index a63736d..74a3dff 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -8,6 +8,22 @@ using FileIO
 
 const SUITE = BenchmarkGroup()
 
+function ProximalAlgorithms.value_and_gradient_closure(f::ProximalOperators.LeastSquaresDirect, x)
+    res = f.A*x - f.b
+    norm(res)^2, () -> f.A'*res
+end
+
+struct SquaredDistance{Tb}
+    b::Tb
+end
+
+(f::SquaredDistance)(x) = norm(x - f.b)^2
+
+function ProximalAlgorithms.value_and_gradient_closure(f::SquaredDistance, x)
+    diff = x - f.b
+    norm(diff)^2, () -> diff
+end
+
 for (benchmark_name, file_name) in [
     ("Lasso tiny", joinpath(@__DIR__, "data", "lasso_tiny.jld2")),
     ("Lasso small", joinpath(@__DIR__, "data", "lasso_small.jld2")),
@@ -42,21 +58,21 @@ for (benchmark_name, file_name) in [
         SUITE[k]["ZeroFPR"] = @benchmarkable solver(x0=x0, f=f, A=$A, g=g) setup=begin
             solver = ProximalAlgorithms.ZeroFPR(tol=1e-6)
             x0 = zeros($T, size($A, 2))
-            f = Translate(SqrNormL2(), -$b)
+            f = SquaredDistance($b)
             g = NormL1($lam)
         end
 
         SUITE[k]["PANOC"] = @benchmarkable solver(x0=x0, f=f, A=$A, g=g) setup=begin
             solver = ProximalAlgorithms.PANOC(tol=1e-6)
             x0 = zeros($T, size($A, 2))
-            f = Translate(SqrNormL2(), -$b)
+            f = SquaredDistance($b)
             g = NormL1($lam)
         end
 
         SUITE[k]["PANOCplus"] = @benchmarkable solver(x0=x0, f=f, A=$A, g=g) setup=begin
             solver = ProximalAlgorithms.PANOCplus(tol=1e-6)
             x0 = zeros($T, size($A, 2))
-            f = Translate(SqrNormL2(), -$b)
+            f = SquaredDistance($b)
             g = NormL1($lam)
         end
 
diff --git a/docs/Project.toml b/docs/Project.toml
index 708dd2b..f38368c 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
@@ -7,6 +8,7 @@ Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 ProximalAlgorithms = "140ffc9f-1907-541a-a177-7475e0a401e9"
 ProximalCore = "dc4f5ac2-75d1-4f31-931e-60435d74994b"
 ProximalOperators = "a725b495-10eb-56fe-b38b-717eba820537"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 Documenter = "1"
diff --git a/docs/src/examples/sparse_linear_regression.jl b/docs/src/examples/sparse_linear_regression.jl
index 9664e3b..3552a6d 100644
--- a/docs/src/examples/sparse_linear_regression.jl
+++ b/docs/src/examples/sparse_linear_regression.jl
@@ -51,10 +51,13 @@ end
 
 mean_squared_error(label, output) = mean((output .- label) .^ 2) / 2
 
+using Zygote
+using AbstractDifferentiation: ZygoteBackend
 using ProximalAlgorithms
 
-training_loss = ProximalAlgorithms.ZygoteFunction(
-    wb -> mean_squared_error(training_label, standardized_linear_model(wb, training_input))
+training_loss = ProximalAlgorithms.AutoDifferentiable(
+    wb -> mean_squared_error(training_label, standardized_linear_model(wb, training_input)),
+    ZygoteBackend()
 )
 
 # As regularization we will use the L1 norm, implemented in [ProximalOperators](https://github.com/JuliaFirstOrder/ProximalOperators.jl):
diff --git a/docs/src/guide/custom_objectives.jl b/docs/src/guide/custom_objectives.jl
index 56fc3c7..5189ddb 100644
--- a/docs/src/guide/custom_objectives.jl
+++ b/docs/src/guide/custom_objectives.jl
@@ -12,29 +12,32 @@
 # 
 # Defining the proximal mapping for a custom function type requires adding a method for [`ProximalCore.prox!`](@ref).
 # 
-# To compute gradients, ProximalAlgorithms provides a fallback definition for [`ProximalCore.gradient!`](@ref), 
-# relying on [Zygote](https://github.com/FluxML/Zygote.jl) to use automatic differentiation.
-# Therefore, you can provide any (differentiable) Julia function wherever gradients need to be taken,
-# and everything will work out of the box.
+# To compute gradients, algorithms use [`ProximalAlgorithms.value_and_gradient_closure`](@ref):
+# this relies on [AbstractDifferentiation](https://github.com/JuliaDiff/AbstractDifferentiation.jl), for automatic differentiation
+# with any of its supported backends, when functions are wrapped in [`ProximalAlgorithms.AutoDifferentiable`](@ref),
+# as the examples below show.
 # 
-# If however one would like to provide their own gradient implementation (e.g. for efficiency reasons),
-# they can simply implement a method for [`ProximalCore.gradient!`](@ref).
+# If however you would like to provide your own gradient implementation (e.g. for efficiency reasons),
+# you can simply implement a method for [`ProximalAlgorithms.value_and_gradient_closure`](@ref) on your own function type.
 # 
 # ```@docs
 # ProximalCore.prox
 # ProximalCore.prox!
-# ProximalCore.gradient
-# ProximalCore.gradient!
+# ProximalAlgorithms.value_and_gradient_closure
+# ProximalAlgorithms.AutoDifferentiable
 # ```
 # 
 # ## Example: constrained Rosenbrock
 # 
 # Let's try to minimize the celebrated Rosenbrock function, but constrained to the unit norm ball. The cost function is
 
+using Zygote
+using AbstractDifferentiation: ZygoteBackend
 using ProximalAlgorithms
 
-rosenbrock2D = ProximalAlgorithms.ZygoteFunction(
-    x -> 100 * (x[2] - x[1]^2)^2 + (1 - x[1])^2
+rosenbrock2D = ProximalAlgorithms.AutoDifferentiable(
+    x -> 100 * (x[2] - x[1]^2)^2 + (1 - x[1])^2,
+    ZygoteBackend()
 )
 
 # To enforce the constraint, we define the indicator of the unit ball, together with its proximal mapping:
@@ -82,17 +85,23 @@ scatter!([solution[1]], [solution[2]], color=:red, markershape=:star5, label="co
 
 mutable struct Counting{T}
     f::T
+    eval_count::Int
     gradient_count::Int
     prox_count::Int
 end
 
-Counting(f::T) where T = Counting{T}(f, 0, 0)
+Counting(f::T) where T = Counting{T}(f, 0, 0, 0)
 
-# Now we only need to intercept any call to `gradient!` and `prox!` and increase counters there:
+# Now we only need to intercept any call to `value_and_gradient_closure` and `prox!` and increase counters there:
 
-function ProximalCore.gradient!(y, f::Counting, x)
-    f.gradient_count += 1
-    return ProximalCore.gradient!(y, f.f, x)
+function ProximalAlgorithms.value_and_gradient_closure(f::Counting, x)
+    f.eval_count += 1
+    fx, pb = ProximalAlgorithms.value_and_gradient_closure(f.f, x)
+    function counting_pullback()
+        f.gradient_count += 1
+        return pb()
+    end
+    return fx, counting_pullback
 end
 
 function ProximalCore.prox!(y, f::Counting, x, gamma)
@@ -109,5 +118,6 @@ solution, iterations = panoc(x0=-ones(2), f=f, g=g)
 
 # and check how many operations where actually performed:
 
-println(f.gradient_count)
-println(g.prox_count)
+println("function evals: $(f.eval_count)")
+println("gradient evals: $(f.gradient_count)")
+println("    prox evals: $(g.prox_count)")
diff --git a/docs/src/guide/getting_started.jl b/docs/src/guide/getting_started.jl
index 228df29..1333bf4 100644
--- a/docs/src/guide/getting_started.jl
+++ b/docs/src/guide/getting_started.jl
@@ -20,7 +20,7 @@
 # The literature on proximal operators and algorithms is vast: for an overview, one can refer to [Parikh2014](@cite), [Beck2017](@cite).
 # 
 # To evaluate these first-order primitives, in ProximalAlgorithms:
-# * ``\nabla f_i`` falls back to using automatic differentiation (as provided by [Zygote](https://github.com/FluxML/Zygote.jl)).
+# * ``\nabla f_i`` falls back to using automatic differentiation (as provided by [AbstractDifferentiation](https://github.com/JuliaDiff/AbstractDifferentiation.jl) and all of its backends).
 # * ``\operatorname{prox}_{f_i}`` relies on the intereface of [ProximalOperators](https://github.com/JuliaFirstOrder/ProximalOperators.jl) (>= 0.15).
 # Both of the above can be implemented for custom function types, as [documented here](@ref custom_terms).
 # 
@@ -51,11 +51,14 @@
 # which we will solve using the fast proximal gradient method (also known as fast forward-backward splitting):
 
 using LinearAlgebra
+using Zygote
+using AbstractDifferentiation: ZygoteBackend
 using ProximalOperators
 using ProximalAlgorithms
 
-quadratic_cost = ProximalAlgorithms.ZygoteFunction(
-    x -> dot([3.4 1.2; 1.2 4.5] * x, x) / 2 + dot([-2.3, 9.9], x)
+quadratic_cost = ProximalAlgorithms.AutoDifferentiable(
+    x -> dot([3.4 1.2; 1.2 4.5] * x, x) / 2 + dot([-2.3, 9.9], x),
+    ZygoteBackend()
 )
 box_indicator = ProximalOperators.IndBox(0, 1)
 
@@ -69,8 +72,10 @@ ffb = ProximalAlgorithms.FastForwardBackward(maxit=1000, tol=1e-5, verbose=true)
 solution, iterations = ffb(x0=ones(2), f=quadratic_cost, g=box_indicator)
 
 # We can verify the correctness of the solution by checking that the negative gradient is orthogonal to the constraints, pointing outwards:
+# for this, we just evaluate the closure `cl` returned as second output of [`value_and_gradient_closure`](@ref).
 
--ProximalAlgorithms.gradient(quadratic_cost, solution)[1]
+v, cl = ProximalAlgorithms.value_and_gradient_closure(quadratic_cost, solution)
+-cl()
 
 # Or by plotting the solution against the cost function and constraint:
 
diff --git a/docs/src/index.md b/docs/src/index.md
index 87b95ef..b1a119f 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -5,16 +5,21 @@ A Julia package for non-smooth optimization algorithms. [Link to GitHub reposito
 This package provides algorithms for the minimization of objective functions
 that include non-smooth terms, such as constraints or non-differentiable penalties.
 Implemented algorithms include:
-* (Fast) Proximal gradient methods
-* Douglas-Rachford splitting
-* Three-term splitting
-* Primal-dual splitting algorithms
-* Newton-type methods
+- (Fast) Proximal gradient methods
+- Douglas-Rachford splitting
+- Three-term splitting
+- Primal-dual splitting algorithms
+- Newton-type methods
 
 Check out [this section](@ref problems_algorithms) for an overview of the available algorithms.
 
-This package works well in combination with [ProximalOperators](https://github.com/JuliaFirstOrder/ProximalOperators.jl) (>= 0.15),
-which contains a wide range of functions that can be used to express cost terms.
+Algorithms rely on:
+- [AbstractDifferentiation.jl](https://github.com/JuliaDiff/AbstractDifferentiation.jl) for automatic differentiation
+(but you can easily bring your own gradients)
+- the [ProximalCore API](https://github.com/JuliaFirstOrder/ProximalCore.jl) for proximal mappings, projections, etc,
+to handle non-differentiable terms
+(see for example [ProximalOperators](https://github.com/JuliaFirstOrder/ProximalOperators.jl)
+for an extensive collection of functions).
 
 !!! note
 
@@ -23,20 +28,11 @@ which contains a wide range of functions that can be used to express cost terms.
 
 ## Installation
 
-Install the latest stable release with
-
 ```julia
 julia> ]
 pkg> add ProximalAlgorithms
 ```
 
-To install the development version instead (`master` branch), do
-
-```julia
-julia> ]
-pkg> add ProximalAlgorithms#master
-```
-
 ## Citing
 
 If you use any of the algorithms from ProximalAlgorithms in your research, you are kindly asked to cite the relevant bibliography.
@@ -45,3 +41,4 @@ Please check [this section of the manual](@ref problems_algorithms) for algorith
 ## Contributing
 
 Contributions are welcome in the form of [issue notifications](https://github.com/JuliaFirstOrder/ProximalAlgorithms.jl/issues) or [pull requests](https://github.com/JuliaFirstOrder/ProximalAlgorithms.jl/pulls). When contributing new algorithms, we highly recommend looking at already implemented ones to get inspiration on how to structure the code.
+
diff --git a/justfile b/justfile
new file mode 100644
index 0000000..5dc42fa
--- /dev/null
+++ b/justfile
@@ -0,0 +1,19 @@
+julia:
+    julia --project=.
+
+instantiate:
+    julia --project=. -e 'using Pkg; Pkg.instantiate()'
+
+test:
+    julia --project=. -e 'using Pkg; Pkg.test()'
+
+format:
+    julia --project=. -e 'using JuliaFormatter: format; format(".")'
+
+docs:
+    julia --project=./docs docs/make.jl
+
+benchmark:
+    julia --project=benchmark -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
+    julia --project=benchmark benchmark/runbenchmarks.jl
+
diff --git a/src/ProximalAlgorithms.jl b/src/ProximalAlgorithms.jl
index 743c0ea..2494da6 100644
--- a/src/ProximalAlgorithms.jl
+++ b/src/ProximalAlgorithms.jl
@@ -1,14 +1,48 @@
 module ProximalAlgorithms
 
+using AbstractDifferentiation
 using ProximalCore
-using ProximalCore: prox, prox!, gradient, gradient!
+using ProximalCore: prox, prox!
 
 const RealOrComplex{R} = Union{R,Complex{R}}
 const Maybe{T} = Union{T,Nothing}
 
+"""
+    AutoDifferentiable(f, backend)
+
+Callable struct wrapping function `f` to be auto-differentiated using `backend`.
+
+When called, it evaluates the same as `f`, while [`ProximalAlgorithms.value_and_gradient_closure`](@ref)
+is implemented using `backend` for automatic differentiation.
+The backend can be any from [AbstractDifferentiation](https://github.com/JuliaDiff/AbstractDifferentiation.jl).
+"""
+struct AutoDifferentiable{F, B}
+    f::F
+    backend::B
+end
+
+(f::AutoDifferentiable)(x) = f.f(x)
+
+"""
+    value_and_gradient_closure(f, x)
+
+Return a tuple containing the value of `f` at `x`, and a closure `cl`.
+
+Function `cl`, once called, yields the gradient of `f` at `x`.
+"""
+value_and_gradient_closure
+
+function value_and_gradient_closure(f::AutoDifferentiable, x)
+    fx, pb = AbstractDifferentiation.value_and_pullback_function(f.backend, f.f, x)
+    return fx, () -> pb(one(fx))[1]
+end
+
+function value_and_gradient_closure(f::ProximalCore.Zero, x)
+    f(x), () -> zero(x)
+end
+
 # various utilities
 
-include("utilities/ad.jl")
 include("utilities/fb_tools.jl")
 include("utilities/iteration_tools.jl")
 
diff --git a/src/algorithms/davis_yin.jl b/src/algorithms/davis_yin.jl
index 93ad084..01b362b 100644
--- a/src/algorithms/davis_yin.jl
+++ b/src/algorithms/davis_yin.jl
@@ -55,7 +55,8 @@ end
 function Base.iterate(iter::DavisYinIteration)
     z = copy(iter.x0)
     xg, = prox(iter.g, z, iter.gamma)
-    grad_f_xg, = gradient(iter.f, xg)
+    f_xg, cl = value_and_gradient_closure(iter.f, xg)
+    grad_f_xg = cl()
     z_half = 2 .* xg .- z .- iter.gamma .* grad_f_xg
     xh, = prox(iter.h, z_half, iter.gamma)
     res = xh - xg
@@ -66,7 +67,8 @@ end
 
 function Base.iterate(iter::DavisYinIteration, state::DavisYinState)
     prox!(state.xg, iter.g, state.z, iter.gamma)
-    gradient!(state.grad_f_xg, iter.f, state.xg)
+    f_xg, cl = value_and_gradient_closure(iter.f, state.xg)
+    state.grad_f_xg .= cl()
     state.z_half .= 2 .* state.xg .- state.z .- iter.gamma .* state.grad_f_xg
     prox!(state.xh, iter.h, state.z_half, iter.gamma)
     state.res .= state.xh .- state.xg
diff --git a/src/algorithms/fast_forward_backward.jl b/src/algorithms/fast_forward_backward.jl
index b797478..916511c 100644
--- a/src/algorithms/fast_forward_backward.jl
+++ b/src/algorithms/fast_forward_backward.jl
@@ -68,7 +68,8 @@ end
 
 function Base.iterate(iter::FastForwardBackwardIteration)
     x = copy(iter.x0)
-    grad_f_x, f_x = gradient(iter.f, x)
+    f_x, cl = value_and_gradient_closure(iter.f, x)
+    grad_f_x = cl()
     gamma = iter.gamma === nothing ? 1 / lower_bound_smoothness_constant(iter.f, I, x, grad_f_x) : iter.gamma
     y = x - gamma .* grad_f_x
     z, g_z = prox(iter.g, y, gamma)
@@ -103,7 +104,8 @@ function Base.iterate(iter::FastForwardBackwardIteration{R}, state::FastForwardB
     state.x .= state.z .+ beta .* (state.z .- state.z_prev)
     state.z_prev, state.z = state.z, state.z_prev
 
-    state.f_x = gradient!(state.grad_f_x, iter.f, state.x)
+    state.f_x, cl = value_and_gradient_closure(iter.f, state.x)
+    state.grad_f_x .= cl()
     state.y .= state.x .- state.gamma .* state.grad_f_x
     state.g_z = prox!(state.z, iter.g, state.y, state.gamma)
     state.res .= state.x .- state.z
diff --git a/src/algorithms/forward_backward.jl b/src/algorithms/forward_backward.jl
index 6d7f43b..15b6040 100644
--- a/src/algorithms/forward_backward.jl
+++ b/src/algorithms/forward_backward.jl
@@ -59,7 +59,8 @@ end
 
 function Base.iterate(iter::ForwardBackwardIteration)
     x = copy(iter.x0)
-    grad_f_x, f_x = gradient(iter.f, x)
+    f_x, cl = value_and_gradient_closure(iter.f, x)
+    grad_f_x = cl()
     gamma = iter.gamma === nothing ? 1 / lower_bound_smoothness_constant(iter.f, I, x, grad_f_x) : iter.gamma
     y = x - gamma .* grad_f_x
     z, g_z = prox(iter.g, y, gamma)
@@ -81,7 +82,8 @@ function Base.iterate(iter::ForwardBackwardIteration{R}, state::ForwardBackwardS
         state.grad_f_x, state.grad_f_z = state.grad_f_z, state.grad_f_x
     else
         state.x, state.z = state.z, state.x
-        state.f_x = gradient!(state.grad_f_x, iter.f, state.x)
+        state.f_x, cl = value_and_gradient_closure(iter.f, state.x)
+        state.grad_f_x .= cl()
     end
 
     state.y .= state.x .- state.gamma .* state.grad_f_x
diff --git a/src/algorithms/li_lin.jl b/src/algorithms/li_lin.jl
index cb572a8..fc7e367 100644
--- a/src/algorithms/li_lin.jl
+++ b/src/algorithms/li_lin.jl
@@ -62,7 +62,8 @@ end
 
 function Base.iterate(iter::LiLinIteration{R}) where {R}
     y = copy(iter.x0)
-    grad_f_y, f_y = gradient(iter.f, y)
+    f_y, cl = value_and_gradient_closure(iter.f, y)
+    grad_f_y = cl()
 
     # TODO: initialize gamma if not provided
     # TODO: authors suggest Barzilai-Borwein rule?
@@ -102,7 +103,8 @@ function Base.iterate(
     else
         # TODO: re-use available space in state?
         # TODO: backtrack gamma at x
-        grad_f_x, f_x = gradient(iter.f, x)
+        f_x, cl = value_and_gradient_closure(iter.f, x)
+        grad_f_x = cl()
         x_forward = state.x - state.gamma .* grad_f_x
         v, g_v = prox(iter.g, x_forward, state.gamma)
         Fv = iter.f(v) + g_v
@@ -121,7 +123,8 @@ function Base.iterate(
         Fx = Fv
     end
 
-    state.f_y = gradient!(state.grad_f_y, iter.f, state.y)
+    state.f_y, cl = value_and_gradient_closure(iter.f, state.y)
+    state.grad_f_y .= cl()
     state.y_forward .= state.y .- state.gamma .* state.grad_f_y
     state.g_z = prox!(state.z, iter.g, state.y_forward, state.gamma)
 
diff --git a/src/algorithms/panoc.jl b/src/algorithms/panoc.jl
index 5f38dcb..72414ff 100644
--- a/src/algorithms/panoc.jl
+++ b/src/algorithms/panoc.jl
@@ -86,7 +86,8 @@ f_model(iter::PANOCIteration, state::PANOCState) = f_model(state.f_Ax, state.At_
 function Base.iterate(iter::PANOCIteration{R}) where R
     x = copy(iter.x0)
     Ax = iter.A * x
-    grad_f_Ax, f_Ax = gradient(iter.f, Ax)
+    f_Ax, cl = value_and_gradient_closure(iter.f, Ax)
+    grad_f_Ax = cl()
     gamma = iter.gamma === nothing ? iter.alpha / lower_bound_smoothness_constant(iter.f, iter.A, x, grad_f_Ax) : iter.gamma
     At_grad_f_Ax = iter.A' * grad_f_Ax
     y = x - gamma .* At_grad_f_Ax
@@ -152,7 +153,8 @@ function Base.iterate(iter::PANOCIteration{R, Tx, Tf}, state::PANOCState) where
 
     state.x_d .= state.x .+ state.d
     state.Ax_d .= state.Ax .+ state.Ad
-    state.f_Ax_d = gradient!(state.grad_f_Ax_d, iter.f, state.Ax_d)
+    state.f_Ax_d, cl = value_and_gradient_closure(iter.f, state.Ax_d)
+    state.grad_f_Ax_d .= cl()
     mul!(state.At_grad_f_Ax_d, adjoint(iter.A), state.grad_f_Ax_d)
 
     copyto!(state.x, state.x_d)
@@ -189,7 +191,8 @@ function Base.iterate(iter::PANOCIteration{R, Tx, Tf}, state::PANOCState) where
             # along a line using interpolation and linear combinations
             # this allows saving operations
             if isinf(f_Az)
-                f_Az = gradient!(state.grad_f_Az, iter.f, state.Az)
+                f_Az, cl = value_and_gradient_closure(iter.f, state.Az)
+                state.grad_f_Az .= cl()
             end
             if isinf(c)
                 mul!(state.At_grad_f_Az, iter.A', state.grad_f_Az)
@@ -203,7 +206,8 @@ function Base.iterate(iter::PANOCIteration{R, Tx, Tf}, state::PANOCState) where
         else
             # otherwise, in the general case where f is only smooth, we compute
             # one gradient and matvec per backtracking step
-            state.f_Ax = gradient!(state.grad_f_Ax, iter.f, state.Ax)
+            state.f_Ax, cl = value_and_gradient_closure(iter.f, state.Ax)
+            state.grad_f_Ax .= cl()
             mul!(state.At_grad_f_Ax, adjoint(iter.A), state.grad_f_Ax)
         end
 
diff --git a/src/algorithms/panocplus.jl b/src/algorithms/panocplus.jl
index cc9e994..2b33e4d 100644
--- a/src/algorithms/panocplus.jl
+++ b/src/algorithms/panocplus.jl
@@ -79,7 +79,8 @@ f_model(iter::PANOCplusIteration, state::PANOCplusState) = f_model(state.f_Ax, s
 function Base.iterate(iter::PANOCplusIteration{R}) where {R}
     x = copy(iter.x0)
     Ax = iter.A * x
-    grad_f_Ax, f_Ax = gradient(iter.f, Ax)
+    f_Ax, cl = value_and_gradient_closure(iter.f, Ax)
+    grad_f_Ax = cl()
     gamma = iter.gamma === nothing ? iter.alpha / lower_bound_smoothness_constant(iter.f, iter.A, x, grad_f_Ax) : iter.gamma
     At_grad_f_Ax = iter.A' * grad_f_Ax
     y = x - gamma .* At_grad_f_Ax
@@ -97,7 +98,8 @@ function Base.iterate(iter::PANOCplusIteration{R}) where {R}
         )
     else
         mul!(state.Az, iter.A, state.z)
-        gradient!(state.grad_f_Az, iter.f, state.Az)
+        f_Az, cl = value_and_gradient_closure(iter.f, state.Az)
+        state.grad_f_Az = cl()
     end
     mul!(state.At_grad_f_Az, adjoint(iter.A), state.grad_f_Az)
     return state, state
@@ -152,7 +154,8 @@ function Base.iterate(iter::PANOCplusIteration{R}, state::PANOCplusState) where
         end
 
         mul!(state.Ax, iter.A, state.x)
-        state.f_Ax = gradient!(state.grad_f_Ax, iter.f, state.Ax)
+        state.f_Ax, cl = value_and_gradient_closure(iter.f, state.Ax)
+        state.grad_f_Ax .= cl()
         mul!(state.At_grad_f_Ax, adjoint(iter.A), state.grad_f_Ax)
 
         state.y .= state.x .- state.gamma .* state.At_grad_f_Ax
@@ -162,7 +165,8 @@ function Base.iterate(iter::PANOCplusIteration{R}, state::PANOCplusState) where
         f_Az_upp = f_model(iter, state)
 
         mul!(state.Az, iter.A, state.z)
-        f_Az = gradient!(state.grad_f_Az, iter.f, state.Az)
+        f_Az, cl = value_and_gradient_closure(iter.f, state.Az)
+        state.grad_f_Az .= cl()
         if (iter.gamma === nothing || iter.adaptive == true)
             tol = 10 * eps(R) * (1 + abs(f_Az))
             if f_Az > f_Az_upp + tol && state.gamma >= iter.minimum_gamma
diff --git a/src/algorithms/primal_dual.jl b/src/algorithms/primal_dual.jl
index 1ac83ba..d9e9074 100644
--- a/src/algorithms/primal_dual.jl
+++ b/src/algorithms/primal_dual.jl
@@ -167,7 +167,8 @@ end
 
 function Base.iterate(iter::AFBAIteration, state::AFBAState = AFBAState(x=copy(iter.x0), y=copy(iter.y0)))
     # perform xbar-update step
-    gradient!(state.gradf, iter.f, state.x)
+    f_x, cl = value_and_gradient_closure(iter.f, state.x)
+    state.gradf .= cl()
     mul!(state.temp_x, iter.L', state.y)
     state.temp_x .+= state.gradf
     state.temp_x .*= -iter.gamma[1]
@@ -175,7 +176,8 @@ function Base.iterate(iter::AFBAIteration, state::AFBAState = AFBAState(x=copy(i
     prox!(state.xbar, iter.g, state.temp_x, iter.gamma[1])
 
     # perform ybar-update step
-    gradient!(state.gradl, convex_conjugate(iter.l), state.y)
+    lc_y, cl = value_and_gradient_closure(convex_conjugate(iter.l), state.y)
+    state.gradl .= cl()
     state.temp_x .= iter.theta .* state.xbar .+ (1 - iter.theta) .* state.x
     mul!(state.temp_y, iter.L, state.temp_x)
     state.temp_y .-= state.gradl
diff --git a/src/algorithms/sfista.jl b/src/algorithms/sfista.jl
index 954309d..c66e1a8 100644
--- a/src/algorithms/sfista.jl
+++ b/src/algorithms/sfista.jl
@@ -71,7 +71,8 @@ function Base.iterate(
     state.a = (state.τ + sqrt(state.τ ^ 2 + 4 * state.τ * state.APrev)) / 2
     state.A = state.APrev + state.a
     state.xt .= (state.APrev / state.A) .* state.yPrev + (state.a / state.A) .* state.xPrev
-    gradient!(state.gradf_xt, iter.f, state.xt)
+    f_xt, cl = value_and_gradient_closure(iter.f, state.xt)
+    state.gradf_xt .= cl()
     λ2 = state.λ / (1 + state.λ * iter.mf)
     # FISTA acceleration steps.
     prox!(state.y, iter.g, state.xt - λ2 * state.gradf_xt, λ2)
@@ -93,7 +94,8 @@ function check_sc(state::SFISTAState, iter::SFISTAIteration, tol, termination_ty
     else
         # Classic (approximate) first-order stationary point [4]. The main inclusion is: r ∈ ∇f(y) + ∂h(y).
         λ2 = state.λ / (1 + state.λ * iter.mf)
-        gradf_y, = gradient(iter.f, state.y)
+        f_y, cl = value_and_gradient_closure(iter.f, state.y)
+        gradf_y = cl()
         r = gradf_y - state.gradf_xt + (state.xt - state.y) / λ2
         res = norm(r)
     end
diff --git a/src/algorithms/zerofpr.jl b/src/algorithms/zerofpr.jl
index fee5ea5..6008cc4 100644
--- a/src/algorithms/zerofpr.jl
+++ b/src/algorithms/zerofpr.jl
@@ -84,7 +84,8 @@ f_model(iter::ZeroFPRIteration, state::ZeroFPRState) = f_model(state.f_Ax, state
 function Base.iterate(iter::ZeroFPRIteration{R}) where R
     x = copy(iter.x0)
     Ax = iter.A * x
-    grad_f_Ax, f_Ax = gradient(iter.f, Ax)
+    f_Ax, cl = value_and_gradient_closure(iter.f, Ax)
+    grad_f_Ax = cl()
     gamma = iter.gamma === nothing ? iter.alpha / lower_bound_smoothness_constant(iter.f, iter.A, x, grad_f_Ax) : iter.gamma
     At_grad_f_Ax = iter.A' * grad_f_Ax
     y = x - gamma .* At_grad_f_Ax
@@ -130,7 +131,9 @@ function Base.iterate(iter::ZeroFPRIteration{R}, state::ZeroFPRState) where R
         f_Axbar_upp, f_Axbar
     else
         mul!(state.Axbar, iter.A, state.xbar)
-        f_model(iter, state), gradient!(state.grad_f_Axbar, iter.f, state.Axbar)
+        f_Axbar, cl = value_and_gradient_closure(iter.f, state.Axbar)
+        state.grad_f_Axbar .= cl()
+        f_model(iter, state), f_Axbar
     end
 
     # compute FBE
@@ -164,7 +167,8 @@ function Base.iterate(iter::ZeroFPRIteration{R}, state::ZeroFPRState) where R
         state.x .= state.xbar_prev .+ state.tau .* state.d
         state.Ax .= state.Axbar .+ state.tau .* state.Ad
         # TODO: can precompute most of next line in case f is quadratic
-        state.f_Ax = gradient!(state.grad_f_Ax, iter.f, state.Ax)
+        state.f_Ax, cl = value_and_gradient_closure(iter.f, state.Ax)
+        state.grad_f_Ax .= cl()
         mul!(state.At_grad_f_Ax, iter.A', state.grad_f_Ax)
         state.y .= state.x .- state.gamma .* state.At_grad_f_Ax
         state.g_xbar = prox!(state.xbar, iter.g, state.y, state.gamma)
diff --git a/src/utilities/ad.jl b/src/utilities/ad.jl
deleted file mode 100644
index f1e43b7..0000000
--- a/src/utilities/ad.jl
+++ /dev/null
@@ -1,14 +0,0 @@
-using Zygote: pullback
-using ProximalCore
-
-struct ZygoteFunction{F}
-    f::F
-end
-
-(f::ZygoteFunction)(x) = f.f(x)
-
-function ProximalCore.gradient!(grad, f::ZygoteFunction, x)
-    fx, pb = pullback(f.f, x)
-    grad .= pb(one(fx))[1]
-    return fx
-end
diff --git a/src/utilities/fb_tools.jl b/src/utilities/fb_tools.jl
index 34d6980..e2acff4 100644
--- a/src/utilities/fb_tools.jl
+++ b/src/utilities/fb_tools.jl
@@ -7,29 +7,29 @@ end
 function lower_bound_smoothness_constant(f, A, x, grad_f_Ax)
     R = real(eltype(x))
     xeps = x .+ 1
-    grad_f_Axeps, _ = gradient(f, A * xeps)
+    f_Axeps, cl = value_and_gradient_closure(f, A * xeps)
+    grad_f_Axeps = cl()
     return norm(A' * (grad_f_Axeps - grad_f_Ax)) / R(sqrt(length(x)))
 end
 
 function lower_bound_smoothness_constant(f, A, x)
+    R = real(eltype(x))
     Ax = A * x
-    grad_f_Ax, _ = gradient(f, Ax)
+    f_Ax, cl = value_and_gradient_closure(f, Ax)
+    grad_f_Ax = cl()
     return lower_bound_smoothness_constant(f, A, x, grad_f_Ax)
 end
 
 _mul!(y, L, x) = mul!(y, L, x)
 _mul!(y, ::Nothing, x) = return
 
-_gradient!(y, f, x) = gradient!(y, f, x)
-_gradient!(::Nothing, f, x) = f(x)
-
 function backtrack_stepsize!(
     gamma::R, f, A, g, x, f_Ax::R, At_grad_f_Ax, y, z, g_z::R, res, Az, grad_f_Az=nothing;
     alpha = 1, minimum_gamma = 1e-7
 ) where R
     f_Az_upp = f_model(f_Ax, At_grad_f_Ax, res, alpha / gamma)
     _mul!(Az, A, z)
-    f_Az = _gradient!(grad_f_Az, f, Az)
+    f_Az, cl = value_and_gradient_closure(f, Az)
     tol = 10 * eps(R) * (1 + abs(f_Az))
     while f_Az > f_Az_upp + tol && gamma >= minimum_gamma
         gamma /= 2
@@ -38,9 +38,12 @@ function backtrack_stepsize!(
         res .= x .- z
         f_Az_upp = f_model(f_Ax, At_grad_f_Ax, res, alpha / gamma)
         _mul!(Az, A, z)
-        f_Az = _gradient!(grad_f_Az, f, Az)
+        f_Az, cl = value_and_gradient_closure(f, Az)
         tol = 10 * eps(R) * (1 + abs(f_Az))
     end
+    if grad_f_Az !== nothing
+        grad_f_Az .= cl()
+    end
     if gamma < minimum_gamma
         @warn "stepsize `gamma` became too small ($(gamma))"
     end
@@ -51,7 +54,8 @@ function backtrack_stepsize!(
     gamma, f, A, g, x; alpha = 1, minimum_gamma = 1e-7
 )
     Ax = A * x
-    grad_f_Ax, f_Ax = gradient(f, Ax)
+    f_Ax, cl = value_and_gradient_closure(f, Ax)
+    grad_f_Ax = cl()
     At_grad_f_Ax = A' * grad_f_Ax
     y = x - gamma .* At_grad_f_Ax
     z, g_z = prox(g, y, gamma)
diff --git a/test/Project.toml b/test/Project.toml
index ea59c93..7241417 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,10 +1,14 @@
 [deps]
+AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
 AbstractOperators = "d9c5613a-d543-52d8-9afd-8f241a8c3f1c"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProximalCore = "dc4f5ac2-75d1-4f31-931e-60435d74994b"
 ProximalOperators = "a725b495-10eb-56fe-b38b-717eba820537"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/test/definitions/arraypartition.jl b/test/definitions/arraypartition.jl
deleted file mode 100644
index 87dcb78..0000000
--- a/test/definitions/arraypartition.jl
+++ /dev/null
@@ -1,20 +0,0 @@
-import ProximalCore
-import RecursiveArrayTools
-
-@inline function ProximalCore.prox(h, x::RecursiveArrayTools.ArrayPartition, gamma...)
-    # unwrap
-    y, fy = ProximalCore.prox(h, x.x, gamma...)
-    # wrap
-    return RecursiveArrayTools.ArrayPartition(y), fy
-end
-
-@inline function ProximalCore.gradient(h, x::RecursiveArrayTools.ArrayPartition)
-    # unwrap
-    grad, fx = ProximalCore.gradient(h, x.x)
-    # wrap
-    return RecursiveArrayTools.ArrayPartition(grad), fx
-end
-
-@inline ProximalCore.prox!(y::RecursiveArrayTools.ArrayPartition, h, x::RecursiveArrayTools.ArrayPartition, gamma...) = ProximalCore.prox!(y.x, h, x.x, gamma...)
-
-@inline ProximalCore.gradient!(y::RecursiveArrayTools.ArrayPartition, h, x::RecursiveArrayTools.ArrayPartition) = ProximalCore.gradient!(y.x, h, x.x)
diff --git a/test/definitions/compose.jl b/test/definitions/compose.jl
deleted file mode 100644
index 8e38c2a..0000000
--- a/test/definitions/compose.jl
+++ /dev/null
@@ -1,29 +0,0 @@
-using RecursiveArrayTools: ArrayPartition
-using ProximalCore
-
-struct ComposeAffine
-    f
-    A
-    b
-end
-
-function (g::ComposeAffine)(x)
-    res = g.A * x .+ g.b
-    return g.f(res)
-end
-
-function compose_affine_gradient!(y, g::ComposeAffine, x)
-    res = g.A * x .+ g.b
-    gradres, v = gradient(g.f, res)
-    mul!(y, adjoint(g.A), gradres)
-    return v
-end
-
-ProximalCore.gradient!(y, g::ComposeAffine, x) = compose_affine_gradient!(y, g, x)
-ProximalCore.gradient!(y::ArrayPartition, g::ComposeAffine, x::ArrayPartition) = compose_affine_gradient!(y, g, x)
-
-function ProximalCore.gradient(h::ComposeAffine, x::ArrayPartition)
-    grad_fx = similar(x)
-    fx = ProximalCore.gradient!(grad_fx, h, x)
-    return grad_fx, fx
-end
diff --git a/test/problems/test_elasticnet.jl b/test/problems/test_elasticnet.jl
index 72e52b8..ffa376a 100644
--- a/test/problems/test_elasticnet.jl
+++ b/test/problems/test_elasticnet.jl
@@ -1,8 +1,10 @@
-@testset "Elastic net ($T)" for T in [Float32, Float64, ComplexF32, ComplexF64]
-    using ProximalOperators
-    using ProximalAlgorithms
-    using LinearAlgebra
+using LinearAlgebra
+using ProximalOperators: NormL1, SqrNormL2, ElasticNet, Translate
+using ProximalAlgorithms
+using Zygote
+using AbstractDifferentiation: ZygoteBackend
 
+@testset "Elastic net ($T)" for T in [Float32, Float64, ComplexF32, ComplexF64]
     A = T[
         1.0 -2.0 3.0 -4.0 5.0
         2.0 -1.0 0.0 -1.0 3.0
@@ -19,7 +21,7 @@
     reg1 = NormL1(R(1))
     reg2 = SqrNormL2(R(1))
     loss = Translate(SqrNormL2(R(1)), -b)
-    cost = LeastSquares(A, b)
+    cost = ProximalAlgorithms.AutoDifferentiable(x -> (norm(A*x - b)^2) / 2, ZygoteBackend())
 
     L = opnorm(A)^2
 
@@ -52,7 +54,7 @@
 
     afba_test_params = [
         (2, 0, 130),
-        (1, 1, 1890),
+        (1, 1, 2000),
         (0, 1, 320),
         (0, 0, 194),
         (1, 0, 130),
@@ -69,7 +71,7 @@
 
         solver = ProximalAlgorithms.AFBA(theta = theta, mu = mu, tol = R(1e-6))
         (x_afba, y_afba), it_afba =
-            solver(x0 = x0, y0 = y0, f = reg2, g = reg1, h = loss, L = A, beta_f = 1)
+            solver(x0 = x0, y0 = y0, f = ProximalAlgorithms.AutoDifferentiable(reg2, ZygoteBackend()), g = reg1, h = loss, L = A, beta_f = 1)
         @test eltype(x_afba) == T
         @test eltype(y_afba) == T
         @test norm(x_afba - x_star, Inf) <= 1e-4
@@ -86,7 +88,7 @@
 
         solver = ProximalAlgorithms.AFBA(theta = theta, mu = mu, tol = R(1e-6))
         (x_afba, y_afba), it_afba =
-            solver(x0 = x0, y0 = y0, f = reg2, g = reg1, h = loss, L = A, beta_f = 1)
+            solver(x0 = x0, y0 = y0, f = ProximalAlgorithms.AutoDifferentiable(reg2, ZygoteBackend()), g = reg1, h = loss, L = A, beta_f = 1)
         @test eltype(x_afba) == T
         @test eltype(y_afba) == T
         @test norm(x_afba - x_star, Inf) <= 1e-4
diff --git a/test/problems/test_equivalence.jl b/test/problems/test_equivalence.jl
index 004d618..de8f970 100644
--- a/test/problems/test_equivalence.jl
+++ b/test/problems/test_equivalence.jl
@@ -1,7 +1,8 @@
 using LinearAlgebra
 using Test
-
-using ProximalOperators
+using Zygote
+using AbstractDifferentiation: ZygoteBackend
+using ProximalOperators: LeastSquares, NormL1
 using ProximalAlgorithms:
     DouglasRachfordIteration, DRLSIteration,
     ForwardBackwardIteration, PANOCIteration,
@@ -51,7 +52,7 @@ end
 
     lam = R(0.1) * norm(A' * b, Inf)
 
-    f = LeastSquares(A, b)
+    f = ProximalAlgorithms.AutoDifferentiable(x -> (norm(A*x - b)^2) / 2, ZygoteBackend())
     g = NormL1(lam)
 
     x0 = zeros(R, n)
@@ -79,7 +80,7 @@ end
 
     lam = R(0.1) * norm(A' * b, Inf)
 
-    f = LeastSquares(A, b)
+    f = ProximalAlgorithms.AutoDifferentiable(x -> (norm(A*x - b)^2) / 2, ZygoteBackend())
     g = NormL1(lam)
 
     x0 = zeros(R, n)
diff --git a/test/problems/test_lasso_small.jl b/test/problems/test_lasso_small.jl
index ba0d69d..2f1c387 100644
--- a/test/problems/test_lasso_small.jl
+++ b/test/problems/test_lasso_small.jl
@@ -1,7 +1,9 @@
 using LinearAlgebra
 using Test
 
-using ProximalOperators
+using Zygote
+using AbstractDifferentiation: ZygoteBackend
+using ProximalOperators: NormL1, LeastSquares
 using ProximalAlgorithms
 using ProximalAlgorithms:
     LBFGS, Broyden, AndersonAcceleration,
@@ -23,8 +25,10 @@ using ProximalAlgorithms:
     lam = R(0.1) * norm(A' * b, Inf)
     @test typeof(lam) == R
 
-    f = Translate(SqrNormL2(R(1)), -b)
-    f2 = LeastSquares(A, b)
+    f_autodiff = ProximalAlgorithms.AutoDifferentiable(x -> (norm(x - b)^2)/2, ZygoteBackend())
+    fA_autodiff = ProximalAlgorithms.AutoDifferentiable(x -> (norm(A*x - b)^2)/2, ZygoteBackend())
+    f_prox = Translate(SqrNormL2(R(1)), -b)
+    fA_prox = LeastSquares(A, b)
     g = NormL1(lam)
 
     Lf = opnorm(A)^2
@@ -37,7 +41,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.ForwardBackward(tol = TOL)
-        x, it = @inferred solver(x0 = x0, f = f2, g = g, Lf = Lf)
+        x, it = @inferred solver(x0 = x0, f = fA_autodiff, g = g, Lf = Lf)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 150
@@ -48,7 +52,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.ForwardBackward(tol = TOL, adaptive = true)
-        x, it = @inferred solver(x0 = x0, f = f2, g = g)
+        x, it = @inferred solver(x0 = x0, f = fA_autodiff, g = g)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 300
@@ -59,7 +63,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.FastForwardBackward(tol = TOL)
-        x, it = @inferred solver(x0 = x0, f = f2, g = g, Lf = Lf)
+        x, it = @inferred solver(x0 = x0, f = fA_autodiff, g = g, Lf = Lf)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 100
@@ -71,7 +75,7 @@ using ProximalAlgorithms:
         x0_backup = copy(x0)
         solver =
             ProximalAlgorithms.FastForwardBackward(tol = TOL, adaptive = true)
-        x, it = @inferred solver(x0 = x0, f = f2, g = g)
+        x, it = @inferred solver(x0 = x0, f = fA_autodiff, g = g)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 200
@@ -82,7 +86,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.FastForwardBackward(tol = TOL)
-        x, it = @inferred solver(x0 = x0, f = f2, g = g, Lf = Lf, extrapolation_sequence=FixedNesterovSequence(real(T)))
+        x, it = @inferred solver(x0 = x0, f = fA_autodiff, g = g, Lf = Lf, extrapolation_sequence=FixedNesterovSequence(real(T)))
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 100
@@ -93,7 +97,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.ZeroFPR(tol = TOL)
-        x, it = @inferred solver(x0 = x0, f = f, A = A, g = g, Lf = Lf)
+        x, it = @inferred solver(x0 = x0, f = f_autodiff, A = A, g = g, Lf = Lf)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 20
@@ -104,7 +108,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.ZeroFPR(adaptive = true, tol = TOL)
-        x, it = @inferred solver(x0 = x0, f = f, A = A, g = g)
+        x, it = @inferred solver(x0 = x0, f = f_autodiff, A = A, g = g)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 20
@@ -116,7 +120,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.PANOC(tol = TOL)
-        x, it = @inferred solver(x0 = x0, f = f, A = A, g = g, Lf = Lf)
+        x, it = @inferred solver(x0 = x0, f = f_autodiff, A = A, g = g, Lf = Lf)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 20
@@ -128,7 +132,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.PANOC(adaptive = true, tol = TOL)
-        x, it = @inferred solver(x0 = x0, f = f, A = A, g = g)
+        x, it = @inferred solver(x0 = x0, f = f_autodiff, A = A, g = g)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 20
@@ -139,7 +143,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.PANOCplus(tol = TOL)
-        x, it = @inferred solver(x0 = x0, f = f, A = A, g = g, Lf = Lf)
+        x, it = @inferred solver(x0 = x0, f = f_autodiff, A = A, g = g, Lf = Lf)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 20
@@ -150,7 +154,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL)
-        x, it = @inferred solver(x0 = x0, f = f, A = A, g = g)
+        x, it = @inferred solver(x0 = x0, f = f_autodiff, A = A, g = g)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 20
@@ -161,7 +165,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.DouglasRachford(gamma = R(10) / opnorm(A)^2, tol = TOL)
-        y, it = @inferred solver(x0 = x0, f = f2, g = g)
+        y, it = @inferred solver(x0 = x0, f = fA_prox, g = g)
         @test eltype(y) == T
         @test norm(y - x_star, Inf) <= TOL
         @test it < 30
@@ -178,7 +182,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.DRLS(tol = 10 * TOL, directions=acc)
-        z, it = @inferred solver(x0 = x0, f = f2, g = g, Lf = Lf)
+        z, it = @inferred solver(x0 = x0, f = fA_prox, g = g, Lf = Lf)
         @test eltype(z) == T
         @test norm(z - x_star, Inf) <= 10 * TOL
         @test it < maxit
@@ -189,7 +193,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.AFBA(theta = 1, mu = 1, tol = R(1e-6))
-        (x_afba, y_afba), it_afba = @inferred solver(x0 = x0, y0 = zeros(T, n), f = f2, g = g, beta_f = opnorm(A)^2)
+        (x_afba, y_afba), it_afba = @inferred solver(x0 = x0, y0 = zeros(T, n), f = fA_autodiff, g = g, beta_f = opnorm(A)^2)
         @test eltype(x_afba) == T
         @test eltype(y_afba) == T
         @test norm(x_afba - x_star, Inf) <= 1e-4
@@ -197,7 +201,7 @@ using ProximalAlgorithms:
         @test x0 == x0_backup
 
         solver = ProximalAlgorithms.AFBA(theta = 1, mu = 1, tol = R(1e-6))
-        (x_afba, y_afba), it_afba = @inferred solver(x0 = x0, y0 = zeros(T, n), f = f2, h = g, beta_f = opnorm(A)^2)
+        (x_afba, y_afba), it_afba = @inferred solver(x0 = x0, y0 = zeros(T, n), f = fA_autodiff, h = g, beta_f = opnorm(A)^2)
         @test eltype(x_afba) == T
         @test eltype(y_afba) == T
         @test norm(x_afba - x_star, Inf) <= 1e-4
@@ -205,7 +209,7 @@ using ProximalAlgorithms:
         @test x0 == x0_backup
 
         solver = ProximalAlgorithms.AFBA(theta = 1, mu = 1, tol = R(1e-6))
-        (x_afba, y_afba), it_afba = @inferred solver(x0 = x0, y0 = zeros(T, m), h = f, L = A, g = g)
+        (x_afba, y_afba), it_afba = @inferred solver(x0 = x0, y0 = zeros(T, m), h = f_prox, L = A, g = g)
         @test eltype(x_afba) == T
         @test eltype(y_afba) == T
         @test norm(x_afba - x_star, Inf) <= 1e-4
@@ -217,7 +221,7 @@ using ProximalAlgorithms:
         x0 = zeros(T, n)
         x0_backup = copy(x0)
         solver = ProximalAlgorithms.SFISTA(tol = 10 * TOL)
-        y, it = @inferred solver(x0 = x0, f = f2, g = g, Lf = Lf)
+        y, it = @inferred solver(x0 = x0, f = fA_autodiff, g = g, Lf = Lf)
         @test eltype(y) == T
         @test norm(y - x_star, Inf) <= 10 * TOL
         @test it < 100
diff --git a/test/problems/test_lasso_small_h_split.jl b/test/problems/test_lasso_small_h_split.jl
deleted file mode 100644
index 54d0fcf..0000000
--- a/test/problems/test_lasso_small_h_split.jl
+++ /dev/null
@@ -1,171 +0,0 @@
-@testset "Lasso small (h. split, $T)" for T in [Float32, Float64, ComplexF32, ComplexF64]
-    using ProximalOperators
-    using ProximalAlgorithms
-    using LinearAlgebra
-    using AbstractOperators: MatrixOp
-    using RecursiveArrayTools: ArrayPartition
-
-    A1 = T[
-        1.0 -2.0 3.0
-        2.0 -1.0 0.0
-        -1.0 0.0 4.0
-        -1.0 -1.0 -1.0
-    ]
-    A2 = T[
-        -4.0 5.0
-        -1.0 3.0
-        -3.0 2.0
-        1.0 3.0
-    ]
-    A = hcat(MatrixOp(A1), MatrixOp(A2))
-    b = T[1.0, 2.0, 3.0, 4.0]
-
-    m, n1 = size(A1)
-    _, n2 = size(A2)
-    n = n1 + n2
-
-    R = real(T)
-
-    lam = R(0.1) * norm(A' * b, Inf)
-    @test typeof(lam) == R
-
-    f = Translate(SqrNormL2(R(1)), -b)
-    f2 = ComposeAffine(SqrNormL2(R(1)), A, -b)
-    g = SeparableSum(NormL1(lam), NormL1(lam))
-
-    Lf = opnorm([A1 A2])^2
-
-    x_star = ArrayPartition(
-        T[-3.877278911564627e-01, 0, 0],
-        T[2.174149659863943e-02, 6.168435374149660e-01],
-    )
-
-    TOL = R(1e-4)
-
-    @testset "ForwardBackward" begin
-
-        ## Nonfast/Nonadaptive
-
-        x0 = ArrayPartition(zeros(T, n1), zeros(T, n2))
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.ForwardBackward(tol = TOL)
-        x, it = solver(x0 = x0, f = f2, g = g, Lf = Lf)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 150
-        @test x0 == x0_backup
-
-        # Nonfast/Adaptive
-
-        x0 = ArrayPartition(zeros(T, n1), zeros(T, n2))
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.ForwardBackward(tol = TOL, adaptive = true)
-        x, it = solver(x0 = x0, f = f2, g = g)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 300
-        @test x0 == x0_backup
-
-        # Fast/Nonadaptive
-
-        x0 = ArrayPartition(zeros(T, n1), zeros(T, n2))
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.FastForwardBackward(tol = TOL)
-        x, it = solver(x0 = x0, f = f2, g = g, Lf = Lf)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 100
-        @test x0 == x0_backup
-
-        # Fast/Adaptive
-
-        x0 = ArrayPartition(zeros(T, n1), zeros(T, n2))
-        x0_backup = copy(x0)
-        solver =
-            ProximalAlgorithms.FastForwardBackward(tol = TOL, adaptive = true)
-        x, it = solver(x0 = x0, f = f2, g = g)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 200
-        @test x0 == x0_backup
-    end
-
-    @testset "ZeroFPR" begin
-
-        # ZeroFPR/Nonadaptive
-
-        x0 = ArrayPartition(zeros(T, n1), zeros(T, n2))
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.ZeroFPR(tol = TOL)
-        x, it = solver(x0 = x0, f = f, A = A, g = g, Lf = Lf)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 20
-        @test x0 == x0_backup
-
-        # ZeroFPR/Adaptive
-
-        x0 = ArrayPartition(zeros(T, n1), zeros(T, n2))
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.ZeroFPR(adaptive = true, tol = TOL)
-        x, it = solver(x0 = x0, f = f, A = A, g = g)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 20
-        @test x0 == x0_backup
-
-    end
-
-    @testset "PANOC" begin
-
-        # PANOC/Nonadaptive
-
-        x0 = ArrayPartition(zeros(T, n1), zeros(T, n2))
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.PANOC(tol = TOL)
-        x, it = solver(x0 = x0, f = f, A = A, g = g, Lf = Lf)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 20
-        @test x0 == x0_backup
-
-        ## PANOC/Adaptive
-
-        x0 = ArrayPartition(zeros(T, n1), zeros(T, n2))
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.PANOC(adaptive = true, tol = TOL)
-        x, it = solver(x0 = x0, f = f, A = A, g = g)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 20
-        @test x0 == x0_backup
-
-    end
-
-    @testset "PANOCplus" begin
-
-        # PANOCplus/Nonadaptive
-
-        x0 = ArrayPartition(zeros(T, n1), zeros(T, n2))
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.PANOCplus(tol = TOL)
-        x, it = solver(x0 = x0, f = f, A = A, g = g, Lf = Lf)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 20
-        @test x0 == x0_backup
-
-        ## PANOCplus/Adaptive
-
-        x0 = ArrayPartition(zeros(T, n1), zeros(T, n2))
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL)
-        x, it = solver(x0 = x0, f = f, A = A, g = g)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 20
-        @test x0 == x0_backup
-
-    end
-
-end
diff --git a/test/problems/test_lasso_small_strongly_convex.jl b/test/problems/test_lasso_small_strongly_convex.jl
index 7dd959b..d2e7e13 100644
--- a/test/problems/test_lasso_small_strongly_convex.jl
+++ b/test/problems/test_lasso_small_strongly_convex.jl
@@ -1,7 +1,9 @@
 using LinearAlgebra
 using Test
 
-using ProximalOperators
+using Zygote
+using AbstractDifferentiation: ZygoteBackend
+using ProximalOperators: NormL1, LeastSquares
 using ProximalAlgorithms
 
 @testset "Lasso small (strongly convex, $T)" for T in [Float32, Float64]
@@ -29,7 +31,8 @@ using ProximalAlgorithms
     A = Q * D * Q'
     b = A * x_star + lam * inv(A') * sign.(x_star)
 
-    f = LeastSquares(A, b)
+    fA_prox = LeastSquares(A, b)
+    fA_autodiff = ProximalAlgorithms.AutoDifferentiable(x -> (norm(A*x - b)^2)/2, ZygoteBackend())
     g = NormL1(lam)
 
     TOL = T(1e-4)
@@ -39,7 +42,7 @@ using ProximalAlgorithms
 
     @testset "SFISTA" begin
         solver = ProximalAlgorithms.SFISTA(tol = TOL)
-        y, it = solver(x0=x0, f=f, g=g, Lf=Lf, mf=mf)
+        y, it = solver(x0=x0, f=fA_autodiff, g=g, Lf=Lf, mf=mf)
         @test eltype(y) == T
         @test norm(y - x_star) <= TOL
         @test it < 40
@@ -48,7 +51,7 @@ using ProximalAlgorithms
 
     @testset "ForwardBackward" begin
         solver = ProximalAlgorithms.ForwardBackward(tol = TOL)
-        y, it = solver(x0=x0, f=f, g=g, Lf=Lf)
+        y, it = solver(x0=x0, f=fA_autodiff, g=g, Lf=Lf)
         @test eltype(y) == T
         @test norm(y - x_star, Inf) <= TOL
         @test it < 110
@@ -57,7 +60,7 @@ using ProximalAlgorithms
 
     @testset "FastForwardBackward" begin
         solver = ProximalAlgorithms.FastForwardBackward(tol = TOL)
-        y, it = solver(x0=x0, f=f, g=g, Lf=Lf, mf=mf)
+        y, it = solver(x0=x0, f=fA_autodiff, g=g, Lf=Lf, mf=mf)
         @test eltype(y) == T
         @test norm(y - x_star, Inf) <= TOL
         @test it < 35
@@ -66,7 +69,7 @@ using ProximalAlgorithms
 
     @testset "FastForwardBackward (custom extrapolation)" begin
         solver = ProximalAlgorithms.FastForwardBackward(tol = TOL)
-        y, it = solver(x0=x0, f=f, g=g, gamma = 1/Lf, mf=mf, extrapolation_sequence=ProximalAlgorithms.ConstantNesterovSequence(mf, 1/Lf))
+        y, it = solver(x0=x0, f=fA_autodiff, g=g, gamma = 1/Lf, mf=mf, extrapolation_sequence=ProximalAlgorithms.ConstantNesterovSequence(mf, 1/Lf))
         @test eltype(y) == T
         @test norm(y - x_star, Inf) <= TOL
         @test it < 35
@@ -75,7 +78,7 @@ using ProximalAlgorithms
 
     @testset "DRLS" begin
         solver = ProximalAlgorithms.DRLS(tol = TOL)
-        v, it = solver(x0=x0, f=f, g=g, mf=mf)
+        v, it = solver(x0=x0, f=fA_prox, g=g, mf=mf)
         @test eltype(v) == T
         @test norm(v - x_star, Inf) <= TOL
         @test it < 14
@@ -84,7 +87,7 @@ using ProximalAlgorithms
 
     @testset "PANOC" begin
         solver = ProximalAlgorithms.PANOC(tol = TOL)
-        y, it = solver(x0=x0, f=f, g=g, Lf=Lf)
+        y, it = solver(x0=x0, f=fA_autodiff, g=g, Lf=Lf)
         @test eltype(y) == T
         @test norm(y - x_star, Inf) <= TOL
         @test it < 45
@@ -93,7 +96,7 @@ using ProximalAlgorithms
 
     @testset "PANOCplus" begin
         solver = ProximalAlgorithms.PANOCplus(tol = TOL)
-        y, it = solver(x0=x0, f=f, g=g, Lf=Lf)
+        y, it = solver(x0=x0, f=fA_autodiff, g=g, Lf=Lf)
         @test eltype(y) == T
         @test norm(y - x_star, Inf) <= TOL
         @test it < 45
diff --git a/test/problems/test_lasso_small_v_split.jl b/test/problems/test_lasso_small_v_split.jl
deleted file mode 100644
index 855df8b..0000000
--- a/test/problems/test_lasso_small_v_split.jl
+++ /dev/null
@@ -1,164 +0,0 @@
-@testset "Lasso small (v. split, $T)" for T in [Float32, Float64, ComplexF32, ComplexF64]
-    using ProximalOperators
-    using ProximalAlgorithms
-    using LinearAlgebra
-    using AbstractOperators: MatrixOp
-
-    A1 = T[
-        1.0 -2.0 3.0 -4.0 5.0
-        2.0 -1.0 0.0 -1.0 3.0
-    ]
-    A2 = T[
-        -1.0 0.0 4.0 -3.0 2.0
-        -1.0 -1.0 -1.0 1.0 3.0
-    ]
-    A = vcat(MatrixOp(A1), MatrixOp(A2))
-    b1 = T[1.0, 2.0]
-    b2 = T[3.0, 4.0]
-
-    m1, n = size(A1)
-    m2, _ = size(A2)
-    m = m1 + m2
-
-    R = real(T)
-
-    lam = R(0.1) * norm([A1; A2]' * [b1; b2], Inf)
-    @test typeof(lam) == R
-
-    f = SeparableSum(Translate(SqrNormL2(R(1)), -b1), Translate(SqrNormL2(R(1)), -b2))
-    f2 = Sum(LeastSquares(A1, b1), LeastSquares(A2, b2))
-    g = NormL1(lam)
-
-    Lf = opnorm([A1; A2])^2
-
-    x_star = T[-3.877278911564627e-01, 0, 0, 2.174149659863943e-02, 6.168435374149660e-01]
-
-    TOL = R(1e-4)
-
-    @testset "ForwardBackward" begin
-
-        ## Nonfast/Nonadaptive
-
-        x0 = zeros(T, n)
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.ForwardBackward(tol = TOL)
-        x, it = solver(x0 = x0, f = f2, g = g, Lf = Lf)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 150
-        @test x0 == x0_backup
-
-        # Nonfast/Adaptive
-
-        x0 = zeros(T, n)
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.ForwardBackward(tol = TOL, adaptive = true)
-        x, it = solver(x0 = x0, f = f2, g = g)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 300
-        @test x0 == x0_backup
-
-        # Fast/Nonadaptive
-
-        x0 = zeros(T, n)
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.FastForwardBackward(tol = TOL)
-        x, it = solver(x0 = x0, f = f2, g = g, Lf = Lf)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 100
-        @test x0 == x0_backup
-
-        # Fast/Adaptive
-
-        x0 = zeros(T, n)
-        x0_backup = copy(x0)
-        solver =
-            ProximalAlgorithms.FastForwardBackward(tol = TOL, adaptive = true)
-        x, it = solver(x0 = x0, f = f2, g = g)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 200
-        @test x0 == x0_backup
-    end
-
-    @testset "ZeroFPR" begin
-
-        # ZeroFPR/Nonadaptive
-
-        x0 = zeros(T, n)
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.ZeroFPR(tol = TOL)
-        x, it = solver(x0 = x0, f = f, A = A, g = g, Lf = opnorm([A1; A2])^2)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 20
-        @test x0 == x0_backup
-
-        # ZeroFPR/Adaptive
-
-        x0 = zeros(T, n)
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.ZeroFPR(adaptive = true, tol = TOL)
-        x, it = solver(x0 = x0, f = f, A = A, g = g)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 20
-        @test x0 == x0_backup
-
-    end
-
-    @testset "PANOC" begin
-
-        # PANOC/Nonadaptive
-
-        x0 = zeros(T, n)
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.PANOC(tol = TOL)
-        x, it = solver(x0 = x0, f = f, A = A, g = g, Lf = opnorm([A1; A2])^2)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 20
-        @test x0 == x0_backup
-
-        ## PANOC/Adaptive
-
-        x0 = zeros(T, n)
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.PANOC(adaptive = true, tol = TOL)
-        x, it = solver(x0 = x0, f = f, A = A, g = g)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 20
-        @test x0 == x0_backup
-
-    end
-
-    @testset "PANOCplus" begin
-
-        # PANOCplus/Nonadaptive
-
-        x0 = zeros(T, n)
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.PANOCplus(tol = TOL)
-        x, it = solver(x0 = x0, f = f, A = A, g = g, Lf = opnorm([A1; A2])^2)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 20
-        @test x0 == x0_backup
-
-        ## PANOCplus/Adaptive
-
-        x0 = zeros(T, n)
-        x0_backup = copy(x0)
-        solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL)
-        x, it = solver(x0 = x0, f = f, A = A, g = g)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 20
-        @test x0 == x0_backup
-
-    end
-
-end
diff --git a/test/problems/test_linear_programs.jl b/test/problems/test_linear_programs.jl
index 51e69c6..30f82e5 100644
--- a/test/problems/test_linear_programs.jl
+++ b/test/problems/test_linear_programs.jl
@@ -1,4 +1,6 @@
-using ProximalOperators
+using Zygote
+using AbstractDifferentiation: ZygoteBackend
+using ProximalOperators: Linear, IndNonnegative, IndPoint, IndAffine, SlicedSeparableSum
 using ProximalAlgorithms
 using LinearAlgebra
 
@@ -68,7 +70,7 @@ end
 
     @testset "AFBA" begin
 
-        f = Linear(c)
+        f = ProximalAlgorithms.AutoDifferentiable(x -> dot(c, x), ZygoteBackend())
         g = IndNonnegative()
         h = IndPoint(b)
 
@@ -94,7 +96,7 @@ end
 
     @testset "VuCondat" begin
 
-        f = Linear(c)
+        f = ProximalAlgorithms.AutoDifferentiable(x -> dot(c, x), ZygoteBackend())
         g = IndNonnegative()
         h = IndPoint(b)
 
@@ -143,7 +145,7 @@ end
 
     @testset "DavisYin" begin
 
-        f = Linear(c)
+        f = ProximalAlgorithms.AutoDifferentiable(x -> dot(c, x), ZygoteBackend())
         g = IndNonnegative()
         h = IndAffine(A, b)
 
diff --git a/test/problems/test_nonconvex_qp.jl b/test/problems/test_nonconvex_qp.jl
index dd3d2e5..6f1dfca 100644
--- a/test/problems/test_nonconvex_qp.jl
+++ b/test/problems/test_nonconvex_qp.jl
@@ -1,5 +1,7 @@
+using Zygote
+using AbstractDifferentiation: ZygoteBackend
 using ProximalAlgorithms
-using ProximalOperators
+using ProximalOperators: IndBox
 using LinearAlgebra
 using Random
 using Test
@@ -10,7 +12,7 @@ using Test
     low = T(-1.0)
     upp = T(+1.0)
 
-    f = Quadratic(Q, q)
+    f = ProximalAlgorithms.AutoDifferentiable(x -> dot(Q * x, x) / 2 + dot(q, x), ZygoteBackend())
     g = IndBox(low, upp)
 
     n = 2
@@ -76,7 +78,7 @@ end
         low = T(-1.0)
         upp = T(+1.0)
 
-        f = Quadratic(Q, q)
+        f = ProximalAlgorithms.AutoDifferentiable(x -> dot(Q * x, x) / 2 + dot(q, x), ZygoteBackend())
         g = IndBox(low, upp)
 
         Lip = maximum(abs.(eigenvalues))
diff --git a/test/problems/test_sparse_logistic_small.jl b/test/problems/test_sparse_logistic_small.jl
index 47a26d0..135fee5 100644
--- a/test/problems/test_sparse_logistic_small.jl
+++ b/test/problems/test_sparse_logistic_small.jl
@@ -1,8 +1,10 @@
-@testset "Sparse logistic small ($T)" for T in [Float32, Float64]
-    using ProximalOperators
-    using ProximalAlgorithms
-    using LinearAlgebra
+using Zygote
+using AbstractDifferentiation: ZygoteBackend
+using ProximalOperators: NormL1
+using ProximalAlgorithms
+using LinearAlgebra
 
+@testset "Sparse logistic small ($T)" for T in [Float32, Float64]
     A = T[
         1.0 -2.0 3.0 -4.0 5.0
         2.0 -1.0 0.0 -1.0 3.0
@@ -15,8 +17,13 @@
 
     R = real(T)
 
-    f = Translate(LogisticLoss(ones(R, m), R(1)), -b)
-    f2 = ComposeAffine(LogisticLoss(ones(R, m), R(1)), A, -b)
+    function logistic_loss(logits)
+        u = 1 .+ exp.(-logits)  # labels are assumed all one
+        return sum(log.(u))
+    end
+
+    f_autodiff = ProximalAlgorithms.AutoDifferentiable(x -> logistic_loss(x - b), ZygoteBackend())
+    fA_autodiff = ProximalAlgorithms.AutoDifferentiable(x -> logistic_loss(A * x - b), ZygoteBackend())
     lam = R(0.1)
     g = NormL1(lam)
 
@@ -29,7 +36,7 @@
     x0 = zeros(T, n)
     x0_backup = copy(x0)
     solver = ProximalAlgorithms.ForwardBackward(tol = TOL, adaptive = true)
-    x, it = solver(x0 = x0, f = f2, g = g)
+    x, it = solver(x0 = x0, f = fA_autodiff, g = g)
     @test eltype(x) == T
     @test norm(x - x_star, Inf) <= 1e-4
     @test it < 1100
@@ -40,7 +47,7 @@
     x0 = zeros(T, n)
     x0_backup = copy(x0)
     solver = ProximalAlgorithms.FastForwardBackward(tol = TOL, adaptive = true)
-    x, it = solver(x0 = x0, f = f2, g = g)
+    x, it = solver(x0 = x0, f = fA_autodiff, g = g)
     @test eltype(x) == T
     @test norm(x - x_star, Inf) <= 1e-4
     @test it < 500
@@ -51,7 +58,7 @@
     x0 = zeros(T, n)
     x0_backup = copy(x0)
     solver = ProximalAlgorithms.ZeroFPR(adaptive = true, tol = TOL)
-    x, it = solver(x0 = x0, f = f, A = A, g = g)
+    x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g)
     @test eltype(x) == T
     @test norm(x - x_star, Inf) <= 1e-4
     @test it < 25
@@ -62,7 +69,7 @@
     x0 = zeros(T, n)
     x0_backup = copy(x0)
     solver = ProximalAlgorithms.PANOC(adaptive = true, tol = TOL)
-    x, it = solver(x0 = x0, f = f, A = A, g = g)
+    x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g)
     @test eltype(x) == T
     @test norm(x - x_star, Inf) <= 1e-4
     @test it < 50
@@ -73,7 +80,7 @@
     x0 = zeros(T, n)
     x0_backup = copy(x0)
     solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL)
-    x, it = solver(x0 = x0, f = f, A = A, g = g)
+    x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g)
     @test eltype(x) == T
     @test norm(x - x_star, Inf) <= 1e-4
     @test it < 50
diff --git a/test/problems/test_verbose.jl b/test/problems/test_verbose.jl
index ff942c1..573d7fe 100644
--- a/test/problems/test_verbose.jl
+++ b/test/problems/test_verbose.jl
@@ -1,8 +1,10 @@
-@testset "Verbose" for T in [Float64]
-    using ProximalOperators
-    using ProximalAlgorithms
-    using LinearAlgebra
+using Zygote
+using AbstractDifferentiation: ZygoteBackend
+using ProximalOperators: LeastSquares, NormL1
+using ProximalAlgorithms
+using LinearAlgebra
 
+@testset "Verbose" for T in [Float64]
     A = T[
         1.0 -2.0 3.0 -4.0 5.0
         2.0 -1.0 0.0 -1.0 3.0
@@ -18,8 +20,9 @@
     lam = R(0.1) * norm(A' * b, Inf)
     @test typeof(lam) == R
 
-    f = Translate(SqrNormL2(R(1)), -b)
-    f2 = LeastSquares(A, b)
+    f_autodiff = ProximalAlgorithms.AutoDifferentiable(x -> (norm(x - b)^2)/2, ZygoteBackend())
+    fA_autodiff = ProximalAlgorithms.AutoDifferentiable(x -> (norm(A*x - b)^2)/2, ZygoteBackend())
+    fA_prox = LeastSquares(A, b)
     g = NormL1(lam)
 
     Lf = opnorm(A)^2
@@ -34,7 +37,7 @@
 
         x0 = zeros(T, n)
         solver = ProximalAlgorithms.ForwardBackward(tol = TOL, verbose = true)
-        x, it = solver(x0 = x0, f = f2, g = g, Lf = Lf)
+        x, it = solver(x0 = x0, f = fA_autodiff, g = g, Lf = Lf)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 150
@@ -47,7 +50,7 @@
             adaptive = true,
             verbose = true,
         )
-        x, it = solver(x0 = x0, f = f2, g = g)
+        x, it = solver(x0 = x0, f = fA_autodiff, g = g)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 300
@@ -57,7 +60,7 @@
         x0 = zeros(T, n)
         solver =
             ProximalAlgorithms.FastForwardBackward(tol = TOL, verbose = true)
-        x, it = solver(x0 = x0, f = f2, g = g, Lf = Lf)
+        x, it = solver(x0 = x0, f = fA_autodiff, g = g, Lf = Lf)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 100
@@ -70,7 +73,7 @@
             adaptive = true,
             verbose = true,
         )
-        x, it = solver(x0 = x0, f = f2, g = g)
+        x, it = solver(x0 = x0, f = fA_autodiff, g = g)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 200
@@ -82,7 +85,7 @@
 
         x0 = zeros(T, n)
         solver = ProximalAlgorithms.ZeroFPR(tol = TOL, verbose = true)
-        x, it = solver(x0 = x0, f = f, A = A, g = g, Lf = Lf)
+        x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g, Lf = Lf)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 20
@@ -91,7 +94,7 @@
 
         x0 = zeros(T, n)
         solver = ProximalAlgorithms.ZeroFPR(adaptive = true, tol = TOL, verbose = true)
-        x, it = solver(x0 = x0, f = f, A = A, g = g)
+        x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 20
@@ -104,7 +107,7 @@
 
         x0 = zeros(T, n)
         solver = ProximalAlgorithms.PANOC(tol = TOL, verbose = true)
-        x, it = solver(x0 = x0, f = f, A = A, g = g, Lf = Lf)
+        x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g, Lf = Lf)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 20
@@ -113,7 +116,7 @@
 
         x0 = zeros(T, n)
         solver = ProximalAlgorithms.PANOC(adaptive = true, tol = TOL, verbose = true)
-        x, it = solver(x0 = x0, f = f, A = A, g = g)
+        x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 20
@@ -126,7 +129,7 @@
 
         x0 = zeros(T, n)
         solver = ProximalAlgorithms.PANOCplus(tol = TOL, verbose = true)
-        x, it = solver(x0 = x0, f = f, A = A, g = g, Lf = Lf)
+        x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g, Lf = Lf)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 20
@@ -135,7 +138,7 @@
 
         x0 = zeros(T, n)
         solver = ProximalAlgorithms.PANOCplus(adaptive = true, tol = TOL, verbose = true)
-        x, it = solver(x0 = x0, f = f, A = A, g = g)
+        x, it = solver(x0 = x0, f = f_autodiff, A = A, g = g)
         @test eltype(x) == T
         @test norm(x - x_star, Inf) <= TOL
         @test it < 20
@@ -152,7 +155,7 @@
             tol = TOL,
             verbose = true,
         )
-        y, it = solver(x0 = x0, f = f2, g = g)
+        y, it = solver(x0 = x0, f = fA_prox, g = g)
         @test eltype(y) == T
         @test norm(y - x_star, Inf) <= TOL
         @test it < 30
diff --git a/test/runtests.jl b/test/runtests.jl
index 6532278..8380741 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,14 +1,24 @@
 using Test
 using Aqua
+using AbstractDifferentiation
 using ProximalAlgorithms
 
+struct Quadratic{M, V}
+    Q::M
+    q::V
+end
+
+(f::Quadratic)(x) = dot(x, f.Q * x) / 2 + dot(f.q, x)
+
+function ProximalAlgorithms.value_and_gradient_closure(f::Quadratic, x)
+    grad = f.Q * x + f.q
+    return dot(grad, x) / 2 + dot(f.q, x), () -> grad
+end
+
 @testset "Aqua" begin
     Aqua.test_all(ProximalAlgorithms; ambiguities=false)
 end
 
-include("definitions/arraypartition.jl")
-include("definitions/compose.jl")
-
 include("utilities/test_ad.jl")
 include("utilities/test_iteration_tools.jl")
 include("utilities/test_fb_tools.jl")
@@ -22,8 +32,6 @@ include("problems/test_equivalence.jl")
 include("problems/test_elasticnet.jl")
 include("problems/test_lasso_small.jl")
 include("problems/test_lasso_small_strongly_convex.jl")
-include("problems/test_lasso_small_v_split.jl")
-include("problems/test_lasso_small_h_split.jl")
 include("problems/test_linear_programs.jl")
 include("problems/test_sparse_logistic_small.jl")
 include("problems/test_nonconvex_qp.jl")
diff --git a/test/utilities/test_ad.jl b/test/utilities/test_ad.jl
index a4fa27f..25c15b3 100644
--- a/test/utilities/test_ad.jl
+++ b/test/utilities/test_ad.jl
@@ -2,8 +2,18 @@ using Test
 using LinearAlgebra
 using ProximalOperators: NormL1
 using ProximalAlgorithms
+using Zygote
+using ReverseDiff
+using AbstractDifferentiation: ZygoteBackend, ReverseDiffBackend
+
+@testset "Autodiff backend ($B on $T)" for (T, B) in Iterators.product(
+    [Float32, Float64, ComplexF32, ComplexF64],
+    [ZygoteBackend, ReverseDiffBackend],
+)
+    if T <: Complex && B == ReverseDiffBackend
+        continue
+    end
 
-@testset "Autodiff ($T)" for T in [Float32, Float64, ComplexF32, ComplexF64]
     R = real(T)
     A = T[
         1.0 -2.0 3.0 -4.0 5.0
@@ -12,30 +22,23 @@ using ProximalAlgorithms
         -1.0 -1.0 -1.0 1.0 3.0
     ]
     b = T[1.0, 2.0, 3.0, 4.0]
-    f = ProximalAlgorithms.ZygoteFunction(
-        x -> R(1/2) * norm(A * x - b, 2)^2
-    )
+    f = ProximalAlgorithms.AutoDifferentiable(x -> R(1/2) * norm(A * x - b, 2)^2, B())
     Lf = opnorm(A)^2
     m, n = size(A)
 
-    @testset "Gradient" begin
-        x = randn(T, n)
-        gradfx, fx = ProximalAlgorithms.gradient(f, x)
-        @test eltype(gradfx) == T
-        @test typeof(fx) == R
-        @test gradfx ≈ A' * (A * x - b)
-    end
+    x0 = zeros(T, n)
 
-    @testset "Algorithms" begin
-        lam = R(0.1) * norm(A' * b, Inf)
-        @test typeof(lam) == R
-        g = NormL1(lam)
-        x_star = T[-3.877278911564627e-01, 0, 0, 2.174149659863943e-02, 6.168435374149660e-01]
-        TOL = R(1e-4)
-        solver = ProximalAlgorithms.FastForwardBackward(tol = TOL)
-        x, it = solver(x0 = zeros(T, n), f = f, g = g, Lf = Lf)
-        @test eltype(x) == T
-        @test norm(x - x_star, Inf) <= TOL
-        @test it < 100
-    end
+    f_x0, cl = ProximalAlgorithms.value_and_gradient_closure(f, x0)
+    grad_f_x0 = @inferred cl()
+
+    lam = R(0.1) * norm(A' * b, Inf)
+    @test typeof(lam) == R
+    g = NormL1(lam)
+    x_star = T[-3.877278911564627e-01, 0, 0, 2.174149659863943e-02, 6.168435374149660e-01]
+    TOL = R(1e-4)
+    solver = ProximalAlgorithms.FastForwardBackward(tol = TOL)
+    x, it = solver(x0 = x0, f = f, g = g, Lf = Lf)
+    @test eltype(x) == T
+    @test norm(x - x_star, Inf) <= TOL
+    @test it < 100
 end
diff --git a/test/utilities/test_fb_tools.jl b/test/utilities/test_fb_tools.jl
index 75a3d50..8205e08 100644
--- a/test/utilities/test_fb_tools.jl
+++ b/test/utilities/test_fb_tools.jl
@@ -1,39 +1,41 @@
 using Test
 using LinearAlgebra
-using ProximalAlgorithms: lower_bound_smoothness_constant, backtrack_stepsize!
-using ProximalOperators: Quadratic, Zero
+using ProximalCore: Zero
+using ProximalAlgorithms
+using AbstractDifferentiation
 
 @testset "Lipschitz constant estimation" for R in [Float32, Float64]
 
-sv = R[0.01, 1.0, 1.0, 1.0, 100.0]
-n = length(sv)
-U, _ = qr(randn(n, n))
-Q = U * Diagonal(sv) * U'
-q = randn(n)
+    sv = R[0.01, 1.0, 1.0, 1.0, 100.0]
+    n = length(sv)
+    U, _ = qr(randn(R, n, n))
+    Q = U * Diagonal(sv) * U'
+    q = randn(R, n)
 
-f = Quadratic(Q, q)
-Lf = maximum(sv)
-g = Zero()
+    f = Quadratic(Q, q)
+    Lf = maximum(sv)
+    g = Zero()
 
-for _ in 1:100
-    x = randn(n)
-    Lest = @inferred lower_bound_smoothness_constant(f, I, x)
-    @test Lest <= Lf
-end
-
-x = randn(n)
-Lest = @inferred lower_bound_smoothness_constant(f, I, x)
-alpha = R(0.5)
-gamma_init = 10 / Lest
-gamma = gamma_init
+    for _ in 1:100
+        x = randn(R, n)
+        Lest = @inferred ProximalAlgorithms.lower_bound_smoothness_constant(f, I, x)
+        @test typeof(Lest) == R
+        @test Lest <= Lf
+    end
 
-for _ in 1:100
     x = randn(n)
-    new_gamma, = @inferred backtrack_stepsize!(gamma, f, I, g, x, alpha=alpha)
-    @test new_gamma <= gamma
-    gamma = new_gamma
-end
-
-@test gamma < gamma_init
+    Lest = @inferred ProximalAlgorithms.lower_bound_smoothness_constant(f, I, x)
+    alpha = R(0.5)
+    gamma_init = 10 / Lest
+    gamma = gamma_init
+
+    for _ in 1:100
+        x = randn(n)
+        new_gamma, = @inferred ProximalAlgorithms.backtrack_stepsize!(gamma, f, I, g, x, alpha=alpha)
+        @test new_gamma <= gamma
+        gamma = new_gamma
+    end
+
+    @test gamma < gamma_init
 
 end