diff --git a/ext/DFTKAMDGPUExt.jl b/ext/DFTKAMDGPUExt.jl
index 26c27f4ab0..9a8da50d56 100644
--- a/ext/DFTKAMDGPUExt.jl
+++ b/ext/DFTKAMDGPUExt.jl
@@ -4,6 +4,8 @@ using PrecompileTools
 using LinearAlgebra
 import DFTK: GPU, precompilation_workflow
 using DFTK
+import ForwardDiff
+import ForwardDiff: Dual
 
 DFTK.synchronize_device(::GPU{<:AMDGPU.ROCArray}) = AMDGPU.synchronize()
 
@@ -39,5 +41,4 @@ if AMDGPU.functional()
         end
     end
 end
-
 end
diff --git a/src/DispatchFunctional.jl b/src/DispatchFunctional.jl
index 20373c191b..51b7d4dea2 100644
--- a/src/DispatchFunctional.jl
+++ b/src/DispatchFunctional.jl
@@ -26,7 +26,7 @@ function DftFunctionals.has_energy(func::LibxcFunctional)
     0 in Libxc.supported_derivatives(Libxc.Functional(func.identifier))
 end
 
-function libxc_unfold_spin(data::Matrix, n_spin::Int)
+function libxc_unfold_spin(data::AbstractMatrix, n_spin::Int)
     n_p = size(data, 2)
     if n_spin == 1
         data  # Only one spin component
diff --git a/src/gpu/gpu_arrays.jl b/src/gpu/gpu_arrays.jl
index 648bc1609f..f776ae36e7 100644
--- a/src/gpu/gpu_arrays.jl
+++ b/src/gpu/gpu_arrays.jl
@@ -13,13 +13,13 @@ function LinearAlgebra.norm(A::Hermitian{T, <:AbstractGPUArray}) where {T}
     sqrt(2upper_triangle - diago)
 end
 
-
+# Make sure that there is a CPU fallback for AbstractGPUArrays (e.g. for Duals)
 for fun in (:potential_terms, :kernel_terms)
     @eval function DftFunctionals.$fun(fun::DispatchFunctional, ρ::AT,
-                                       args...) where {AT <: AbstractGPUArray{Float64}}
+                                       args...) where {AT <: AbstractGPUArray}
         # Fallback implementation for the GPU: Transfer to the CPU and run computation there
         cpuify(::Nothing) = nothing
         cpuify(x::AbstractArray) = Array(x)
         $fun(fun, Array(ρ), cpuify.(args)...)
     end
-end
+end
\ No newline at end of file
diff --git a/src/response/chi0.jl b/src/response/chi0.jl
index d13eaceeda..de06831fc8 100644
--- a/src/response/chi0.jl
+++ b/src/response/chi0.jl
@@ -409,6 +409,7 @@ to the Hamiltonian change `δH` represented by the matrix-vector products `δHψ
     # We then use the extra information we have from these additional bands,
     # non-necessarily converged, to split the Sternheimer_solver with a Schur
     # complement.
+    occupation = [to_cpu(oc) for oc in occupation]
     (mask_occ, mask_extra) = occupied_empty_masks(occupation, occupation_threshold)
 
     ψ_occ   = [ψ[ik][:, maskk] for (ik, maskk) in enumerate(mask_occ)]
@@ -561,6 +562,7 @@ function construct_bandtol(Bandtol::Type, basis::PlaneWaveBasis, ψ, occupation:
     Ω  = basis.model.unit_cell_volume
     Ng = prod(basis.fft_size)
     Nk = length(basis.kpoints)
+    occupation = [to_cpu(oc) for oc in occupation]
     mask_occ = occupied_empty_masks(occupation, occupation_threshold).mask_occ
 
     # Including k-points the expression (3.11) in 2505.02319 becomes
diff --git a/src/terms/local_nonlinearity.jl b/src/terms/local_nonlinearity.jl
index 94f80ae59b..71020bfc19 100644
--- a/src/terms/local_nonlinearity.jl
+++ b/src/terms/local_nonlinearity.jl
@@ -9,11 +9,13 @@ struct TermLocalNonlinearity{TF} <: TermNonlinear
 end
 (L::LocalNonlinearity)(::AbstractBasis) = TermLocalNonlinearity(L.f)
 
+# FD on the GPU, when T<:Dual causes all sorts of troubles, at least on AMD. TODO: also on NVIDIA?
+# TODO: only transfer to CPU when T <: Dual ?, or only if ROCArray?
 function ene_ops(term::TermLocalNonlinearity, basis::PlaneWaveBasis{T}, ψ, occupation;
                  ρ, kwargs...) where {T}
     fp(ρ) = ForwardDiff.derivative(term.f, ρ)
     E = sum(fρ -> convert_dual(T, fρ), term.f.(ρ)) * basis.dvol
-    potential = convert_dual.(T, fp.(ρ))
+    potential = to_device(basis.architecture, convert_dual.(T, fp.(to_cpu(ρ))))
 
     # In the case of collinear spin, the potential is spin-dependent
     ops = [RealSpaceMultiplication(basis, kpt, potential[:, :, :, kpt.spin])
@@ -22,16 +24,16 @@ function ene_ops(term::TermLocalNonlinearity, basis::PlaneWaveBasis{T}, ψ, occu
 end
 
 
-function compute_kernel(term::TermLocalNonlinearity, ::AbstractBasis{T}; ρ, kwargs...) where {T}
+function compute_kernel(term::TermLocalNonlinearity, basis::AbstractBasis{T}; ρ, kwargs...) where {T}
     fp(ρ) = ForwardDiff.derivative(term.f, ρ)
     fpp(ρ) = ForwardDiff.derivative(fp, ρ)
-    Diagonal(vec(convert_dual.(T, fpp.(ρ))))
+    Diagonal(to_device(basis.architecture, vec(convert_dual.(T, fpp.(to_cpu(ρ))))))
 end
 
-function apply_kernel(term::TermLocalNonlinearity, ::AbstractBasis{T},
+function apply_kernel(term::TermLocalNonlinearity, basis::AbstractBasis{T},
                       δρ::AbstractArray{Tδρ}; ρ, kwargs...) where {T, Tδρ}
     S = promote_type(T, Tδρ)
     fp(ρ) = ForwardDiff.derivative(term.f, ρ)
     fpp(ρ) = ForwardDiff.derivative(fp, ρ)
-    convert_dual.(S, fpp.(ρ) .* δρ)
+    to_device(basis.architecture, convert_dual.(S, fpp.(to_cpu(ρ)) .* to_cpu(δρ)))
 end
diff --git a/src/terms/xc.jl b/src/terms/xc.jl
index b9ead3cdd3..39dca7611f 100644
--- a/src/terms/xc.jl
+++ b/src/terms/xc.jl
@@ -42,7 +42,7 @@ function (xc::Xc)(basis::PlaneWaveBasis{T}) where {T}
         # Strip duals from functional parameters if needed
         params = parameters(fun)
         if !isempty(params)
-            newparams = convert_dual.(T, params)
+            newparams = map(p -> convert_dual(T, p), params)
             fun = change_parameters(fun, newparams; keep_identifier=true)
         end
         fun
@@ -427,7 +427,7 @@ function apply_kernel(term::TermXc, basis::PlaneWaveBasis{T}, δρ::AbstractArra
 
     # If the XC functional is not supported for an architecture, terms is on the CPU
     terms = kernel_terms(term.functionals, density)
-    δV = zeros(Tδρ, size(ρ)...)  # [ix, iy, iz, iσ]
+    δV = zeros_like(δρ, Tδρ, size(ρ)...)  # [ix, iy, iz, iσ]
 
     Vρρ = to_device(basis.architecture, reshape(terms.Vρρ, n_spin, n_spin, basis.fft_size...))
     @views for s = 1:n_spin, t = 1:n_spin  # LDA term
@@ -529,11 +529,19 @@ _matify(data::AbstractArray) = reshape(data, size(data, 1), :)
 
 for fun in (:potential_terms, :kernel_terms)
     @eval begin
-        function DftFunctionals.$fun(xc::Functional, density::LibxcDensities)
+        function DftFunctionals.$fun(xc::DispatchFunctional, density::LibxcDensities)
             $fun(xc, _matify(density.ρ_real), _matify(density.σ_real),
                      _matify(density.τ_real), _matify(density.Δρ_real))
         end
 
+        # Ensure functionals from DftFunctionals are sent to the CPU, until DftFunctionals.jl is refactored
+        function DftFunctionals.$fun(fun::DftFunctionals.Functional, density::LibxcDensities)
+            maticpuify(::Nothing) = nothing
+            maticpuify(x::AbstractArray) = reshape(Array(x), size(x, 1), :)
+            DftFunctionals.$fun(fun, maticpuify(density.ρ_real), maticpuify(density.σ_real),
+                                 maticpuify(density.τ_real), maticpuify(density.Δρ_real))
+        end
+
         function DftFunctionals.$fun(xcs::Vector{Functional}, density::LibxcDensities)
             isempty(xcs) && return NamedTuple()
             result = $fun(xcs[1], density)
diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
index c9295085f9..e03635bdce 100644
--- a/src/workarounds/forwarddiff_rules.jl
+++ b/src/workarounds/forwarddiff_rules.jl
@@ -31,9 +31,9 @@ end
 function LinearAlgebra.mul!(y::AbstractArray{<:Union{Complex{<:Dual}}},
                             p::AbstractFFTs.Plan,
                             x::AbstractArray{<:Union{Complex{<:Dual}}})
-    copyto!(y, p*x)
+    copyto!(y, _mul(p, x))
 end
-function Base.:*(p::AbstractFFTs.Plan, x::AbstractArray{<:Complex{<:Dual{Tg}}}) where {Tg}
+function _mul(p::AbstractFFTs.Plan, x::AbstractArray{<:Complex{<:Dual{Tg}}}) where {Tg}
     # TODO do we want x::AbstractArray{<:Dual{T}} too?
     xtil = p * ForwardDiff.value.(x)
     dxtils = ntuple(ForwardDiff.npartials(eltype(x))) do n
@@ -46,6 +46,8 @@ function Base.:*(p::AbstractFFTs.Plan, x::AbstractArray{<:Complex{<:Dual{Tg}}})
         )
     end
 end
+Base.:*(p::AbstractFFTs.Plan, x::AbstractArray{<:Complex{<:Dual}}) = _mul(p, x)
+Base.:*(p::DummyInplace, x::AbstractArray{<:Union{Complex{<:Dual}}}) = copyto!(x, _mul(p.fft, x))
 
 function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:Dual}
     opFFT  = AbstractFFTs.plan_fft(tmp)
@@ -219,10 +221,9 @@ function construct_value(basis::PlaneWaveBasis{T}) where {T <: Dual}
 end
 
 
-@timing "self_consistent_field ForwardDiff" function self_consistent_field(
-        basis_dual::PlaneWaveBasis{T};
-        response=ResponseOptions(),
-        kwargs...) where {T <: Dual}
+@timing "self_consistent_field ForwardDiff" function self_consistent_field(basis_dual::PlaneWaveBasis{<:Dual{Tg,V,N}};
+                               response=ResponseOptions(),
+                               kwargs...) where {Tg,V,N}
     # Note: No guarantees on this interface yet.
 
     # Primal pass
@@ -241,7 +242,7 @@ end
     end
     # Implicit differentiation
     response.verbose && println("Solving response problem")
-    δresults = ntuple(ForwardDiff.npartials(T)) do α
+    δresults = ntuple(N) do α
         δHextψ = [ForwardDiff.partials.(δHextψk, α) for δHextψk in Hψ_dual]
         δtemperature = ForwardDiff.partials(basis_dual.model.temperature, α)
         solve_ΩplusK_split(scfres, δHextψ; δtemperature,
@@ -249,20 +250,21 @@ end
     end
 
     # Convert and combine
-    DT = Dual{ForwardDiff.tagtype(T)}
     ψ = map(scfres.ψ, getfield.(δresults, :δψ)...) do ψk, δψk...
         map(ψk, δψk...) do ψnk, δψnk...
-            Complex(DT(real(ψnk), real.(δψnk)),
-                    DT(imag(ψnk), imag.(δψnk)))
+            Complex(Dual{Tg}(real(ψnk), real.(δψnk)),
+                    Dual{Tg}(imag(ψnk), imag.(δψnk)))
         end
     end
     eigenvalues = map(scfres.eigenvalues, getfield.(δresults, :δeigenvalues)...) do εk, δεk...
-        map((εnk, δεnk...) -> DT(εnk, δεnk), εk, δεk...)
+        map((εnk, δεnk...) -> Dual{Tg}(εnk, δεnk), εk, δεk...)
     end
     occupation = map(scfres.occupation, getfield.(δresults, :δoccupation)...) do occk, δocck...
-        map((occnk, δoccnk...) -> DT(occnk, δoccnk), occk, δocck...)
+        occk_cpu = to_cpu(occk)
+        to_device(basis_dual.architecture,
+                  map((occnk, δoccnk...) -> Dual{Tg}(occnk, δoccnk), occk_cpu, δocck...))
     end
-    εF = DT(scfres.εF, getfield.(δresults, :δεF)...)
+    εF = Dual{Tg}(scfres.εF, getfield.(δresults, :δεF)...)
 
     # For strain, basis_dual contributes an explicit lattice contribution which
     # is not contained in δresults, so we need to recompute ρ here
diff --git a/test/forwarddiff.jl b/test/forwarddiff.jl
index 6b0293f678..c78d2a6f5a 100644
--- a/test/forwarddiff.jl
+++ b/test/forwarddiff.jl
@@ -1,492 +1,66 @@
-# Wrappers around ForwardDiff to fix tags and reduce compilation time.
-# Warning: fixing types without care can lead to perturbation confusion,
-# this should only be used within the testing framework. Risk of perturbation
-# confusion arises when nested derivatives of different functions are taken
-# with a fixed tag. Only use these wrappers at the top-level call.
-@testmodule ForwardDiffWrappers begin
-using ForwardDiff
-export tagged_derivative, tagged_gradient, tagged_jacobian
-
-struct DerivativeTag end
-function tagged_derivative(f, x::T; custom_tag=DerivativeTag) where T
-    # explicit call to ForwardDiff.Tag() to trigger ForwardDiff.tagcount
-    TagType = typeof(ForwardDiff.Tag(custom_tag(), T))
-    x_dual = ForwardDiff.Dual{TagType, T, 1}(x, ForwardDiff.Partials((one(T),)))
-
-    res = ForwardDiff.extract_derivative(TagType, f(x_dual))
-    return res
-end
-
-struct GradientTag end
-GradientConfig(f, x, ::Type{Tag}) where {Tag} =
-    ForwardDiff.GradientConfig(f, x, ForwardDiff.Chunk(x), Tag())
-function tagged_gradient(f, x::AbstractArray{T}; custom_tag=GradientTag) where T
-    # explicit call to ForwardDiff.Tag() to trigger ForwardDiff.tagcount
-    TagType = typeof(ForwardDiff.Tag(custom_tag(), T))
-
-    cfg = GradientConfig(f, x, TagType)
-    ForwardDiff.gradient(f, x, cfg, Val{false}())
-end
-
-struct JacobianTag end
-JacobianConfig(f, x, ::Type{Tag}) where {Tag} =
-    ForwardDiff.JacobianConfig(f, x, ForwardDiff.Chunk(x), Tag())
-function tagged_jacobian(f, x::AbstractArray{T}; custom_tag=JacobianTag) where T
-    # explicit call to ForwardDiff.Tag() to trigger ForwardDiff.tagcount
-    TagType = typeof(ForwardDiff.Tag(custom_tag(), T))
-
-    cfg = JacobianConfig(f, x, TagType)
-    ForwardDiff.jacobian(f, x, cfg, Val{false}())
-end
-end
-
+### ForwardDiff tests on the CPU
 @testitem "Force derivatives using ForwardDiff" #=
-    =#    tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffWrappers] begin
+    =#    tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffCases] begin
     using DFTK
-    using ForwardDiff
-    using LinearAlgebra
-    silicon = TestCases.silicon
-
-    function compute_force(ε1, ε2; metal=false, tol=1e-10, atoms=silicon.atoms)
-        T = promote_type(typeof(ε1), typeof(ε2))
-        pos = [[1.01, 1.02, 1.03] / 8, -ones(3) / 8 + ε1 * [1., 0, 0] + ε2 * [0, 1., 0]]
-        if metal
-            # Silicon reduced HF is metallic
-            model = model_DFT(Matrix{T}(silicon.lattice), atoms, pos;
-                              functionals=[], temperature=1e-3)
-        else
-            model = model_DFT(Matrix{T}(silicon.lattice), atoms, pos;
-                              functionals=LDA())
-        end
-        basis = PlaneWaveBasis(model; Ecut=5, kgrid=[2, 2, 2])
-
-        response     = ResponseOptions(; verbose=true)
-        is_converged = DFTK.ScfConvergenceForce(tol)
-        scfres = self_consistent_field(basis; is_converged, response)
-        compute_forces_cart(scfres)
-    end
-
-    F = compute_force(0.0, 0.0)
-    derivative_ε1_fd = let ε1 = 1e-5
-        (compute_force(ε1, 0.0) - F) / ε1
-    end
-    derivative_ε1 = tagged_derivative(ε1 -> compute_force(ε1, 0.0), 0.0)
-    @test norm(derivative_ε1 - derivative_ε1_fd) < 1e-4
-
-    derivative_ε2_fd = let ε2 = 1e-5
-        (compute_force(0.0, ε2) - F) / ε2
-    end
-    derivative_ε2 = tagged_derivative(ε2 -> compute_force(0.0, ε2), 0.0)
-    @test norm(derivative_ε2 - derivative_ε2_fd) < 1e-4
-
-    @testset "Multiple partials" begin
-        grad = tagged_gradient(v -> compute_force(v...)[1][1], [0.0, 0.0])
-        @test abs(grad[1] - derivative_ε1[1][1]) < 1e-4
-        @test abs(grad[2] - derivative_ε2[1][1]) < 1e-4
-
-        jac = tagged_jacobian(v -> compute_force(v...)[1], [0.0, 0.0])
-        @test norm(grad - jac[1, :]) < 1e-9
-    end
-
-    @testset "Derivative for metals" begin
-        metal = true
-        derivative_ε1_fd = let ε1 = 1e-5
-            (compute_force(ε1, 0.0; metal) - compute_force(-ε1, 0.0; metal)) / 2ε1
-        end
-        derivative_ε1 = tagged_derivative(ε1 -> compute_force(ε1, 0.0; metal), 0.0)
-        @test norm(derivative_ε1 - derivative_ε1_fd) < 1e-4
-    end
-
-    @testset "Using PspUpf" begin
-        Si = ElementPsp(:Si; psp=load_psp(silicon.psp_upf))
-        atoms = [Si, Si]
-
-        derivative_ε1_fd = let ε1 = 1e-5
-            (compute_force(ε1, 0.0; atoms) - compute_force(-ε1, 0.0; atoms)) / 2ε1
-        end
-        derivative_ε1 = tagged_derivative(ε1 -> compute_force(ε1, 0.0; atoms), 0.0)
-        @test norm(derivative_ε1 - derivative_ε1_fd) < 1e-4
-    end
+    ForwardDiffCases.test_FD_force_derivatives(DFTK.CPU())
 end
 
-@testitem "Anisotropic strain sensitivity using ForwardDiff" #=
-    =#    tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffWrappers] begin
+@testitem "Strain sensitivity using ForwardDiff" #=
+    =#    tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffCases] begin
     using DFTK
-    using ForwardDiff
-    using LinearAlgebra
-    using ComponentArrays
-    using PseudoPotentialData
-    aluminium = TestCases.aluminium
-    Ecut = 5
-    kgrid = [2, 2, 2]
-    model = model_DFT(aluminium.lattice, aluminium.atoms, aluminium.positions;
-                      functionals=LDA(), temperature=1e-2, smearing=Smearing.Gaussian(),
-                      kinetic_blowup=BlowupCHV())
-    basis = PlaneWaveBasis(model; Ecut, kgrid)
-    nbandsalg = FixedBands(; n_bands_converge=10)
-
-    function compute_properties(lattice)
-        model_strained = Model(model; lattice)
-        basis = PlaneWaveBasis(model_strained; Ecut, kgrid)
-        scfres = self_consistent_field(basis; tol=1e-11, nbandsalg)
-        ComponentArray(
-           eigenvalues=stack([ev[1:10] for ev in scfres.eigenvalues]),
-           ρ=scfres.ρ,
-           energies=collect(values(scfres.energies)),
-           εF=scfres.εF,
-           occupation=reduce(vcat, scfres.occupation),
-        )
-    end
-
-    strain_isotropic(ε) = (1 + ε) * model.lattice
-    function strain_anisotropic(ε)
-        DFTK.voigt_strain_to_full([ε, 0., 0., 0., 0., 0.]) * model.lattice
-    end
-
-    @testset "$strain_fn" for strain_fn in (strain_isotropic, strain_anisotropic)
-        f(ε) = compute_properties(strain_fn(ε))
-        dx = tagged_derivative(f, 0.)
-
-        h = 1e-4
-        x1 = f(-h)
-        x2 = f(+h)
-        dx_findiff = (x2 - x1) / 2h
-        @test norm(dx.ρ - dx_findiff.ρ) * sqrt(basis.dvol) < 1e-6
-        @test maximum(abs, dx.eigenvalues - dx_findiff.eigenvalues) < 1e-6
-        @test maximum(abs, dx.energies - dx_findiff.energies) < 3e-5
-        @test dx.εF - dx_findiff.εF < 1e-6
-        @test maximum(abs, dx.occupation - dx_findiff.occupation) < 1e-4
-    end
+    ForwardDiffCases.test_FD_strain_sensitivity(DFTK.CPU())
 end
 
 @testitem "scfres PSP sensitivity using ForwardDiff" #=
-    =#    tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffWrappers] begin
+    =#    tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffCases] begin
     using DFTK
-    using ForwardDiff
-    using LinearAlgebra
-    using ComponentArrays
-    using PseudoPotentialData
-    aluminium = TestCases.aluminium
-
-    function compute_band_energies(ε::T) where {T}
-        psp  = load_psp(PseudoFamily("cp2k.nc.sr.lda.v0_1.semicore.gth"), :Al)
-        rloc = convert(T, psp.rloc)
-
-        pspmod = PspHgh(psp.Zion, rloc,
-                        psp.cloc, psp.rp .+ [0, ε], psp.h;
-                        psp.identifier, psp.description)
-        atoms = fill(ElementPsp(aluminium.atnum; psp=pspmod), length(aluminium.positions))
-        model = model_DFT(Matrix{T}(aluminium.lattice), atoms, aluminium.positions;
-                          functionals=LDA(), temperature=1e-2, smearing=Smearing.Gaussian())
-        basis = PlaneWaveBasis(model; Ecut=5, kgrid=[2, 2, 2])
-
-        is_converged = DFTK.ScfConvergenceDensity(1e-10)
-        scfres = self_consistent_field(basis; is_converged, mixing=KerkerMixing(),
-                                       nbandsalg=FixedBands(; n_bands_converge=10),
-                                       damping=0.6, response=ResponseOptions(; verbose=true))
-
-        ComponentArray(
-           eigenvalues=stack([ev[1:10] for ev in scfres.eigenvalues]),
-           ρ=scfres.ρ,
-           energies=collect(values(scfres.energies)),
-           εF=scfres.εF,
-           occupation=reduce(vcat, scfres.occupation),
-        )
-    end
-
-    derivative_ε = let ε = 1e-4
-        (compute_band_energies(ε) - compute_band_energies(-ε)) / 2ε
-    end
-    derivative_fd = tagged_derivative(compute_band_energies, 0.0)
-    @test norm(derivative_fd - derivative_ε) < 5e-4
+    ForwardDiffCases.test_FD_psp_sensitivity(DFTK.CPU())
 end
 
 @testitem "Functional force sensitivity using ForwardDiff" #=
-    =#    tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffWrappers] begin
-    using DFTK
-    using ForwardDiff
-    using LinearAlgebra
-    using ComponentArrays
-    using DftFunctionals
-    silicon = TestCases.silicon
-
-    function compute_force(ε1::T) where {T}
-        pos = [[1.01, 1.02, 1.03] / 8, -ones(3) / 8]
-        pbec = DftFunctional(:gga_c_pbe)
-        pbex = DftFunctional(:gga_x_pbe)
-        pbex = change_parameters(pbex, parameters(pbex) + ComponentArray(κ=0, μ=ε1))
-
-        model = model_DFT(Matrix{T}(silicon.lattice), silicon.atoms, pos;
-                          functionals=[pbex, pbec])
-        basis = PlaneWaveBasis(model; Ecut=5, kgrid=[2, 2, 2])
-
-        is_converged = DFTK.ScfConvergenceDensity(1e-10)
-        scfres = self_consistent_field(basis; is_converged,
-                                       response=ResponseOptions(; verbose=true))
-        compute_forces_cart(scfres)
-    end
-
-    derivative_ε = let ε = 1e-5
-        (compute_force(ε) - compute_force(-ε)) / 2ε
-    end
-    derivative_fd = tagged_derivative(compute_force, 0.0)
-    @test norm(derivative_ε - derivative_fd) < 1e-4
-end
-
-@testitem "Derivative of complex function" #=
-    =#    tags=[:dont_test_mpi, :minimal] setup=[ForwardDiffWrappers] begin
-    using DFTK
-    using ForwardDiff
-    using LinearAlgebra
-    using SpecialFunctions
-    using FiniteDifferences
-
-    α = randn(ComplexF64)
-    erfcα = x -> erfc(α * x)
-
-    x0  = randn()
-    fd1 = tagged_derivative(erfcα, x0)
-    fd2 = FiniteDifferences.central_fdm(5, 1)(erfcα, x0)
-    @test norm(fd1 - fd2) < 1e-8
-end
-
-@testitem "Higher derivatives of Fermi-Dirac occupation" #=
-    =#    tags=[:dont_test_mpi, :minimal] setup=[ForwardDiffWrappers] begin
+    =#    tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffCases] begin
     using DFTK
-    using ForwardDiff
-
-    smearing = Smearing.FermiDirac()
-    f(x) = Smearing.occupation(smearing, x)
-
-    function compute_nth_derivative(n, f, x)
-        (n == 0) && return f(x)
-        ForwardDiff.derivative(x -> compute_nth_derivative(n - 1, f, x), x)
-    end
-
-    @testset "Avoid NaN from exp-overflow for large x" begin
-        T = Float64
-        x = log(floatmax(T)) / 2 + 1
-        for n in 0:8
-            @testset "Derivative order $n" begin
-                y = compute_nth_derivative(n, f, x)
-                @test isfinite(y)
-                @test abs(y) ≤ eps(T)
-            end
-        end
-    end
+    ForwardDiffCases.test_FD_force_sensitivity(DFTK.CPU())
 end
 
 @testitem "LocalNonlinearity sensitivity using ForwardDiff" #=
-    =#     tags=[:dont_test_mpi, :minimal] setup=[ForwardDiffWrappers] begin
+    =#    tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffCases] begin
     using DFTK
-    using ForwardDiff
-    using LinearAlgebra
-
-    function compute_force(ε::T) where {T}
-        # solve the 1D Gross-Pitaevskii equation with ElementGaussian potential
-        lattice = 10.0 .* [[1 0 0.]; [0 0 0]; [0 0 0]]
-        positions = [[0.2, 0, 0], [0.8, 0, 0]]
-        gauss = ElementGaussian(1.0, 0.5)
-        atoms = [gauss, gauss]
-        n_electrons = 1
-        terms = [Kinetic(), AtomicLocal(), LocalNonlinearity(ρ -> (1.0 + ε) * ρ^2)]
-        model = Model(Matrix{T}(lattice), atoms, positions;
-                      n_electrons, terms, spin_polarization=:spinless)
-        basis = PlaneWaveBasis(model; Ecut=500, kgrid=(1, 1, 1))
-        ρ = zeros(Float64, basis.fft_size..., 1)
-        is_converged = DFTK.ScfConvergenceDensity(1e-10)
-        scfres = self_consistent_field(basis; ρ, is_converged,
-                                       response=ResponseOptions(; verbose=true))
-        compute_forces_cart(scfres)
-    end
-    derivative_ε = let ε = 1e-5
-        (compute_force(ε) - compute_force(-ε)) / 2ε
-    end
-    derivative_fd = tagged_derivative(compute_force, 0.0)
-    @test norm(derivative_ε - derivative_fd) < 1e-4
+    ForwardDiffCases.test_FD_local_nonlinearity_sensitivity(DFTK.CPU())
 end
 
 @testitem "Symmetries broken by perturbation are filtered out" #=
-    =#    tags=[:dont_test_mpi] setup=[ForwardDiffWrappers] begin
+    =# tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffCases] begin
     using DFTK
-    using ForwardDiff
-    using LinearAlgebra
-
-    lattice = [2. 0. 0.; 0. 1. 0.; 0. 0. 1.]
-    positions = [[0., 0., 0.], [0.5, 0., 0.]]
-    gauss = ElementGaussian(1.0, 0.5)
-    atoms = [gauss, gauss]
-    atom_groups = [findall(Ref(pot) .== atoms) for pot in Set(atoms)]
-
-    # Select some "interesting" subset of the symmetries
-    # Rotation in the yz plane by 90 degrees
-    rotyz = SymOp([1 0 0; 0 0 1; 0 -1 0], [0., 0., 0.])
-    mirroryz = rotyz * rotyz
-    # Mirror y
-    mirrory = SymOp([1 0 0; 0 -1 0; 0 0 1], [0., 0., 0.])
-    # Translation by 0.5 in the x direction
-    transx = SymOp(diagm([1, 1, 1]), [0.5, 0., 0.])
-
-    # Generate the full group
-    symmetries_full = DFTK.complete_symop_group([rotyz, mirrory, transx])
-    @test length(symmetries_full) == 16
-
-    DFTK._check_symmetries(symmetries_full, lattice, atom_groups, positions)
-
-    function check_symmetries(filtered_symmetries, expected_symmetries)
-        expected_symmetries = DFTK.complete_symop_group(expected_symmetries)
-
-        @test length(filtered_symmetries) == length(expected_symmetries)
-        for s in expected_symmetries
-            @test DFTK.is_approx_in(s, filtered_symmetries)
-        end
-        for s in filtered_symmetries
-            @test DFTK.is_approx_in(s, expected_symmetries)
-        end
-    end
-
-    # Instantiate Dual to test with perturbations
-    # We need to call the `Tag` constructor to trigger the `ForwardDiff.tagcount`
-    ε = ForwardDiff.Dual{typeof(ForwardDiff.Tag(Val(:mytag), Float64))}(0.0, 1.0)
-
-    @testset "Atom movement" begin
-        # Moving the second atom should break the transx symmetry, but not the others
-        positions_modified = [[0., 0., 0.], [0.5 + ε, 0., 0.]]
-        symmetries_filtered = DFTK.remove_dual_broken_symmetries(lattice, atoms, positions_modified, symmetries_full)
-
-        @test length(symmetries_filtered) == 8
-        check_symmetries(symmetries_filtered, [rotyz, mirrory])
-    end
-
-    @testset "Lattice strain" begin
-        # Straining the lattice along z should break the rotyz symmetry, but not the others
-        # In particular it should not break mirroryz which is normally generated by rotyz.
-        lattice_modified = diagm([2., 1., 1. + ε])
-        symmetries_filtered = DFTK.remove_dual_broken_symmetries(lattice_modified, atoms, positions, symmetries_full)
-
-        @test length(symmetries_filtered) == 8
-        check_symmetries(symmetries_filtered, [mirrory, transx, mirroryz])
-    end
-
-    @testset "Atom movement + lattice strain" begin
-        # Only the mirrory and mirroryz symmetries should be left
-        positions_modified = [[0., 0., 0.], [0.5 + ε, 0., 0.]]
-        lattice_modified = diagm([2., 1., 1. + ε])
-        symmetries_filtered = DFTK.remove_dual_broken_symmetries(lattice_modified, atoms, positions_modified, symmetries_full)
-
-        @test length(symmetries_filtered) == 4
-        check_symmetries(symmetries_filtered, [mirrory, mirroryz])
-    end
-
-    @testset "Isotropic lattice scaling" begin
-        # Isotropic scaling should not break any symmetry
-        lattice_modified = (1 + ε) * lattice
-        symmetries_filtered = DFTK.remove_dual_broken_symmetries(lattice_modified, atoms, positions, symmetries_full)
-
-        @test length(symmetries_filtered) == length(symmetries_full)
-        check_symmetries(symmetries_filtered, symmetries_full)
-    end
+    ForwardDiffCases.test_FD_filter_broken_symmetries(DFTK.CPU())
 end
 
 @testitem "Symmetry-breaking perturbation using ForwardDiff" #=
-    =#    tags=[:dont_test_mpi] setup=[TestCases, ForwardDiffWrappers] begin
+    =# tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffCases] begin
     using DFTK
-    using ForwardDiff
-    using LinearAlgebra
-    aluminium = TestCases.aluminium
-
-    @testset for perturbation in (:lattice, :positions)
-        function run_scf(ε)
-            lattice = if perturbation == :lattice
-                v = ε * [0., 0., 0., 0., 0., 1.]
-                DFTK.voigt_strain_to_full(v) * aluminium.lattice
-            else
-                # Lattice has to be a dual for position perturbations to work
-                aluminium.lattice * one(typeof(ε))
-            end
-            pos = if perturbation == :lattice
-                aluminium.positions
-            else
-                map(enumerate(aluminium.positions)) do (i, x)
-                    i == 1 ? x + ε * [1., 0, 0] : x
-                end
-            end
-
-            model = model_DFT(lattice, aluminium.atoms, pos;
-                              functionals=LDA(), temperature=1e-2,
-                              smearing=Smearing.Gaussian())
-            basis = PlaneWaveBasis(model; Ecut=5, kgrid=[2, 2, 2])
-            self_consistent_field(basis; tol=1e-10)
-        end
-
-        δρ = tagged_derivative(ε -> run_scf(ε).ρ, 0.)
-
-        h = 1e-5
-        scfres1 = run_scf(-h)
-        scfres2 = run_scf(+h)
-        δρ_finitediff = (scfres2.ρ - scfres1.ρ) / 2h
-
-        rtol = 1e-4
-        @test norm(δρ - δρ_finitediff, 1) < rtol * norm(δρ, 1)
-    end
+    ForwardDiffCases.test_FD_symmetry_breaking_perturbation(DFTK.CPU())
 end
 
 @testitem "Test scfres dual has the same params as scfres primal" #=
-    =#    tags=[:dont_test_mpi] setup=[TestCases, ForwardDiffWrappers] begin
+    =# tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffCases] begin
     using DFTK
-    using ForwardDiff
-    using LinearAlgebra
-    using PseudoPotentialData
-    silicon = TestCases.silicon
-
-    # Make silicon primal model
-    model = model_DFT(silicon.lattice, silicon.atoms, silicon.positions;
-                      functionals=LDA(), temperature=1e-3, smearing=Smearing.Gaussian())
-    
-    # Make silicon dual model
-    # We need to call the `Tag` constructor to trigger the `ForwardDiff.tagcount`
-    T = typeof(ForwardDiff.Tag(Val(:mytag), Float64))
-    x_dual = ForwardDiff.Dual{T}(1.0, 1.0)
-    model_dual = Model(model; lattice=x_dual * model.lattice)
-
-    # Construct the primal and dual basis
-    basis = PlaneWaveBasis(model; Ecut=5, kgrid=(1,1,1))
-    basis_dual = PlaneWaveBasis(model_dual; Ecut=5, kgrid=(1,1,1))
-
-    # Compute scfres with primal and dual basis
-    scfres = self_consistent_field(basis; tol=1e-5)
-    scfres_dual = self_consistent_field(basis_dual; tol=1e-5)
-    
-    # Check that scfres_dual has the same parameters as scfres
-    @test isempty(setdiff(keys(scfres), keys(scfres_dual)))
+    ForwardDiffCases.test_FD_scfres_parameter_consistency(DFTK.CPU())
 end
 
-
 @testitem "ForwardDiff wrt temperature" #=
-    =#    tags=[:dont_test_mpi, :minimal] setup=[ForwardDiffWrappers] begin
+    =# tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffCases] begin
     using DFTK
-    using ForwardDiff
-    using LinearAlgebra
-    using PseudoPotentialData
+    ForwardDiffCases.test_FD_wrt_temperature(DFTK.CPU())
+end
 
-    a = 10.26  # Silicon lattice constant in Bohr
-    lattice = a / 2 * [[0 1 1.];
-                       [1 0 1.];
-                       [1 1 0.]]
-    Si = ElementPsp(:Si, PseudoFamily("dojo.nc.sr.lda.v0_4_1.standard.upf"))
-    atoms     = [Si, Si]
-    positions = [ones(3)/8, -ones(3)/8]
+### Generic ForwardDiff tests (independent of DFTK architecture)
+@testitem "Derivative of complex function" #=
+    =# tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffCases] begin
+    ForwardDiffCases.test_FD_complex_function_derivative()
+end
 
-    function get(T)
-        model = model_DFT(lattice, atoms, positions; functionals=LDA(), temperature=T)
-        basis = PlaneWaveBasis(model; Ecut=10, kgrid=[1, 1, 1])
-        scfres = self_consistent_field(basis, tol=1e-12)
-        scfres.energies.Entropy
-    end
-    T0 = .01
-    derivative_ε = let ε = 1e-5
-        (get(T0+ε) - get(T0-ε)) / 2ε
-    end
-    derivative_fd = tagged_derivative(get, T0)
-    @test norm(derivative_ε - derivative_fd) < 1e-4
+@testitem "Higher derivatives of Fermi-Dirac occupation" #=
+    =# tags=[:dont_test_mpi, :minimal] setup=[TestCases, ForwardDiffCases] begin
+    ForwardDiffCases.test_FD_fermi_dirac_higher_derivatives()
 end
+
diff --git a/test/forwarddiff_cases.jl b/test/forwarddiff_cases.jl
new file mode 100644
index 0000000000..9d98562b12
--- /dev/null
+++ b/test/forwarddiff_cases.jl
@@ -0,0 +1,434 @@
+@testmodule ForwardDiffCases begin
+using DFTK
+using Test
+using ForwardDiff
+using LinearAlgebra
+using ComponentArrays
+using PseudoPotentialData
+using DftFunctionals
+using FiniteDifferences
+using SpecialFunctions
+using ..TestCases: silicon, aluminium
+
+# Wrappers around ForwardDiff to fix tags and reduce compilation time.
+# Warning: fixing types without care can lead to perturbation confusion,
+# this should only be used within the testing framework. Risk of perturbation
+# confusion arises when nested derivatives of different functions are taken
+# with a fixed tag. Only use these wrappers at the top-level call.
+struct DerivativeTag end
+function tagged_derivative(f, x::T; custom_tag=DerivativeTag) where T
+    # explicit call to ForwardDiff.Tag() to trigger ForwardDiff.tagcount
+    TagType = typeof(ForwardDiff.Tag(custom_tag(), T))
+    x_dual = ForwardDiff.Dual{TagType, T, 1}(x, ForwardDiff.Partials((one(T),)))
+
+    res = ForwardDiff.extract_derivative(TagType, f(x_dual))
+    return res
+end
+
+struct GradientTag end
+GradientConfig(f, x, ::Type{Tag}) where {Tag} =
+    ForwardDiff.GradientConfig(f, x, ForwardDiff.Chunk(x), Tag())
+function tagged_gradient(f, x::AbstractArray{T}; custom_tag=GradientTag) where T
+    # explicit call to ForwardDiff.Tag() to trigger ForwardDiff.tagcount
+    TagType = typeof(ForwardDiff.Tag(custom_tag(), T))
+
+    cfg = GradientConfig(f, x, TagType)
+    ForwardDiff.gradient(f, x, cfg, Val{false}())
+end
+
+struct JacobianTag end
+JacobianConfig(f, x, ::Type{Tag}) where {Tag} =
+    ForwardDiff.JacobianConfig(f, x, ForwardDiff.Chunk(x), Tag())
+function tagged_jacobian(f, x::AbstractArray{T}; custom_tag=JacobianTag) where T
+    # explicit call to ForwardDiff.Tag() to trigger ForwardDiff.tagcount
+    TagType = typeof(ForwardDiff.Tag(custom_tag(), T))
+
+    cfg = JacobianConfig(f, x, TagType)
+    ForwardDiff.jacobian(f, x, cfg, Val{false}())
+end
+
+function test_FD_force_derivatives(architecture)
+    function compute_force(ε1, ε2; metal=false, tol=1e-10, atoms=silicon.atoms)
+        T = promote_type(typeof(ε1), typeof(ε2))
+        pos = [[1.01, 1.02, 1.03] / 8, -ones(3) / 8 + ε1 * [1., 0, 0] + ε2 * [0, 1., 0]]
+        if metal
+            # Silicon reduced HF is metallic
+            model = model_DFT(Matrix{T}(silicon.lattice), atoms, pos;
+                              functionals=[], temperature=1e-3)
+        else
+            model = model_DFT(Matrix{T}(silicon.lattice), atoms, pos;
+                              functionals=LDA())
+        end
+        basis = PlaneWaveBasis(model; Ecut=5, kgrid=[2, 2, 2], architecture)
+
+        response     = ResponseOptions(; verbose=true)
+        is_converged = DFTK.ScfConvergenceForce(tol)
+        scfres = self_consistent_field(basis; is_converged, response)
+        compute_forces_cart(scfres)
+    end
+
+    F = compute_force(0.0, 0.0)
+    derivative_ε1_fd = let ε1 = 1e-5
+        (compute_force(ε1, 0.0) - F) / ε1
+    end
+    derivative_ε1 = tagged_derivative(ε1 -> compute_force(ε1, 0.0), 0.0)
+    @test norm(derivative_ε1 - derivative_ε1_fd) < 1e-4
+
+    derivative_ε2_fd = let ε2 = 1e-5
+        (compute_force(0.0, ε2) - F) / ε2
+    end
+    derivative_ε2 = tagged_derivative(ε2 -> compute_force(0.0, ε2), 0.0)
+    @test norm(derivative_ε2 - derivative_ε2_fd) < 1e-4
+
+    @testset "Multiple partials" begin
+        grad = tagged_gradient(v -> compute_force(v...)[1][1], [0.0, 0.0])
+        @test abs(grad[1] - derivative_ε1[1][1]) < 1e-4
+        @test abs(grad[2] - derivative_ε2[1][1]) < 1e-4
+
+        jac = tagged_jacobian(v -> compute_force(v...)[1], [0.0, 0.0])
+        @test norm(grad - jac[1, :]) < 1e-9
+    end
+
+    @testset "Derivative for metals" begin
+        metal = true
+        derivative_ε1_fd = let ε1 = 1e-5
+            (compute_force(ε1, 0.0; metal) - compute_force(-ε1, 0.0; metal)) / 2ε1
+        end
+        derivative_ε1 = tagged_derivative(ε1 -> compute_force(ε1, 0.0; metal), 0.0)
+        @test norm(derivative_ε1 - derivative_ε1_fd) < 1e-4
+    end
+
+    @testset "Using PspUpf" begin
+        Si = ElementPsp(:Si; psp=load_psp(silicon.psp_upf))
+        atoms = [Si, Si]
+
+        derivative_ε1_fd = let ε1 = 1e-5
+            (compute_force(ε1, 0.0; atoms) - compute_force(-ε1, 0.0; atoms)) / 2ε1
+        end
+        derivative_ε1 = tagged_derivative(ε1 -> compute_force(ε1, 0.0; atoms), 0.0)
+        @test norm(derivative_ε1 - derivative_ε1_fd) < 1e-4
+    end
+
+end
+
+function test_FD_strain_sensitivity(architecture)
+    Ecut = 5
+    kgrid = [2, 2, 2]
+    model = model_DFT(aluminium.lattice, aluminium.atoms, aluminium.positions;
+                      functionals=LDA(), temperature=1e-2, smearing=Smearing.Gaussian(),
+                      kinetic_blowup=BlowupCHV())
+    basis = PlaneWaveBasis(model; Ecut, kgrid, architecture)
+    nbandsalg = FixedBands(; n_bands_converge=10)
+
+    function compute_properties(lattice)
+        model_strained = Model(model; lattice)
+        basis = PlaneWaveBasis(model_strained; Ecut, kgrid, architecture)
+        scfres = self_consistent_field(basis; tol=1e-11, nbandsalg)
+        ComponentArray(
+           eigenvalues=stack([ev[1:10] for ev in DFTK.to_cpu(scfres.eigenvalues)]),
+           ρ=DFTK.to_cpu(scfres.ρ),
+           energies=collect(values(scfres.energies)),
+           εF=scfres.εF,
+           occupation=reduce(vcat, DFTK.to_cpu(scfres.occupation)),
+        )
+    end
+
+    strain_isotropic(ε) = (1 + ε) * model.lattice
+    function strain_anisotropic(ε)
+        DFTK.voigt_strain_to_full([ε, 0., 0., 0., 0., 0.]) * model.lattice
+    end
+
+    @testset "$strain_fn" for strain_fn in (strain_isotropic, strain_anisotropic)
+        f(ε) = compute_properties(strain_fn(ε))
+        dx = tagged_derivative(f, 0.)
+
+        h = 1e-4
+        x1 = f(-h)
+        x2 = f(+h)
+        dx_findiff = (x2 - x1) / 2h
+        @test norm(dx.ρ - dx_findiff.ρ) * sqrt(basis.dvol) < 1e-6
+        @test maximum(abs, dx.eigenvalues - dx_findiff.eigenvalues) < 1e-6
+        @test maximum(abs, dx.energies - dx_findiff.energies) < 3e-5
+        @test dx.εF - dx_findiff.εF < 1e-6
+        @test maximum(abs, dx.occupation - dx_findiff.occupation) < 1e-4
+    end 
+end
+
+function test_FD_psp_sensitivity(architecture)
+    function compute_band_energies(ε::T) where {T}
+        psp  = load_psp(PseudoFamily("cp2k.nc.sr.lda.v0_1.semicore.gth"), :Al)
+        rloc = convert(T, psp.rloc)
+
+        pspmod = PspHgh(psp.Zion, rloc,
+                        psp.cloc, psp.rp .+ [0, ε], psp.h;
+                        psp.identifier, psp.description)
+        atoms = fill(ElementPsp(aluminium.atnum; psp=pspmod), length(aluminium.positions))
+        model = model_DFT(Matrix{T}(aluminium.lattice), atoms, aluminium.positions;
+                          functionals=LDA(), temperature=1e-2, smearing=Smearing.Gaussian())
+        basis = PlaneWaveBasis(model; Ecut=5, kgrid=[2, 2, 2], architecture)
+
+        is_converged = DFTK.ScfConvergenceDensity(1e-10)
+        scfres = self_consistent_field(basis; is_converged, mixing=KerkerMixing(),
+                                       nbandsalg=FixedBands(; n_bands_converge=10),
+                                       damping=0.6, response=ResponseOptions(; verbose=true))
+
+        ComponentArray(
+           eigenvalues=stack([ev[1:10] for ev in DFTK.to_cpu(scfres.eigenvalues)]),
+           ρ=DFTK.to_cpu(scfres.ρ),
+           energies=collect(values(scfres.energies)),
+           εF=scfres.εF,
+           occupation=reduce(vcat, DFTK.to_cpu(scfres.occupation)),
+        )
+    end
+
+    derivative_ε = let ε = 1e-4
+        (compute_band_energies(ε) - compute_band_energies(-ε)) / 2ε
+    end
+    derivative_fd = tagged_derivative(compute_band_energies, 0.0)
+    @test norm(derivative_fd - derivative_ε) < 5e-4
+end
+
+function test_FD_force_sensitivity(architecture)
+    function compute_force(ε1::T) where {T}
+        pos = [[1.01, 1.02, 1.03] / 8, -ones(3) / 8]
+        pbec = DftFunctional(:gga_c_pbe)
+        pbex = DftFunctional(:gga_x_pbe)
+        pbex = change_parameters(pbex, parameters(pbex) + ComponentArray(κ=0, μ=ε1))
+
+        model = model_DFT(Matrix{T}(silicon.lattice), silicon.atoms, pos;
+                          functionals=[pbex, pbec])
+        basis = PlaneWaveBasis(model; Ecut=5, kgrid=[2, 2, 2], architecture)
+
+        is_converged = DFTK.ScfConvergenceDensity(1e-10)
+        scfres = self_consistent_field(basis; is_converged,
+                                       response=ResponseOptions(; verbose=true))
+        compute_forces_cart(scfres)
+    end
+
+    derivative_ε = let ε = 1e-5
+        (compute_force(ε) - compute_force(-ε)) / 2ε
+    end
+    derivative_fd = tagged_derivative(compute_force, 0.0)
+    @test norm(derivative_ε - derivative_fd) < 1e-4
+end
+
+function test_FD_local_nonlinearity_sensitivity(architecture)
+    function compute_force(ε::T) where {T}
+        # solve the 1D Gross-Pitaevskii equation with ElementGaussian potential
+        lattice = 10.0 .* [[1 0 0.]; [0 0 0]; [0 0 0]]
+        positions = [[0.2, 0, 0], [0.8, 0, 0]]
+        gauss = ElementGaussian(1.0, 0.5)
+        atoms = [gauss, gauss]
+        n_electrons = 1
+        terms = [Kinetic(), AtomicLocal(), LocalNonlinearity(ρ -> (1.0 + ε) * ρ^2)]
+        model = Model(Matrix{T}(lattice), atoms, positions;
+                      n_electrons, terms, spin_polarization=:spinless)
+        basis = PlaneWaveBasis(model; Ecut=500, kgrid=(1, 1, 1), architecture)
+        ρ = DFTK.to_device(architecture, zeros(Float64, basis.fft_size..., 1))
+        is_converged = DFTK.ScfConvergenceDensity(1e-10)
+        scfres = self_consistent_field(basis; ρ, is_converged,
+                                       response=ResponseOptions(; verbose=true))
+        compute_forces_cart(scfres)
+    end
+    derivative_ε = let ε = 1e-5
+        (compute_force(ε) - compute_force(-ε)) / 2ε
+    end
+    derivative_fd = tagged_derivative(compute_force, 0.0)
+    @test norm(derivative_ε - derivative_fd) < 1e-4
+end
+
+function test_FD_filter_broken_symmetries(architecture)
+    lattice = [2. 0. 0.; 0. 1. 0.; 0. 0. 1.]
+    positions = [[0., 0., 0.], [0.5, 0., 0.]]
+    gauss = ElementGaussian(1.0, 0.5)
+    atoms = [gauss, gauss]
+    atom_groups = [findall(Ref(pot) .== atoms) for pot in Set(atoms)]
+
+    # Select some "interesting" subset of the symmetries
+    # Rotation in the yz plane by 90 degrees
+    rotyz = SymOp([1 0 0; 0 0 1; 0 -1 0], [0., 0., 0.])
+    mirroryz = rotyz * rotyz
+    # Mirror y
+    mirrory = SymOp([1 0 0; 0 -1 0; 0 0 1], [0., 0., 0.])
+    # Translation by 0.5 in the x direction
+    transx = SymOp(diagm([1, 1, 1]), [0.5, 0., 0.])
+
+    # Generate the full group
+    symmetries_full = DFTK.complete_symop_group([rotyz, mirrory, transx])
+    @test length(symmetries_full) == 16
+
+    DFTK._check_symmetries(symmetries_full, lattice, atom_groups, positions)
+
+    function check_symmetries(filtered_symmetries, expected_symmetries)
+        expected_symmetries = DFTK.complete_symop_group(expected_symmetries)
+
+        @test length(filtered_symmetries) == length(expected_symmetries)
+        for s in expected_symmetries
+            @test DFTK.is_approx_in(s, filtered_symmetries)
+        end
+        for s in filtered_symmetries
+            @test DFTK.is_approx_in(s, expected_symmetries)
+        end
+    end
+
+    # Instantiate Dual to test with perturbations
+    # We need to call the `Tag` constructor to trigger the `ForwardDiff.tagcount`
+    ε = ForwardDiff.Dual{typeof(ForwardDiff.Tag(Val(:mytag), Float64))}(0.0, 1.0)
+
+    @testset "Atom movement" begin
+        # Moving the second atom should break the transx symmetry, but not the others
+        positions_modified = [[0., 0., 0.], [0.5 + ε, 0., 0.]]
+        symmetries_filtered = DFTK.remove_dual_broken_symmetries(lattice, atoms, positions_modified, symmetries_full)
+
+        @test length(symmetries_filtered) == 8
+        check_symmetries(symmetries_filtered, [rotyz, mirrory])
+    end
+
+    @testset "Lattice strain" begin
+        # Straining the lattice along z should break the rotyz symmetry, but not the others
+        # In particular it should not break mirroryz which is normally generated by rotyz.
+        lattice_modified = diagm([2., 1., 1. + ε])
+        symmetries_filtered = DFTK.remove_dual_broken_symmetries(lattice_modified, atoms, positions, symmetries_full)
+
+        @test length(symmetries_filtered) == 8
+        check_symmetries(symmetries_filtered, [mirrory, transx, mirroryz])
+    end
+
+    @testset "Atom movement + lattice strain" begin
+        # Only the mirrory and mirroryz symmetries should be left
+        positions_modified = [[0., 0., 0.], [0.5 + ε, 0., 0.]]
+        lattice_modified = diagm([2., 1., 1. + ε])
+        symmetries_filtered = DFTK.remove_dual_broken_symmetries(lattice_modified, atoms, positions_modified, symmetries_full)
+
+        @test length(symmetries_filtered) == 4
+        check_symmetries(symmetries_filtered, [mirrory, mirroryz])
+    end
+
+    @testset "Isotropic lattice scaling" begin
+        # Isotropic scaling should not break any symmetry
+        lattice_modified = (1 + ε) * lattice
+        symmetries_filtered = DFTK.remove_dual_broken_symmetries(lattice_modified, atoms, positions, symmetries_full)
+
+        @test length(symmetries_filtered) == length(symmetries_full)
+        check_symmetries(symmetries_filtered, symmetries_full)
+    end
+end
+
+function test_FD_symmetry_breaking_perturbation(architecture)
+    @testset for perturbation in (:lattice, :positions)
+        function run_scf(ε)
+            lattice = if perturbation == :lattice
+                v = ε * [0., 0., 0., 0., 0., 1.]
+                DFTK.voigt_strain_to_full(v) * aluminium.lattice
+            else
+                # Lattice has to be a dual for position perturbations to work
+                aluminium.lattice * one(typeof(ε))
+            end
+            pos = if perturbation == :lattice
+                aluminium.positions
+            else
+                map(enumerate(aluminium.positions)) do (i, x)
+                    i == 1 ? x + ε * [1., 0, 0] : x
+                end
+            end
+
+            model = model_DFT(lattice, aluminium.atoms, pos;
+                              functionals=LDA(), temperature=1e-2,
+                              smearing=Smearing.Gaussian())
+            basis = PlaneWaveBasis(model; Ecut=5, kgrid=[2, 2, 2], architecture)
+            self_consistent_field(basis; tol=1e-10)
+        end
+
+        δρ = tagged_derivative(ε -> run_scf(ε).ρ, 0.)
+
+        h = 1e-5
+        scfres1 = run_scf(-h)
+        scfres2 = run_scf(+h)
+        δρ_finitediff = (scfres2.ρ - scfres1.ρ) / 2h
+
+        rtol = 1e-4
+        @test norm(δρ - δρ_finitediff, 1) < rtol * norm(δρ, 1)
+    end
+end
+
+function test_FD_scfres_parameter_consistency(architecture)
+    # Make silicon primal model
+    model = model_DFT(silicon.lattice, silicon.atoms, silicon.positions;
+                      functionals=LDA(), temperature=1e-3, smearing=Smearing.Gaussian())
+    
+    # Make silicon dual model
+    # We need to call the `Tag` constructor to trigger the `ForwardDiff.tagcount`
+    T = typeof(ForwardDiff.Tag(Val(:mytag), Float64))
+    x_dual = ForwardDiff.Dual{T}(1.0, 1.0)
+    model_dual = Model(model; lattice=x_dual * model.lattice)
+
+    # Construct the primal and dual basis
+    basis = PlaneWaveBasis(model; Ecut=5, kgrid=(1,1,1), architecture)
+    basis_dual = PlaneWaveBasis(model_dual; Ecut=5, kgrid=(1,1,1), architecture)
+
+    # Compute scfres with primal and dual basis
+    scfres = self_consistent_field(basis; tol=1e-5)
+    scfres_dual = self_consistent_field(basis_dual; tol=1e-5)
+    
+    # Check that scfres_dual has the same parameters as scfres
+    @test isempty(setdiff(keys(scfres), keys(scfres_dual)))
+end
+
+function test_FD_wrt_temperature(architecture)
+    a = 10.26  # Silicon lattice constant in Bohr
+    lattice = a / 2 * [[0 1 1.];
+                       [1 0 1.];
+                       [1 1 0.]]
+    Si = ElementPsp(:Si, PseudoFamily("dojo.nc.sr.lda.v0_4_1.standard.upf"))
+    atoms     = [Si, Si]
+    positions = [ones(3)/8, -ones(3)/8]
+
+    function get(T)
+        model = model_DFT(lattice, atoms, positions; functionals=LDA(), temperature=T)
+        basis = PlaneWaveBasis(model; Ecut=10, kgrid=[1, 1, 1], architecture)
+        scfres = self_consistent_field(basis, tol=1e-12)
+        scfres.energies.Entropy
+    end
+    T0 = .01
+    derivative_ε = let ε = 1e-5
+        (get(T0+ε) - get(T0-ε)) / 2ε
+    end
+    derivative_fd = tagged_derivative(get, T0)
+    @test norm(derivative_ε - derivative_fd) < 1e-4
+end
+
+### Generic ForwardDiff tests (independent of DFTK architecture)
+function test_FD_complex_function_derivative()
+    α = randn(ComplexF64)
+    erfcα = x -> erfc(α * x)
+
+    x0  = randn()
+    fd1 = tagged_derivative(erfcα, x0)
+    fd2 = FiniteDifferences.central_fdm(5, 1)(erfcα, x0)
+    @test norm(fd1 - fd2) < 1e-8
+end
+
+function test_FD_fermi_dirac_higher_derivatives()
+    smearing = Smearing.FermiDirac()
+    f(x) = Smearing.occupation(smearing, x)
+
+    function compute_nth_derivative(n, f, x)
+        (n == 0) && return f(x)
+        ForwardDiff.derivative(x -> compute_nth_derivative(n - 1, f, x), x)
+    end
+
+    @testset "Avoid NaN from exp-overflow for large x" begin
+        T = Float64
+        x = log(floatmax(T)) / 2 + 1
+        for n in 0:8
+            @testset "Derivative order $n" begin
+                y = compute_nth_derivative(n, f, x)
+                @test isfinite(y)
+                @test abs(y) ≤ eps(T)
+            end
+        end
+    end
+end
+
+
+end
\ No newline at end of file
diff --git a/test/forwarddiff_gpu.jl b/test/forwarddiff_gpu.jl
new file mode 100644
index 0000000000..7d78f7582a
--- /dev/null
+++ b/test/forwarddiff_gpu.jl
@@ -0,0 +1,165 @@
+### ForwardDiff tests on the GPU, closely following forwarddiff.jl tests
+
+### CUDA tests:
+@testitem "Force derivatives using ForwardDiff (CUDA)" #=
+    =#    tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using CUDA
+    if CUDA.has_cuda() && CUDA.has_cuda_gpu()
+        ForwardDiffCases.test_FD_force_derivatives(DFTK.GPU(CuArray))
+    end
+end
+
+@testitem "Strain sensitivity using ForwardDiff (CUDA)" #=
+    =#    tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using CUDA
+    if CUDA.has_cuda() && CUDA.has_cuda_gpu()
+        ForwardDiffCases.test_FD_strain_sensitivity(DFTK.GPU(CuArray))
+    end
+end
+
+@testitem "scfres PSP sensitivity using ForwardDiff (CUDA)" #=
+    =#    tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using CUDA
+    if CUDA.has_cuda() && CUDA.has_cuda_gpu()
+        ForwardDiffCases.test_FD_psp_sensitivity(DFTK.GPU(CuArray))
+    end
+end
+
+@testitem "Functional force sensitivity using ForwardDiff (CUDA)" #=
+    =#    tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using CUDA
+    if CUDA.has_cuda() && CUDA.has_cuda_gpu()
+        ForwardDiffCases.test_FD_force_sensitivity(DFTK.GPU(CuArray))
+    end
+end
+
+@testitem "LocalNonlinearity sensitivity using ForwardDiff (CUDA)" #=
+    =#    tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using CUDA
+    if CUDA.has_cuda() && CUDA.has_cuda_gpu()
+        ForwardDiffCases.test_FD_local_nonlinearity_sensitivity(DFTK.GPU(CuArray))
+    end
+end
+
+@testitem "Symmetries broken by perturbation are filtered out (CUDA)" #=
+    =# tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using CUDA
+    if CUDA.has_cuda() && CUDA.has_cuda_gpu()
+        ForwardDiffCases.test_FD_filter_broken_symmetries(DFTK.GPU(CuArray))
+    end
+end
+
+@testitem "Symmetry-breaking perturbation using ForwardDiff (CUDA)" #=
+    =# tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using CUDA
+    if CUDA.has_cuda() && CUDA.has_cuda_gpu()
+        ForwardDiffCases.test_FD_symmetry_breaking_perturbation(DFTK.GPU(CuArray))
+    end
+end
+
+@testitem "Test scfres dual has the same params as scfres primal (CUDA)" #=
+    =# tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using CUDA
+    if CUDA.has_cuda() && CUDA.has_cuda_gpu()
+        ForwardDiffCases.test_FD_scfres_parameter_consistency(DFTK.GPU(CuArray))
+    end
+end
+
+@testitem "ForwardDiff wrt temperature (CUDA)" #=
+    =# tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using CUDA
+    if CUDA.has_cuda() && CUDA.has_cuda_gpu()
+        ForwardDiffCases.test_FD_wrt_temperature(DFTK.GPU(CuArray))
+    end
+end
+
+### AMDGPU tests:
+@testitem "Force derivatives using ForwardDiff (AMDGPU)" #=
+    =#    tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using AMDGPU 
+    if AMDGPU.has_rocm_gpu()
+        ForwardDiffCases.test_FD_force_derivatives(DFTK.GPU(ROCArray))
+    end
+end
+
+@testitem "Strain sensitivity using ForwardDiff (AMDGPU)" #=
+    =#    tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using AMDGPU
+    if AMDGPU.has_rocm_gpu()
+        ForwardDiffCases.test_FD_strain_sensitivity(DFTK.GPU(ROCArray))
+    end
+end
+
+@testitem "scfres PSP sensitivity using ForwardDiff (AMDGPU)" #=
+    =#    tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using AMDGPU
+    if AMDGPU.has_rocm_gpu()
+        ForwardDiffCases.test_FD_psp_sensitivity(DFTK.GPU(ROCArray))
+    end
+end
+
+@testitem "Functional force sensitivity using ForwardDiff (AMDGPU)" #=
+    =#    tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using AMDGPU
+    if AMDGPU.has_rocm_gpu()
+        ForwardDiffCases.test_FD_force_sensitivity(DFTK.GPU(ROCArray))
+    end
+end
+
+@testitem "LocalNonlinearity sensitivity using ForwardDiff (AMDGPU)" #=
+    =#    tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using AMDGPU
+    if AMDGPU.has_rocm_gpu()
+        ForwardDiffCases.test_FD_local_nonlinearity_sensitivity(DFTK.GPU(ROCArray))
+    end
+end
+
+@testitem "Symmetries broken by perturbation are filtered out (AMDGPU)" #=
+    =# tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using AMDGPU
+    if AMDGPU.has_rocm_gpu()
+        ForwardDiffCases.test_FD_filter_broken_symmetries(DFTK.GPU(ROCArray))
+    end
+end
+
+@testitem "Symmetry-breaking perturbation using ForwardDiff (AMDGPU)" #=
+    =# tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using AMDGPU
+    if AMDGPU.has_rocm_gpu()
+        ForwardDiffCases.test_FD_symmetry_breaking_perturbation(DFTK.GPU(ROCArray))
+    end
+end
+
+@testitem "Test scfres dual has the same params as scfres primal (AMDGPU)" #=
+    =# tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using AMDGPU
+    if AMDGPU.has_rocm_gpu()
+        ForwardDiffCases.test_FD_scfres_parameter_consistency(DFTK.GPU(ROCArray))
+    end
+end
+
+@testitem "ForwardDiff wrt temperature (AMDGPU)" #=
+    =# tags=[:gpu] setup=[TestCases, ForwardDiffCases] begin
+    using DFTK
+    using AMDGPU
+    if AMDGPU.has_rocm_gpu()
+        ForwardDiffCases.test_FD_wrt_temperature(DFTK.GPU(ROCArray))
+    end
+end
\ No newline at end of file