diff --git a/src/architecture.jl b/src/architecture.jl
index cf48da95ed..765fd2748e 100644
--- a/src/architecture.jl
+++ b/src/architecture.jl
@@ -30,3 +30,9 @@ Synchronize data and finish all operations on the execution stream of the device
 This needs to be called explicitly before a task finishes (e.g. in an `@spawn` block).
 """
 synchronize_device(::AbstractArchitecture) = nothing
+
+
+"""
+Returns the architecture of the given array.
+"""
+get_architecture(x::AbstractArray) = x isa AbstractGPUArray ? GPU{typeof(x)}() : CPU()
\ No newline at end of file
diff --git a/src/common/spherical_bessels.jl b/src/common/spherical_bessels.jl
index a5b0c82828..c73961c0bd 100644
--- a/src/common/spherical_bessels.jl
+++ b/src/common/spherical_bessels.jl
@@ -18,5 +18,5 @@ with `SpecialFunctions.sphericalbesselj`. Specialized for integer ``0 ≤ l ≤
     l == 3 && return (sin(x) * (15 - 6x^2) + cos(x) * (x^3 - 15x)) / x^4
     l == 4 && return (sin(x) * (105 - 45x^2 + x^4) + cos(x) * (10x^3 - 105x)) / x^5
     l == 5 && return (sin(x) * (945 - 420x^2 + 15x^4) + cos(x) * (-945x + 105x^3 - x^5)) / x^6
-    error("The case l = $l is not implemented")
+    throw(BoundsError()) # specific l not implemented
 end
diff --git a/src/elements.jl b/src/elements.jl
index a5af36de71..31d39b7e6f 100644
--- a/src/elements.jl
+++ b/src/elements.jl
@@ -153,9 +153,8 @@ AtomsBase.species(el::ElementPsp) = el.species
 charge_ionic(el::ElementPsp)      = charge_ionic(el.psp)
 has_core_density(el::ElementPsp)  = has_core_density(el.psp)
 
-function local_potential_fourier(el::ElementPsp, p::T) where {T <: Real}
-    p == 0 && return zero(T)  # Compensating charge background
-    eval_psp_local_fourier(el.psp, p)
+function local_potential_fourier(el::ElementPsp, ps::AbstractArray{T}) where {T <: Real}
+    eval_psp_local_fourier(el.psp, ps)
 end
 local_potential_real(el::ElementPsp, r::Real) = eval_psp_local_real(el.psp, r)
 
diff --git a/src/pseudo/NormConservingPsp.jl b/src/pseudo/NormConservingPsp.jl
index 15a5ab5486..a3c534b50a 100644
--- a/src/pseudo/NormConservingPsp.jl
+++ b/src/pseudo/NormConservingPsp.jl
@@ -82,8 +82,8 @@ V_{\rm loc}(p) &= ∫_{ℝ^3} (V_{\rm loc}(r) - C(r)) e^{-ip·r} dr + F[C(r)] \\
 \end{aligned}
 ```
 """
-eval_psp_local_fourier(psp::NormConservingPsp, p::AbstractVector) =
-    eval_psp_local_fourier(psp, norm(p))
+#eval_psp_local_fourier(psp::NormConservingPsp, p::AbstractVector) =
+#    eval_psp_local_fourier(psp, norm(p))
 
 @doc raw"""
     eval_psp_energy_correction([T=Float64,] psp, n_electrons)
diff --git a/src/pseudo/PspUpf.jl b/src/pseudo/PspUpf.jl
index 34f8436237..8eacb6805b 100644
--- a/src/pseudo/PspUpf.jl
+++ b/src/pseudo/PspUpf.jl
@@ -197,19 +197,32 @@ end
 
 eval_psp_local_real(psp::PspUpf, r::T) where {T<:Real} = psp.vloc_interp(r)
 
-function eval_psp_local_fourier(psp::PspUpf, p::T)::T where {T<:Real}
+function eval_psp_local_fourier(psp::PspUpf, ps::AbstractArray{T}) where {T<:Real}
     # QE style C(r) = -Zerf(r)/r Coulomb tail correction used to ensure
     # exponential decay of `f` so that the Hankel transform is accurate.
     # H[Vloc(r)] = H[Vloc(r) - C(r)] + H[C(r)],
     # where H[-Zerf(r)/r] = -Z/p^2 exp(-p^2 /4)
     # ABINIT uses a more 'pure' Coulomb term with the same asymptotic behavior
     # C(r) = -Z/r; H[-Z/r] = -Z/p^2
-    rgrid = @view psp.rgrid[1:psp.ircut]
-    vloc  = @view psp.vloc[1:psp.ircut]
-    I = simpson(rgrid) do i, r
-         r * (r * vloc[i] - -psp.Zion * erf(r)) * sphericalbesselj_fast(0, p * r)
+    x = @view psp.rgrid[1:3]
+    uniform_grid = (x[2] - x[1]) ≈ (x[3] - x[2]) ? true : false
+
+    arch = get_architecture(ps)
+    rgrid = to_device(arch, @view psp.rgrid[1:psp.ircut])
+    vloc  = to_device(arch, @view psp.vloc[1:psp.ircut])
+    Zion = psp.Zion
+    map(ps) do p
+        method = uniform_grid ? simpson_uniform : simpson_nonuniform
+        if p == 0
+            zero(T)
+        else
+            # GPU compilation error if branching done in generic simpson()
+            I = method(rgrid) do i, r
+                r * (r * vloc[i] - -Zion * erf(r)) * sphericalbesselj_fast(0, p * r)
+            end
+            4T(π) * (I + -Zion / p^2 * exp(-p^2 / T(4)))
+        end
     end
-    4T(π) * (I + -psp.Zion / p^2 * exp(-p^2 / T(4)))
 end
 
 function eval_psp_density_valence_real(psp::PspUpf, r::T) where {T<:Real}
diff --git a/src/terms/local.jl b/src/terms/local.jl
index ebcce36ea7..fb35a843cd 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -81,17 +81,18 @@ function atomic_local_form_factors(basis::PlaneWaveBasis{T}; q=zero(Vec3{T})) wh
         p = norm(G)
         iG2ifnorm_cpu[iG] = get!(norm_indices, p, length(norm_indices) + 1)
     end
+    iG2ifnorm = to_device(basis.architecture, iG2ifnorm_cpu)
 
-    form_factors_cpu = zeros(T, length(norm_indices), length(basis.model.atom_groups))
-    for(p, ifnorm) in norm_indices
-        for (igroup, group) in enumerate(basis.model.atom_groups)
-            element = basis.model.atoms[first(group)]
-            form_factors_cpu[ifnorm, igroup] = local_potential_fourier(element, p)
-        end
+    ni_pairs = collect(pairs(norm_indices))
+    ps = to_device(basis.architecture, [p for (p, idx) in ni_pairs])
+    indices = to_device(basis.architecture, [idx for (p, idx) in ni_pairs])
+
+    form_factors = similar(ps, length(norm_indices), length(basis.model.atom_groups))
+    for (igroup, group) in enumerate(basis.model.atom_groups)
+        element = basis.model.atoms[first(group)]
+        @inbounds form_factors[indices, igroup] .= local_potential_fourier(element, ps)
     end
 
-    form_factors = to_device(basis.architecture, form_factors_cpu)
-    iG2ifnorm = to_device(basis.architecture, iG2ifnorm_cpu)
     (; form_factors, iG2ifnorm)
 end