diff --git a/src/architecture.jl b/src/architecture.jl index cf48da95ed..765fd2748e 100644 --- a/src/architecture.jl +++ b/src/architecture.jl @@ -30,3 +30,9 @@ Synchronize data and finish all operations on the execution stream of the device This needs to be called explicitly before a task finishes (e.g. in an `@spawn` block). """ synchronize_device(::AbstractArchitecture) = nothing + + +""" +Returns the architecture of the given array. +""" +get_architecture(x::AbstractArray) = x isa AbstractGPUArray ? GPU{typeof(x)}() : CPU() \ No newline at end of file diff --git a/src/common/spherical_bessels.jl b/src/common/spherical_bessels.jl index a5b0c82828..c73961c0bd 100644 --- a/src/common/spherical_bessels.jl +++ b/src/common/spherical_bessels.jl @@ -18,5 +18,5 @@ with `SpecialFunctions.sphericalbesselj`. Specialized for integer ``0 ≤ l ≤ l == 3 && return (sin(x) * (15 - 6x^2) + cos(x) * (x^3 - 15x)) / x^4 l == 4 && return (sin(x) * (105 - 45x^2 + x^4) + cos(x) * (10x^3 - 105x)) / x^5 l == 5 && return (sin(x) * (945 - 420x^2 + 15x^4) + cos(x) * (-945x + 105x^3 - x^5)) / x^6 - error("The case l = $l is not implemented") + throw(BoundsError()) # specific l not implemented end diff --git a/src/elements.jl b/src/elements.jl index a5af36de71..31d39b7e6f 100644 --- a/src/elements.jl +++ b/src/elements.jl @@ -153,9 +153,8 @@ AtomsBase.species(el::ElementPsp) = el.species charge_ionic(el::ElementPsp) = charge_ionic(el.psp) has_core_density(el::ElementPsp) = has_core_density(el.psp) -function local_potential_fourier(el::ElementPsp, p::T) where {T <: Real} - p == 0 && return zero(T) # Compensating charge background - eval_psp_local_fourier(el.psp, p) +function local_potential_fourier(el::ElementPsp, ps::AbstractArray{T}) where {T <: Real} + eval_psp_local_fourier(el.psp, ps) end local_potential_real(el::ElementPsp, r::Real) = eval_psp_local_real(el.psp, r) diff --git a/src/pseudo/NormConservingPsp.jl b/src/pseudo/NormConservingPsp.jl index 15a5ab5486..a3c534b50a 100644 --- a/src/pseudo/NormConservingPsp.jl +++ b/src/pseudo/NormConservingPsp.jl @@ -82,8 +82,8 @@ V_{\rm loc}(p) &= ∫_{ℝ^3} (V_{\rm loc}(r) - C(r)) e^{-ip·r} dr + F[C(r)] \\ \end{aligned} ``` """ -eval_psp_local_fourier(psp::NormConservingPsp, p::AbstractVector) = - eval_psp_local_fourier(psp, norm(p)) +#eval_psp_local_fourier(psp::NormConservingPsp, p::AbstractVector) = +# eval_psp_local_fourier(psp, norm(p)) @doc raw""" eval_psp_energy_correction([T=Float64,] psp, n_electrons) diff --git a/src/pseudo/PspUpf.jl b/src/pseudo/PspUpf.jl index 34f8436237..8eacb6805b 100644 --- a/src/pseudo/PspUpf.jl +++ b/src/pseudo/PspUpf.jl @@ -197,19 +197,32 @@ end eval_psp_local_real(psp::PspUpf, r::T) where {T<:Real} = psp.vloc_interp(r) -function eval_psp_local_fourier(psp::PspUpf, p::T)::T where {T<:Real} +function eval_psp_local_fourier(psp::PspUpf, ps::AbstractArray{T}) where {T<:Real} # QE style C(r) = -Zerf(r)/r Coulomb tail correction used to ensure # exponential decay of `f` so that the Hankel transform is accurate. # H[Vloc(r)] = H[Vloc(r) - C(r)] + H[C(r)], # where H[-Zerf(r)/r] = -Z/p^2 exp(-p^2 /4) # ABINIT uses a more 'pure' Coulomb term with the same asymptotic behavior # C(r) = -Z/r; H[-Z/r] = -Z/p^2 - rgrid = @view psp.rgrid[1:psp.ircut] - vloc = @view psp.vloc[1:psp.ircut] - I = simpson(rgrid) do i, r - r * (r * vloc[i] - -psp.Zion * erf(r)) * sphericalbesselj_fast(0, p * r) + x = @view psp.rgrid[1:3] + uniform_grid = (x[2] - x[1]) ≈ (x[3] - x[2]) ? true : false + + arch = get_architecture(ps) + rgrid = to_device(arch, @view psp.rgrid[1:psp.ircut]) + vloc = to_device(arch, @view psp.vloc[1:psp.ircut]) + Zion = psp.Zion + map(ps) do p + method = uniform_grid ? simpson_uniform : simpson_nonuniform + if p == 0 + zero(T) + else + # GPU compilation error if branching done in generic simpson() + I = method(rgrid) do i, r + r * (r * vloc[i] - -Zion * erf(r)) * sphericalbesselj_fast(0, p * r) + end + 4T(π) * (I + -Zion / p^2 * exp(-p^2 / T(4))) + end end - 4T(π) * (I + -psp.Zion / p^2 * exp(-p^2 / T(4))) end function eval_psp_density_valence_real(psp::PspUpf, r::T) where {T<:Real} diff --git a/src/terms/local.jl b/src/terms/local.jl index ebcce36ea7..fb35a843cd 100644 --- a/src/terms/local.jl +++ b/src/terms/local.jl @@ -81,17 +81,18 @@ function atomic_local_form_factors(basis::PlaneWaveBasis{T}; q=zero(Vec3{T})) wh p = norm(G) iG2ifnorm_cpu[iG] = get!(norm_indices, p, length(norm_indices) + 1) end + iG2ifnorm = to_device(basis.architecture, iG2ifnorm_cpu) - form_factors_cpu = zeros(T, length(norm_indices), length(basis.model.atom_groups)) - for(p, ifnorm) in norm_indices - for (igroup, group) in enumerate(basis.model.atom_groups) - element = basis.model.atoms[first(group)] - form_factors_cpu[ifnorm, igroup] = local_potential_fourier(element, p) - end + ni_pairs = collect(pairs(norm_indices)) + ps = to_device(basis.architecture, [p for (p, idx) in ni_pairs]) + indices = to_device(basis.architecture, [idx for (p, idx) in ni_pairs]) + + form_factors = similar(ps, length(norm_indices), length(basis.model.atom_groups)) + for (igroup, group) in enumerate(basis.model.atom_groups) + element = basis.model.atoms[first(group)] + @inbounds form_factors[indices, igroup] .= local_potential_fourier(element, ps) end - form_factors = to_device(basis.architecture, form_factors_cpu) - iG2ifnorm = to_device(basis.architecture, iG2ifnorm_cpu) (; form_factors, iG2ifnorm) end