diff --git a/.github/workflows/documentation.yaml b/.github/workflows/documentation.yaml index 2252d866de..75c71319c4 100644 --- a/.github/workflows/documentation.yaml +++ b/.github/workflows/documentation.yaml @@ -6,6 +6,11 @@ on: tags: - 'v*' pull_request: +concurrency: + # Skip intermediate builds: always. + # Cancel intermediate builds: only if it is a pull request build. + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} jobs: docs: diff --git a/Project.toml b/Project.toml index f48c45836d..156f5a17b2 100644 --- a/Project.toml +++ b/Project.toml @@ -35,6 +35,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" PseudoPotentialIO = "cb339c56-07fa-4cb2-923a-142469552264" PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Requires = "ae029012-a4dd-5104-9daa-d747884805df" Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" diff --git a/examples/gpu.jl b/examples/gpu.jl new file mode 100644 index 0000000000..a005591c30 --- /dev/null +++ b/examples/gpu.jl @@ -0,0 +1,19 @@ +using DFTK +using CUDA + +a = 10.26 # Silicon lattice constant in Bohr +lattice = a / 2 * [[0 1 1.]; + [1 0 1.]; + [1 1 0.]] +Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4")) +atoms = [Si, Si] +positions = [ones(3)/8, -ones(3)/8] +model = model_DFT(lattice, atoms, positions, []; temperature=1e-3) + +# If available, use CUDA to store DFT quantities and perform main computations +architecture = has_cuda() ? DFTK.GPU(CuArray) : DFTK.CPU() + +basis = PlaneWaveBasis(model; Ecut=30, kgrid=(1, 1, 1), architecture) +scfres = self_consistent_field(basis; tol=1e-3, + solver=scf_damping_solver(), + mixing=KerkerMixing()) diff --git a/src/DFTK.jl b/src/DFTK.jl index 0b1408dc99..2124ac1cb2 100644 --- a/src/DFTK.jl +++ b/src/DFTK.jl @@ -13,6 +13,9 @@ using spglib_jll using Unitful using UnitfulAtomic using ForwardDiff +using AbstractFFTs +using GPUArraysCore +using Random using ChainRulesCore export Vec3 @@ -31,6 +34,7 @@ include("common/mpi.jl") include("common/threading.jl") include("common/printing.jl") include("common/cis2pi.jl") +include("architecture.jl") include("common/zeros_like.jl") include("common/norm.jl") diff --git a/src/Model.jl b/src/Model.jl index c203d5f95c..9aa9c4cc66 100644 --- a/src/Model.jl +++ b/src/Model.jl @@ -279,13 +279,25 @@ Examples of covectors are forces. Reciprocal vectors are a special case: they are covectors, but conventionally have an additional factor of 2π in their definition, so they transform rather with 2π times the inverse lattice transpose: q_cart = 2π lattice' \ q_red = recip_lattice * q_red. + +For each of the function there is a one-argument version (returning a function to do the +transformation) and a two-argument version applying the transformation to a passed vector. =# -vector_red_to_cart(model::Model, rred) = model.lattice * rred -vector_cart_to_red(model::Model, rcart) = model.inv_lattice * rcart -covector_red_to_cart(model::Model, fred) = model.inv_lattice' * fred -covector_cart_to_red(model::Model, fcart) = model.lattice' * fcart -recip_vector_red_to_cart(model::Model, qred) = model.recip_lattice * qred -recip_vector_cart_to_red(model::Model, qcart) = model.inv_recip_lattice * qcart +@inline _gen_matmul(mat) = vec -> mat * vec + +vector_red_to_cart(model::Model) = _gen_matmul(model.lattice) +vector_cart_to_red(model::Model) = _gen_matmul(model.inv_lattice) +covector_red_to_cart(model::Model) = _gen_matmul(model.inv_lattice') +covector_cart_to_red(model::Model) = _gen_matmul(model.lattice') +recip_vector_red_to_cart(model::Model) = _gen_matmul(model.recip_lattice) +recip_vector_cart_to_red(model::Model) = _gen_matmul(model.inv_recip_lattice) + +vector_red_to_cart(model::Model, vec) = vector_red_to_cart(model)(vec) +vector_cart_to_red(model::Model, vec) = vector_cart_to_red(model)(vec) +covector_red_to_cart(model::Model, vec) = covector_red_to_cart(model)(vec) +covector_cart_to_red(model::Model, vec) = covector_cart_to_red(model)(vec) +recip_vector_red_to_cart(model::Model, vec) = recip_vector_red_to_cart(model)(vec) +recip_vector_cart_to_red(model::Model, vec) = recip_vector_cart_to_red(model)(vec) #= Transformations on vectors and covectors are matrices and comatrices. @@ -300,7 +312,14 @@ s_cart = L s_red = L A_red r_red = L A_red L⁻¹ r_cart, thus A_cart = L A_red Examples of matrices are the symmetries in real space (W) Examples of comatrices are the symmetries in reciprocal space (S) =# -matrix_red_to_cart(model::Model, Ared) = model.lattice * Ared * model.inv_lattice -matrix_cart_to_red(model::Model, Acart) = model.inv_lattice * Acart * model.lattice -comatrix_red_to_cart(model::Model, Bred) = model.inv_lattice' * Bred * model.lattice' -comatrix_cart_to_red(model::Model, Bcart) = model.lattice' * Bcart * model.inv_lattice' +@inline _gen_matmatmul(M, Minv) = mat -> M * mat * Minv + +matrix_red_to_cart(model::Model) = _gen_matmatmul(model.lattice, model.inv_lattice) +matrix_cart_to_red(model::Model) = _gen_matmatmul(model.inv_lattice, model.lattice) +comatrix_red_to_cart(model::Model) = _gen_matmatmul(model.inv_lattice', model.lattice') +comatrix_cart_to_red(model::Model) = _gen_matmatmul(model.lattice', model.inv_lattice') + +matrix_red_to_cart(model::Model, Ared) = matrix_red_to_cart(model)(Ared) +matrix_cart_to_red(model::Model, Acart) = matrix_cart_to_red(model)(Acart) +comatrix_red_to_cart(model::Model, Bred) = comatrix_red_to_cart(model)(Bred) +comatrix_cart_to_red(model::Model, Bcart) = comatrix_cart_to_red(model)(Bcart) diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl index 034d27e914..41d443ca78 100644 --- a/src/PlaneWaveBasis.jl +++ b/src/PlaneWaveBasis.jl @@ -15,16 +15,16 @@ Discretization information for ``k``-point-dependent quantities such as orbitals More generally, a ``k``-point is a block of the Hamiltonian; eg collinear spin is treated by doubling the number of kpoints. """ -struct Kpoint{T <: Real} +struct Kpoint{T <: Real, GT <: AbstractVector{Vec3{Int}}} spin::Int # Spin component can be 1 or 2 as index into what is - # returned by the `spin_components` function + # # returned by the `spin_components` function coordinate::Vec3{T} # Fractional coordinate of k-point mapping::Vector{Int} # Index of G_vectors[i] on the FFT grid: - # G_vectors(basis)[kpt.mapping[i]] == G_vectors(basis, kpt)[i] + # # G_vectors(basis)[kpt.mapping[i]] == G_vectors(basis, kpt)[i] mapping_inv::Dict{Int, Int} # Inverse of `mapping`: - # G_vectors(basis)[i] == G_vectors(basis, kpt)[mapping_inv[i]] - G_vectors::Vector{Vec3{Int}} # Wave vectors in integer coordinates: - # ({G, 1/2 |k+G|^2 ≤ Ecut}) + # # G_vectors(basis)[i] == G_vectors(basis, kpt)[mapping_inv[i]] + G_vectors::GT # Wave vectors in integer coordinates (vector of Vec3{Int}) + # # ({G, 1/2 |k+G|^2 ≤ Ecut}) end @doc raw""" @@ -39,7 +39,13 @@ Normalization conventions: `ifft` and `fft` convert between these representations. """ -struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real} +struct PlaneWaveBasis{T, + VT <: Real, + T_G_vectors <: AbstractArray{Vec3{Int}, 3}, + T_r_vectors <: AbstractArray{Vec3{VT}, 3}, + T_kpt_G_vecs <: AbstractVector{Vec3{Int}} + } <: AbstractBasis{T} + # T is the default type to express data, VT the corresponding bare value type (i.e. not dual) model::Model{T, VT} @@ -67,13 +73,13 @@ struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real} ifft_normalization::T # ifft = ifft_normalization * BFFT # "cubic" basis in reciprocal and real space, on which potentials and densities are stored - G_vectors::Array{Vec3{Int}, 3} - r_vectors::Array{Vec3{VT }, 3} + G_vectors::T_G_vectors + r_vectors::T_r_vectors ## MPI-local information of the kpoints this processor treats # Irreducible kpoints. In the case of collinear spin, # this lists all the spin up, then all the spin down - kpoints::Vector{Kpoint{T}} + kpoints::Vector{Kpoint{T, T_kpt_G_vecs}} # BZ integration weights, summing up to model.n_spin_components kweights::Vector{T} @@ -93,6 +99,9 @@ struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real} krange_allprocs::Vector{Vector{Int}} # indices of kpoints treated by the # respective rank in comm_kpts + ## Information on the hardware and device used for computations. + architecture::AbstractArchitecture + ## Symmetry operations that leave the discretized model (k and r grids) invariant. # Subset of model.symmetries. symmetries::Vector{SymOp{VT}} @@ -113,7 +122,8 @@ Base.Broadcast.broadcastable(basis::PlaneWaveBasis) = Ref(basis) Base.eltype(::PlaneWaveBasis{T}) where {T} = T @timing function build_kpoints(model::Model{T}, fft_size, kcoords, Ecut; - variational=true) where {T} + variational=true, + architecture::AbstractArchitecture) where {T} kpoints_per_spin = [Kpoint[] for _ in 1:model.n_spin_components] for k in kcoords k = Vec3{T}(k) # rationals are sloooow @@ -124,23 +134,24 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T sizehint!(mapping, n_guess) sizehint!(Gvecs_k, n_guess) for (i, G) in enumerate(G_vectors(fft_size)) - if !variational || sum(abs2, model.recip_lattice * (G + k)) / 2 ≤ Ecut + if !variational || norm2(model.recip_lattice * (G + k)) / 2 ≤ Ecut push!(mapping, i) push!(Gvecs_k, G) end end + Gvecs_k = to_device(architecture, Gvecs_k) + mapping_inv = Dict(ifull => iball for (iball, ifull) in enumerate(mapping)) for iσ = 1:model.n_spin_components push!(kpoints_per_spin[iσ], Kpoint(iσ, k, mapping, mapping_inv, Gvecs_k)) end end - vcat(kpoints_per_spin...) # put all spin up first, then all spin down end function build_kpoints(basis::PlaneWaveBasis, kcoords) build_kpoints(basis.model, basis.fft_size, kcoords, basis.Ecut; - variational=basis.variational) + variational=basis.variational, basis.architecture) end # Lowest-level constructor, should not be called directly. @@ -148,10 +159,11 @@ end # and are stored in PlaneWaveBasis for easy reconstruction. function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational, kcoords, kweights, kgrid, kshift, - symmetries_respect_rgrid, comm_kpts) where {T <: Real} + symmetries_respect_rgrid, comm_kpts, + architecture::AbstractArchitecture) where {T <: Real} # Validate fft_size if variational - max_E = sum(abs2, model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2 + max_E = norm2(model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2 Ecut > max_E && @warn( "For a variational method, Ecut should be less than the maximal kinetic " * "energy the grid supports ($max_E)" @@ -191,7 +203,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational, kweights_global = kweights # Setup FFT plans - (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(T, fft_size) + Gs = to_device(architecture, G_vectors(fft_size)) + (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans!(similar(Gs, Complex{T}, fft_size)) # Normalization constants # fft = fft_normalization * FFT @@ -235,7 +248,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational, "Non-variational calculations are experimental. " * "Not all features of DFTK may be supported or work as intended." ) - kpoints = build_kpoints(model, fft_size, kcoords_global[krange_thisproc], Ecut; variational) + kpoints = build_kpoints(model, fft_size, kcoords_global[krange_thisproc], Ecut; + variational, architecture) # kpoints is now possibly twice the size of krange. Make things consistent if model.n_spin_components == 2 krange_thisproc = vcat(krange_thisproc, n_kpt .+ krange_thisproc) @@ -245,21 +259,27 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational, @assert mpi_sum(sum(kweights_thisproc), comm_kpts) ≈ model.n_spin_components @assert length(kpoints) == length(kweights_thisproc) + if architecture isa GPU && Threads.nthreads() > 1 + error("Can't mix multi-threading and GPU computations yet.") + end + VT = value_type(T) dvol = model.unit_cell_volume ./ prod(fft_size) - r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3) for i = 1:N1, j = 1:N2, k = 1:N3] + r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3) + for i = 1:N1, j = 1:N2, k = 1:N3] + r_vectors = to_device(architecture, r_vectors) terms = Vector{Any}(undef, length(model.term_types)) # Dummy terms array, filled below - basis = PlaneWaveBasis{T,value_type(T)}( + basis = PlaneWaveBasis{T, value_type(T), typeof(Gs), typeof(r_vectors), + typeof(kpoints[1].G_vectors)}( model, fft_size, dvol, Ecut, variational, opFFT, ipFFT, opBFFT, ipBFFT, fft_normalization, ifft_normalization, - G_vectors(fft_size), r_vectors, + Gs, r_vectors, kpoints, kweights_thisproc, kgrid, kshift, kcoords_global, kweights_global, comm_kpts, krange_thisproc, krange_allprocs, - symmetries, symmetries_respect_rgrid, terms) - + architecture, symmetries, symmetries_respect_rgrid, terms) # Instantiate the terms with the basis for (it, t) in enumerate(model.term_types) term_name = string(nameof(typeof(t))) @@ -277,7 +297,7 @@ end variational=true, fft_size=nothing, kgrid=nothing, kshift=nothing, symmetries_respect_rgrid=isnothing(fft_size), - comm_kpts=MPI.COMM_WORLD) where {T <: Real} + comm_kpts=MPI.COMM_WORLD, architecture=CPU()) where {T <: Real} if isnothing(fft_size) @assert variational if symmetries_respect_rgrid @@ -295,7 +315,7 @@ end fft_size = compute_fft_size(model, Ecut, kcoords; factors) end PlaneWaveBasis(model, Ecut, fft_size, variational, kcoords, kweights, - kgrid, kshift, symmetries_respect_rgrid, comm_kpts) + kgrid, kshift, symmetries_respect_rgrid, comm_kpts, architecture) end @doc raw""" @@ -305,7 +325,8 @@ number of points in each dimension and `kshift` the shift (0 or 1/2 in each dire If not specified a grid is generated using `kgrid_from_minimal_spacing` with a minimal spacing of `2π * 0.022` per Bohr. """ -function PlaneWaveBasis(model::Model; Ecut, +function PlaneWaveBasis(model::Model; + Ecut, kgrid=kgrid_from_minimal_spacing(model, 2π * 0.022), kshift=zeros(3), kwargs...) @@ -322,11 +343,11 @@ Creates a new basis identical to `basis`, but with a custom set of kpoints PlaneWaveBasis(basis.model, basis.Ecut, basis.fft_size, basis.variational, kcoords, kweights, kgrid, kshift, - basis.symmetries_respect_rgrid, basis.comm_kpts) + basis.symmetries_respect_rgrid, basis.comm_kpts, basis.architecture) end """ - G_vectors(fft_size::Tuple) + G_vectors([architecture=AbstractArchitecture], fft_size::Tuple) The wave vectors `G` in reduced (integer) coordinates for a cubic basis set of given sizes. @@ -339,6 +360,7 @@ function G_vectors(fft_size::Union{Tuple,AbstractVector}) axes = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3] [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]] end + function G_vectors_generator(fft_size::Union{Tuple,AbstractVector}) # The generator version is used mainly in symmetry.jl for lowpass_for_symmetry! and # accumulate_over_symmetries!, which are 100-fold slower with G_vector(fft_size). @@ -348,6 +370,7 @@ function G_vectors_generator(fft_size::Union{Tuple,AbstractVector}) (Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]) end + @doc raw""" G_vectors(basis::PlaneWaveBasis) G_vectors(basis::PlaneWaveBasis, kpt::Kpoint) @@ -359,13 +382,16 @@ G_vectors(basis::PlaneWaveBasis) = basis.G_vectors G_vectors(::PlaneWaveBasis, kpt::Kpoint) = kpt.G_vectors + @doc raw""" G_vectors_cart(basis::PlaneWaveBasis) G_vectors_cart(basis::PlaneWaveBasis, kpt::Kpoint) The list of ``G`` vectors of a given `basis` or `kpt`, in cartesian coordinates. """ -G_vectors_cart(basis::PlaneWaveBasis) = recip_vector_red_to_cart.(basis.model, G_vectors(basis)) +function G_vectors_cart(basis::PlaneWaveBasis) + map(recip_vector_red_to_cart(basis.model), G_vectors(basis)) +end function G_vectors_cart(basis::PlaneWaveBasis, kpt::Kpoint) recip_vector_red_to_cart.(basis.model, G_vectors(basis, kpt)) end @@ -376,7 +402,8 @@ end The list of ``G + k`` vectors, in reduced coordinates. """ function Gplusk_vectors(basis::PlaneWaveBasis, kpt::Kpoint) - map(G -> G + kpt.coordinate, G_vectors(basis, kpt)) + coordinate = kpt.coordinate # Accelerator: avoid closure on kpt (not isbits) + map(G -> G + coordinate, G_vectors(basis, kpt)) end @doc raw""" @@ -385,7 +412,7 @@ end The list of ``G + k`` vectors, in cartesian coordinates. """ function Gplusk_vectors_cart(basis::PlaneWaveBasis, kpt::Kpoint) - recip_vector_red_to_cart.(basis.model, Gplusk_vectors(basis, kpt)) + map(recip_vector_red_to_cart(basis.model), Gplusk_vectors(basis, kpt)) end @doc raw""" @@ -400,7 +427,7 @@ r_vectors(basis::PlaneWaveBasis) = basis.r_vectors The list of ``r`` vectors, in cartesian coordinates. """ -r_vectors_cart(basis::PlaneWaveBasis) = vector_red_to_cart.(basis.model, r_vectors(basis)) +r_vectors_cart(basis::PlaneWaveBasis) = map(vector_red_to_cart(basis.model), r_vectors(basis)) """ @@ -408,10 +435,10 @@ Return the index tuple `I` such that `G_vectors(basis)[I] == G` or the index `i` such that `G_vectors(basis, kpoint)[i] == G`. Returns nothing if outside the range of valid wave vectors. """ -@inline function index_G_vectors(basis::PlaneWaveBasis, G::AbstractVector{T}) where {T <: Integer} +@inline function index_G_vectors(fft_size::Tuple, G::AbstractVector{<:Integer}) # the inline declaration encourages the compiler to hoist these (G-independent) precomputations - start = .- cld.(basis.fft_size .- 1, 2) - stop = fld.(basis.fft_size .- 1, 2) + start = .- cld.(fft_size .- 1, 2) + stop = fld.(fft_size .- 1, 2) lengths = stop .- start .+ 1 # FFTs store wavevectors as [0 1 2 3 -2 -1] (example for N=5) @@ -426,6 +453,10 @@ Returns nothing if outside the range of valid wave vectors. end end +function index_G_vectors(basis::PlaneWaveBasis, G::AbstractVector{<:Integer}) + index_G_vectors(basis.fft_size, G) +end + function index_G_vectors(basis::PlaneWaveBasis, kpoint::Kpoint, G::AbstractVector{T}) where {T <: Integer} fft_size = basis.fft_size @@ -485,7 +516,7 @@ function gather_kpts(basis::PlaneWaveBasis) basis.kshift, basis.symmetries_respect_rgrid, comm_kpts=MPI.COMM_SELF, - ) + architecture=basis.architecture) end end diff --git a/src/architecture.jl b/src/architecture.jl new file mode 100644 index 0000000000..499bd7ec45 --- /dev/null +++ b/src/architecture.jl @@ -0,0 +1,26 @@ +""" +Abstract supertype for architectures supported by DFTK. +""" +abstract type AbstractArchitecture end + +struct CPU <: AbstractArchitecture end + +struct GPU{ArrayType <: AbstractArray} <: AbstractArchitecture end + +""" +Construct a particular GPU architecture by passing the ArrayType +""" +GPU(::Type{T}) where {T <: AbstractArray} = GPU{T}() + +""" +Transfer an array from a device (typically a GPU) to the CPU. +""" +to_cpu(x::AbstractArray) = Array(x) +to_cpu(x::Array) = x + +""" +Transfer an array to a particular device (typically a GPU) +""" +to_device(::CPU, x) = to_cpu(x) +to_device(::GPU{ArrayType}, x::AbstractArray) where {ArrayType} = ArrayType(x) +to_device(::GPU{ArrayType}, x::ArrayType) where {ArrayType} = x diff --git a/src/common/norm.jl b/src/common/norm.jl index 4f96ad6146..3bccf28523 100644 --- a/src/common/norm.jl +++ b/src/common/norm.jl @@ -7,3 +7,8 @@ differentiation, we can extend analytically f to accept complex inputs, then dif inputs, and therefore we have to redefine it. """ norm_cplx(x) = sqrt(sum(xx -> xx * xx, x)) + +""" +Square of the ℓ²-norm. +""" +norm2(G) = sum(abs2, G) diff --git a/src/common/ortho.jl b/src/common/ortho.jl index 8e682bcc43..ae3e0b3b76 100644 --- a/src/common/ortho.jl +++ b/src/common/ortho.jl @@ -1,2 +1,2 @@ # Orthonormalize -@timing ortho_qr(φk) = Matrix(qr(φk).Q) +@timing ortho_qr(φk::ArrayType) where {ArrayType <: AbstractArray} = convert(ArrayType, qr(φk).Q) diff --git a/src/common/zeros_like.jl b/src/common/zeros_like.jl index ebcdfad05d..7de53ea4d0 100644 --- a/src/common/zeros_like.jl +++ b/src/common/zeros_like.jl @@ -1,8 +1,11 @@ -# Create an array of same type as X filled with zeros, minimizing the number -# of allocations. +""" +Create an array of same "array type" as X filled with zeros, minimizing the number +of allocations. This unifies CPU and GPU code, as the output will always be on the +same device as the input. +""" function zeros_like(X::AbstractArray, T::Type=eltype(X), dims::Integer...=size(X)...) Z = similar(X, T, dims...) - Z .= 0 + Z .= zero(T) Z end zeros_like(X::AbstractArray, dims::Integer...) = zeros_like(X, eltype(X), dims...) diff --git a/src/densities.jl b/src/densities.jl index 69e3f631ea..b65c98e111 100644 --- a/src/densities.jl +++ b/src/densities.jl @@ -23,6 +23,8 @@ using an optional `occupation_threshold`. By default all occupation numbers are @views @timing function compute_density(basis::PlaneWaveBasis{T}, ψ, occupation; occupation_threshold=zero(T)) where {T} S = promote_type(T, real(eltype(ψ[1]))) + # occupation should be on the CPU as we are going to be doing scalar indexing. + occupation = [to_cpu(oc) for oc in occupation] # we split the total iteration range (ik, n) in chunks, and parallelize over them mask_occ = map(occk -> findall(isless.(occupation_threshold, occk)), occupation) @@ -30,17 +32,17 @@ using an optional `occupation_threshold`. By default all occupation numbers are chunk_length = cld(length(ik_n), Threads.nthreads()) # chunk-local variables - ρ_chunklocal = Array{S,4}[zeros(S, basis.fft_size..., basis.model.n_spin_components) + ρ_chunklocal = [zeros_like(basis.G_vectors, S, basis.fft_size..., basis.model.n_spin_components) for _ = 1:Threads.nthreads()] - ψnk_real_chunklocal = Array{complex(S),3}[zeros(complex(S), basis.fft_size) - for _ = 1:Threads.nthreads()] + ψnk_real_chunklocal = [zeros_like(basis.G_vectors, complex(S), basis.fft_size...) + for _ = 1:Threads.nthreads()] @sync for (ichunk, chunk) in enumerate(Iterators.partition(ik_n, chunk_length)) Threads.@spawn for (ik, n) in chunk # spawn a task per chunk ψnk_real = ψnk_real_chunklocal[ichunk] ρ_loc = ρ_chunklocal[ichunk] - kpt = basis.kpoints[ik] + ifft!(ψnk_real, basis, kpt, ψ[ik][:, n]) ρ_loc[:, :, :, kpt.spin] .+= occupation[ik][n] .* basis.kweights[ik] .* abs2.(ψnk_real) end diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl index f08f500543..3de27ac29a 100644 --- a/src/eigen/diag.jl +++ b/src/eigen/diag.jl @@ -58,7 +58,9 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint:: end # Transform results into a nicer datastructure - (λ=[real.(res.λ) for res in results], + # TODO It feels inconsistent to put λ onto the CPU here but none of the other objects. + # Better have this handled by the caller of diagonalize_all_kblocks. + (λ=[to_cpu(real.(res.λ)) for res in results], # Always get onto the CPU X=[res.X for res in results], residual_norms=[res.residual_norms for res in results], iterations=[res.iterations for res in results], diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl index 80e18c326d..5df5f6b91f 100644 --- a/src/eigen/lobpcg_hyper_impl.jl +++ b/src/eigen/lobpcg_hyper_impl.jl @@ -183,7 +183,7 @@ normest(M) = maximum(abs.(diag(M))) + norm(M - Diagonal(diag(M))) end invR = inv(R) @assert all(!isnan, invR) - rmul!(X, invR) # we do not use X/R because we use invR next + rmul!(X, invR) # we do not use X/R because we use invR next # We would like growth_factor *= opnorm(inv(R)) but it's too # expensive, so we use an upper bound which is sharp enough to @@ -196,7 +196,7 @@ normest(M) = maximum(abs.(diag(M))) + norm(M - Diagonal(diag(M))) growth_factor *= norminvR # condR = 1/LAPACK.trcon!('I', 'U', 'N', Array(R)) - condR = normest(R)*norminvR # in practice this seems to be an OK estimate + condR = normest(R)*norminvR # in practice this seems to be an OK estimate vprintln("Ortho(X) success? $success ", eps(real(T))*condR^2, " < $tol") @@ -267,7 +267,7 @@ end niter > 10 && error("Ortho(X,Y) is failing badly, this should never happen") niter += 1 end - vprintln("ortho choleskys: ", ninners) # get how many Choleskys are performed + vprintln("ortho choleskys: ", ninners) # get how many Choleskys are performed # @assert (norm(BY'X)) < tol # @assert (norm(X'X-I)) < tol @@ -345,7 +345,7 @@ end full_BX = BX while true - if niter > 0 # first iteration is just to compute the residuals (no X update) + if niter > 0 # first iteration is just to compute the residuals (no X update) ### Perform the Rayleigh-Ritz mul!(AR, A, R) n_matvec += size(R, 2) @@ -413,7 +413,7 @@ end return final_retval(full_X, full_AX, resid_history, niter, n_matvec) end newly_locked = nlocked - prev_nlocked - active = newly_locked+1:size(X,2) # newly active vectors + active = newly_locked+1:size(X,2) # newly active vectors if niter > 0 ### compute P = Y*cP only for the newly active vectors @@ -484,7 +484,7 @@ end # Orthogonalize R wrt all X, newly active P if niter > 0 Z = LazyHcat(full_X, P) - BZ = LazyHcat(full_BX, BP) # data shared with (full_X, P) in non-general case + BZ = LazyHcat(full_BX, BP) # data shared with (full_X, P) in non-general case else Z = full_X BZ = full_BX diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl index e5ab0d71fa..4eb8ef96be 100644 --- a/src/eigen/preconditioners.jl +++ b/src/eigen/preconditioners.jl @@ -26,9 +26,9 @@ PreconditionerNone(basis, kpt) = I mutable struct PreconditionerTPA{T <: Real} basis::PlaneWaveBasis kpt::Kpoint - kin::Vector{T} # kinetic energy of every G + kin::AbstractVector{T} # kinetic energy of every G mean_kin::Union{Nothing, Vector{T}} # mean kinetic energy of every band - default_shift::T # if mean_kin is not set by `precondprep!`, this will be used for the shift + default_shift::T # if mean_kin is not set by `precondprep!`, this will be used for the shift end function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift=1) where {T} @@ -38,10 +38,7 @@ function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift= # TODO Annoying that one has to recompute the kinetic energies here. Perhaps # it's better to pass a HamiltonianBlock directly and read the computed values. kinetic_term = only(kinetic_term) - scaling = kinetic_term.scaling_factor - blowup = kinetic_term.blowup # blowup for energy cut-off smearing - kin = Vector{T}([scaling * sum(abs2, q)/2 * blowup(norm(q), basis.Ecut) - for q in Gplusk_vectors_cart(basis, kpt)]) + kin = kinetic_energy(kinetic_term, basis.Ecut, Gplusk_vectors_cart(basis, kpt)) PreconditionerTPA{T}(basis, kpt, kin, nothing, default_shift) end diff --git a/src/external/jld2io.jl b/src/external/jld2io.jl index 8dfbb05cac..7afc385874 100644 --- a/src/external/jld2io.jl +++ b/src/external/jld2io.jl @@ -1,4 +1,3 @@ - function ScfSaveCheckpoints(filename="dftk_scf_checkpoint.jld2"; keep=false, overwrite=false) # TODO Save only every 30 minutes or so function callback(info) @@ -84,10 +83,16 @@ struct PlaneWaveBasisSerialisation{T <: Real} kshift::Union{Nothing,Vec3{T}} symmetries_respect_rgrid::Bool fft_size::Tuple{Int, Int, Int} + architecture::AbstractArchitecture +end +function JLD2.writeas(::Type{PlaneWaveBasis{T,T,GT,RT,KGT}}) where {T,GT,RT,KGT} + # The GT, GT, KGT are uniquely determined by the architecture, + # which is stored in the basis. + PlaneWaveBasisSerialisation{T} end -JLD2.writeas(::Type{PlaneWaveBasis{T,T}}) where {T} = PlaneWaveBasisSerialisation{T} -function Base.convert(::Type{PlaneWaveBasisSerialisation{T}}, basis::PlaneWaveBasis{T,T}) where {T} +function Base.convert(::Type{PlaneWaveBasisSerialisation{T}}, + basis::PlaneWaveBasis{T,T}) where {T} PlaneWaveBasisSerialisation{T}( basis.model, basis.Ecut, @@ -98,12 +103,17 @@ function Base.convert(::Type{PlaneWaveBasisSerialisation{T}}, basis::PlaneWaveBa basis.kshift, basis.symmetries_respect_rgrid, basis.fft_size, + basis.architecture ) end -function Base.convert(::Type{PlaneWaveBasis{T,T}}, serial::PlaneWaveBasisSerialisation{T}) where {T} - PlaneWaveBasis(serial.model, serial.Ecut, serial.kcoords, - serial.kweights; serial.fft_size, - serial.kgrid, serial.kshift, serial.symmetries_respect_rgrid, - serial.variational) +function Base.convert(::Type{PlaneWaveBasis{T,T,GT,RT,KGT}}, + serial::PlaneWaveBasisSerialisation{T}) where {T,GT,RT,KGT} + PlaneWaveBasis(serial.model, serial.Ecut, serial.kcoords, serial.kweights; + serial.fft_size, + serial.kgrid, + serial.kshift, + serial.symmetries_respect_rgrid, + serial.variational, + architecture=serial.architecture) end diff --git a/src/fft.jl b/src/fft.jl index 1f775a1747..81155b2ddd 100644 --- a/src/fft.jl +++ b/src/fft.jl @@ -158,7 +158,8 @@ represented in the spherical basis sets, `supersampling` should be at least `2`. If `factors` is not empty, ensure that the resulting fft_size contains all the factors """ function compute_fft_size(model::Model{T}, Ecut, kcoords=nothing; - ensure_smallprimes=true, algorithm=:fast, factors=1, kwargs...) where {T} + ensure_smallprimes=true, algorithm=:fast, factors=1, + kwargs...) where {T} if algorithm == :fast Glims = compute_Glims_fast(model.lattice, Ecut; kwargs...) elseif algorithm == :precise @@ -171,7 +172,7 @@ function compute_fft_size(model::Model{T}, Ecut, kcoords=nothing; # fft_size needs to be final at k-point construction time Glims_temp = compute_Glims_fast(model.lattice, Ecut; kwargs...) fft_size_temp = Tuple{Int, Int, Int}(2 .* Glims_temp .+ 1) - kpoints_temp = build_kpoints(model, fft_size_temp, kcoords, Ecut) + kpoints_temp = build_kpoints(model, fft_size_temp, kcoords, Ecut; architecture=CPU()) Glims = compute_Glims_precise(model.lattice, Ecut, kpoints_temp; kwargs...) else @@ -257,23 +258,30 @@ function compute_Glims_fast(lattice::AbstractMatrix{T}, Ecut; supersampling=2, t Glims end -# For Float32 there are issues with aligned FFTW plans, so we -# fall back to unaligned FFTW plans (which are generally discouraged). -_fftw_flags(::Type{Float32}) = FFTW.MEASURE | FFTW.UNALIGNED -_fftw_flags(::Type{Float64}) = FFTW.MEASURE - """ Plan a FFT of type `T` and size `fft_size`, spending some time on finding an optimal algorithm. (Inplace, out-of-place) x (forward, backward) FFT plans are returned. """ -function build_fft_plans(T::Union{Type{Float32}, Type{Float64}}, fft_size) - tmp = Array{Complex{T}}(undef, fft_size...) - ipFFT = FFTW.plan_fft!(tmp, flags=_fftw_flags(T)) - opFFT = FFTW.plan_fft(tmp, flags=_fftw_flags(T)) - # backward by inverting and stripping off normalizations +function build_fft_plans!(tmp::Array{Complex{Float64}}) + ipFFT = FFTW.plan_fft!(tmp; flags=FFTW.MEASURE) + opFFT = FFTW.plan_fft(tmp; flags=FFTW.MEASURE) + # backwards-FFT by inverting and stripping off normalizations + ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p +end +function build_fft_plans!(tmp::Array{Complex{Float32}}) + # For Float32 there are issues with aligned FFTW plans, so we + # fall back to unaligned FFTW plans (which are generally discouraged). + ipFFT = FFTW.plan_fft!(tmp; flags=FFTW.MEASURE | FFTW.UNALIGNED) + opFFT = FFTW.plan_fft(tmp; flags=FFTW.MEASURE | FFTW.UNALIGNED) + # backwards-FFT by inverting and stripping off normalizations + ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p +end +function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:Union{Float32,Float64}} + ipFFT = AbstractFFTs.plan_fft!(tmp) + opFFT = AbstractFFTs.plan_fft(tmp) + # backwards-FFT by inverting and stripping off normalizations ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p end - # TODO Some grid sizes are broken in the generic FFT implementation # in FourierTransforms, for more details see workarounds/fft_generic.jl diff --git a/src/guess_density.jl b/src/guess_density.jl index 051ea93420..301bb21e8e 100644 --- a/src/guess_density.jl +++ b/src/guess_density.jl @@ -66,7 +66,7 @@ function _guess_spin_density(basis::PlaneWaveBasis{T}, atoms, positions, magneti @warn("Returning zero spin density guess, because no initial magnetization has " * "been specified in any of the given elements / atoms. Your SCF will likely " * "not converge to a spin-broken solution.") - return zeros(T, basis.fft_size) + return zeros_like(basis.G_vectors, T, basis.fft_size...) end @assert length(magmoms) == length(atoms) == length(positions) @@ -93,26 +93,35 @@ which follow the functional form and are placed at `position` (in fractional coordinates). """ function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T} - ρ = zeros(complex(T), basis.fft_size) + recip_lattice = basis.model.recip_lattice + fft_size = basis.fft_size + ρ = zeros_like(G_vectors(basis), complex(T), fft_size...) + isempty(gaussians) && return irfft(basis, ρ) + # This copy is required so that `gaussians` is isbits and can be transferred to the GPU + # TODO See if there is a better option here ... this feels non-ideal for larger systems + gaussians = SVector{size(gaussians)[1]}(gaussians) + # Fill ρ with the (unnormalized) Fourier transform, i.e. ∫ e^{-iGx} f(x) dx, # where f(x) is a weighted gaussian # # is formed from a superposition of atomic densities, each scaled by a prefactor - for (iG, G) in enumerate(G_vectors(basis)) - # Ensure that we only set G-vectors that have a -G counterpart - if isnothing(index_G_vectors(basis, -G)) - ρ[iG] = zero(complex(T)) - continue + # Ensure that we only set G-vectors that have a -G counterpart to ensure ρ is real. + function build_ρ(G) + if isnothing(index_G_vectors(fft_size, -G)) + return zero(complex(T)) end - - Gsq = sum(abs2, basis.model.recip_lattice * G) + Gsq = norm2(recip_lattice * G) + res = zero(complex(T)) for (coeff, decay_length, r) in gaussians form_factor::T = exp(-Gsq * T(decay_length)^2) - ρ[iG] += T(coeff) * form_factor * cis2pi(-dot(G, r)) + res += T(coeff) * form_factor * cis2pi(-dot(G, r)) end + res end + ρ = map(build_ρ, basis.G_vectors) + irfft(basis, ρ / sqrt(basis.model.unit_cell_volume)) end diff --git a/src/interpolation.jl b/src/interpolation.jl index da5b08d939..8ed26d3777 100644 --- a/src/interpolation.jl +++ b/src/interpolation.jl @@ -94,6 +94,7 @@ function interpolate_kpoint(data_in::AbstractVecOrMat, n_bands = size(data_in, 2) n_Gk_out = length(G_vectors(basis_out, kpoint_out)) data_out = similar(data_in, n_Gk_out, n_bands) .= 0 + # TODO: use a map, or this will not be GPU compatible (scalar indexing) for iin in 1:size(data_in, 1) idx_fft = kpoint_in.mapping[iin] idx_fft in keys(kpoint_out.mapping_inv) || continue diff --git a/src/occupation.jl b/src/occupation.jl index f57488f776..0e2793ed3f 100644 --- a/src/occupation.jl +++ b/src/occupation.jl @@ -29,8 +29,10 @@ function compute_occupation(basis::PlaneWaveBasis{T}, eigenvalues, εF; inverse_temperature = iszero(temperature) ? T(Inf) : 1/temperature filled_occ = filled_occupation(basis.model) - [filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature) - for εk in eigenvalues] + map(eigenvalues) do εk + occ = filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature) + to_device(basis.architecture, occ) + end end function compute_fermi_level(basis::PlaneWaveBasis{T}, eigenvalues; temperature) where {T} diff --git a/src/orbitals.jl b/src/orbitals.jl index 73b8c525e4..45596f4f02 100644 --- a/src/orbitals.jl +++ b/src/orbitals.jl @@ -1,3 +1,5 @@ +using Random # Used to have a generic API for CPU and GPU computations alike: see random_orbitals + # Returns the occupied orbitals, the occupation array and optionally the eigenvalues without # virtual states (or states with small occupation level for metals). # threshold is a parameter to distinguish between states we want to keep and the @@ -52,5 +54,12 @@ end unpack_ψ(x, sizes_ψ) = deepcopy(unsafe_unpack_ψ(x, sizes_ψ)) function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T} - ortho_qr(randn(Complex{T}, length(G_vectors(basis, kpt)), howmany)) + @static if VERSION < v"1.7" # TaskLocalRNG not yet available. + orbitals = randn(Complex{T}, length(G_vectors(basis, kpt)), howmany) + orbitals = to_device(basis.architecture, orbitals) + else + orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany) + randn!(TaskLocalRNG(), orbitals) # use the RNG on the device if we're using a GPU + end + ortho_qr(orbitals) end diff --git a/src/postprocess/stresses.jl b/src/postprocess/stresses.jl index e4837202e6..2e00dad7ae 100644 --- a/src/postprocess/stresses.jl +++ b/src/postprocess/stresses.jl @@ -19,7 +19,7 @@ Compute the stresses (= 1/Vol dE/d(M*lattice), taken at M=I) of an obtained SCF basis.Ecut, basis.fft_size, basis.variational, basis.kcoords_global, basis.kweights_global, basis.kgrid, basis.kshift, basis.symmetries_respect_rgrid, - basis.comm_kpts) + basis.comm_kpts, basis.architecture) ρ = compute_density(new_basis, scfres.ψ, scfres.occupation) energies = energy_hamiltonian(new_basis, scfres.ψ, scfres.occupation; ρ, scfres.eigenvalues, scfres.εF).energies diff --git a/src/scf/chi0models.jl b/src/scf/chi0models.jl index df77bda5d3..1c478053fe 100644 --- a/src/scf/chi0models.jl +++ b/src/scf/chi0models.jl @@ -55,7 +55,7 @@ function (χ0::DielectricModel)(basis; kwargs...) C0 = 1 - εr iszero(C0) && return nothing # Will yield no contribution - Gsq = [sum(abs2, G) for G in G_vectors_cart(basis)] + Gsq = norm2.(G_vectors_cart(basis)) apply_sqrtL = identity if χ0.localization != identity sqrtL = sqrt.(χ0.localization.(r_vectors(basis))) diff --git a/src/scf/mixing.jl b/src/scf/mixing.jl index 1a85360195..82507016f3 100644 --- a/src/scf/mixing.jl +++ b/src/scf/mixing.jl @@ -50,7 +50,7 @@ end @timing "KerkerMixing" function mix_density(mixing::KerkerMixing, basis::PlaneWaveBasis, δF; kwargs...) T = eltype(δF) - G² = [sum(abs2, G) for G in G_vectors_cart(basis)] + G² = norm2.(G_vectors_cart(basis)) kTF = T.(mixing.kTF) ΔDOS_Ω = T.(mixing.ΔDOS_Ω) @@ -72,7 +72,6 @@ end δF_fourier = fft(basis, δF) δFtot_fourier = total_density(δF_fourier) δFspin_fourier = spin_density(δF_fourier) - δρtot_fourier = δFtot_fourier .* G² ./ (kTF.^2 .+ G²) enforce_real!(basis, δρtot_fourier) δρtot = irfft(basis, δρtot_fourier) @@ -137,7 +136,7 @@ end εr > 1 / sqrt(eps(T)) && return mix_density(KerkerMixing(; kTF), basis, δF) C0 = 1 - εr - Gsq = [sum(abs2, G) for G in G_vectors_cart(basis)] + Gsq = map(G -> norm2(G), G_vectors_cart(basis)) δF_fourier = fft(basis, δF) δρ = @. δF_fourier * (kTF^2 - C0 * Gsq) / (εr * kTF^2 - C0 * Gsq) δρ = irfft(basis, δρ) diff --git a/src/scf/nbands_algorithm.jl b/src/scf/nbands_algorithm.jl index 2ebedfad7e..a74c6aff61 100644 --- a/src/scf/nbands_algorithm.jl +++ b/src/scf/nbands_algorithm.jl @@ -68,6 +68,8 @@ function determine_n_bands(bands::AdaptiveBands, occupation::AbstractVector, # TODO Could return different bands per k-Points # Determine number of bands to be actually converged + # Bring occupation on the CPU, or findlast will fail + occupation = [to_cpu(occk) for occk in occupation] n_bands_occ = maximum(occupation) do occk something(findlast(fnk -> fnk ≥ bands.occupation_threshold, occk), length(occk) + 1) end diff --git a/src/scf/potential_mixing.jl b/src/scf/potential_mixing.jl index 29eb2b85de..548333dd5b 100644 --- a/src/scf/potential_mixing.jl +++ b/src/scf/potential_mixing.jl @@ -61,6 +61,7 @@ function (anderson::AndersonAcceleration)(xₙ, αₙ, Pfxₙ) xₙ₊₁ = vec(xₙ) .+ αₙ .* vec(Pfxₙ) βs = -(Mfac \ vec(Pfxₙ)) + βs = to_cpu(βs) # GPU computation only : get βs back on the CPU so we can iterate through it for (iβ, β) in enumerate(βs) xₙ₊₁ .+= β .* (xs[iβ] .- vec(xₙ) .+ αₙ .* (Pfxs[iβ] .- vec(Pfxₙ))) end diff --git a/src/supercell.jl b/src/supercell.jl index 4fb68ef511..cd78d7d606 100644 --- a/src/supercell.jl +++ b/src/supercell.jl @@ -43,7 +43,7 @@ function cell_to_supercell(basis::PlaneWaveBasis) ones(3), # kgrid = Γ point only basis.kshift, # kshift symmetries_respect_rgrid, - basis.comm_kpts) + basis.comm_kpts, basis.architecture) end @doc raw""" diff --git a/src/symmetry.jl b/src/symmetry.jl index 724759b782..51e1228bc2 100644 --- a/src/symmetry.jl +++ b/src/symmetry.jl @@ -187,7 +187,7 @@ end end # Low-pass filters ρ (in Fourier) so that symmetry operations acting on it stay in the grid -function lowpass_for_symmetry!(ρ, basis; symmetries=basis.symmetries) +function lowpass_for_symmetry!(ρ::AbstractArray, basis; symmetries=basis.symmetries) for symop in symmetries isone(symop) && continue for (ig, G) in enumerate(G_vectors_generator(basis.fft_size)) @@ -269,7 +269,7 @@ function unfold_bz(basis::PlaneWaveBasis) basis.Ecut, basis.fft_size, basis.variational, kcoords, [1/length(kcoords) for _ in kcoords], basis.kgrid, basis.kshift, - basis.symmetries_respect_rgrid, basis.comm_kpts) + basis.symmetries_respect_rgrid, basis.comm_kpts, basis.architecture) end end diff --git a/src/terms/Hamiltonian.jl b/src/terms/Hamiltonian.jl index 572465268f..8184f03b69 100644 --- a/src/terms/Hamiltonian.jl +++ b/src/terms/Hamiltonian.jl @@ -51,7 +51,7 @@ function HamiltonianBlock(basis, kpoint, operators, scratch=ham_allocate_scratch end end function ham_allocate_scratch_(basis::PlaneWaveBasis{T}) where {T} - (ψ_reals=[zeros(complex(T), basis.fft_size...) for _ = 1:Threads.nthreads()], ) + (ψ_reals=[zeros_like(basis.G_vectors, complex(T), basis.fft_size...) for _ = 1:Threads.nthreads()], ) end Base.:*(H::HamiltonianBlock, ψ) = mul!(similar(ψ), H, ψ) @@ -89,9 +89,8 @@ Base.:*(H::Hamiltonian, ψ) = mul!(deepcopy(ψ), H, ψ) T = eltype(H.basis) n_bands = size(ψ, 2) Hψ_fourier = similar(Hψ[:, 1]) - ψ_real = zeros(complex(T), H.basis.fft_size...) - Hψ_real = zeros(complex(T), H.basis.fft_size...) - + ψ_real = similar(ψ, complex(T), H.basis.fft_size...) + Hψ_real = similar(Hψ, complex(T), H.basis.fft_size...) # take ψi, IFFT it to ψ_real, apply each term to Hψ_fourier and Hψ_real, and add it to Hψ for iband = 1:n_bands Hψ_real .= 0 diff --git a/src/terms/anyonic.jl b/src/terms/anyonic.jl index 54b07df3be..2c55ea4d58 100644 --- a/src/terms/anyonic.jl +++ b/src/terms/anyonic.jl @@ -49,7 +49,7 @@ function make_div_free(basis::PlaneWaveBasis{T}, A) where {T} # project on divergence free fields, ie in Fourier # project A(G) on the orthogonal of G if iG != 1 - out[1][iG], out[2][iG] = vec - (G'vec) * G / sum(abs2, G) + out[1][iG], out[2][iG] = vec - (G'vec) * G / norm2(G) else out[1][iG], out[2][iG] = vec end @@ -114,7 +114,7 @@ function ene_ops(term::TermAnyonic, basis::PlaneWaveBasis{T}, ψ, occupation; ρ_fourier = fft(basis, ρ[:, :, :, 1]) ρref_fourier = fft(basis, term.ρref) # TODO optimize for (iG, G) in enumerate(G_vectors_cart(basis)) - G2 = sum(abs2, G) + G2 = norm2(G) if G2 != 0 A1[iG] = +2T(π) * G[2] / G2 * (ρ_fourier[iG] - ρref_fourier[iG]) * im A2[iG] = -2T(π) * G[1] / G2 * (ρ_fourier[iG] - ρref_fourier[iG]) * im @@ -142,7 +142,7 @@ function ene_ops(term::TermAnyonic, basis::PlaneWaveBasis{T}, ψ, occupation; eff_pot_fourier = zeros(complex(T), basis.fft_size) for (iG, Gred) in enumerate(G_vectors(basis)) G = basis.model.recip_lattice * Gred - G2 = sum(abs2, G) + G2 = norm2(G) if G2 != 0 eff_pot_fourier[iG] += -4T(π)*β * im * G[2] / G2 * eff_current_fourier[1][iG] eff_pot_fourier[iG] += +4T(π)*β * im * G[1] / G2 * eff_current_fourier[2][iG] diff --git a/src/terms/ewald.jl b/src/terms/ewald.jl index ddd85bb172..79dc949bba 100644 --- a/src/terms/ewald.jl +++ b/src/terms/ewald.jl @@ -91,7 +91,7 @@ function energy_ewald(lattice::AbstractArray{T}, charges, positions; for G1 in -Glims[1]:Glims[1], G2 in -Glims[2]:Glims[2], G3 in -Glims[3]:Glims[3] G = Vec3(G1, G2, G3) iszero(G) && continue - Gsq = sum(abs2, recip_lattice * G) + Gsq = norm2(recip_lattice * G) cos_strucfac = sum(Z * cos2pi(dot(r, G)) for (r, Z) in zip(positions, charges)) sin_strucfac = sum(Z * sin2pi(dot(r, G)) for (r, Z) in zip(positions, charges)) sum_strucfac = cos_strucfac^2 + sin_strucfac^2 diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl index 06f99d7e47..81372ce5f1 100644 --- a/src/terms/hartree.jl +++ b/src/terms/hartree.jl @@ -29,9 +29,12 @@ end function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T} # Solving the Poisson equation ΔV = -4π ρ in Fourier space # is multiplying elementwise by 4π / |G|^2. - poisson_green_coeffs = 4T(π) ./ [sum(abs2, G) for G in G_vectors_cart(basis)] - poisson_green_coeffs[1] = 0 # Compensating charge background => Zero DC + + poisson_green_coeffs = 4T(π) ./ norm2.(G_vectors_cart(basis)) + GPUArraysCore.@allowscalar poisson_green_coeffs[1] = 0 # Compensating charge background => Zero DC + enforce_real!(basis, poisson_green_coeffs) # Symmetrize Fourier coeffs to have real iFFT + poisson_green_coeffs = to_device(basis.architecture, poisson_green_coeffs) TermHartree(T(scaling_factor), T(scaling_factor) .* poisson_green_coeffs) end diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl index 39f456c818..6ceb2a7f0d 100644 --- a/src/terms/kinetic.jl +++ b/src/terms/kinetic.jl @@ -19,11 +19,21 @@ struct TermKinetic <: Term kinetic_energies::Vector{<:AbstractVector} end function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor, blowup) where {T} - kinetic_energies = [[T(scaling_factor) * sum(abs2, Gk)/2 * blowup(norm(Gk), basis.Ecut) - for Gk in Gplusk_vectors_cart(basis, kpt)] for kpt in basis.kpoints] + kinetic_energies = [kinetic_energy(blowup, scaling_factor, basis.Ecut, + Gplusk_vectors_cart(basis, kpt)) + for kpt in basis.kpoints] TermKinetic(T(scaling_factor), kinetic_energies) end +function kinetic_energy(blowup, scaling_factor, Ecut, q::AbstractArray{Vec3{T}}) where {T} + map(q) do qk + T(scaling_factor) * norm2(qk) / 2 * blowup(norm(qk), Ecut) + end +end +function kinetic_energy(kin::Kinetic, Ecut, q) + kinetic_energy(kin.blowup, kin.scaling_factor, Ecut, q) +end + @timing "ene_ops: kinetic" function ene_ops(term::TermKinetic, basis::PlaneWaveBasis{T}, ψ, occupation; kwargs...) where {T} ops = [FourierMultiplication(basis, kpoint, term.kinetic_energies[ik]) @@ -31,6 +41,7 @@ end if isnothing(ψ) || isnothing(occupation) return (; E=T(Inf), ops) end + occupation = [to_cpu(occk) for occk in occupation] E = zero(T) for (ik, ψk) in enumerate(ψ) diff --git a/src/terms/local.jl b/src/terms/local.jl index 57603414ad..f7aa4dc3ab 100644 --- a/src/terms/local.jl +++ b/src/terms/local.jl @@ -74,7 +74,10 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T} # positions, this involves a form factor (`local_potential_fourier`) # and a structure factor e^{-i G·r} model = basis.model - G_cart = G_vectors_cart(basis) + G_cart = to_cpu(G_vectors_cart(basis)) + # TODO Bring G_cart on the CPU for compatibility with the pseudopotentials which + # are not isbits ... might be able to solve this by restructuring the loop + # Pre-compute the form factors at unique values of |G| to speed up # the potential Fourier transform (by a lot). Using a hash map gives O(1) @@ -90,7 +93,8 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T} end end - pot_fourier = map(enumerate(G_vectors(basis))) do (iG, G) + Gs = to_cpu(G_vectors(basis)) # TODO Again for GPU compatibility + pot_fourier = map(enumerate(Gs)) do (iG, G) q = norm(G_cart[iG]) pot = sum(enumerate(model.atom_groups)) do (igroup, group) structure_factor = sum(r -> cis2pi(-dot(G, r)), @view model.positions[group]) @@ -98,9 +102,10 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T} end pot / sqrt(model.unit_cell_volume) end + enforce_real!(basis, pot_fourier) # Symmetrize Fourier coeffs to have real iFFT + pot_real = irfft(basis, to_device(basis.architecture, pot_fourier)) - pot_real = irfft(basis, pot_fourier) TermAtomicLocal(pot_real) end diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl index 4b23e20580..e9451afdf1 100644 --- a/src/terms/nonlocal.jl +++ b/src/terms/nonlocal.jl @@ -16,7 +16,7 @@ function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}) where {T} ops = map(basis.kpoints) do kpt P = build_projection_vectors_(basis, kpt, psps, psp_positions) D = build_projection_coefficients_(T, psps, psp_positions) - NonlocalOperator(basis, kpt, P, D) + NonlocalOperator(basis, kpt, P, to_device(basis.architecture, D)) end TermAtomicNonlocal(ops) end @@ -63,8 +63,8 @@ end C = build_projection_coefficients_(T, element.psp) for (ik, kpt) in enumerate(basis.kpoints) # we compute the forces from the irreductible BZ; they are symmetrized later - qs_cart = Gplusk_vectors_cart(basis, kpt) qs = Gplusk_vectors(basis, kpt) + qs_cart = to_cpu(Gplusk_vectors_cart(basis, kpt)) form_factors = build_form_factors(element.psp, qs_cart) for idx in group r = model.positions[idx] @@ -149,6 +149,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint, n_proj = count_n_proj(psps, psp_positions) n_G = length(G_vectors(basis, kpt)) proj_vectors = zeros(Complex{T}, n_G, n_proj) + qs = to_cpu(Gplusk_vectors(basis, kpt)) # Compute the columns of proj_vectors = 1/√Ω pihat(k+G) # Since the pi are translates of each others, pihat(k+G) decouples as @@ -157,12 +158,13 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint, offset = 0 # offset into proj_vectors for (psp, positions) in zip(psps, psp_positions) # Compute position-independent form factors - form_factors = build_form_factors(psp, Gplusk_vectors_cart(basis, kpt)) + qs_cart = to_cpu(Gplusk_vectors_cart(basis, kpt)) + form_factors = build_form_factors(psp, qs_cart) # Combine with structure factors for r in positions # k+G in this formula can also be G, this only changes an unimportant phase factor - structure_factors = map(q -> cis2pi(-dot(q, r)), Gplusk_vectors(basis, kpt)) + structure_factors = map(q -> cis2pi(-dot(q, r)), qs) @views for iproj = 1:count_n_proj(psp) proj_vectors[:, offset+iproj] .= ( structure_factors .* form_factors[:, iproj] ./ sqrt(unit_cell_volume) @@ -172,13 +174,15 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint, end end @assert offset == n_proj - proj_vectors + + # Offload potential values to a device (like a GPU) + to_device(basis.architecture, proj_vectors) end """ Build form factors (Fourier transforms of projectors) for an atom centered at 0. """ -function build_form_factors(psp, qs) +function build_form_factors(psp, qs::Array) T = real(eltype(first(qs))) # Pre-compute the radial parts of the non-local projectors at unique |q| to speed up diff --git a/src/terms/xc.jl b/src/terms/xc.jl index bccbf9ba2f..2babe2ed9b 100644 --- a/src/terms/xc.jl +++ b/src/terms/xc.jl @@ -97,7 +97,7 @@ end end if haskey(terms, :Vl) && any(x -> abs(x) > term.potential_threshold, terms.Vl) @warn "Meta-GGAs with a Δρ term have not yet been thoroughly tested." maxlog=1 - mG² = [-sum(abs2, G) for G in G_vectors_cart(basis)] + mG² = .-norm2.(G_vectors_cart(basis)) Vl = reshape(terms.Vl, n_spin, basis.fft_size...) Vl_fourier = fft(basis, Vl[s, :, :, :]) # TODO: forcing real-valued ifft; should be enforced at creation of array @@ -255,7 +255,7 @@ function LibxcDensities(basis, max_derivative::Integer, ρ, τ) # Compute Δρ if max_derivative > 1 Δρ_real = similar(ρ_real, n_spin, basis.fft_size...) - mG² = [-sum(abs2, G) for G in G_vectors_cart(basis)] + mG² = .-norm2.(G_vectors_cart(basis)) for σ = 1:n_spin # TODO: forcing real-valued ifft; should be enforced at creation of array Δρ_real[σ, :, :, :] .= irfft(basis, mG² .* @view ρ_fourier[σ, :, :, :]; diff --git a/src/workarounds/fft_generic.jl b/src/workarounds/fft_generic.jl index 371ec2b061..a9b8c4ff0d 100644 --- a/src/workarounds/fft_generic.jl +++ b/src/workarounds/fft_generic.jl @@ -23,15 +23,13 @@ end default_primes(::Any) = (2, ) # Generic fallback function, Float32 and Float64 specialization in fft.jl -function build_fft_plans(T, fft_size) - tmp = Array{Complex{T}}(undef, fft_size...) - +function build_fft_plans!(tmp::AbstractArray{<:Complex}) # Note: FourierTransforms has no support for in-place FFTs at the moment # ... also it's extension to multi-dimensional arrays is broken and # the algo only works for some cases - @assert all(ispow2, fft_size) + @assert all(ispow2, size(tmp)) - # opFFT = FourierTransforms.plan_fft(tmp) # TODO When multidim works + # opFFT = AbstractFFTs.plan_fft(tmp) # TODO When multidim works # opBFFT = inv(opFFT).p opFFT = generic_plan_fft(tmp) # Fallback for now opBFFT = generic_plan_bfft(tmp) diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl index f17cf75ac4..22c50c6114 100644 --- a/src/workarounds/forwarddiff_rules.jl +++ b/src/workarounds/forwarddiff_rules.jl @@ -105,16 +105,11 @@ end next_working_fft_size(::Type{<:ForwardDiff.Dual}, size::Int) = size -_fftw_flags(::Type{<:ForwardDiff.Dual}) = FFTW.MEASURE | FFTW.UNALIGNED - -function build_fft_plans(T::Type{<:Union{ForwardDiff.Dual,Complex{<:ForwardDiff.Dual}}}, fft_size) - tmp = Array{complex(T)}(undef, fft_size...) # TODO think about other Array types - opFFT = FFTW.plan_fft(tmp, flags=_fftw_flags(T)) - opBFFT = FFTW.plan_bfft(tmp, flags=_fftw_flags(T)) - +function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:ForwardDiff.Dual} + opFFT = AbstractFFTs.plan_fft(tmp) + opBFFT = AbstractFFTs.plan_bfft(tmp) ipFFT = DummyInplace{typeof(opFFT)}(opFFT) ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT) - # backward by inverting and stripping off normalizations ipFFT, opFFT, ipBFFT, opBFFT end diff --git a/src/workarounds/gpu_arrays.jl b/src/workarounds/gpu_arrays.jl index 5c9fc9a478..55272d34c3 100644 --- a/src/workarounds/gpu_arrays.jl +++ b/src/workarounds/gpu_arrays.jl @@ -3,3 +3,12 @@ using GPUArraysCore # https://github.com/JuliaGPU/CUDA.jl/issues/1565 LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal, y::AbstractGPUArray) = x' * (D * y) + +function lowpass_for_symmetry!(ρ::AT, basis; + symmetries=basis.symmetries) where {AT <: AbstractGPUArray} + all(isone, symmetries) && return ρ + # lowpass_for_symmetry! currently uses scalar indexing, so we have to do this very ugly + # thing for cases where ρ sits on a device (e.g. GPU) + ρ_CPU = lowpass_for_symmetry!(to_cpu(ρ), basis; symmetries) + to_device(basis.architecture, ρ_CPU) +end