diff --git a/.github/workflows/documentation.yaml b/.github/workflows/documentation.yaml
index 2252d866de..75c71319c4 100644
--- a/.github/workflows/documentation.yaml
+++ b/.github/workflows/documentation.yaml
@@ -6,6 +6,11 @@ on:
     tags:
       - 'v*'
   pull_request:
+concurrency:
+  # Skip intermediate builds: always.
+  # Cancel intermediate builds: only if it is a pull request build.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 
 jobs:
   docs:
diff --git a/Project.toml b/Project.toml
index f48c45836d..156f5a17b2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -35,6 +35,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 PseudoPotentialIO = "cb339c56-07fa-4cb2-923a-142469552264"
 PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
diff --git a/examples/gpu.jl b/examples/gpu.jl
new file mode 100644
index 0000000000..a005591c30
--- /dev/null
+++ b/examples/gpu.jl
@@ -0,0 +1,19 @@
+using DFTK
+using CUDA
+
+a = 10.26  # Silicon lattice constant in Bohr
+lattice = a / 2 * [[0 1 1.];
+                   [1 0 1.];
+                   [1 1 0.]]
+Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
+atoms     = [Si, Si]
+positions = [ones(3)/8, -ones(3)/8]
+model = model_DFT(lattice, atoms, positions, []; temperature=1e-3)
+
+# If available, use CUDA to store DFT quantities and perform main computations
+architecture = has_cuda() ? DFTK.GPU(CuArray) : DFTK.CPU()
+
+basis  = PlaneWaveBasis(model; Ecut=30, kgrid=(1, 1, 1), architecture)
+scfres = self_consistent_field(basis; tol=1e-3,
+                               solver=scf_damping_solver(),
+                               mixing=KerkerMixing())
diff --git a/src/DFTK.jl b/src/DFTK.jl
index 0b1408dc99..2124ac1cb2 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -13,6 +13,9 @@ using spglib_jll
 using Unitful
 using UnitfulAtomic
 using ForwardDiff
+using AbstractFFTs
+using GPUArraysCore
+using Random
 using ChainRulesCore
 
 export Vec3
@@ -31,6 +34,7 @@ include("common/mpi.jl")
 include("common/threading.jl")
 include("common/printing.jl")
 include("common/cis2pi.jl")
+include("architecture.jl")
 include("common/zeros_like.jl")
 include("common/norm.jl")
 
diff --git a/src/Model.jl b/src/Model.jl
index c203d5f95c..9aa9c4cc66 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -279,13 +279,25 @@ Examples of covectors are forces.
 Reciprocal vectors are a special case: they are covectors, but conventionally have an
 additional factor of 2π in their definition, so they transform rather with 2π times the
 inverse lattice transpose: q_cart = 2π lattice' \ q_red = recip_lattice * q_red.
+
+For each of the function there is a one-argument version (returning a function to do the
+transformation) and a two-argument version applying the transformation to a passed vector.
 =#
-vector_red_to_cart(model::Model, rred)        = model.lattice * rred
-vector_cart_to_red(model::Model, rcart)       = model.inv_lattice * rcart
-covector_red_to_cart(model::Model, fred)      = model.inv_lattice' * fred
-covector_cart_to_red(model::Model, fcart)     = model.lattice' * fcart
-recip_vector_red_to_cart(model::Model, qred)  = model.recip_lattice * qred
-recip_vector_cart_to_red(model::Model, qcart) = model.inv_recip_lattice * qcart
+@inline _gen_matmul(mat) = vec -> mat * vec
+
+vector_red_to_cart(model::Model)       = _gen_matmul(model.lattice)
+vector_cart_to_red(model::Model)       = _gen_matmul(model.inv_lattice)
+covector_red_to_cart(model::Model)     = _gen_matmul(model.inv_lattice')
+covector_cart_to_red(model::Model)     = _gen_matmul(model.lattice')
+recip_vector_red_to_cart(model::Model) = _gen_matmul(model.recip_lattice)
+recip_vector_cart_to_red(model::Model) = _gen_matmul(model.inv_recip_lattice)
+
+vector_red_to_cart(model::Model, vec)       = vector_red_to_cart(model)(vec)
+vector_cart_to_red(model::Model, vec)       = vector_cart_to_red(model)(vec)
+covector_red_to_cart(model::Model, vec)     = covector_red_to_cart(model)(vec)
+covector_cart_to_red(model::Model, vec)     = covector_cart_to_red(model)(vec)
+recip_vector_red_to_cart(model::Model, vec) = recip_vector_red_to_cart(model)(vec)
+recip_vector_cart_to_red(model::Model, vec) = recip_vector_cart_to_red(model)(vec)
 
 #=
 Transformations on vectors and covectors are matrices and comatrices.
@@ -300,7 +312,14 @@ s_cart = L s_red = L A_red r_red = L A_red L⁻¹ r_cart, thus A_cart = L A_red
 Examples of matrices are the symmetries in real space (W)
 Examples of comatrices are the symmetries in reciprocal space (S)
 =#
-matrix_red_to_cart(model::Model, Ared)    = model.lattice * Ared * model.inv_lattice
-matrix_cart_to_red(model::Model, Acart)   = model.inv_lattice * Acart * model.lattice
-comatrix_red_to_cart(model::Model, Bred)  = model.inv_lattice' * Bred * model.lattice'
-comatrix_cart_to_red(model::Model, Bcart) = model.lattice' * Bcart * model.inv_lattice'
+@inline _gen_matmatmul(M, Minv) = mat -> M * mat * Minv
+
+matrix_red_to_cart(model::Model)   = _gen_matmatmul(model.lattice,      model.inv_lattice)
+matrix_cart_to_red(model::Model)   = _gen_matmatmul(model.inv_lattice,  model.lattice)
+comatrix_red_to_cart(model::Model) = _gen_matmatmul(model.inv_lattice', model.lattice')
+comatrix_cart_to_red(model::Model) = _gen_matmatmul(model.lattice',     model.inv_lattice')
+
+matrix_red_to_cart(model::Model, Ared)    = matrix_red_to_cart(model)(Ared)
+matrix_cart_to_red(model::Model, Acart)   = matrix_cart_to_red(model)(Acart)
+comatrix_red_to_cart(model::Model, Bred)  = comatrix_red_to_cart(model)(Bred)
+comatrix_cart_to_red(model::Model, Bcart) = comatrix_cart_to_red(model)(Bcart)
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 034d27e914..41d443ca78 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -15,16 +15,16 @@ Discretization information for ``k``-point-dependent quantities such as orbitals
 More generally, a ``k``-point is a block of the Hamiltonian;
 eg collinear spin is treated by doubling the number of kpoints.
 """
-struct Kpoint{T <: Real}
+struct Kpoint{T <: Real, GT <: AbstractVector{Vec3{Int}}}
     spin::Int                     # Spin component can be 1 or 2 as index into what is
-                                  # returned by the `spin_components` function
+    #                             # returned by the `spin_components` function
     coordinate::Vec3{T}           # Fractional coordinate of k-point
     mapping::Vector{Int}          # Index of G_vectors[i] on the FFT grid:
-                                  # G_vectors(basis)[kpt.mapping[i]] == G_vectors(basis, kpt)[i]
+    #                             # G_vectors(basis)[kpt.mapping[i]] == G_vectors(basis, kpt)[i]
     mapping_inv::Dict{Int, Int}   # Inverse of `mapping`:
-                                  # G_vectors(basis)[i] == G_vectors(basis, kpt)[mapping_inv[i]]
-    G_vectors::Vector{Vec3{Int}}  # Wave vectors in integer coordinates:
-                                  # ({G, 1/2 |k+G|^2 ≤ Ecut})
+    #                             # G_vectors(basis)[i] == G_vectors(basis, kpt)[mapping_inv[i]]
+    G_vectors::GT                 # Wave vectors in integer coordinates (vector of Vec3{Int})
+    #                             # ({G, 1/2 |k+G|^2 ≤ Ecut})
 end
 
 @doc raw"""
@@ -39,7 +39,13 @@ Normalization conventions:
 
 `ifft` and `fft` convert between these representations.
 """
-struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real}
+struct PlaneWaveBasis{T,
+                      VT <: Real,
+                      T_G_vectors  <: AbstractArray{Vec3{Int}, 3},
+                      T_r_vectors  <: AbstractArray{Vec3{VT},  3},
+                      T_kpt_G_vecs <: AbstractVector{Vec3{Int}}
+                     } <: AbstractBasis{T}
+
     # T is the default type to express data, VT the corresponding bare value type (i.e. not dual)
     model::Model{T, VT}
 
@@ -67,13 +73,13 @@ struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real}
     ifft_normalization::T  # ifft = ifft_normalization * BFFT
 
     # "cubic" basis in reciprocal and real space, on which potentials and densities are stored
-    G_vectors::Array{Vec3{Int}, 3}
-    r_vectors::Array{Vec3{VT }, 3}
+    G_vectors::T_G_vectors
+    r_vectors::T_r_vectors
 
     ## MPI-local information of the kpoints this processor treats
     # Irreducible kpoints. In the case of collinear spin,
     # this lists all the spin up, then all the spin down
-    kpoints::Vector{Kpoint{T}}
+    kpoints::Vector{Kpoint{T, T_kpt_G_vecs}}
     # BZ integration weights, summing up to model.n_spin_components
     kweights::Vector{T}
 
@@ -93,6 +99,9 @@ struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real}
     krange_allprocs::Vector{Vector{Int}}  # indices of kpoints treated by the
     #                                       respective rank in comm_kpts
 
+    ## Information on the hardware and device used for computations.
+    architecture::AbstractArchitecture
+
     ## Symmetry operations that leave the discretized model (k and r grids) invariant.
     # Subset of model.symmetries.
     symmetries::Vector{SymOp{VT}}
@@ -113,7 +122,8 @@ Base.Broadcast.broadcastable(basis::PlaneWaveBasis) = Ref(basis)
 Base.eltype(::PlaneWaveBasis{T}) where {T} = T
 
 @timing function build_kpoints(model::Model{T}, fft_size, kcoords, Ecut;
-                               variational=true) where {T}
+                               variational=true,
+                               architecture::AbstractArchitecture) where {T}
     kpoints_per_spin = [Kpoint[] for _ in 1:model.n_spin_components]
     for k in kcoords
         k = Vec3{T}(k)  # rationals are sloooow
@@ -124,23 +134,24 @@ Base.eltype(::PlaneWaveBasis{T}) where {T} = T
         sizehint!(mapping, n_guess)
         sizehint!(Gvecs_k, n_guess)
         for (i, G) in enumerate(G_vectors(fft_size))
-            if !variational || sum(abs2, model.recip_lattice * (G + k)) / 2 ≤ Ecut
+            if !variational || norm2(model.recip_lattice * (G + k)) / 2 ≤ Ecut
                 push!(mapping, i)
                 push!(Gvecs_k, G)
             end
         end
+        Gvecs_k = to_device(architecture, Gvecs_k)
+
         mapping_inv = Dict(ifull => iball for (iball, ifull) in enumerate(mapping))
         for iσ = 1:model.n_spin_components
             push!(kpoints_per_spin[iσ],
                   Kpoint(iσ, k, mapping, mapping_inv, Gvecs_k))
         end
     end
-
     vcat(kpoints_per_spin...)  # put all spin up first, then all spin down
 end
 function build_kpoints(basis::PlaneWaveBasis, kcoords)
     build_kpoints(basis.model, basis.fft_size, kcoords, basis.Ecut;
-                  variational=basis.variational)
+                  variational=basis.variational, basis.architecture)
 end
 
 # Lowest-level constructor, should not be called directly.
@@ -148,10 +159,11 @@ end
 # and are stored in PlaneWaveBasis for easy reconstruction.
 function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
                         kcoords, kweights, kgrid, kshift,
-                        symmetries_respect_rgrid, comm_kpts) where {T <: Real}
+                        symmetries_respect_rgrid, comm_kpts,
+                        architecture::AbstractArchitecture) where {T <: Real}
     # Validate fft_size
     if variational
-        max_E = sum(abs2, model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2
+        max_E = norm2(model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2
         Ecut > max_E && @warn(
             "For a variational method, Ecut should be less than the maximal kinetic " *
             "energy the grid supports ($max_E)"
@@ -191,7 +203,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     kweights_global = kweights
 
     # Setup FFT plans
-    (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(T, fft_size)
+    Gs = to_device(architecture, G_vectors(fft_size))
+    (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans!(similar(Gs, Complex{T}, fft_size))
 
     # Normalization constants
     # fft = fft_normalization * FFT
@@ -235,7 +248,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
         "Non-variational calculations are experimental. " *
         "Not all features of DFTK may be supported or work as intended."
     )
-    kpoints = build_kpoints(model, fft_size, kcoords_global[krange_thisproc], Ecut; variational)
+    kpoints = build_kpoints(model, fft_size, kcoords_global[krange_thisproc], Ecut;
+                            variational, architecture)
     # kpoints is now possibly twice the size of krange. Make things consistent
     if model.n_spin_components == 2
         krange_thisproc   = vcat(krange_thisproc, n_kpt .+ krange_thisproc)
@@ -245,21 +259,27 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     @assert mpi_sum(sum(kweights_thisproc), comm_kpts) ≈ model.n_spin_components
     @assert length(kpoints) == length(kweights_thisproc)
 
+    if architecture isa GPU && Threads.nthreads() > 1
+        error("Can't mix multi-threading and GPU computations yet.")
+    end
+
     VT = value_type(T)
     dvol  = model.unit_cell_volume ./ prod(fft_size)
-    r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3) for i = 1:N1, j = 1:N2, k = 1:N3]
+    r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3)
+                 for i = 1:N1, j = 1:N2, k = 1:N3]
+    r_vectors = to_device(architecture, r_vectors)
     terms = Vector{Any}(undef, length(model.term_types))  # Dummy terms array, filled below
 
-    basis = PlaneWaveBasis{T,value_type(T)}(
+    basis = PlaneWaveBasis{T, value_type(T), typeof(Gs), typeof(r_vectors),
+                           typeof(kpoints[1].G_vectors)}(
         model, fft_size, dvol,
         Ecut, variational,
         opFFT, ipFFT, opBFFT, ipBFFT,
         fft_normalization, ifft_normalization,
-        G_vectors(fft_size), r_vectors,
+        Gs, r_vectors,
         kpoints, kweights_thisproc, kgrid, kshift,
         kcoords_global, kweights_global, comm_kpts, krange_thisproc, krange_allprocs,
-        symmetries, symmetries_respect_rgrid, terms)
-
+        architecture, symmetries, symmetries_respect_rgrid, terms)
     # Instantiate the terms with the basis
     for (it, t) in enumerate(model.term_types)
         term_name = string(nameof(typeof(t)))
@@ -277,7 +297,7 @@ end
                                 variational=true, fft_size=nothing,
                                 kgrid=nothing, kshift=nothing,
                                 symmetries_respect_rgrid=isnothing(fft_size),
-                                comm_kpts=MPI.COMM_WORLD) where {T <: Real}
+                                comm_kpts=MPI.COMM_WORLD, architecture=CPU()) where {T <: Real}
     if isnothing(fft_size)
         @assert variational
         if symmetries_respect_rgrid
@@ -295,7 +315,7 @@ end
         fft_size = compute_fft_size(model, Ecut, kcoords; factors)
     end
     PlaneWaveBasis(model, Ecut, fft_size, variational, kcoords, kweights,
-                   kgrid, kshift, symmetries_respect_rgrid, comm_kpts)
+                   kgrid, kshift, symmetries_respect_rgrid, comm_kpts, architecture)
 end
 
 @doc raw"""
@@ -305,7 +325,8 @@ number of points in each dimension and `kshift` the shift (0 or 1/2 in each dire
 If not specified a grid is generated using `kgrid_from_minimal_spacing` with
 a minimal spacing of `2π * 0.022` per Bohr.
 """
-function PlaneWaveBasis(model::Model; Ecut,
+function PlaneWaveBasis(model::Model;
+                        Ecut,
                         kgrid=kgrid_from_minimal_spacing(model, 2π * 0.022),
                         kshift=zeros(3),
                         kwargs...)
@@ -322,11 +343,11 @@ Creates a new basis identical to `basis`, but with a custom set of kpoints
     PlaneWaveBasis(basis.model, basis.Ecut,
                    basis.fft_size, basis.variational,
                    kcoords, kweights, kgrid, kshift,
-                   basis.symmetries_respect_rgrid, basis.comm_kpts)
+                   basis.symmetries_respect_rgrid, basis.comm_kpts, basis.architecture)
 end
 
 """
-    G_vectors(fft_size::Tuple)
+    G_vectors([architecture=AbstractArchitecture], fft_size::Tuple)
 
 The wave vectors `G` in reduced (integer) coordinates for a cubic basis set
 of given sizes.
@@ -339,6 +360,7 @@ function G_vectors(fft_size::Union{Tuple,AbstractVector})
     axes  = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3]
     [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
 end
+
 function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
     # The generator version is used mainly in symmetry.jl for lowpass_for_symmetry! and
     # accumulate_over_symmetries!, which are 100-fold slower with G_vector(fft_size).
@@ -348,6 +370,7 @@ function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
     (Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3])
 end
 
+
 @doc raw"""
     G_vectors(basis::PlaneWaveBasis)
     G_vectors(basis::PlaneWaveBasis, kpt::Kpoint)
@@ -359,13 +382,16 @@ G_vectors(basis::PlaneWaveBasis) = basis.G_vectors
 G_vectors(::PlaneWaveBasis, kpt::Kpoint) = kpt.G_vectors
 
 
+
 @doc raw"""
     G_vectors_cart(basis::PlaneWaveBasis)
     G_vectors_cart(basis::PlaneWaveBasis, kpt::Kpoint)
 
 The list of ``G`` vectors of a given `basis` or `kpt`, in cartesian coordinates.
 """
-G_vectors_cart(basis::PlaneWaveBasis) = recip_vector_red_to_cart.(basis.model, G_vectors(basis))
+function G_vectors_cart(basis::PlaneWaveBasis)
+    map(recip_vector_red_to_cart(basis.model), G_vectors(basis))
+end
 function G_vectors_cart(basis::PlaneWaveBasis, kpt::Kpoint)
     recip_vector_red_to_cart.(basis.model, G_vectors(basis, kpt))
 end
@@ -376,7 +402,8 @@ end
 The list of ``G + k`` vectors, in reduced coordinates.
 """
 function Gplusk_vectors(basis::PlaneWaveBasis, kpt::Kpoint)
-    map(G -> G + kpt.coordinate, G_vectors(basis, kpt))
+    coordinate = kpt.coordinate  # Accelerator: avoid closure on kpt (not isbits)
+    map(G -> G + coordinate, G_vectors(basis, kpt))
 end
 
 @doc raw"""
@@ -385,7 +412,7 @@ end
 The list of ``G + k`` vectors, in cartesian coordinates.
 """
 function Gplusk_vectors_cart(basis::PlaneWaveBasis, kpt::Kpoint)
-    recip_vector_red_to_cart.(basis.model, Gplusk_vectors(basis, kpt))
+    map(recip_vector_red_to_cart(basis.model), Gplusk_vectors(basis, kpt))
 end
 
 @doc raw"""
@@ -400,7 +427,7 @@ r_vectors(basis::PlaneWaveBasis) = basis.r_vectors
 
 The list of ``r`` vectors, in cartesian coordinates.
 """
-r_vectors_cart(basis::PlaneWaveBasis) = vector_red_to_cart.(basis.model, r_vectors(basis))
+r_vectors_cart(basis::PlaneWaveBasis) = map(vector_red_to_cart(basis.model), r_vectors(basis))
 
 
 """
@@ -408,10 +435,10 @@ Return the index tuple `I` such that `G_vectors(basis)[I] == G`
 or the index `i` such that `G_vectors(basis, kpoint)[i] == G`.
 Returns nothing if outside the range of valid wave vectors.
 """
-@inline function index_G_vectors(basis::PlaneWaveBasis, G::AbstractVector{T}) where {T <: Integer}
+@inline function index_G_vectors(fft_size::Tuple, G::AbstractVector{<:Integer})
     # the inline declaration encourages the compiler to hoist these (G-independent) precomputations
-    start = .- cld.(basis.fft_size .- 1, 2)
-    stop  = fld.(basis.fft_size .- 1, 2)
+    start = .- cld.(fft_size .- 1, 2)
+    stop  = fld.(fft_size .- 1, 2)
     lengths = stop .- start .+ 1
 
     # FFTs store wavevectors as [0 1 2 3 -2 -1] (example for N=5)
@@ -426,6 +453,10 @@ Returns nothing if outside the range of valid wave vectors.
     end
 end
 
+function index_G_vectors(basis::PlaneWaveBasis, G::AbstractVector{<:Integer})
+    index_G_vectors(basis.fft_size, G)
+end
+
 function index_G_vectors(basis::PlaneWaveBasis, kpoint::Kpoint,
                          G::AbstractVector{T}) where {T <: Integer}
     fft_size = basis.fft_size
@@ -485,7 +516,7 @@ function gather_kpts(basis::PlaneWaveBasis)
                        basis.kshift,
                        basis.symmetries_respect_rgrid,
                        comm_kpts=MPI.COMM_SELF,
-                      )
+                       architecture=basis.architecture)
     end
 end
 
diff --git a/src/architecture.jl b/src/architecture.jl
new file mode 100644
index 0000000000..499bd7ec45
--- /dev/null
+++ b/src/architecture.jl
@@ -0,0 +1,26 @@
+"""
+Abstract supertype for architectures supported by DFTK.
+"""
+abstract type AbstractArchitecture end
+
+struct CPU <: AbstractArchitecture end
+
+struct GPU{ArrayType <: AbstractArray} <: AbstractArchitecture end
+
+"""
+Construct a particular GPU architecture by passing the ArrayType
+"""
+GPU(::Type{T}) where {T <: AbstractArray} = GPU{T}()
+
+"""
+Transfer an array from a device (typically a GPU) to the CPU.
+"""
+to_cpu(x::AbstractArray) = Array(x)
+to_cpu(x::Array) = x
+
+"""
+Transfer an array to a particular device (typically a GPU)
+"""
+to_device(::CPU, x) = to_cpu(x)
+to_device(::GPU{ArrayType}, x::AbstractArray) where {ArrayType} = ArrayType(x)
+to_device(::GPU{ArrayType}, x::ArrayType)     where {ArrayType} = x
diff --git a/src/common/norm.jl b/src/common/norm.jl
index 4f96ad6146..3bccf28523 100644
--- a/src/common/norm.jl
+++ b/src/common/norm.jl
@@ -7,3 +7,8 @@ differentiation, we can extend analytically f to accept complex inputs, then dif
 inputs, and therefore we have to redefine it.
 """
 norm_cplx(x) = sqrt(sum(xx -> xx * xx, x))
+
+"""
+Square of the ℓ²-norm.
+"""
+norm2(G) = sum(abs2, G)
diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index 8e682bcc43..ae3e0b3b76 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -1,2 +1,2 @@
 # Orthonormalize
-@timing ortho_qr(φk) = Matrix(qr(φk).Q)
+@timing ortho_qr(φk::ArrayType) where {ArrayType <: AbstractArray} = convert(ArrayType, qr(φk).Q)
diff --git a/src/common/zeros_like.jl b/src/common/zeros_like.jl
index ebcdfad05d..7de53ea4d0 100644
--- a/src/common/zeros_like.jl
+++ b/src/common/zeros_like.jl
@@ -1,8 +1,11 @@
-# Create an array of same type as X filled with zeros, minimizing the number
-# of allocations.
+"""
+Create an array of same "array type" as X filled with zeros, minimizing the number
+of allocations. This unifies CPU and GPU code, as the output will always be on the
+same device as the input.
+"""
 function zeros_like(X::AbstractArray, T::Type=eltype(X), dims::Integer...=size(X)...)
     Z = similar(X, T, dims...)
-    Z .= 0
+    Z .= zero(T)
     Z
 end
 zeros_like(X::AbstractArray, dims::Integer...) = zeros_like(X, eltype(X), dims...)
diff --git a/src/densities.jl b/src/densities.jl
index 69e3f631ea..b65c98e111 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -23,6 +23,8 @@ using an optional `occupation_threshold`. By default all occupation numbers are
 @views @timing function compute_density(basis::PlaneWaveBasis{T}, ψ, occupation;
                                         occupation_threshold=zero(T)) where {T}
     S = promote_type(T, real(eltype(ψ[1])))
+    # occupation should be on the CPU as we are going to be doing scalar indexing.
+    occupation = [to_cpu(oc) for oc in occupation]
 
     # we split the total iteration range (ik, n) in chunks, and parallelize over them
     mask_occ = map(occk -> findall(isless.(occupation_threshold, occk)), occupation)
@@ -30,17 +32,17 @@ using an optional `occupation_threshold`. By default all occupation numbers are
     chunk_length = cld(length(ik_n), Threads.nthreads())
 
     # chunk-local variables
-    ρ_chunklocal = Array{S,4}[zeros(S, basis.fft_size..., basis.model.n_spin_components)
+    ρ_chunklocal = [zeros_like(basis.G_vectors, S, basis.fft_size..., basis.model.n_spin_components)
                                for _ = 1:Threads.nthreads()]
-    ψnk_real_chunklocal = Array{complex(S),3}[zeros(complex(S), basis.fft_size)
-                                               for _ = 1:Threads.nthreads()]
+    ψnk_real_chunklocal = [zeros_like(basis.G_vectors, complex(S), basis.fft_size...)
+                                for _ = 1:Threads.nthreads()]
 
     @sync for (ichunk, chunk) in enumerate(Iterators.partition(ik_n, chunk_length))
         Threads.@spawn for (ik, n) in chunk  # spawn a task per chunk
             ψnk_real = ψnk_real_chunklocal[ichunk]
             ρ_loc = ρ_chunklocal[ichunk]
-
             kpt = basis.kpoints[ik]
+
             ifft!(ψnk_real, basis, kpt, ψ[ik][:, n])
             ρ_loc[:, :, :, kpt.spin] .+= occupation[ik][n] .* basis.kweights[ik] .* abs2.(ψnk_real)
         end
diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl
index f08f500543..3de27ac29a 100644
--- a/src/eigen/diag.jl
+++ b/src/eigen/diag.jl
@@ -58,7 +58,9 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
     end
 
     # Transform results into a nicer datastructure
-    (λ=[real.(res.λ) for res in results],
+    # TODO It feels inconsistent to put λ onto the CPU here but none of the other objects.
+    #      Better have this handled by the caller of diagonalize_all_kblocks.
+    (λ=[to_cpu(real.(res.λ)) for res in results],  # Always get onto the CPU
      X=[res.X for res in results],
      residual_norms=[res.residual_norms for res in results],
      iterations=[res.iterations for res in results],
diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
index 80e18c326d..5df5f6b91f 100644
--- a/src/eigen/lobpcg_hyper_impl.jl
+++ b/src/eigen/lobpcg_hyper_impl.jl
@@ -183,7 +183,7 @@ normest(M) = maximum(abs.(diag(M))) + norm(M - Diagonal(diag(M)))
         end
         invR = inv(R)
         @assert all(!isnan, invR)
-        rmul!(X, invR) # we do not use X/R because we use invR next
+        rmul!(X, invR)  # we do not use X/R because we use invR next
 
         # We would like growth_factor *= opnorm(inv(R)) but it's too
         # expensive, so we use an upper bound which is sharp enough to
@@ -196,7 +196,7 @@ normest(M) = maximum(abs.(diag(M))) + norm(M - Diagonal(diag(M)))
         growth_factor *= norminvR
 
         # condR = 1/LAPACK.trcon!('I', 'U', 'N', Array(R))
-        condR = normest(R)*norminvR # in practice this seems to be an OK estimate
+        condR = normest(R)*norminvR  # in practice this seems to be an OK estimate
 
         vprintln("Ortho(X) success? $success ", eps(real(T))*condR^2, " < $tol")
 
@@ -267,7 +267,7 @@ end
         niter > 10 && error("Ortho(X,Y) is failing badly, this should never happen")
         niter += 1
     end
-    vprintln("ortho choleskys: ", ninners) # get how many Choleskys are performed
+    vprintln("ortho choleskys: ", ninners)  # get how many Choleskys are performed
 
     # @assert (norm(BY'X)) < tol
     # @assert (norm(X'X-I)) < tol
@@ -345,7 +345,7 @@ end
     full_BX = BX
 
     while true
-        if niter > 0 # first iteration is just to compute the residuals (no X update)
+        if niter > 0  # first iteration is just to compute the residuals (no X update)
             ###  Perform the Rayleigh-Ritz
             mul!(AR, A, R)
             n_matvec += size(R, 2)
@@ -413,7 +413,7 @@ end
             return final_retval(full_X, full_AX, resid_history, niter, n_matvec)
         end
         newly_locked = nlocked - prev_nlocked
-        active = newly_locked+1:size(X,2) # newly active vectors
+        active = newly_locked+1:size(X,2)  # newly active vectors
 
         if niter > 0
             ### compute P = Y*cP only for the newly active vectors
@@ -484,7 +484,7 @@ end
         # Orthogonalize R wrt all X, newly active P
         if niter > 0
             Z  = LazyHcat(full_X, P)
-            BZ = LazyHcat(full_BX, BP) # data shared with (full_X, P) in non-general case
+            BZ = LazyHcat(full_BX, BP)  # data shared with (full_X, P) in non-general case
         else
             Z  = full_X
             BZ = full_BX
diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl
index e5ab0d71fa..4eb8ef96be 100644
--- a/src/eigen/preconditioners.jl
+++ b/src/eigen/preconditioners.jl
@@ -26,9 +26,9 @@ PreconditionerNone(basis, kpt) = I
 mutable struct PreconditionerTPA{T <: Real}
     basis::PlaneWaveBasis
     kpt::Kpoint
-    kin::Vector{T}  # kinetic energy of every G
+    kin::AbstractVector{T}  # kinetic energy of every G
     mean_kin::Union{Nothing, Vector{T}}  # mean kinetic energy of every band
-    default_shift::T # if mean_kin is not set by `precondprep!`, this will be used for the shift
+    default_shift::T  # if mean_kin is not set by `precondprep!`, this will be used for the shift
 end
 
 function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift=1) where {T}
@@ -38,10 +38,7 @@ function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift=
     # TODO Annoying that one has to recompute the kinetic energies here. Perhaps
     #      it's better to pass a HamiltonianBlock directly and read the computed values.
     kinetic_term = only(kinetic_term)
-    scaling = kinetic_term.scaling_factor
-    blowup = kinetic_term.blowup     # blowup for energy cut-off smearing
-    kin = Vector{T}([scaling * sum(abs2, q)/2 * blowup(norm(q), basis.Ecut)
-                     for q in Gplusk_vectors_cart(basis, kpt)])
+    kin = kinetic_energy(kinetic_term, basis.Ecut, Gplusk_vectors_cart(basis, kpt))
     PreconditionerTPA{T}(basis, kpt, kin, nothing, default_shift)
 end
 
diff --git a/src/external/jld2io.jl b/src/external/jld2io.jl
index 8dfbb05cac..7afc385874 100644
--- a/src/external/jld2io.jl
+++ b/src/external/jld2io.jl
@@ -1,4 +1,3 @@
-
 function ScfSaveCheckpoints(filename="dftk_scf_checkpoint.jld2"; keep=false, overwrite=false)
     # TODO Save only every 30 minutes or so
     function callback(info)
@@ -84,10 +83,16 @@ struct PlaneWaveBasisSerialisation{T <: Real}
     kshift::Union{Nothing,Vec3{T}}
     symmetries_respect_rgrid::Bool
     fft_size::Tuple{Int, Int, Int}
+    architecture::AbstractArchitecture
+end
+function JLD2.writeas(::Type{PlaneWaveBasis{T,T,GT,RT,KGT}}) where {T,GT,RT,KGT}
+    # The GT, GT, KGT are uniquely determined by the architecture,
+    # which is stored in the basis.
+    PlaneWaveBasisSerialisation{T}
 end
-JLD2.writeas(::Type{PlaneWaveBasis{T,T}}) where {T} = PlaneWaveBasisSerialisation{T}
 
-function Base.convert(::Type{PlaneWaveBasisSerialisation{T}}, basis::PlaneWaveBasis{T,T}) where {T}
+function Base.convert(::Type{PlaneWaveBasisSerialisation{T}},
+                      basis::PlaneWaveBasis{T,T}) where {T}
     PlaneWaveBasisSerialisation{T}(
         basis.model,
         basis.Ecut,
@@ -98,12 +103,17 @@ function Base.convert(::Type{PlaneWaveBasisSerialisation{T}}, basis::PlaneWaveBa
         basis.kshift,
         basis.symmetries_respect_rgrid,
         basis.fft_size,
+        basis.architecture
     )
 end
 
-function Base.convert(::Type{PlaneWaveBasis{T,T}}, serial::PlaneWaveBasisSerialisation{T}) where {T}
-    PlaneWaveBasis(serial.model, serial.Ecut, serial.kcoords,
-                   serial.kweights; serial.fft_size,
-                   serial.kgrid, serial.kshift, serial.symmetries_respect_rgrid,
-                   serial.variational)
+function Base.convert(::Type{PlaneWaveBasis{T,T,GT,RT,KGT}},
+                      serial::PlaneWaveBasisSerialisation{T}) where {T,GT,RT,KGT}
+    PlaneWaveBasis(serial.model, serial.Ecut, serial.kcoords, serial.kweights;
+                   serial.fft_size,
+                   serial.kgrid,
+                   serial.kshift,
+                   serial.symmetries_respect_rgrid,
+                   serial.variational,
+                   architecture=serial.architecture)
 end
diff --git a/src/fft.jl b/src/fft.jl
index 1f775a1747..81155b2ddd 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -158,7 +158,8 @@ represented in the spherical basis sets, `supersampling` should be at least `2`.
 If `factors` is not empty, ensure that the resulting fft_size contains all the factors
 """
 function compute_fft_size(model::Model{T}, Ecut, kcoords=nothing;
-                          ensure_smallprimes=true, algorithm=:fast, factors=1, kwargs...) where {T}
+                          ensure_smallprimes=true, algorithm=:fast, factors=1,
+                          kwargs...) where {T}
     if algorithm == :fast
         Glims = compute_Glims_fast(model.lattice, Ecut; kwargs...)
     elseif algorithm == :precise
@@ -171,7 +172,7 @@ function compute_fft_size(model::Model{T}, Ecut, kcoords=nothing;
         # fft_size needs to be final at k-point construction time
         Glims_temp    = compute_Glims_fast(model.lattice, Ecut; kwargs...)
         fft_size_temp = Tuple{Int, Int, Int}(2 .* Glims_temp .+ 1)
-        kpoints_temp  = build_kpoints(model, fft_size_temp, kcoords, Ecut)
+        kpoints_temp  = build_kpoints(model, fft_size_temp, kcoords, Ecut; architecture=CPU())
 
         Glims = compute_Glims_precise(model.lattice, Ecut, kpoints_temp; kwargs...)
     else
@@ -257,23 +258,30 @@ function compute_Glims_fast(lattice::AbstractMatrix{T}, Ecut; supersampling=2, t
     Glims
 end
 
-# For Float32 there are issues with aligned FFTW plans, so we
-# fall back to unaligned FFTW plans (which are generally discouraged).
-_fftw_flags(::Type{Float32}) = FFTW.MEASURE | FFTW.UNALIGNED
-_fftw_flags(::Type{Float64}) = FFTW.MEASURE
-
 """
 Plan a FFT of type `T` and size `fft_size`, spending some time on finding an
 optimal algorithm. (Inplace, out-of-place) x (forward, backward) FFT plans are returned.
 """
-function build_fft_plans(T::Union{Type{Float32}, Type{Float64}}, fft_size)
-    tmp = Array{Complex{T}}(undef, fft_size...)
-    ipFFT = FFTW.plan_fft!(tmp, flags=_fftw_flags(T))
-    opFFT = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
-    # backward by inverting and stripping off normalizations
+function build_fft_plans!(tmp::Array{Complex{Float64}})
+    ipFFT = FFTW.plan_fft!(tmp; flags=FFTW.MEASURE)
+    opFFT = FFTW.plan_fft(tmp;  flags=FFTW.MEASURE)
+    # backwards-FFT by inverting and stripping off normalizations
+    ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
+end
+function build_fft_plans!(tmp::Array{Complex{Float32}})
+    # For Float32 there are issues with aligned FFTW plans, so we
+    # fall back to unaligned FFTW plans (which are generally discouraged).
+    ipFFT = FFTW.plan_fft!(tmp; flags=FFTW.MEASURE | FFTW.UNALIGNED)
+    opFFT = FFTW.plan_fft(tmp;  flags=FFTW.MEASURE | FFTW.UNALIGNED)
+    # backwards-FFT by inverting and stripping off normalizations
+    ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
+end
+function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:Union{Float32,Float64}}
+    ipFFT = AbstractFFTs.plan_fft!(tmp)
+    opFFT = AbstractFFTs.plan_fft(tmp)
+    # backwards-FFT by inverting and stripping off normalizations
     ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
 end
-
 
 # TODO Some grid sizes are broken in the generic FFT implementation
 # in FourierTransforms, for more details see workarounds/fft_generic.jl
diff --git a/src/guess_density.jl b/src/guess_density.jl
index 051ea93420..301bb21e8e 100644
--- a/src/guess_density.jl
+++ b/src/guess_density.jl
@@ -66,7 +66,7 @@ function _guess_spin_density(basis::PlaneWaveBasis{T}, atoms, positions, magneti
         @warn("Returning zero spin density guess, because no initial magnetization has " *
               "been specified in any of the given elements / atoms. Your SCF will likely " *
               "not converge to a spin-broken solution.")
-        return zeros(T, basis.fft_size)
+        return zeros_like(basis.G_vectors, T, basis.fft_size...)
     end
 
     @assert length(magmoms) == length(atoms) == length(positions)
@@ -93,26 +93,35 @@ which follow the functional form
 and are placed at `position` (in fractional coordinates).
 """
 function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
-    ρ = zeros(complex(T), basis.fft_size)
+    recip_lattice = basis.model.recip_lattice
+    fft_size      = basis.fft_size
+    ρ = zeros_like(G_vectors(basis), complex(T), fft_size...)
+
     isempty(gaussians) && return irfft(basis, ρ)
 
+    # This copy is required so that `gaussians` is isbits and can be transferred to the GPU
+    # TODO See if there is a better option here ... this feels non-ideal for larger systems
+    gaussians = SVector{size(gaussians)[1]}(gaussians)
+
     # Fill ρ with the (unnormalized) Fourier transform, i.e. ∫ e^{-iGx} f(x) dx,
     # where f(x) is a weighted gaussian
     #
     # is formed from a superposition of atomic densities, each scaled by a prefactor
-    for (iG, G) in enumerate(G_vectors(basis))
-        # Ensure that we only set G-vectors that have a -G counterpart
-        if isnothing(index_G_vectors(basis, -G))
-            ρ[iG] = zero(complex(T))
-            continue
+    # Ensure that we only set G-vectors that have a -G counterpart to ensure ρ is real.
+    function build_ρ(G)
+        if isnothing(index_G_vectors(fft_size, -G))
+            return zero(complex(T))
         end
-
-        Gsq = sum(abs2, basis.model.recip_lattice * G)
+        Gsq = norm2(recip_lattice * G)
+        res = zero(complex(T))
         for (coeff, decay_length, r) in gaussians
             form_factor::T = exp(-Gsq * T(decay_length)^2)
-            ρ[iG] += T(coeff) * form_factor * cis2pi(-dot(G, r))
+            res += T(coeff) * form_factor * cis2pi(-dot(G, r))
         end
+        res
     end
+    ρ = map(build_ρ, basis.G_vectors)
+
     irfft(basis, ρ / sqrt(basis.model.unit_cell_volume))
 end
 
diff --git a/src/interpolation.jl b/src/interpolation.jl
index da5b08d939..8ed26d3777 100644
--- a/src/interpolation.jl
+++ b/src/interpolation.jl
@@ -94,6 +94,7 @@ function interpolate_kpoint(data_in::AbstractVecOrMat,
     n_bands  = size(data_in, 2)
     n_Gk_out = length(G_vectors(basis_out, kpoint_out))
     data_out = similar(data_in, n_Gk_out, n_bands) .= 0
+    # TODO: use a map, or this will not be GPU compatible (scalar indexing)
     for iin in 1:size(data_in, 1)
         idx_fft = kpoint_in.mapping[iin]
         idx_fft in keys(kpoint_out.mapping_inv) || continue
diff --git a/src/occupation.jl b/src/occupation.jl
index f57488f776..0e2793ed3f 100644
--- a/src/occupation.jl
+++ b/src/occupation.jl
@@ -29,8 +29,10 @@ function compute_occupation(basis::PlaneWaveBasis{T}, eigenvalues, εF;
     inverse_temperature = iszero(temperature) ? T(Inf) : 1/temperature
 
     filled_occ = filled_occupation(basis.model)
-    [filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature)
-     for εk in eigenvalues]
+    map(eigenvalues) do εk
+        occ = filled_occ * Smearing.occupation.(smearing, (εk .- εF) .* inverse_temperature)
+        to_device(basis.architecture, occ)
+    end
 end
 
 function compute_fermi_level(basis::PlaneWaveBasis{T}, eigenvalues; temperature) where {T}
diff --git a/src/orbitals.jl b/src/orbitals.jl
index 73b8c525e4..45596f4f02 100644
--- a/src/orbitals.jl
+++ b/src/orbitals.jl
@@ -1,3 +1,5 @@
+using Random  # Used to have a generic API for CPU and GPU computations alike: see random_orbitals
+
 # Returns the occupied orbitals, the occupation array and optionally the eigenvalues without
 # virtual states (or states with small occupation level for metals).
 # threshold is a parameter to distinguish between states we want to keep and the
@@ -52,5 +54,12 @@ end
 unpack_ψ(x, sizes_ψ) = deepcopy(unsafe_unpack_ψ(x, sizes_ψ))
 
 function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
-    ortho_qr(randn(Complex{T}, length(G_vectors(basis, kpt)), howmany))
+    @static if VERSION < v"1.7"  # TaskLocalRNG not yet available.
+        orbitals = randn(Complex{T}, length(G_vectors(basis, kpt)), howmany)
+        orbitals = to_device(basis.architecture, orbitals)
+    else
+        orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
+        randn!(TaskLocalRNG(), orbitals)  # use the RNG on the device if we're using a GPU
+    end
+    ortho_qr(orbitals)
 end
diff --git a/src/postprocess/stresses.jl b/src/postprocess/stresses.jl
index e4837202e6..2e00dad7ae 100644
--- a/src/postprocess/stresses.jl
+++ b/src/postprocess/stresses.jl
@@ -19,7 +19,7 @@ Compute the stresses (= 1/Vol dE/d(M*lattice), taken at M=I) of an obtained SCF
                                    basis.Ecut, basis.fft_size, basis.variational,
                                    basis.kcoords_global, basis.kweights_global,
                                    basis.kgrid, basis.kshift, basis.symmetries_respect_rgrid,
-                                   basis.comm_kpts)
+                                   basis.comm_kpts, basis.architecture)
         ρ = compute_density(new_basis, scfres.ψ, scfres.occupation)
         energies = energy_hamiltonian(new_basis, scfres.ψ, scfres.occupation;
                                       ρ, scfres.eigenvalues, scfres.εF).energies
diff --git a/src/scf/chi0models.jl b/src/scf/chi0models.jl
index df77bda5d3..1c478053fe 100644
--- a/src/scf/chi0models.jl
+++ b/src/scf/chi0models.jl
@@ -55,7 +55,7 @@ function (χ0::DielectricModel)(basis; kwargs...)
     C0  = 1 - εr
     iszero(C0) && return nothing  # Will yield no contribution
 
-    Gsq = [sum(abs2, G) for G in G_vectors_cart(basis)]
+    Gsq = norm2.(G_vectors_cart(basis))
     apply_sqrtL = identity
     if χ0.localization != identity
         sqrtL = sqrt.(χ0.localization.(r_vectors(basis)))
diff --git a/src/scf/mixing.jl b/src/scf/mixing.jl
index 1a85360195..82507016f3 100644
--- a/src/scf/mixing.jl
+++ b/src/scf/mixing.jl
@@ -50,7 +50,7 @@ end
 @timing "KerkerMixing" function mix_density(mixing::KerkerMixing, basis::PlaneWaveBasis,
                                             δF; kwargs...)
     T      = eltype(δF)
-    G²     = [sum(abs2, G) for G in G_vectors_cart(basis)]
+    G²     = norm2.(G_vectors_cart(basis))
     kTF    = T.(mixing.kTF)
     ΔDOS_Ω = T.(mixing.ΔDOS_Ω)
 
@@ -72,7 +72,6 @@ end
     δF_fourier     = fft(basis, δF)
     δFtot_fourier  = total_density(δF_fourier)
     δFspin_fourier = spin_density(δF_fourier)
-
     δρtot_fourier = δFtot_fourier .* G² ./ (kTF.^2 .+ G²)
     enforce_real!(basis, δρtot_fourier)
     δρtot = irfft(basis, δρtot_fourier)
@@ -137,7 +136,7 @@ end
     εr > 1 / sqrt(eps(T)) && return mix_density(KerkerMixing(; kTF), basis, δF)
 
     C0 = 1 - εr
-    Gsq = [sum(abs2, G) for G in G_vectors_cart(basis)]
+    Gsq = map(G -> norm2(G), G_vectors_cart(basis))
     δF_fourier = fft(basis, δF)
     δρ = @. δF_fourier * (kTF^2 - C0 * Gsq) / (εr * kTF^2 - C0 * Gsq)
     δρ = irfft(basis, δρ)
diff --git a/src/scf/nbands_algorithm.jl b/src/scf/nbands_algorithm.jl
index 2ebedfad7e..a74c6aff61 100644
--- a/src/scf/nbands_algorithm.jl
+++ b/src/scf/nbands_algorithm.jl
@@ -68,6 +68,8 @@ function determine_n_bands(bands::AdaptiveBands, occupation::AbstractVector,
     # TODO Could return different bands per k-Points
 
     # Determine number of bands to be actually converged
+    # Bring occupation on the CPU, or findlast will fail
+    occupation = [to_cpu(occk) for occk in occupation]
     n_bands_occ = maximum(occupation) do occk
         something(findlast(fnk -> fnk ≥ bands.occupation_threshold, occk), length(occk) + 1)
     end
diff --git a/src/scf/potential_mixing.jl b/src/scf/potential_mixing.jl
index 29eb2b85de..548333dd5b 100644
--- a/src/scf/potential_mixing.jl
+++ b/src/scf/potential_mixing.jl
@@ -61,6 +61,7 @@ function (anderson::AndersonAcceleration)(xₙ, αₙ, Pfxₙ)
 
     xₙ₊₁ = vec(xₙ) .+ αₙ .* vec(Pfxₙ)
     βs   = -(Mfac \ vec(Pfxₙ))
+    βs = to_cpu(βs)  # GPU computation only : get βs back on the CPU so we can iterate through it
     for (iβ, β) in enumerate(βs)
         xₙ₊₁ .+= β .* (xs[iβ] .- vec(xₙ) .+ αₙ .* (Pfxs[iβ] .- vec(Pfxₙ)))
     end
diff --git a/src/supercell.jl b/src/supercell.jl
index 4fb68ef511..cd78d7d606 100644
--- a/src/supercell.jl
+++ b/src/supercell.jl
@@ -43,7 +43,7 @@ function cell_to_supercell(basis::PlaneWaveBasis)
                    ones(3),              # kgrid = Γ point only
                    basis.kshift,         # kshift
                    symmetries_respect_rgrid,
-                   basis.comm_kpts)
+                   basis.comm_kpts, basis.architecture)
 end
 
 @doc raw"""
diff --git a/src/symmetry.jl b/src/symmetry.jl
index 724759b782..51e1228bc2 100644
--- a/src/symmetry.jl
+++ b/src/symmetry.jl
@@ -187,7 +187,7 @@ end
 end
 
 # Low-pass filters ρ (in Fourier) so that symmetry operations acting on it stay in the grid
-function lowpass_for_symmetry!(ρ, basis; symmetries=basis.symmetries)
+function lowpass_for_symmetry!(ρ::AbstractArray, basis; symmetries=basis.symmetries)
     for symop in symmetries
         isone(symop) && continue
         for (ig, G) in enumerate(G_vectors_generator(basis.fft_size))
@@ -269,7 +269,7 @@ function unfold_bz(basis::PlaneWaveBasis)
                               basis.Ecut, basis.fft_size, basis.variational,
                               kcoords, [1/length(kcoords) for _ in kcoords],
                               basis.kgrid, basis.kshift,
-                              basis.symmetries_respect_rgrid, basis.comm_kpts)
+                              basis.symmetries_respect_rgrid, basis.comm_kpts, basis.architecture)
     end
 end
 
diff --git a/src/terms/Hamiltonian.jl b/src/terms/Hamiltonian.jl
index 572465268f..8184f03b69 100644
--- a/src/terms/Hamiltonian.jl
+++ b/src/terms/Hamiltonian.jl
@@ -51,7 +51,7 @@ function HamiltonianBlock(basis, kpoint, operators, scratch=ham_allocate_scratch
     end
 end
 function ham_allocate_scratch_(basis::PlaneWaveBasis{T}) where {T}
-    (ψ_reals=[zeros(complex(T), basis.fft_size...) for _ = 1:Threads.nthreads()], )
+    (ψ_reals=[zeros_like(basis.G_vectors, complex(T), basis.fft_size...) for _ = 1:Threads.nthreads()], )
 end
 
 Base.:*(H::HamiltonianBlock, ψ) = mul!(similar(ψ), H, ψ)
@@ -89,9 +89,8 @@ Base.:*(H::Hamiltonian, ψ) = mul!(deepcopy(ψ), H, ψ)
     T = eltype(H.basis)
     n_bands = size(ψ, 2)
     Hψ_fourier = similar(Hψ[:, 1])
-    ψ_real  = zeros(complex(T), H.basis.fft_size...)
-    Hψ_real = zeros(complex(T), H.basis.fft_size...)
-
+    ψ_real  = similar(ψ, complex(T), H.basis.fft_size...)
+    Hψ_real = similar(Hψ, complex(T), H.basis.fft_size...)
     # take ψi, IFFT it to ψ_real, apply each term to Hψ_fourier and Hψ_real, and add it to Hψ
     for iband = 1:n_bands
         Hψ_real .= 0
diff --git a/src/terms/anyonic.jl b/src/terms/anyonic.jl
index 54b07df3be..2c55ea4d58 100644
--- a/src/terms/anyonic.jl
+++ b/src/terms/anyonic.jl
@@ -49,7 +49,7 @@ function make_div_free(basis::PlaneWaveBasis{T}, A) where {T}
         # project on divergence free fields, ie in Fourier
         # project A(G) on the orthogonal of G
         if iG != 1
-            out[1][iG], out[2][iG] = vec - (G'vec) * G / sum(abs2, G)
+            out[1][iG], out[2][iG] = vec - (G'vec) * G / norm2(G)
         else
             out[1][iG], out[2][iG] = vec
         end
@@ -114,7 +114,7 @@ function ene_ops(term::TermAnyonic, basis::PlaneWaveBasis{T}, ψ, occupation;
     ρ_fourier = fft(basis, ρ[:, :, :, 1])
     ρref_fourier = fft(basis, term.ρref)  # TODO optimize
     for (iG, G) in enumerate(G_vectors_cart(basis))
-        G2 = sum(abs2, G)
+        G2 = norm2(G)
         if G2 != 0
             A1[iG] = +2T(π) * G[2] / G2 * (ρ_fourier[iG] - ρref_fourier[iG]) * im
             A2[iG] = -2T(π) * G[1] / G2 * (ρ_fourier[iG] - ρref_fourier[iG]) * im
@@ -142,7 +142,7 @@ function ene_ops(term::TermAnyonic, basis::PlaneWaveBasis{T}, ψ, occupation;
     eff_pot_fourier = zeros(complex(T), basis.fft_size)
     for (iG, Gred) in enumerate(G_vectors(basis))
         G = basis.model.recip_lattice * Gred
-        G2 = sum(abs2, G)
+        G2 = norm2(G)
         if G2 != 0
             eff_pot_fourier[iG] += -4T(π)*β * im * G[2] / G2 * eff_current_fourier[1][iG]
             eff_pot_fourier[iG] += +4T(π)*β * im * G[1] / G2 * eff_current_fourier[2][iG]
diff --git a/src/terms/ewald.jl b/src/terms/ewald.jl
index ddd85bb172..79dc949bba 100644
--- a/src/terms/ewald.jl
+++ b/src/terms/ewald.jl
@@ -91,7 +91,7 @@ function energy_ewald(lattice::AbstractArray{T}, charges, positions;
     for G1 in -Glims[1]:Glims[1], G2 in -Glims[2]:Glims[2], G3 in -Glims[3]:Glims[3]
         G = Vec3(G1, G2, G3)
         iszero(G) && continue
-        Gsq = sum(abs2, recip_lattice * G)
+        Gsq = norm2(recip_lattice * G)
         cos_strucfac = sum(Z * cos2pi(dot(r, G)) for (r, Z) in zip(positions, charges))
         sin_strucfac = sum(Z * sin2pi(dot(r, G)) for (r, Z) in zip(positions, charges))
         sum_strucfac = cos_strucfac^2 + sin_strucfac^2
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index 06f99d7e47..81372ce5f1 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -29,9 +29,12 @@ end
 function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
     # Solving the Poisson equation ΔV = -4π ρ in Fourier space
     # is multiplying elementwise by 4π / |G|^2.
-    poisson_green_coeffs = 4T(π) ./ [sum(abs2, G) for G in G_vectors_cart(basis)]
-    poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
+
+    poisson_green_coeffs = 4T(π) ./ norm2.(G_vectors_cart(basis))
+    GPUArraysCore.@allowscalar poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
+
     enforce_real!(basis, poisson_green_coeffs)  # Symmetrize Fourier coeffs to have real iFFT
+    poisson_green_coeffs = to_device(basis.architecture, poisson_green_coeffs)
 
     TermHartree(T(scaling_factor), T(scaling_factor) .* poisson_green_coeffs)
 end
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index 39f456c818..6ceb2a7f0d 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -19,11 +19,21 @@ struct TermKinetic <: Term
     kinetic_energies::Vector{<:AbstractVector}
 end
 function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor, blowup) where {T}
-    kinetic_energies = [[T(scaling_factor) * sum(abs2, Gk)/2 * blowup(norm(Gk), basis.Ecut)
-                         for Gk in Gplusk_vectors_cart(basis, kpt)] for kpt in basis.kpoints]
+    kinetic_energies = [kinetic_energy(blowup, scaling_factor, basis.Ecut,
+                                       Gplusk_vectors_cart(basis, kpt))
+                        for kpt in basis.kpoints]
     TermKinetic(T(scaling_factor), kinetic_energies)
 end
 
+function kinetic_energy(blowup, scaling_factor, Ecut, q::AbstractArray{Vec3{T}}) where {T}
+    map(q) do qk
+        T(scaling_factor) * norm2(qk) / 2 * blowup(norm(qk), Ecut)
+    end
+end
+function kinetic_energy(kin::Kinetic, Ecut, q)
+    kinetic_energy(kin.blowup, kin.scaling_factor, Ecut, q)
+end
+
 @timing "ene_ops: kinetic" function ene_ops(term::TermKinetic, basis::PlaneWaveBasis{T},
                                             ψ, occupation; kwargs...) where {T}
     ops = [FourierMultiplication(basis, kpoint, term.kinetic_energies[ik])
@@ -31,6 +41,7 @@ end
     if isnothing(ψ) || isnothing(occupation)
         return (; E=T(Inf), ops)
     end
+    occupation = [to_cpu(occk) for occk in occupation]
 
     E = zero(T)
     for (ik, ψk) in enumerate(ψ)
diff --git a/src/terms/local.jl b/src/terms/local.jl
index 57603414ad..f7aa4dc3ab 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -74,7 +74,10 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
     # positions, this involves a form factor (`local_potential_fourier`)
     # and a structure factor e^{-i G·r}
     model = basis.model
-    G_cart = G_vectors_cart(basis)
+    G_cart = to_cpu(G_vectors_cart(basis))
+    # TODO Bring G_cart on the CPU for compatibility with the pseudopotentials which
+    #      are not isbits ... might be able to solve this by restructuring the loop
+
 
     # Pre-compute the form factors at unique values of |G| to speed up
     # the potential Fourier transform (by a lot). Using a hash map gives O(1)
@@ -90,7 +93,8 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
         end
     end
 
-    pot_fourier = map(enumerate(G_vectors(basis))) do (iG, G)
+    Gs = to_cpu(G_vectors(basis))  # TODO Again for GPU compatibility
+    pot_fourier = map(enumerate(Gs)) do (iG, G)
         q = norm(G_cart[iG])
         pot = sum(enumerate(model.atom_groups)) do (igroup, group)
             structure_factor = sum(r -> cis2pi(-dot(G, r)), @view model.positions[group])
@@ -98,9 +102,10 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
         end
         pot / sqrt(model.unit_cell_volume)
     end
+
     enforce_real!(basis, pot_fourier)  # Symmetrize Fourier coeffs to have real iFFT
+    pot_real = irfft(basis, to_device(basis.architecture, pot_fourier))
 
-    pot_real = irfft(basis, pot_fourier)
     TermAtomicLocal(pot_real)
 end
 
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 4b23e20580..e9451afdf1 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -16,7 +16,7 @@ function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}) where {T}
     ops = map(basis.kpoints) do kpt
         P = build_projection_vectors_(basis, kpt, psps, psp_positions)
         D = build_projection_coefficients_(T, psps, psp_positions)
-        NonlocalOperator(basis, kpt, P, D)
+        NonlocalOperator(basis, kpt, P, to_device(basis.architecture, D))
     end
     TermAtomicNonlocal(ops)
 end
@@ -63,8 +63,8 @@ end
         C = build_projection_coefficients_(T, element.psp)
         for (ik, kpt) in enumerate(basis.kpoints)
             # we compute the forces from the irreductible BZ; they are symmetrized later
-            qs_cart = Gplusk_vectors_cart(basis, kpt)
             qs = Gplusk_vectors(basis, kpt)
+            qs_cart = to_cpu(Gplusk_vectors_cart(basis, kpt))
             form_factors = build_form_factors(element.psp, qs_cart)
             for idx in group
                 r = model.positions[idx]
@@ -149,6 +149,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
     n_proj = count_n_proj(psps, psp_positions)
     n_G    = length(G_vectors(basis, kpt))
     proj_vectors = zeros(Complex{T}, n_G, n_proj)
+    qs = to_cpu(Gplusk_vectors(basis, kpt))
 
     # Compute the columns of proj_vectors = 1/√Ω pihat(k+G)
     # Since the pi are translates of each others, pihat(k+G) decouples as
@@ -157,12 +158,13 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
     offset = 0  # offset into proj_vectors
     for (psp, positions) in zip(psps, psp_positions)
         # Compute position-independent form factors
-        form_factors = build_form_factors(psp, Gplusk_vectors_cart(basis, kpt))
+        qs_cart = to_cpu(Gplusk_vectors_cart(basis, kpt))
+        form_factors = build_form_factors(psp, qs_cart)
 
         # Combine with structure factors
         for r in positions
             # k+G in this formula can also be G, this only changes an unimportant phase factor
-            structure_factors = map(q -> cis2pi(-dot(q, r)), Gplusk_vectors(basis, kpt))
+            structure_factors = map(q -> cis2pi(-dot(q, r)), qs)
             @views for iproj = 1:count_n_proj(psp)
                 proj_vectors[:, offset+iproj] .= (
                     structure_factors .* form_factors[:, iproj] ./ sqrt(unit_cell_volume)
@@ -172,13 +174,15 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
         end
     end
     @assert offset == n_proj
-    proj_vectors
+
+    # Offload potential values to a device (like a GPU)
+    to_device(basis.architecture, proj_vectors)
 end
 
 """
 Build form factors (Fourier transforms of projectors) for an atom centered at 0.
 """
-function build_form_factors(psp, qs)
+function build_form_factors(psp, qs::Array)
     T = real(eltype(first(qs)))
 
     # Pre-compute the radial parts of the non-local projectors at unique |q| to speed up
diff --git a/src/terms/xc.jl b/src/terms/xc.jl
index bccbf9ba2f..2babe2ed9b 100644
--- a/src/terms/xc.jl
+++ b/src/terms/xc.jl
@@ -97,7 +97,7 @@ end
         end
         if haskey(terms, :Vl) && any(x -> abs(x) > term.potential_threshold, terms.Vl)
             @warn "Meta-GGAs with a Δρ term have not yet been thoroughly tested." maxlog=1
-            mG² = [-sum(abs2, G) for G in G_vectors_cart(basis)]
+            mG² = .-norm2.(G_vectors_cart(basis))
             Vl  = reshape(terms.Vl, n_spin, basis.fft_size...)
             Vl_fourier = fft(basis, Vl[s, :, :, :])
             # TODO: forcing real-valued ifft; should be enforced at creation of array
@@ -255,7 +255,7 @@ function LibxcDensities(basis, max_derivative::Integer, ρ, τ)
     # Compute Δρ
     if max_derivative > 1
         Δρ_real = similar(ρ_real, n_spin, basis.fft_size...)
-        mG² = [-sum(abs2, G) for G in G_vectors_cart(basis)]
+        mG² = .-norm2.(G_vectors_cart(basis))
         for σ = 1:n_spin
             # TODO: forcing real-valued ifft; should be enforced at creation of array
             Δρ_real[σ, :, :, :] .= irfft(basis, mG² .* @view ρ_fourier[σ, :, :, :];
diff --git a/src/workarounds/fft_generic.jl b/src/workarounds/fft_generic.jl
index 371ec2b061..a9b8c4ff0d 100644
--- a/src/workarounds/fft_generic.jl
+++ b/src/workarounds/fft_generic.jl
@@ -23,15 +23,13 @@ end
 default_primes(::Any) = (2, )
 
 # Generic fallback function, Float32 and Float64 specialization in fft.jl
-function build_fft_plans(T, fft_size)
-    tmp = Array{Complex{T}}(undef, fft_size...)
-
+function build_fft_plans!(tmp::AbstractArray{<:Complex})
     # Note: FourierTransforms has no support for in-place FFTs at the moment
     # ... also it's extension to multi-dimensional arrays is broken and
     #     the algo only works for some cases
-    @assert all(ispow2, fft_size)
+    @assert all(ispow2, size(tmp))
 
-    # opFFT = FourierTransforms.plan_fft(tmp)   # TODO When multidim works
+    # opFFT = AbstractFFTs.plan_fft(tmp)   # TODO When multidim works
     # opBFFT = inv(opFFT).p
     opFFT  = generic_plan_fft(tmp)               # Fallback for now
     opBFFT = generic_plan_bfft(tmp)
diff --git a/src/workarounds/forwarddiff_rules.jl b/src/workarounds/forwarddiff_rules.jl
index f17cf75ac4..22c50c6114 100644
--- a/src/workarounds/forwarddiff_rules.jl
+++ b/src/workarounds/forwarddiff_rules.jl
@@ -105,16 +105,11 @@ end
 
 next_working_fft_size(::Type{<:ForwardDiff.Dual}, size::Int) = size
 
-_fftw_flags(::Type{<:ForwardDiff.Dual}) = FFTW.MEASURE | FFTW.UNALIGNED
-
-function build_fft_plans(T::Type{<:Union{ForwardDiff.Dual,Complex{<:ForwardDiff.Dual}}}, fft_size)
-    tmp = Array{complex(T)}(undef, fft_size...) # TODO think about other Array types
-    opFFT  = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
-    opBFFT = FFTW.plan_bfft(tmp, flags=_fftw_flags(T))
-
+function build_fft_plans!(tmp::AbstractArray{Complex{T}}) where {T<:ForwardDiff.Dual}
+    opFFT  = AbstractFFTs.plan_fft(tmp)
+    opBFFT = AbstractFFTs.plan_bfft(tmp)
     ipFFT  = DummyInplace{typeof(opFFT)}(opFFT)
     ipBFFT = DummyInplace{typeof(opBFFT)}(opBFFT)
-    # backward by inverting and stripping off normalizations
     ipFFT, opFFT, ipBFFT, opBFFT
 end
 
diff --git a/src/workarounds/gpu_arrays.jl b/src/workarounds/gpu_arrays.jl
index 5c9fc9a478..55272d34c3 100644
--- a/src/workarounds/gpu_arrays.jl
+++ b/src/workarounds/gpu_arrays.jl
@@ -3,3 +3,12 @@ using GPUArraysCore
 
 # https://github.com/JuliaGPU/CUDA.jl/issues/1565
 LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal, y::AbstractGPUArray) = x' * (D * y)
+
+function lowpass_for_symmetry!(ρ::AT, basis;
+    symmetries=basis.symmetries) where {AT <: AbstractGPUArray}
+    all(isone, symmetries) && return ρ
+    # lowpass_for_symmetry! currently uses scalar indexing, so we have to do this very ugly
+    # thing for cases where ρ sits on a device (e.g. GPU)
+    ρ_CPU = lowpass_for_symmetry!(to_cpu(ρ), basis; symmetries)
+    to_device(basis.architecture, ρ_CPU)
+end