diff --git a/Project.toml b/Project.toml
index 4da5b59ed3..7e59806582 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,6 +13,7 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 DftFunctionals = "6bd331d2-b28d-4fd3-880e-1a1c7f37947f"
 DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
@@ -44,6 +45,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
 UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
+WignerSymbols = "9f57e263-0b3d-5e2e-b1be-24f2bb48858b"
 
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
@@ -90,6 +92,7 @@ DftFunctionals = "0.3"
 DifferentiationInterface = "0.6.39, 0.7"
 DocStringExtensions = "0.9"
 DoubleFloats = "1"
+EzXML = "1.2.1"
 FFTW = "1.5"
 FiniteDiff = "2"
 FiniteDifferences = "0.12"
@@ -135,6 +138,7 @@ TimerOutputs = "0.5.29"
 Unitful = "1"
 UnitfulAtomic = "1"
 Wannier = "0.3.2"
+WignerSymbols = "2.0.0"
 WriteVTK = "1"
 julia = "1.10"
 wannier90_jll = "3.1"
diff --git a/src/DFTK.jl b/src/DFTK.jl
index a03a4e7c49..599d5f29dd 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -88,6 +88,7 @@ include("Kpoint.jl")
 include("PlaneWaveBasis.jl")
 include("orbitals.jl")
 include("input_output.jl")
+include("ultrasoft.jl")
 
 export create_supercell
 export cell_to_supercell
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 1688e85ac2..2a0be8145e 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -94,6 +94,8 @@ struct PlaneWaveBasis{T,
 
     ## Instantiated terms (<: Term). See Hamiltonian for high-level usage
     terms::Vector{Any}
+
+    augmentation_regions
 end
 
 
@@ -241,6 +243,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Real, fft_size::Tuple{Int, Int, I
     dvol  = model.unit_cell_volume ./ prod(fft_size)
     terms = Vector{Any}(undef, length(model.term_types))  # Dummy terms array, filled below
 
+    augmentation_regions = precompute_augmentation_regions(model, fft_grid)
+
     basis = PlaneWaveBasis{T, value_type(T), Arch, typeof(fft_grid),
                            typeof(kpoints[1].G_vectors)}(
         model, fft_size, dvol,
@@ -250,7 +254,7 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Real, fft_size::Tuple{Int, Int, I
         kcoords_global, kweights_global, n_irreducible_kpoints,
         comm_kpts, krange_thisproc, krange_allprocs, krange_thisproc_allspin,
         architecture, symmetries, symmetries_respect_rgrid,
-        use_symmetries_for_kpoint_reduction, terms)
+        use_symmetries_for_kpoint_reduction, terms, augmentation_regions)
 
     # Instantiate the terms with the basis
     for (it, t) in enumerate(model.term_types)
diff --git a/src/common/spherical_harmonics.jl b/src/common/spherical_harmonics.jl
index ec46b871eb..09b43e3710 100644
--- a/src/common/spherical_harmonics.jl
+++ b/src/common/spherical_harmonics.jl
@@ -45,5 +45,57 @@ function ylm_real(l::Integer, m::Integer, rvec::AbstractVector{T}) where {T}
         (m ==  3) && return sqrt( 35 / 32T(π)) * (x^2 - 3y^2) * x / r^3
     end
 
+    if l == 4
+        (m == -4) && return sqrt( 315 / 16T(π)) * x * y * (x^2 - y^2) / r^4
+        (m == -3) && return sqrt( 315 / 32T(π)) * y * (3x^2 - y^2) * z / r^4
+        (m == -2) && return sqrt(  45 / 16T(π)) * x * y * (7z^2 - r^2) / r^4
+        (m == -1) && return sqrt(  45 / 32T(π)) * y * (7z^3 - 3*z*r^2) / r^4
+        (m ==  0) && return sqrt(   9 /256T(π)) * (35z^4 - 30*z^2*r^2 + 3*r^4) / r^4
+        (m ==  1) && return sqrt(  45 / 32T(π)) * x * (7z^3 - 3*z*r^2) / r^4
+        (m ==  2) && return sqrt(  45 / 64T(π)) * (x^2 - y^2) * (7z^2 - r^2) / r^4
+        (m ==  3) && return sqrt( 315 / 32T(π)) * x * (x^2 - 3y^2) * z / r^4
+        (m ==  4) && return sqrt( 315 /256T(π)) * (x^2 * (x^2 - 3y^2) - y^2 * (3x^2 - y^2)) / r^4
+    end
+
     error("The case l = $l and m = $m is not implemented")
 end
+
+function pack_lm(l::Integer, m::Integer)
+    @assert 0 ≤ l
+    @assert -l ≤ m ≤ l
+    return l^2 + l + m + 1
+end
+
+# TODO: check this
+function unpack_lm(lm::Integer)
+    @assert lm ≥ 1
+    l = floor(Int, sqrt(lm - 1))
+    m = lm - (l^2 + l + 1)
+    return l, m
+end
+
+function gaunt_coefficients(T::Type, l::Integer)
+    nharm = (2l+1)^2
+    # Sample points randomly on a sphere
+    rng = Xoshiro(42)
+    points = rand(rng, T, 3, nharm)
+    points ./= sqrt.(sum(points.^2, dims=1))
+    # Evaluate spherical harmonics at these points
+    Y = zeros(T, nharm, nharm)
+    for j in 1:nharm
+        for i in 1:nharm
+            Y[i, j] = ylm_real(unpack_lm(j)..., points[:, i])
+        end
+    end
+    fact = factorize(Y)
+
+    coefs = zeros(T, nharm, (l+1)^2, (l+1)^2)
+
+    for j in 1:(l+1)^2
+        for i in 1:(l+1)^2
+            coefs[:, i, j] = fact \ (Y[:, i] .* Y[:, j])
+        end
+    end
+
+    coefs
+end
diff --git a/src/densities.jl b/src/densities.jl
index ae26fcfc97..fc2a2703f9 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -42,7 +42,51 @@ using an optional `occupation_threshold`. By default all occupation numbers are
     end
     ρ = sum(getfield.(storages, :ρ))
 
+    # Hacky augmentation charges
+    ρ_augmentation = zero(ρ)
+    nonloc_ops = AtomicNonlocal()(basis).ops
+    aug_coeffs = zeros(Complex{T}, 36, 36)
+    for (ik, n) in range
+        P = nonloc_ops[ik].P
+
+        projections = P' * ψ[ik][:, n]
+        for iatom in 1:length(basis.model.positions)
+            # TODO: only loop over iproj:18?
+            for iproj=1:18, jproj=1:18
+                coeff = projections[(iatom-1) * 18 + iproj]' *
+                        projections[(iatom-1) * 18 + jproj]
+                aug_coeffs[(iatom-1) * 18 + iproj, (iatom-1) * 18 + jproj] +=
+                                (real(coeff) * occupation[ik][n] # TODO: is the real call correct?
+                                * basis.kweights[ik])
+            end
+        end
+    end
+
+    # TODO: hardcoded for the Si psp I have
+    psp = basis.model.atoms[1].psp
+    # proj_indices = [1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6]
+    # proj_to_m = [0, 0, -1, 0, 1, -1, 0, 1, -2, -1, 0, 1, 2, -2, -1, 0, 1, 2]
+    proj_indices = [1, 2, 3, 4, 3, 4, 3, 4, 5, 6, 5, 6, 5, 6, 5, 6, 5, 6]
+    proj_to_m = [0, 0, -1, -1, 0, 0, 1, 1, -2, -2, -1, -1, 0, 0, 1, 1, 2, 2]
+
+    for iatom in 1:length(basis.model.positions)
+        box = basis.augmentation_regions[iatom]
+        @info "Looping over grid for atom $iatom: box size $(length(box.grid_indices))"
+        for (xyz, Q) in zip(box.grid_indices, eachslice(box.augmentations; dims=1))
+            tot = zero(T)
+            # TODO: only loop over iproj:18?
+            for iproj=1:18, jproj=1:18
+                tot += aug_coeffs[(iatom-1) * 18 + iproj, (iatom-1) * 18 + jproj] * Q[iproj, jproj]
+            end
+            # TODO: assumes 1 spin channel
+            ρ_augmentation[xyz..., 1] += tot
+        end
+    end
+
+    ρ += ρ_augmentation
+
     mpi_sum!(ρ, basis.comm_kpts)
+    # TODO: is this fine to do after the charge augmentation?
     ρ = symmetrize_ρ(basis, ρ; do_lowpass=false)
 
     # There can always be small negative densities, e.g. due to numerical fluctuations
diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl
index 116621d55f..86c1a2d21f 100644
--- a/src/eigen/diag.jl
+++ b/src/eigen/diag.jl
@@ -13,6 +13,28 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
     kpoints = ham.basis.kpoints
     results = Vector{Any}(undef, length(kpoints))
 
+    # TODO: temporary plumbing :P
+    B = [I for _ in kpoints]
+    if any(ham.basis.model.atoms) do el el isa ElementPsp && el.psp isa PspUpf && el.psp.type == "US" end
+        model = ham.basis.model
+
+        # keep only pseudopotential atoms and positions
+        psp_groups = [group for group in model.atom_groups
+                        if model.atoms[first(group)] isa ElementPsp]
+        psps          = [model.atoms[first(group)].psp for group in psp_groups]
+        psp_positions = [model.positions[group] for group in psp_groups]
+
+        # We have an overlap term
+        B = map(ham.blocks) do block
+            op = block.nonlocal_op
+            Q = build_projection_coefficients(eltype(ham.basis), psps, psp_positions, "Q")
+            LinearMap(size(op.P, 1)) do Sψ, ψ
+                mul!(Sψ, op.P, (Q * (op.P' * ψ)), 1, 0)
+                Sψ .+= ψ # + identity
+            end
+        end
+    end
+
     for (ik, kpt) in enumerate(kpoints)
         n_Gk = length(G_vectors(ham.basis, kpt))
         if n_Gk < nev_per_kpoint
@@ -48,7 +70,7 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
         prec = nothing
         !isnothing(prec_type) && (prec = prec_type(ham[ik]))
         results[ik] = eigensolver(ham[ik], ψguessk;
-                                  prec, tol, miniter, maxiter, n_conv_check)
+                                  B=B[ik], prec, tol, miniter, maxiter, n_conv_check)
     end
 
     # Transform results into a nicer datastructure
diff --git a/src/eigen/diag_lobpcg_hyper.jl b/src/eigen/diag_lobpcg_hyper.jl
index 27795bb3c0..417c360392 100644
--- a/src/eigen/diag_lobpcg_hyper.jl
+++ b/src/eigen/diag_lobpcg_hyper.jl
@@ -4,11 +4,11 @@ include("lobpcg_hyper_impl.jl")
 # but X and the history on the device (for GPU runs)
 function lobpcg_hyper(A, X0; maxiter=100, prec=nothing,
                       tol=20size(A, 2)*eps(real(eltype(A))),
-                      largest=false, n_conv_check=nothing, kwargs...)
+                      largest=false, n_conv_check=nothing, B=I, kwargs...)
     prec === nothing && (prec = I)
 
     @assert !largest "Only seeking the smallest eigenpairs is implemented."
-    result = LOBPCG(A, X0, I, prec, tol, maxiter; n_conv_check, kwargs...)
+    result = LOBPCG(A, X0, B, prec, tol, maxiter; n_conv_check, kwargs...)
 
     n_conv_check === nothing && (n_conv_check = size(X0, 2))
     converged = maximum(result.residual_norms[1:n_conv_check]) < tol
diff --git a/src/pseudo/PspUpf.jl b/src/pseudo/PspUpf.jl
index 7cf9c2410c..2b8c37d081 100644
--- a/src/pseudo/PspUpf.jl
+++ b/src/pseudo/PspUpf.jl
@@ -1,9 +1,13 @@
 using LinearAlgebra
 using Interpolations: linear_interpolation
-using PseudoPotentialIO: load_upf
+using PseudoPotentialIO: load_upf, get_attr
 
+# TODO: I don't like the storage by AM channel, it doesn't match the structure of the file
 struct PspUpf{T,I} <: NormConservingPsp
+    # TODO remove, for now convenient just in case
+    upf
     ## From file
+    type::String       # Either NC or US. UPF: `pseudo_type`
     Zion::Int          # Pseudo-atomic (valence) charge. UPF: `z_valence`
     lmax::Int          # Maximal angular momentum in the non-local part. UPF: `l_max`
     rgrid::Vector{T}   # Radial grid, can be linear or logarithmic. UPF: `PP_MESH/PP_R`
@@ -15,6 +19,14 @@ struct PspUpf{T,I} <: NormConservingPsp
     # Kleinman-Bylander energies. Stored per AM channel `h[l+1][i,j]`.
     # UPF: `PP_DIJ`
     h::Vector{Matrix{T}}
+    # Ultrasoft overlap energies. Stored per AM channel `h[l+1][i,j]`.
+    # TODO: should probably be zeros for NC pseudos to easily support mixing NC and US
+    # UPF: `PP_Q`
+    Q::Vector{Matrix{T}}
+    # Ultrasoft charge augmentation function interpolations.
+    # TODO: storage format is TBD
+    # UPF: `PP_QIJL`
+    Qijl
     # (UNUSED) Pseudo-wavefunctions on the radial grid. Can be used for wavefunction
     # initialization and as projectors for PDOS and DFT+U(+V).
     # r^2 * χ where χ are pseudo-atomic wavefunctions on the radial grid.
@@ -66,20 +78,21 @@ Does not support:
 - Fully-realtivistic / spin-orbit pseudos
 - Bare Coulomb / all-electron potentials
 - Semilocal potentials
-- Ultrasoft potentials
 - Projector-augmented wave potentials
 - GIPAW reconstruction data
 """
 function PspUpf(path; identifier=path, rcut=nothing)
     pseudo = load_upf(path)
 
+    type = pseudo["header"]["pseudo_type"]
+    if type == "USPP"
+        type = "US" # I think both exist?
+    end
+
     unsupported = []
-    pseudo["header"]["has_so"]               && push!(unsupported, "spin-orbit coupling")
-    pseudo["header"]["pseudo_type"] == "SL"  && push!(unsupported, "semilocal potential")
-    pseudo["header"]["pseudo_type"] == "US"  && push!(unsupported, "ultrasoft")
-    pseudo["header"]["pseudo_type"] == "PAW" && push!(unsupported, "projector-augmented wave")
-    pseudo["header"]["has_gipaw"]            && push!(unsupported, "gipaw data")
-    pseudo["header"]["pseudo_type"] == "1/r" && push!(unsupported, "Coulomb")
+    pseudo["header"]["has_so"]    && push!(unsupported, "spin-orbit coupling")
+    type != "NC" && type != "US"  && push!(unsupported, "$type potential type")
+    pseudo["header"]["has_gipaw"] && push!(unsupported, "gipaw data")
     length(unsupported) > 0 && error("Pseudopotential contains the following unsupported" *
                                      " features/quantities: $(join(unsupported, ","))")
 
@@ -118,6 +131,30 @@ function PspUpf(path; identifier=path, rcut=nothing)
         push!(h, Dij_l)
         count += nproj_l
     end
+    Q = Matrix[]
+    Qijl = missing
+    if type == "US"
+        augmentation = read_augmentation(path, pseudo)
+        Qijl = map(augmentation.Qijl) do Qjl
+            map(Qjl) do Ql
+                map(Ql) do Qfun
+                    isnothing(Qfun) && return nothing
+                    # Q is actually given as r²Q(r) in the UPF file
+                    # Also convert from 1/Ry to 1/Ha (TODO: not really these units but whatever?)
+                    linear_interpolation((rgrid,), Qfun ./ rgrid.^2 .* 2 .* 2) # TODO: why *2 again?
+                end
+            end
+        end
+        isnothing(augmentation) && error("Ultrasoft pseudopotential $identifier does not contain augmentation data")
+        count = 1
+        for l = 0:lmax
+            nproj_l = length(r2_projs[l+1])
+            # 1/Ry -> 1/Ha
+            Qij_l = augmentation.Q[count:count+nproj_l-1, count:count+nproj_l-1] .* 2 .* 2 # TODO: why *2 again?
+            push!(Q, Qij_l)
+            count += nproj_l
+        end
+    end
 
     r2_pswfcs = [Vector{Float64}[] for _ = 0:lmax]
     pswfc_occs = [Float64[] for _ = 0:lmax]
@@ -145,14 +182,91 @@ function PspUpf(path; identifier=path, rcut=nothing)
     r2_ρcore_interp = linear_interpolation((rgrid,), r2_ρcore)
 
     PspUpf{eltype(rgrid),typeof(vloc_interp)}(
-        Zion, lmax, rgrid, drgrid,
-        vloc, r2_projs, h, r2_pswfcs, pswfc_occs, pswfc_energies, pswfc_labels,
+        pseudo,
+        type, Zion, lmax, rgrid, drgrid,
+        vloc, r2_projs, h, Q, Qijl, r2_pswfcs, pswfc_occs, pswfc_energies, pswfc_labels,
         r2_ρion, r2_ρcore,
         vloc_interp, r2_projs_interp, r2_ρion_interp, r2_ρcore_interp,
         rcut, ircut, identifier, description
     )
 end
 
+import EzXML
+function read_augmentation(path, upf)
+    # TODO: assumes UPF v2
+    text = read(path, String)
+    # Remove end-of-file junk (input data, etc.)
+    text = string(split(text, "</UPF>")[1], "</UPF>")
+    # Clean any errant `&` characters
+    text = replace(text, "&" => "")
+    doc_root = EzXML.root(EzXML.parsexml(text))
+
+    augmentation = EzXML.findfirst("PP_NONLOCAL/PP_AUGMENTATION", doc_root)
+    ismissing(augmentation) && return nothing
+    get_attr(Bool, augmentation, "q_with_l") || error("q_with_l not set in PP_AUGMENTATION")
+
+    # Read Q
+    qnode = EzXML.findfirst("PP_Q", augmentation)
+    ismissing(qnode) && error("PP_Q not found in PP_AUGMENTATION")
+    Q = parse.(Float64, split(strip(qnode.content)))
+    Q = reshape(Q, upf["header"]["number_of_proj"], upf["header"]["number_of_proj"])
+
+    # Read QIJL
+    nproj = upf["header"]["number_of_proj"]
+    Qijl = [[[] for _=1:nproj] for _=1:nproj]
+    for i=1:nproj
+        il = upf["beta_projectors"][i]["angular_momentum"]
+        for j=i:nproj
+            jl = upf["beta_projectors"][j]["angular_momentum"]
+            # TODO: I don't understand why we can go in steps of 2
+            for L=abs(il-jl):2:il+jl
+                qijlnode = EzXML.findfirst("PP_QIJL.$i.$j.$L", augmentation)
+                if isnothing(qijlnode)
+                    push!(Qijl[i][j], nothing)
+                else
+                    # TODO: truncate based on cutoff_radius_index
+                    data = parse.(Float64, split(strip(qijlnode.content)))
+                    push!(Qijl[i][j], data)
+                end
+            end
+        end
+    end
+
+    # Same units as D_ion I suppose
+    (; Q, Qijl)
+end
+
+import WignerSymbols: clebschgordan
+function eval_augmentation(psp::PspUpf{T}, gaunt_coefficients, n, n_mag, m, m_mag, r) where {T<:Real}
+    if n > m
+        n, n_mag, m, m_mag = m, m_mag, n, n_mag  # Ensure n <= m
+    end
+
+    out = zero(T)
+    nl = psp.upf["beta_projectors"][n]["angular_momentum"]
+    @assert abs(n_mag) <= nl "n_mag must be in [-l, l] for l=$nl, n_mag=$n_mag"
+    ml = psp.upf["beta_projectors"][m]["angular_momentum"]
+    @assert abs(m_mag) <= ml "m_mag must be in [-l, l] for l=$ml, m_mag=$m_mag"
+    for (iL, L) in enumerate(abs(nl-ml):2:nl+ml)
+        isnothing(psp.Qijl[n][m][iL]) && continue
+
+        for M in -L:L
+            # TODO remove
+            # L == 0 && M == 0 || continue
+            # TODO it's not clear if we need to normalize the result by √4π?
+            # TODO in the easy case where n=n_mag=m=m_mag=0, it seems we need to, to ensure that ∫Q(r)r² gives the right Q coef
+            # @show nl n_mag ml m_mag L M
+            coef = gaunt_coefficients[pack_lm(L, M), pack_lm(nl, n_mag), pack_lm(ml, m_mag)]
+            if abs(coef) > 1e-8 # TODO: arbitrary threshold?
+                out += (coef
+                        * ylm_real(L, M, r)
+                        * psp.Qijl[n][m][iL](max(1e-4, norm(r)))) # TODO: need to include 0 in the interp grid?
+            end
+        end
+    end
+    out
+end
+
 charge_ionic(psp::PspUpf) = psp.Zion
 has_valence_density(psp::PspUpf) = !all(iszero, psp.r2_ρion)
 has_core_density(psp::PspUpf) = !all(iszero, psp.r2_ρcore)
diff --git a/src/terms/Hamiltonian.jl b/src/terms/Hamiltonian.jl
index df120b566f..a83acc9212 100644
--- a/src/terms/Hamiltonian.jl
+++ b/src/terms/Hamiltonian.jl
@@ -48,6 +48,37 @@ function HamiltonianBlock(basis, kpoint, operators; scratch=nothing)
         scratch = @something scratch _ham_allocate_scratch(basis)
         nonlocal_op = isempty(nonlocal_ops) ? nothing : only(nonlocal_ops)
         divAgrad_op = isempty(divAgrad_ops) ? nothing : only(divAgrad_ops)
+
+        # TODO: hacky update of nonlocal D for ultrasoft
+        # TODO: doing this inside of each term is very wasteful,
+        #       the real potentials are the same across kpoints... (except for spin)
+        if !isnothing(nonlocal_op)
+            real_op = only(real_ops)
+            Dextra = zeros(36, 36)
+
+            for iatom in 1:length(basis.model.positions)
+                box = basis.augmentation_regions[iatom]
+                for (xyz, Q) in zip(box.grid_indices, eachslice(box.augmentations; dims=1))
+                    # TODO: only loop over iproj:18?
+                    for iproj=1:18, jproj=iproj:18
+                        # TODO: assumes 1 spin channel?
+                        Dextra[(iatom-1) * 18 + iproj, (iatom-1) * 18 + jproj] +=
+                            (real_op.potential[xyz...] * Q[iproj, jproj])
+                    end
+                end
+                for iproj=1:18, jproj=iproj+1:18
+                    Dextra[(iatom-1) * 18 + jproj, (iatom-1) * 18 + iproj] =
+                        Dextra[(iatom-1) * 18 + iproj, (iatom-1) * 18 + jproj]
+                end
+            end
+
+            nonlocal_op = NonlocalOperator(
+                basis,
+                kpoint,
+                nonlocal_op.P,
+                nonlocal_op.D + Dextra * basis.dvol)# * 2) # TODO: I don't understand why the 2 here
+        end
+
         DftHamiltonianBlock(basis, kpoint, operators,
                             only(fourier_ops), only(real_ops),
                             nonlocal_op, divAgrad_op, scratch)
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 55f57cd0ba..af38938a20 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -108,7 +108,7 @@ end
 # The ordering of the projector indices is (A,l,m,i), where A is running over all
 # atoms, l, m are AM quantum numbers and i is running over all projectors for a
 # given l. The matrix is block-diagonal with non-zeros only if A, l and m agree.
-function build_projection_coefficients(T, psps, psp_positions)
+function build_projection_coefficients(T, psps, psp_positions, matrix="h")
     # TODO In the current version the proj_coeffs still has a lot of zeros.
     #      One could improve this by storing the blocks as a list or in a
     #      BlockDiagonal data structure
@@ -119,7 +119,7 @@ function build_projection_coefficients(T, psps, psp_positions)
     for (psp, positions) in zip(psps, psp_positions), _ in positions
         n_proj_psp = count_n_proj(psp)
         block = count+1:count+n_proj_psp
-        proj_coeffs[block, block] = build_projection_coefficients(T, psp)
+        proj_coeffs[block, block] = build_projection_coefficients(T, psp, matrix)
         count += n_proj_psp
     end
     @assert count == n_proj
@@ -131,14 +131,14 @@ end
 # The ordering of the projector indices is (l,m,i), where l, m are the
 # AM quantum numbers and i is running over all projectors for a given l.
 # The matrix is block-diagonal with non-zeros only if l and m agree.
-function build_projection_coefficients(T::Type, psp::NormConservingPsp)
+function build_projection_coefficients(T::Type, psp::NormConservingPsp, matrix="h")
     n_proj = count_n_proj(psp)
     proj_coeffs = zeros(T, n_proj, n_proj)
     count = 0
     for l = 0:psp.lmax, _ = -l:l
         n_proj_l = count_n_proj_radial(psp, l)  # Number of i's
         range = count .+ (1:n_proj_l)
-        proj_coeffs[range, range] = psp.h[l + 1]
+        proj_coeffs[range, range] = matrix == "h" ? psp.h[l + 1] : psp.Q[l + 1]
         count += n_proj_l
     end
     proj_coeffs
diff --git a/src/ultrasoft.jl b/src/ultrasoft.jl
new file mode 100644
index 0000000000..5075015e9b
--- /dev/null
+++ b/src/ultrasoft.jl
@@ -0,0 +1,59 @@
+function atom_local_grid(lattice::AbstractMatrix{T}, fft_grid::FFTGrid, position, radius) where {T}
+    # Find bounds on the points we have to check along each dimension
+    # Bounds from https://math.stackexchange.com/a/1230292
+    AtAinv = inv(lattice'lattice)
+    Atinv = inv(lattice')
+    xbound = radius * AtAinv[1, 1] / norm(Atinv * [1, 0, 0])
+    ybound = radius * AtAinv[2, 2] / norm(Atinv * [0, 1, 0])
+    zbound = radius * AtAinv[3, 3] / norm(Atinv * [0, 0, 1])
+
+    # +1 for inexact computation of the bound, +1 for the rounding of the atom position 
+    xmax = ceil(Int, xbound * fft_grid.fft_size[1]) + 2
+    ymax = ceil(Int, ybound * fft_grid.fft_size[2]) + 2
+    zmax = ceil(Int, zbound * fft_grid.fft_size[3]) + 2
+
+    grid_indices = Vec3{Int}[]
+    atom_distances = Vec3{T}[]
+
+    x0, y0, z0 = round.(position .* fft_grid.fft_size)
+    for iz=z0-zmax:z0+zmax, iy=y0-ymax:y0+ymax, ix=x0-xmax:x0+xmax
+        atom_distance = lattice * ([ix, iy, iz] ./ fft_grid.fft_size - position)
+
+        if norm(atom_distance) <= radius
+            # mod in case we wrap around grid boundaries
+            index = mod.([ix, iy, iz], fft_grid.fft_size) .+ 1
+            push!(grid_indices, index)
+            push!(atom_distances, atom_distance)
+        end
+    end
+
+    (; grid_indices, atom_distances)
+end
+
+function precompute_augmentation_regions(model::Model{T}, fft_grid::FFTGrid) where {T}
+    # TODO: hardcoded for the Si psp I have
+    psp = model.atoms[1].psp
+    # proj_indices = [1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6]
+    # proj_to_m = [0, 0, -1, 0, 1, -1, 0, 1, -2, -1, 0, 1, 2, -2, -1, 0, 1, 2]
+    proj_indices = [1, 2, 3, 4, 3, 4, 3, 4, 5, 6, 5, 6, 5, 6, 5, 6, 5, 6]
+    proj_to_m = [0, 0, -1, -1, 0, 0, 1, 1, -2, -2, -1, -1, 0, 0, 1, 1, 2, 2]
+
+    gaunt_coefs = gaunt_coefficients(T, psp.lmax)
+
+    map(enumerate(model.positions)) do (iatom, pos)
+        @info "Computing grid for atom $iatom"
+        # TODO: hardcoded ultrasoft_cutoff_radius (is this what this means?)
+        (; grid_indices, atom_distances) = atom_local_grid(model.lattice, fft_grid, pos, 1.8)
+        augmentations = zeros(T, length(grid_indices), 18, 18)
+        @info "Looping augmentation region for atom $iatom: box size $(length(grid_indices))"
+        for (r, Q) in zip(atom_distances, eachslice(augmentations; dims=1))
+            for iproj=1:18, jproj=iproj:18
+                Q[iproj, jproj] = eval_augmentation(psp, gaunt_coefs, proj_indices[iproj], proj_to_m[iproj], proj_indices[jproj], proj_to_m[jproj], r)
+                if iproj != jproj # By symmetry
+                    Q[jproj, iproj] = Q[iproj, jproj]
+                end
+            end
+        end
+        (; grid_indices, atom_distances, augmentations)
+    end
+end
\ No newline at end of file